From a82cebf7022316a35ee16a52bc1477ec3eabf5fd Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Fri, 20 Jan 2017 12:43:27 -0500 Subject: [PATCH 1/9] Adds a couple of metrics computed on UMIs --- src/main/java/picard/sam/UmiMetrics.java | 90 ++++++++++++++++ .../UmiAwareDuplicateSetIterator.java | 120 +++++++++++++++++++-- .../UmiAwareMarkDuplicatesWithMateCigar.java | 24 ++++- .../UmiAwareMarkDuplicatesWithMateCigarTest.java | 87 +++++++++++++++ .../UmiAwareMarkDuplicatesWithMateCigarTester.java | 40 +++++++ 5 files changed, 349 insertions(+), 12 deletions(-) create mode 100644 src/main/java/picard/sam/UmiMetrics.java diff --git a/src/main/java/picard/sam/UmiMetrics.java b/src/main/java/picard/sam/UmiMetrics.java new file mode 100644 index 000000000..45d817d78 --- /dev/null +++ b/src/main/java/picard/sam/UmiMetrics.java @@ -0,0 +1,90 @@ +/* + * The MIT License + * + * Copyright (c) 2017 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.sam; + +import htsjdk.samtools.metrics.MetricBase; + +/** + * Metrics that are calculated during the process of marking duplicates + * within a stream of SAMRecords using the UmiAwareDuplicateSetIterator. + */ +public class UmiMetrics extends MetricBase { + // Number of bases in each UMI + public int UMI_LENGTH; + + // Number of different UMI sequences observed + public long OBSERVED_UNIQUE_UMIS = 0; + + // Number of different inferred UMI sequences derived + public long INFERRED_UNIQUE_UMIS = 0; + + // Number of errors inferred by comparing the observed and inferred UMIs + public long OBSERVED_BASE_ERRORS = 0; + + // Number of duplicate sets found before taking UMIs into account + public long DUPLICATE_SETS_WITHOUT_UMI = 0; + + // Number of duplicate sets found after taking UMIs into account + public long DUPLICATE_SETS_WITH_UMI = 0; + + // This is a measure of entropy (in base 4) to indicate the amount of + // information (given as effective bases) provided by each observed UMI + public double EFFECTIVE_LENGTH_OF_OBSERVED_UMIS = 0; + + public double EFFECTIVE_LENGTH_OF_INFERRED_UMIS = 0; + + // Estimation of Phred scaled quality scores for UMIs + public double ESTIMATED_BASE_QUALITY_OF_UMIS; + + // MLE estimation of reads that will be falsely labeled as being part of a duplicate set due to UMI collisions + public double EXPECTED_READS_WITH_UMI_COLLISION; + + // Phred scale of MLE estimate of collision rate + public double UMI_COLLISION_Q; + + public void estimateBaseQualities(final int observedUmiBases) { + ESTIMATED_BASE_QUALITY_OF_UMIS = -10.0*Math.log10((double) OBSERVED_BASE_ERRORS / (double) observedUmiBases); + } + + public UmiMetrics() {} + + public UmiMetrics(final int length, final int observedUniqueUmis, final int inferredUniqueUmis, + final int observedBaseErrors, final int duplicateSetsWithoutUmi, + final int duplicateSetsWithUmi, final double effectiveLengthOfInferredUmis, + final double effectiveLengthOfObservedUmis, final double estimatedBaseQualityOfUmis, + final double expectedUmiCollisions, final double umiCollisionQ) { + UMI_LENGTH = length; + OBSERVED_UNIQUE_UMIS = observedUniqueUmis; + INFERRED_UNIQUE_UMIS = inferredUniqueUmis; + OBSERVED_BASE_ERRORS = observedBaseErrors; + DUPLICATE_SETS_WITHOUT_UMI = duplicateSetsWithoutUmi; + DUPLICATE_SETS_WITH_UMI = duplicateSetsWithUmi; + EFFECTIVE_LENGTH_OF_INFERRED_UMIS = effectiveLengthOfInferredUmis; + EFFECTIVE_LENGTH_OF_OBSERVED_UMIS = effectiveLengthOfObservedUmis; + ESTIMATED_BASE_QUALITY_OF_UMIS = estimatedBaseQualityOfUmis; + EXPECTED_READS_WITH_UMI_COLLISION = expectedUmiCollisions; + UMI_COLLISION_Q = umiCollisionQ; + } +} diff --git a/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java b/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java index ee1ddfa76..018dcc996 100644 --- a/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java +++ b/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java @@ -34,13 +34,19 @@ package picard.sam.markduplicates; +import com.google.common.math.LongMath; import htsjdk.samtools.DuplicateSet; import htsjdk.samtools.DuplicateSetIterator; +import htsjdk.samtools.SAMRecord; import htsjdk.samtools.util.CloseableIterator; +import htsjdk.samtools.util.Histogram; import picard.PicardException; +import picard.sam.UmiMetrics; import java.util.*; +import static htsjdk.samtools.util.StringUtil.hammingDistance; + /** * UmiAwareDuplicateSetIterator is an iterator that wraps a duplicate set iterator * in such a way that each duplicate set may be broken up into subsets according @@ -55,22 +61,33 @@ private final String inferredUmiTag; private final boolean allowMissingUmis; private boolean isOpen = false; + private UmiMetrics metrics; + private boolean haveWeSeenFirstRead = false; + private long duplicateSetsWithUmi = 0; + private long duplicateSetsWithoutUmi = 0; + private double expectedCollisions = 0; + private int observedBaseErrors = 0; + + private Histogram observedUmis = new Histogram<>(); + private Histogram inferredUmis = new Histogram<>(); /** * Creates a UMI aware duplicate set iterator * - * @param wrappedIterator UMI aware duplicate set iterator is a wrapper + * @param wrappedIterator UMI aware duplicate set iterator is a wrapper * @param maxEditDistanceToJoin The edit distance between UMIs that will be used to union UMIs into groups - * @param umiTag The tag used in the bam file that designates the UMI - * @param assignedUmiTag The tag in the bam file that designates the assigned UMI + * @param umiTag The tag used in the bam file that designates the UMI + * @param assignedUmiTag The tag in the bam file that designates the assigned UMI */ UmiAwareDuplicateSetIterator(final DuplicateSetIterator wrappedIterator, final int maxEditDistanceToJoin, - final String umiTag, final String assignedUmiTag, final boolean allowMissingUmis) { + final String umiTag, final String assignedUmiTag, final boolean allowMissingUmis, + final UmiMetrics metrics) { this.wrappedIterator = wrappedIterator; this.maxEditDistanceToJoin = maxEditDistanceToJoin; this.umiTag = umiTag; this.inferredUmiTag = assignedUmiTag; this.allowMissingUmis = allowMissingUmis; + this.metrics = metrics; isOpen = true; nextSetsIterator = Collections.emptyIterator(); } @@ -79,18 +96,21 @@ public void close() { isOpen = false; wrappedIterator.close(); + + if (metrics.UMI_LENGTH > 0) { + collectMetrics(); + } + } @Override public boolean hasNext() { - if(!isOpen) { + if (!isOpen) { return false; - } - else { - if(nextSetsIterator.hasNext() || wrappedIterator.hasNext()) { + } else { + if (nextSetsIterator.hasNext() || wrappedIterator.hasNext()) { return true; - } - else { + } else { isOpen = false; return false; } @@ -119,6 +139,84 @@ private void process(final DuplicateSet set) { } final UmiGraph umiGraph = new UmiGraph(set, umiTag, inferredUmiTag, allowMissingUmis); - nextSetsIterator = umiGraph.joinUmisIntoDuplicateSets(maxEditDistanceToJoin).iterator(); + + List duplicateSets = umiGraph.joinUmisIntoDuplicateSets(maxEditDistanceToJoin); + + // Collect statistics on numbers of observed and inferred UMIs + // and total numbers of observed and inferred UMIs + for (DuplicateSet ds : duplicateSets) { + List records = ds.getRecords(); + SAMRecord representativeRead = ds.getRepresentative(); + + String inferredUmi = representativeRead.getStringAttribute(inferredUmiTag); + + for (SAMRecord rec : records) { + String currentUmi = rec.getStringAttribute(umiTag); + + if (currentUmi != null) { + if (!haveWeSeenFirstRead) { + metrics.UMI_LENGTH = currentUmi.length(); + haveWeSeenFirstRead = true; + } + + metrics.OBSERVED_BASE_ERRORS += hammingDistance(currentUmi, inferredUmi); + observedBaseErrors += metrics.UMI_LENGTH; + + observedUmis.increment(currentUmi); + inferredUmis.increment(inferredUmi); + } + } + + duplicateSetsWithUmi++; + } + duplicateSetsWithoutUmi++; + + // For each duplicate set estimate the number of expected umi collisions + double Z = 0; + for (int k = 0; k <= maxEditDistanceToJoin; k++) { + if (metrics.UMI_LENGTH > 0) { + Z = Z + LongMath.binomial(metrics.UMI_LENGTH, k) * Math.pow(3.0, k); + } + } + expectedCollisions = expectedCollisions + + (1 - Math.pow(1 - Z / Math.pow(4, metrics.UMI_LENGTH), duplicateSets.size() - 1)) * duplicateSets.size(); + + nextSetsIterator = duplicateSets.iterator(); + } + + private void collectMetrics() { + metrics.OBSERVED_UNIQUE_UMIS = observedUmis.size(); + metrics.INFERRED_UNIQUE_UMIS = inferredUmis.size(); + + metrics.EFFECTIVE_LENGTH_OF_OBSERVED_UMIS = effectiveNumberOfBases(observedUmis); + metrics.EFFECTIVE_LENGTH_OF_INFERRED_UMIS = effectiveNumberOfBases(inferredUmis); + + metrics.DUPLICATE_SETS_WITH_UMI = duplicateSetsWithUmi; + metrics.DUPLICATE_SETS_WITHOUT_UMI = duplicateSetsWithoutUmi; + metrics.EXPECTED_READS_WITH_UMI_COLLISION = expectedCollisions; + + metrics.UMI_COLLISION_Q = -10 * Math.log10(expectedCollisions / inferredUmis.size()); + + double Z = 0; + for (int k = 0; k <= maxEditDistanceToJoin; k++) { + Z = Z + LongMath.binomial(metrics.UMI_LENGTH, k) * Math.pow(3.0, k); + } + + metrics.estimateBaseQualities(observedBaseErrors); + } + + private double effectiveNumberOfBases(Histogram observations) { + double H = 0.0; + + double totalObservations = observations.getSumOfValues(); + for (Histogram.Bin observation : observations.values()) { + double p = observation.getValue() / totalObservations; + H = H - p * Math.log(p); + } + + // Convert to log base 4 so that the entropy is now a measure + // of the effective number of DNA bases. If we used log(2.0) + // our result would be in bits. + return H / Math.log(4.0); } } diff --git a/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java b/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java index aba8799f9..c88834f3c 100644 --- a/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java +++ b/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java @@ -27,10 +27,14 @@ import htsjdk.samtools.DuplicateSet; import htsjdk.samtools.DuplicateSetIterator; import htsjdk.samtools.SAMRecordDuplicateComparator; +import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.util.*; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.programgroups.Alpha; +import picard.sam.UmiMetrics; + +import java.io.File; /** * This is a simple tool to mark duplicates making use of UMIs in the reads. @@ -65,6 +69,9 @@ @Option(shortName = "MAX_EDIT_DISTANCE_TO_JOIN", doc = "Largest edit distance that UMIs must have in order to be considered as coming from distinct source molecules.", optional = true) public int MAX_EDIT_DISTANCE_TO_JOIN = 1; + @Option(shortName = "UMI_METRICS", doc = "UMI Metrics", optional = true) + public File UMI_METRICS_FILE = null; + @Option(shortName = "UMI_TAG_NAME", doc = "Tag name to use for UMI", optional = true) public String UMI_TAG_NAME = "RX"; @@ -78,6 +85,21 @@ public boolean ALLOW_MISSING_UMIS = false; private final Log log = Log.getInstance(UmiAwareMarkDuplicatesWithMateCigar.class); + private UmiMetrics metrics = new UmiMetrics(); + + @Override + protected int doWork() { + // Perform Mark Duplicates work + int retval=super.doWork(); + + // Write metrics specific to UMIs + if(UMI_METRICS_FILE != null) { + MetricsFile metricsFile = getMetricsFile(); + metricsFile.addMetric(metrics); + metricsFile.write(UMI_METRICS_FILE); + } + return retval; + } @Override protected CloseableIterator getDuplicateSetIterator(final SamHeaderAndIterator headerAndIterator, final SAMRecordDuplicateComparator comparator) { @@ -85,6 +107,6 @@ new DuplicateSetIterator(headerAndIterator.iterator, headerAndIterator.header, false, - comparator), MAX_EDIT_DISTANCE_TO_JOIN, UMI_TAG_NAME, ASSIGNED_UMI_TAG, ALLOW_MISSING_UMIS); + comparator), MAX_EDIT_DISTANCE_TO_JOIN, UMI_TAG_NAME, ASSIGNED_UMI_TAG, ALLOW_MISSING_UMIS, metrics); } } diff --git a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java index b78bbab73..9157536c8 100644 --- a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java +++ b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java @@ -27,6 +27,7 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import picard.PicardException; +import picard.sam.UmiMetrics; import java.util.*; @@ -163,4 +164,90 @@ public void testBadUmis(List umis, List assignedUmi, final List< } tester.setExpectedAssignedUmis(assignedUmi).runTest(); } + + @DataProvider(name = "testUmiMetricsDataProvider") + private Object[][] testUmiMetricsDataProvider() { + + // Calculate values of metrics by hand to ensure they are right + // effectiveLength4_1 is the effective UMI length observing 5 UMIs where 4 are the same + double effectiveLength4_1 = -(4./5.)*Math.log(4./5.)/Math.log(4.) -(1./5.)*Math.log(1./5.)/Math.log(4.); + // effectiveLength4_1 is the effective UMI length observing 5 UMIs where 3 are the same and the other two are + // unique + double effectiveLength3_1_1 = -(3./5.)*Math.log(3./5.)/Math.log(4.) -2*(1./5.)*Math.log(1./5.)/Math.log(4.); + // estimatedBaseQualityk_n is the phred scaled base quality score where k of n bases are incorrect + double estimatedBaseQuality1_20 = -10*Math.log10(1./20.); + double estimatedBaseQuality3_20 = -10*Math.log10(3./20.); + + // expectedCollisionsi_j_k where edit distance to join is i, duplicate sets is k and UMI length is k. + double expectedCollisions1_2_4 = 13./64.; + double expectedCollisions0_16_2 = (1. - Math.pow(1. - 1./16., 15))*16.*2.; + double collisionQ1_2_4 = -10*Math.log10(expectedCollisions1_2_4/2.0); + double collisionQ0_16_2 = -10*Math.log10(expectedCollisions0_16_2/16.0); + return new Object[][]{{ + // Test basic error correction using edit distance of 1 + Arrays.asList(new String[]{"AAAA", "AAAA", "ATTA", "AAAA", "AAAT"}), // Observed UMI + Arrays.asList(new String[]{"AAAA", "AAAA", "ATTA", "AAAA", "AAAA"}), // Expected inferred UMI + Arrays.asList(new Boolean[]{false, true, false, true, true}), // Should it be marked as duplicate? + 1, // Edit Distance to Join + new UmiMetrics(4, // UMI_LENGTH + 3, // OBSERVED_UNIQUE_UMIS + 2, // INFERRED_UNIQUE_UMIS + 2, // OBSERVED_BASE_ERRORS (Note: This is 2 rather than 1 because we are using paired end reads) + 2, // DUPLICATE_SETS_WITHOUT_UMI + 4, // DUPLICATE_SETS_WITH_UMI + effectiveLength4_1, // EFFECTIVE_LENGTH_OF_INFERRED_UMIS + effectiveLength3_1_1, // EFECTIVE_LENGTH_OF_OBSERVED_UMIS + estimatedBaseQuality1_20, // ESTIMATED_BASE_QUALITY_OF_UMIS + expectedCollisions1_2_4, // EXPECTED_READS_WITH_UMI_COLLISION + collisionQ1_2_4) // UMI_COLLISION_Q + }, { + // Test basic error correction using edit distance of 2 + Arrays.asList(new String[]{"AAAA", "AAAA", "ATTA", "AAAA", "AAAT"}), + Arrays.asList(new String[]{"AAAA", "AAAA", "AAAA", "AAAA", "AAAA"}), + Arrays.asList(new Boolean[]{false, true, true, true, true}), + 2, + new UmiMetrics(4, // UMI_LENGTH + 3, // OBSERVED_UNIQUE_UMIS + 1, // INFERRED_UNIQUE_UMIS + 6, // OBSERVED_BASE_ERRORS + 2, // DUPLICATE_SETS_WITHOUT_UMI + 2, // DUPLICATE_SETS_WITH_UMI + 0.0, // EFFECTIVE_LENGTH_OF_INFERRED_UMIS + effectiveLength3_1_1, // EFECTIVE_LENGTH_OF_OBSERVED_UMIS + estimatedBaseQuality3_20, // ESTIMATED_BASE_QUALITY_OF_UMIS + 0, // EXPECTED_READS_WITH_UMI_COLLISION + Double.NaN) // UMI_COLLISION_Q + }, { + // Test maximum entropy (EFFECTIVE_LENGTH_OF_INFERRED_UMIS) + Arrays.asList(new String[]{"AA", "AT", "AC", "AG", "TA", "TT", "TC", "TG", "CA", "CT", "CC", "CG", "GA", "GT", "GC", "GG"}), + Arrays.asList(new String[]{"AA", "AT", "AC", "AG", "TA", "TT", "TC", "TG", "CA", "CT", "CC", "CG", "GA", "GT", "GC", "GG"}), + Arrays.asList(new Boolean[]{false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false}), + 0, + new UmiMetrics(2, // UMI_LENGTH + 16, // OBSERVED_UNIQUE_UMIS + 16, // INFERRED_UNIQUE_UMIS + 0, // OBSERVED_BASE_ERRORS + 2, // DUPLICATE_SETS_WITHOUT_UMI + 16, // DUPLICATE_SETS_WITH_UMI + 2.0, // EFFECTIVE_LENGTH_OF_INFERRED_UMIS + 2, // EFECTIVE_LENGTH_OF_OBSERVED_UMIS + Double.NaN, // ESTIMATED_BASE_QUALITY_OF_UMIS + expectedCollisions0_16_2, // EXPECTED_READS_WITH_UMI_COLLISION + collisionQ0_16_2) // UMI_COLLISION_Q + }}; + } + + @Test(dataProvider = "testUmiMetricsDataProvider") + public void testUmiMetrics(List umis, List assignedUmi, final List isDuplicate, + final int editDistanceToJoin, final UmiMetrics expectedMetrics) { + UmiAwareMarkDuplicatesWithMateCigarTester tester = getTester(false); + tester.addArg("MAX_EDIT_DISTANCE_TO_JOIN=" + editDistanceToJoin); + + for(int i = 0;i < umis.size();i++) { + tester.addMatePairWithUmi(umis.get(i), assignedUmi.get(i), isDuplicate.get(i), isDuplicate.get(i)); + } + tester.setExpectedAssignedUmis(assignedUmi); + tester.setExpectedMetrics(expectedMetrics); + tester.runTest(); + } } diff --git a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java index bdf166b03..dae5be479 100644 --- a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java +++ b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java @@ -27,9 +27,14 @@ import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.metrics.MetricsFile; import org.testng.Assert; import picard.cmdline.CommandLineProgram; +import picard.sam.UmiMetrics; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; import java.util.List; /** @@ -42,6 +47,8 @@ public class UmiAwareMarkDuplicatesWithMateCigarTester extends AbstractMarkDuplicatesCommandLineProgramTester { private int readNameCounter = 0; private List expectedAssignedUmis; + private UmiMetrics expectedMetrics; + private File umiMetricsFile; // This tag is only used for testing, it indicates what we expect to see in the inferred UMI tag. private final String expectedUmiTag = "RE"; @@ -54,6 +61,9 @@ } UmiAwareMarkDuplicatesWithMateCigarTester(final boolean allowMissingUmis) { + umiMetricsFile = new File(getOutputDir(), "umi_metrics.txt"); + addArg("UMI_METRICS_FILE=" + umiMetricsFile); + if (allowMissingUmis) { addArg("ALLOW_MISSING_UMIS=" + true); } @@ -146,6 +156,11 @@ UmiAwareMarkDuplicatesWithMateCigarTester setExpectedAssignedUmis(final List> metricsOutput = new MetricsFile>(); + try{ + metricsOutput.read(new FileReader(umiMetricsFile)); + } + catch (final FileNotFoundException ex) { + System.err.println("Metrics file not found: " + ex); + } + double tolerance = 1e-6; + Assert.assertEquals(metricsOutput.getMetrics().size(), 1); + final UmiMetrics observedMetrics = metricsOutput.getMetrics().get(0); + Assert.assertEquals(observedMetrics.UMI_LENGTH, expectedMetrics.UMI_LENGTH, "UMI_LENGTH does not match expected"); + Assert.assertEquals(observedMetrics.OBSERVED_UNIQUE_UMIS, expectedMetrics.OBSERVED_UNIQUE_UMIS, "OBSERVED_UNIQUE_UMIS does not match expected"); + Assert.assertEquals(observedMetrics.INFERRED_UNIQUE_UMIS, expectedMetrics.INFERRED_UNIQUE_UMIS, "INFERRED_UNIQUE_UMIS does not match expected"); + Assert.assertEquals(observedMetrics.OBSERVED_BASE_ERRORS, expectedMetrics.OBSERVED_BASE_ERRORS, "OBSERVED_BASE_ERRORS does not match expected"); + Assert.assertEquals(observedMetrics.DUPLICATE_SETS_WITHOUT_UMI, expectedMetrics.DUPLICATE_SETS_WITHOUT_UMI, "DUPLICATE_SETS_WITHOUT_UMI does not match expected"); + Assert.assertEquals(observedMetrics.EFFECTIVE_LENGTH_OF_INFERRED_UMIS, expectedMetrics.EFFECTIVE_LENGTH_OF_INFERRED_UMIS, tolerance, "EFFECTIVE_LENGTH_OF_INFERRED_UMIS does not match expected"); + Assert.assertEquals(observedMetrics.EFFECTIVE_LENGTH_OF_OBSERVED_UMIS, expectedMetrics.EFFECTIVE_LENGTH_OF_OBSERVED_UMIS, tolerance, "EFFECTIVE_LENGTH_OF_OBSERVED_UMIS does not match expected"); + Assert.assertEquals(observedMetrics.ESTIMATED_BASE_QUALITY_OF_UMIS, expectedMetrics.ESTIMATED_BASE_QUALITY_OF_UMIS, tolerance, "ESTIMATED_BASE_QUALITY_OF_UMIS does not match expected"); + Assert.assertEquals(observedMetrics.EXPECTED_READS_WITH_UMI_COLLISION, expectedMetrics.EXPECTED_READS_WITH_UMI_COLLISION, tolerance, "EXPECTED_READS_WITH_UMI_COLLISION does not match expected"); + Assert.assertEquals(observedMetrics.UMI_COLLISION_Q, expectedMetrics.UMI_COLLISION_Q, tolerance, "UMI_COLLISION_Q does not match expected"); + } + // Also do tests from AbstractMarkDuplicatesCommandLineProgramTester super.test(); } From efb5860045cbeae3fae930779311c6f8e76f93c2 Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Mon, 13 Feb 2017 16:46:42 -0500 Subject: [PATCH 2/9] Addressing Yossi's comments --- src/main/java/picard/sam/UmiMetrics.java | 34 +++++++++------ .../UmiAwareDuplicateSetIterator.java | 49 ++++++++++++---------- .../UmiAwareMarkDuplicatesWithMateCigar.java | 16 +++---- .../UmiAwareMarkDuplicatesWithMateCigarTest.java | 2 +- .../UmiAwareMarkDuplicatesWithMateCigarTester.java | 14 +++---- 5 files changed, 65 insertions(+), 50 deletions(-) diff --git a/src/main/java/picard/sam/UmiMetrics.java b/src/main/java/picard/sam/UmiMetrics.java index 45d817d78..2c2194b6d 100644 --- a/src/main/java/picard/sam/UmiMetrics.java +++ b/src/main/java/picard/sam/UmiMetrics.java @@ -49,23 +49,31 @@ // Number of duplicate sets found after taking UMIs into account public long DUPLICATE_SETS_WITH_UMI = 0; - // This is a measure of entropy (in base 4) to indicate the amount of - // information (given as effective bases) provided by each observed UMI - public double EFFECTIVE_LENGTH_OF_OBSERVED_UMIS = 0; - - public double EFFECTIVE_LENGTH_OF_INFERRED_UMIS = 0; + // Entropy (in base 4) of the observed UMI sequences, indicating the + // effective number of bases in the UMIs. If this is significantly + // smaller than UMI_LENGTH, it indicates that the UMIs are not + // distributed uniformly. + public double OBSERVED_UMI_ENTROPY = 0; + + // Entropy (in base 4) of the inferred UMI sequences, indicating the + // effective number of bases in the inferred UMIs. If this is significantly + // smaller than UMI_LENGTH, it indicates that the UMIs are not + // distributed uniformly. + public double INFERRED_UMI_ENTROPY = 0; // Estimation of Phred scaled quality scores for UMIs - public double ESTIMATED_BASE_QUALITY_OF_UMIS; + public double UMI_BASE_QUALITIES; - // MLE estimation of reads that will be falsely labeled as being part of a duplicate set due to UMI collisions - public double EXPECTED_READS_WITH_UMI_COLLISION; + // MLE estimation of reads that will be falsely labeled as being part of a duplicate set due to UMI collisions. + // This estimate is computed over every duplicate set, and effectively accounts for the distribution of duplicate + // set sizes. + public double UMI_COLLISION_EST; // Phred scale of MLE estimate of collision rate public double UMI_COLLISION_Q; public void estimateBaseQualities(final int observedUmiBases) { - ESTIMATED_BASE_QUALITY_OF_UMIS = -10.0*Math.log10((double) OBSERVED_BASE_ERRORS / (double) observedUmiBases); + UMI_BASE_QUALITIES = -10.0*Math.log10((double) OBSERVED_BASE_ERRORS / (double) observedUmiBases); } public UmiMetrics() {} @@ -81,10 +89,10 @@ public UmiMetrics(final int length, final int observedUniqueUmis, final int infe OBSERVED_BASE_ERRORS = observedBaseErrors; DUPLICATE_SETS_WITHOUT_UMI = duplicateSetsWithoutUmi; DUPLICATE_SETS_WITH_UMI = duplicateSetsWithUmi; - EFFECTIVE_LENGTH_OF_INFERRED_UMIS = effectiveLengthOfInferredUmis; - EFFECTIVE_LENGTH_OF_OBSERVED_UMIS = effectiveLengthOfObservedUmis; - ESTIMATED_BASE_QUALITY_OF_UMIS = estimatedBaseQualityOfUmis; - EXPECTED_READS_WITH_UMI_COLLISION = expectedUmiCollisions; + INFERRED_UMI_ENTROPY = effectiveLengthOfInferredUmis; + OBSERVED_UMI_ENTROPY = effectiveLengthOfObservedUmis; + UMI_BASE_QUALITIES = estimatedBaseQualityOfUmis; + UMI_COLLISION_EST = expectedUmiCollisions; UMI_COLLISION_Q = umiCollisionQ; } } diff --git a/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java b/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java index 018dcc996..674033acd 100644 --- a/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java +++ b/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java @@ -66,7 +66,7 @@ private long duplicateSetsWithUmi = 0; private long duplicateSetsWithoutUmi = 0; private double expectedCollisions = 0; - private int observedBaseErrors = 0; + private int observedUmiBases = 0; private Histogram observedUmis = new Histogram<>(); private Histogram inferredUmis = new Histogram<>(); @@ -154,32 +154,43 @@ private void process(final DuplicateSet set) { String currentUmi = rec.getStringAttribute(umiTag); if (currentUmi != null) { + // All UMIs should be the same length if (!haveWeSeenFirstRead) { metrics.UMI_LENGTH = currentUmi.length(); haveWeSeenFirstRead = true; + } else { + if (metrics.UMI_LENGTH != currentUmi.length()) { + throw new PicardException("UMIs of differing lengths were found."); + } } metrics.OBSERVED_BASE_ERRORS += hammingDistance(currentUmi, inferredUmi); - observedBaseErrors += metrics.UMI_LENGTH; + observedUmiBases += metrics.UMI_LENGTH; observedUmis.increment(currentUmi); inferredUmis.increment(inferredUmi); } } - - duplicateSetsWithUmi++; } + duplicateSetsWithUmi += duplicateSets.size(); duplicateSetsWithoutUmi++; - // For each duplicate set estimate the number of expected umi collisions - double Z = 0; + // For each duplicate set estimate the number of expected UMI collisions + double nWaysUmisCanDiffer = 0; // Number of ways two UMIs may contain errors, but be considered the same for (int k = 0; k <= maxEditDistanceToJoin; k++) { if (metrics.UMI_LENGTH > 0) { - Z = Z + LongMath.binomial(metrics.UMI_LENGTH, k) * Math.pow(3.0, k); + nWaysUmisCanDiffer = nWaysUmisCanDiffer + LongMath.binomial(metrics.UMI_LENGTH, k) * Math.pow(3.0, k); } } + + // The probability of two non-duplicate UMIs are drawn from a uniform distribution being correctly labeled as + // not belonging to the same UMI family is given as, pCorrectlyLabeled = nWaysUmisCanDiffer / 4^metrics.UMI_LENGTH. + // Estimate of probability all members in the duplicate set are correctly labeled is given by + // pAllMembersCorrectlyLabeled = (1 - pCorrectlyLabeled)^duplicateSets.size(). + // The expected number of reads incorrectly labeled as belonging to a duplicate set is given as, + // (1 - pAllMembersCorrectlyLabeled) * duplicateSets.size(). expectedCollisions = expectedCollisions + - (1 - Math.pow(1 - Z / Math.pow(4, metrics.UMI_LENGTH), duplicateSets.size() - 1)) * duplicateSets.size(); + (1 - Math.pow(1 - nWaysUmisCanDiffer / Math.pow(4, metrics.UMI_LENGTH), duplicateSets.size() - 1)) * duplicateSets.size(); nextSetsIterator = duplicateSets.iterator(); } @@ -188,35 +199,29 @@ private void collectMetrics() { metrics.OBSERVED_UNIQUE_UMIS = observedUmis.size(); metrics.INFERRED_UNIQUE_UMIS = inferredUmis.size(); - metrics.EFFECTIVE_LENGTH_OF_OBSERVED_UMIS = effectiveNumberOfBases(observedUmis); - metrics.EFFECTIVE_LENGTH_OF_INFERRED_UMIS = effectiveNumberOfBases(inferredUmis); + metrics.OBSERVED_UMI_ENTROPY = effectiveNumberOfBases(observedUmis); + metrics.INFERRED_UMI_ENTROPY = effectiveNumberOfBases(inferredUmis); metrics.DUPLICATE_SETS_WITH_UMI = duplicateSetsWithUmi; metrics.DUPLICATE_SETS_WITHOUT_UMI = duplicateSetsWithoutUmi; - metrics.EXPECTED_READS_WITH_UMI_COLLISION = expectedCollisions; + metrics.UMI_COLLISION_EST = expectedCollisions; metrics.UMI_COLLISION_Q = -10 * Math.log10(expectedCollisions / inferredUmis.size()); - - double Z = 0; - for (int k = 0; k <= maxEditDistanceToJoin; k++) { - Z = Z + LongMath.binomial(metrics.UMI_LENGTH, k) * Math.pow(3.0, k); - } - - metrics.estimateBaseQualities(observedBaseErrors); + metrics.estimateBaseQualities(observedUmiBases); } private double effectiveNumberOfBases(Histogram observations) { - double H = 0.0; + double entropyBase4 = 0.0; double totalObservations = observations.getSumOfValues(); for (Histogram.Bin observation : observations.values()) { - double p = observation.getValue() / totalObservations; - H = H - p * Math.log(p); + double pObservation = observation.getValue() / totalObservations; + entropyBase4 = entropyBase4 - pObservation * Math.log(pObservation); } // Convert to log base 4 so that the entropy is now a measure // of the effective number of DNA bases. If we used log(2.0) // our result would be in bits. - return H / Math.log(4.0); + return entropyBase4 / Math.log(4.0); } } diff --git a/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java b/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java index c88834f3c..698e4a1ff 100644 --- a/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java +++ b/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java @@ -69,8 +69,9 @@ @Option(shortName = "MAX_EDIT_DISTANCE_TO_JOIN", doc = "Largest edit distance that UMIs must have in order to be considered as coming from distinct source molecules.", optional = true) public int MAX_EDIT_DISTANCE_TO_JOIN = 1; - @Option(shortName = "UMI_METRICS", doc = "UMI Metrics", optional = true) - public File UMI_METRICS_FILE = null; + // The UMI_METRICS file provides various statistical measurements collected about the UMIs during deduplication. + @Option(shortName = "UMI_METRICS", doc = "UMI Metrics") + public File UMI_METRICS_FILE; @Option(shortName = "UMI_TAG_NAME", doc = "Tag name to use for UMI", optional = true) public String UMI_TAG_NAME = "RX"; @@ -89,15 +90,16 @@ @Override protected int doWork() { + // Before we do anything, make sure the UMI_METRICS_FILE can be written to. + IOUtil.assertFileIsWritable(UMI_METRICS_FILE); + // Perform Mark Duplicates work int retval=super.doWork(); // Write metrics specific to UMIs - if(UMI_METRICS_FILE != null) { - MetricsFile metricsFile = getMetricsFile(); - metricsFile.addMetric(metrics); - metricsFile.write(UMI_METRICS_FILE); - } + MetricsFile metricsFile = getMetricsFile(); + metricsFile.addMetric(metrics); + metricsFile.write(UMI_METRICS_FILE); return retval; } diff --git a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java index 9157536c8..8336873e3 100644 --- a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java +++ b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java @@ -243,7 +243,7 @@ public void testUmiMetrics(List umis, List assignedUmi, final Li UmiAwareMarkDuplicatesWithMateCigarTester tester = getTester(false); tester.addArg("MAX_EDIT_DISTANCE_TO_JOIN=" + editDistanceToJoin); - for(int i = 0;i < umis.size();i++) { + for( int i = 0;i < umis.size();i++ ) { tester.addMatePairWithUmi(umis.get(i), assignedUmi.get(i), isDuplicate.get(i), isDuplicate.get(i)); } tester.setExpectedAssignedUmis(assignedUmi); diff --git a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java index dae5be479..677dfef78 100644 --- a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java +++ b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java @@ -48,7 +48,7 @@ private int readNameCounter = 0; private List expectedAssignedUmis; private UmiMetrics expectedMetrics; - private File umiMetricsFile; + private File umiMetricsFile = new File(getOutputDir(), "umi_metrics.txt"); // This tag is only used for testing, it indicates what we expect to see in the inferred UMI tag. private final String expectedUmiTag = "RE"; @@ -57,11 +57,11 @@ // AbstractMarkDuplicatesCommandLineProgramTester. Since those tests use // reads that don't have UMIs we enable the ALLOW_MISSING_UMIS option. UmiAwareMarkDuplicatesWithMateCigarTester() { + addArg("UMI_METRICS_FILE=" + umiMetricsFile); addArg("ALLOW_MISSING_UMIS=" + true); } UmiAwareMarkDuplicatesWithMateCigarTester(final boolean allowMissingUmis) { - umiMetricsFile = new File(getOutputDir(), "umi_metrics.txt"); addArg("UMI_METRICS_FILE=" + umiMetricsFile); if (allowMissingUmis) { @@ -174,7 +174,7 @@ public void test() { if(expectedMetrics != null) { // Check the values written to metrics.txt against our input expectations final MetricsFile> metricsOutput = new MetricsFile>(); - try{ + try { metricsOutput.read(new FileReader(umiMetricsFile)); } catch (final FileNotFoundException ex) { @@ -188,10 +188,10 @@ public void test() { Assert.assertEquals(observedMetrics.INFERRED_UNIQUE_UMIS, expectedMetrics.INFERRED_UNIQUE_UMIS, "INFERRED_UNIQUE_UMIS does not match expected"); Assert.assertEquals(observedMetrics.OBSERVED_BASE_ERRORS, expectedMetrics.OBSERVED_BASE_ERRORS, "OBSERVED_BASE_ERRORS does not match expected"); Assert.assertEquals(observedMetrics.DUPLICATE_SETS_WITHOUT_UMI, expectedMetrics.DUPLICATE_SETS_WITHOUT_UMI, "DUPLICATE_SETS_WITHOUT_UMI does not match expected"); - Assert.assertEquals(observedMetrics.EFFECTIVE_LENGTH_OF_INFERRED_UMIS, expectedMetrics.EFFECTIVE_LENGTH_OF_INFERRED_UMIS, tolerance, "EFFECTIVE_LENGTH_OF_INFERRED_UMIS does not match expected"); - Assert.assertEquals(observedMetrics.EFFECTIVE_LENGTH_OF_OBSERVED_UMIS, expectedMetrics.EFFECTIVE_LENGTH_OF_OBSERVED_UMIS, tolerance, "EFFECTIVE_LENGTH_OF_OBSERVED_UMIS does not match expected"); - Assert.assertEquals(observedMetrics.ESTIMATED_BASE_QUALITY_OF_UMIS, expectedMetrics.ESTIMATED_BASE_QUALITY_OF_UMIS, tolerance, "ESTIMATED_BASE_QUALITY_OF_UMIS does not match expected"); - Assert.assertEquals(observedMetrics.EXPECTED_READS_WITH_UMI_COLLISION, expectedMetrics.EXPECTED_READS_WITH_UMI_COLLISION, tolerance, "EXPECTED_READS_WITH_UMI_COLLISION does not match expected"); + Assert.assertEquals(observedMetrics.INFERRED_UMI_ENTROPY, expectedMetrics.INFERRED_UMI_ENTROPY, tolerance, "INFERRED_UMI_ENTROPY does not match expected"); + Assert.assertEquals(observedMetrics.OBSERVED_UMI_ENTROPY, expectedMetrics.OBSERVED_UMI_ENTROPY, tolerance, "OBSERVED_UMI_ENTROPY does not match expected"); + Assert.assertEquals(observedMetrics.UMI_BASE_QUALITIES, expectedMetrics.UMI_BASE_QUALITIES, tolerance, "UMI_BASE_QUALITIES does not match expected"); + Assert.assertEquals(observedMetrics.UMI_COLLISION_EST, expectedMetrics.UMI_COLLISION_EST, tolerance, "UMI_COLLISION_EST does not match expected"); Assert.assertEquals(observedMetrics.UMI_COLLISION_Q, expectedMetrics.UMI_COLLISION_Q, tolerance, "UMI_COLLISION_Q does not match expected"); } From 0956d6ece7f1fa479f2361950bf0add3327ed534 Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Mon, 27 Feb 2017 17:49:54 -0500 Subject: [PATCH 3/9] Responding to Yossi's comments, tests should pass, reorganized metrics --- src/main/java/picard/sam/UmiMetrics.java | 64 +++++++++++++++++++ .../UmiAwareDuplicateSetIterator.java | 73 ++-------------------- .../UmiAwareMarkDuplicatesWithMateCigar.java | 2 +- .../UmiAwareMarkDuplicatesWithMateCigarTest.java | 2 +- .../UmiAwareMarkDuplicatesWithMateCigarTester.java | 2 +- 5 files changed, 71 insertions(+), 72 deletions(-) diff --git a/src/main/java/picard/sam/UmiMetrics.java b/src/main/java/picard/sam/UmiMetrics.java index 2c2194b6d..d6c1983af 100644 --- a/src/main/java/picard/sam/UmiMetrics.java +++ b/src/main/java/picard/sam/UmiMetrics.java @@ -24,13 +24,20 @@ package picard.sam; +import com.google.common.math.LongMath; import htsjdk.samtools.metrics.MetricBase; +import htsjdk.samtools.util.Histogram; +import static htsjdk.samtools.util.StringUtil.hammingDistance; /** * Metrics that are calculated during the process of marking duplicates * within a stream of SAMRecords using the UmiAwareDuplicateSetIterator. */ public class UmiMetrics extends MetricBase { + private Histogram observedUmis = new Histogram<>(); + private Histogram inferredUmis = new Histogram<>(); + private int observedUmiBases = 0; + // Number of bases in each UMI public int UMI_LENGTH; @@ -95,4 +102,61 @@ public UmiMetrics(final int length, final int observedUniqueUmis, final int infe UMI_COLLISION_EST = expectedUmiCollisions; UMI_COLLISION_Q = umiCollisionQ; } + + public void calculateDerivedFields() { + OBSERVED_UNIQUE_UMIS = observedUmis.size(); + INFERRED_UNIQUE_UMIS = inferredUmis.size(); + + OBSERVED_UMI_ENTROPY = effectiveNumberOfBases(observedUmis); + INFERRED_UMI_ENTROPY = effectiveNumberOfBases(inferredUmis); + + UMI_COLLISION_Q = -10 * Math.log10(UMI_COLLISION_EST / inferredUmis.size()); + estimateBaseQualities(observedUmiBases); + } + + private double effectiveNumberOfBases(Histogram observations) { + double entropyBase4 = 0.0; + + double totalObservations = observations.getSumOfValues(); + for (Histogram.Bin observation : observations.values()) { + double pObservation = observation.getValue() / totalObservations; + entropyBase4 = entropyBase4 - pObservation * Math.log(pObservation); + } + + // Convert to log base 4 so that the entropy is now a measure + // of the effective number of DNA bases. If we used log(2.0) + // our result would be in bits. + return entropyBase4 / Math.log(4.0); + } + + public void updateDuplicateSetMetrics(final int maxEditDistanceToJoin, final int duplicateSetSize) { + DUPLICATE_SETS_WITH_UMI += duplicateSetSize; + DUPLICATE_SETS_WITHOUT_UMI++; + + // For each duplicate set estimate the number of expected UMI collisions + double nWaysUmisCanDiffer = 0; // Number of ways two UMIs may contain errors, but be considered the same + for (int k = 0; k <= maxEditDistanceToJoin; k++) { + if (UMI_LENGTH > 0) { + nWaysUmisCanDiffer = nWaysUmisCanDiffer + LongMath.binomial(UMI_LENGTH, k) * Math.pow(3.0, k); + } + } + + // The probability of two non-duplicate UMIs are drawn from a uniform distribution being correctly labeled as + // not belonging to the same UMI family is given as, pCorrectlyLabeled = nWaysUmisCanDiffer / 4^metrics.UMI_LENGTH. + // Estimate of probability all members in the duplicate set are correctly labeled is given by + // pAllMembersCorrectlyLabeled = (1 - pCorrectlyLabeled)^duplicateSets.size(). + // The expected number of reads incorrectly labeled as belonging to a duplicate set is given as, + // (1 - pAllMembersCorrectlyLabeled) * duplicateSets.size(). + UMI_COLLISION_EST = UMI_COLLISION_EST + + (1 - Math.pow(1 - nWaysUmisCanDiffer / Math.pow(4, UMI_LENGTH), duplicateSetSize - 1)) * duplicateSetSize; + } + + public void updateUmiRecordMetrics(final String currentUmi, final String inferredUmi) { + OBSERVED_BASE_ERRORS += hammingDistance(currentUmi, inferredUmi); + + observedUmiBases += UMI_LENGTH; + observedUmis.increment(currentUmi); + inferredUmis.increment(inferredUmi); + } } + diff --git a/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java b/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java index 674033acd..f6ea1378b 100644 --- a/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java +++ b/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java @@ -1,7 +1,7 @@ /* * The MIT License * - * Copyright (c) 2016 The Broad Institute + * Copyright (c) 2017 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -34,19 +34,15 @@ package picard.sam.markduplicates; -import com.google.common.math.LongMath; import htsjdk.samtools.DuplicateSet; import htsjdk.samtools.DuplicateSetIterator; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.util.CloseableIterator; -import htsjdk.samtools.util.Histogram; import picard.PicardException; import picard.sam.UmiMetrics; import java.util.*; -import static htsjdk.samtools.util.StringUtil.hammingDistance; - /** * UmiAwareDuplicateSetIterator is an iterator that wraps a duplicate set iterator * in such a way that each duplicate set may be broken up into subsets according @@ -63,13 +59,7 @@ private boolean isOpen = false; private UmiMetrics metrics; private boolean haveWeSeenFirstRead = false; - private long duplicateSetsWithUmi = 0; - private long duplicateSetsWithoutUmi = 0; private double expectedCollisions = 0; - private int observedUmiBases = 0; - - private Histogram observedUmis = new Histogram<>(); - private Histogram inferredUmis = new Histogram<>(); /** * Creates a UMI aware duplicate set iterator @@ -98,9 +88,8 @@ public void close() { wrappedIterator.close(); if (metrics.UMI_LENGTH > 0) { - collectMetrics(); + metrics.calculateDerivedFields(); } - } @Override @@ -147,7 +136,6 @@ private void process(final DuplicateSet set) { for (DuplicateSet ds : duplicateSets) { List records = ds.getRecords(); SAMRecord representativeRead = ds.getRepresentative(); - String inferredUmi = representativeRead.getStringAttribute(inferredUmiTag); for (SAMRecord rec : records) { @@ -163,65 +151,12 @@ private void process(final DuplicateSet set) { throw new PicardException("UMIs of differing lengths were found."); } } - - metrics.OBSERVED_BASE_ERRORS += hammingDistance(currentUmi, inferredUmi); - observedUmiBases += metrics.UMI_LENGTH; - - observedUmis.increment(currentUmi); - inferredUmis.increment(inferredUmi); + metrics.updateUmiRecordMetrics(currentUmi, inferredUmi); } } } - duplicateSetsWithUmi += duplicateSets.size(); - duplicateSetsWithoutUmi++; - - // For each duplicate set estimate the number of expected UMI collisions - double nWaysUmisCanDiffer = 0; // Number of ways two UMIs may contain errors, but be considered the same - for (int k = 0; k <= maxEditDistanceToJoin; k++) { - if (metrics.UMI_LENGTH > 0) { - nWaysUmisCanDiffer = nWaysUmisCanDiffer + LongMath.binomial(metrics.UMI_LENGTH, k) * Math.pow(3.0, k); - } - } - - // The probability of two non-duplicate UMIs are drawn from a uniform distribution being correctly labeled as - // not belonging to the same UMI family is given as, pCorrectlyLabeled = nWaysUmisCanDiffer / 4^metrics.UMI_LENGTH. - // Estimate of probability all members in the duplicate set are correctly labeled is given by - // pAllMembersCorrectlyLabeled = (1 - pCorrectlyLabeled)^duplicateSets.size(). - // The expected number of reads incorrectly labeled as belonging to a duplicate set is given as, - // (1 - pAllMembersCorrectlyLabeled) * duplicateSets.size(). - expectedCollisions = expectedCollisions + - (1 - Math.pow(1 - nWaysUmisCanDiffer / Math.pow(4, metrics.UMI_LENGTH), duplicateSets.size() - 1)) * duplicateSets.size(); + metrics.updateDuplicateSetMetrics(maxEditDistanceToJoin, duplicateSets.size()); nextSetsIterator = duplicateSets.iterator(); } - - private void collectMetrics() { - metrics.OBSERVED_UNIQUE_UMIS = observedUmis.size(); - metrics.INFERRED_UNIQUE_UMIS = inferredUmis.size(); - - metrics.OBSERVED_UMI_ENTROPY = effectiveNumberOfBases(observedUmis); - metrics.INFERRED_UMI_ENTROPY = effectiveNumberOfBases(inferredUmis); - - metrics.DUPLICATE_SETS_WITH_UMI = duplicateSetsWithUmi; - metrics.DUPLICATE_SETS_WITHOUT_UMI = duplicateSetsWithoutUmi; - metrics.UMI_COLLISION_EST = expectedCollisions; - - metrics.UMI_COLLISION_Q = -10 * Math.log10(expectedCollisions / inferredUmis.size()); - metrics.estimateBaseQualities(observedUmiBases); - } - - private double effectiveNumberOfBases(Histogram observations) { - double entropyBase4 = 0.0; - - double totalObservations = observations.getSumOfValues(); - for (Histogram.Bin observation : observations.values()) { - double pObservation = observation.getValue() / totalObservations; - entropyBase4 = entropyBase4 - pObservation * Math.log(pObservation); - } - - // Convert to log base 4 so that the entropy is now a measure - // of the effective number of DNA bases. If we used log(2.0) - // our result would be in bits. - return entropyBase4 / Math.log(4.0); - } } diff --git a/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java b/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java index 698e4a1ff..a136bf3c6 100644 --- a/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java +++ b/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java @@ -1,7 +1,7 @@ /* * The MIT License * - * Copyright (c) 2016 The Broad Institute + * Copyright (c) 2017 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java index 8336873e3..86909a12f 100644 --- a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java +++ b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java @@ -1,7 +1,7 @@ /* * The MIT License * - * Copyright (c) 2016 The Broad Institute + * Copyright (c) 2017 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java index 677dfef78..0d4e8fcee 100644 --- a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java +++ b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java @@ -171,7 +171,7 @@ public void test() { } } - if(expectedMetrics != null) { + if (expectedMetrics != null) { // Check the values written to metrics.txt against our input expectations final MetricsFile> metricsOutput = new MetricsFile>(); try { From 83e4c38da477aa93812091ab9de1f7c7e7b5404a Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Mon, 13 Mar 2017 14:54:00 -0400 Subject: [PATCH 4/9] Fixed up some Javadoc --- src/main/java/picard/sam/UmiMetrics.java | 42 +++++++++++++++++--------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/src/main/java/picard/sam/UmiMetrics.java b/src/main/java/picard/sam/UmiMetrics.java index d6c1983af..42c49e1c5 100644 --- a/src/main/java/picard/sam/UmiMetrics.java +++ b/src/main/java/picard/sam/UmiMetrics.java @@ -38,45 +38,49 @@ private Histogram inferredUmis = new Histogram<>(); private int observedUmiBases = 0; - // Number of bases in each UMI + /** Number of bases in each UMI */ public int UMI_LENGTH; - // Number of different UMI sequences observed + /** Number of different UMI sequences observed */ public long OBSERVED_UNIQUE_UMIS = 0; - // Number of different inferred UMI sequences derived + /** Number of different inferred UMI sequences derived */ public long INFERRED_UNIQUE_UMIS = 0; - // Number of errors inferred by comparing the observed and inferred UMIs + /** Number of errors inferred by comparing the observed and inferred UMIs */ public long OBSERVED_BASE_ERRORS = 0; - // Number of duplicate sets found before taking UMIs into account + /** Number of duplicate sets found before taking UMIs into account */ public long DUPLICATE_SETS_WITHOUT_UMI = 0; - // Number of duplicate sets found after taking UMIs into account + /** Number of duplicate sets found after taking UMIs into account */ public long DUPLICATE_SETS_WITH_UMI = 0; - // Entropy (in base 4) of the observed UMI sequences, indicating the - // effective number of bases in the UMIs. If this is significantly - // smaller than UMI_LENGTH, it indicates that the UMIs are not - // distributed uniformly. + /** + * Entropy (in base 4) of the observed UMI sequences, indicating the + * effective number of bases in the UMIs. If this is significantly + * smaller than UMI_LENGTH, it indicates that the UMIs are not + * distributed uniformly. + */ public double OBSERVED_UMI_ENTROPY = 0; - // Entropy (in base 4) of the inferred UMI sequences, indicating the - // effective number of bases in the inferred UMIs. If this is significantly - // smaller than UMI_LENGTH, it indicates that the UMIs are not - // distributed uniformly. + /** Entropy (in base 4) of the inferred UMI sequences, indicating the + * effective number of bases in the inferred UMIs. If this is significantly + * smaller than UMI_LENGTH, it indicates that the UMIs are not + * distributed uniformly. + */ public double INFERRED_UMI_ENTROPY = 0; - // Estimation of Phred scaled quality scores for UMIs + /** Estimation of Phred scaled quality scores for UMIs */ public double UMI_BASE_QUALITIES; - // MLE estimation of reads that will be falsely labeled as being part of a duplicate set due to UMI collisions. - // This estimate is computed over every duplicate set, and effectively accounts for the distribution of duplicate - // set sizes. + /** MLE estimation of reads that will be falsely labeled as being part of a duplicate set due to UMI collisions. + * This estimate is computed over every duplicate set, and effectively accounts for the distribution of duplicate + * set sizes. This is an experimental metric, and should be used with caution. + */ public double UMI_COLLISION_EST; - // Phred scale of MLE estimate of collision rate + /** Phred scale of MLE estimate of collision rate. This is an experimental metric. */ public double UMI_COLLISION_Q; public void estimateBaseQualities(final int observedUmiBases) { From 403b9034906d0b3c8909220fb4f71b784495165a Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Thu, 23 Mar 2017 16:16:57 -0400 Subject: [PATCH 5/9] Addressing Yossi's comments --- src/main/java/picard/sam/UmiMetrics.java | 69 +++------------------- .../UmiAwareDuplicateSetIterator.java | 19 ++++-- .../UmiAwareMarkDuplicatesWithMateCigarTest.java | 25 +++----- .../UmiAwareMarkDuplicatesWithMateCigarTester.java | 2 - 4 files changed, 31 insertions(+), 84 deletions(-) diff --git a/src/main/java/picard/sam/UmiMetrics.java b/src/main/java/picard/sam/UmiMetrics.java index 42c49e1c5..7a7e9f89b 100644 --- a/src/main/java/picard/sam/UmiMetrics.java +++ b/src/main/java/picard/sam/UmiMetrics.java @@ -24,9 +24,11 @@ package picard.sam; +import java.util.stream.Collectors; import com.google.common.math.LongMath; import htsjdk.samtools.metrics.MetricBase; import htsjdk.samtools.util.Histogram; +import htsjdk.samtools.util.QualityUtil; import static htsjdk.samtools.util.StringUtil.hammingDistance; /** @@ -34,10 +36,6 @@ * within a stream of SAMRecords using the UmiAwareDuplicateSetIterator. */ public class UmiMetrics extends MetricBase { - private Histogram observedUmis = new Histogram<>(); - private Histogram inferredUmis = new Histogram<>(); - private int observedUmiBases = 0; - /** Number of bases in each UMI */ public int UMI_LENGTH; @@ -74,26 +72,12 @@ /** Estimation of Phred scaled quality scores for UMIs */ public double UMI_BASE_QUALITIES; - /** MLE estimation of reads that will be falsely labeled as being part of a duplicate set due to UMI collisions. - * This estimate is computed over every duplicate set, and effectively accounts for the distribution of duplicate - * set sizes. This is an experimental metric, and should be used with caution. - */ - public double UMI_COLLISION_EST; - - /** Phred scale of MLE estimate of collision rate. This is an experimental metric. */ - public double UMI_COLLISION_Q; - - public void estimateBaseQualities(final int observedUmiBases) { - UMI_BASE_QUALITIES = -10.0*Math.log10((double) OBSERVED_BASE_ERRORS / (double) observedUmiBases); - } - public UmiMetrics() {} public UmiMetrics(final int length, final int observedUniqueUmis, final int inferredUniqueUmis, final int observedBaseErrors, final int duplicateSetsWithoutUmi, final int duplicateSetsWithUmi, final double effectiveLengthOfInferredUmis, - final double effectiveLengthOfObservedUmis, final double estimatedBaseQualityOfUmis, - final double expectedUmiCollisions, final double umiCollisionQ) { + final double effectiveLengthOfObservedUmis, final double estimatedBaseQualityOfUmis) { UMI_LENGTH = length; OBSERVED_UNIQUE_UMIS = observedUniqueUmis; INFERRED_UNIQUE_UMIS = inferredUniqueUmis; @@ -103,64 +87,27 @@ public UmiMetrics(final int length, final int observedUniqueUmis, final int infe INFERRED_UMI_ENTROPY = effectiveLengthOfInferredUmis; OBSERVED_UMI_ENTROPY = effectiveLengthOfObservedUmis; UMI_BASE_QUALITIES = estimatedBaseQualityOfUmis; - UMI_COLLISION_EST = expectedUmiCollisions; - UMI_COLLISION_Q = umiCollisionQ; } - public void calculateDerivedFields() { + public void calculateDerivedFields(final Histogram observedUmis, Histogram inferredUmis, long observedUmiBases) { OBSERVED_UNIQUE_UMIS = observedUmis.size(); INFERRED_UNIQUE_UMIS = inferredUmis.size(); OBSERVED_UMI_ENTROPY = effectiveNumberOfBases(observedUmis); INFERRED_UMI_ENTROPY = effectiveNumberOfBases(inferredUmis); - UMI_COLLISION_Q = -10 * Math.log10(UMI_COLLISION_EST / inferredUmis.size()); - estimateBaseQualities(observedUmiBases); + UMI_BASE_QUALITIES = QualityUtil.getPhredScoreFromErrorProbability((double) OBSERVED_BASE_ERRORS / (double) observedUmiBases); } private double effectiveNumberOfBases(Histogram observations) { - double entropyBase4 = 0.0; - double totalObservations = observations.getSumOfValues(); - for (Histogram.Bin observation : observations.values()) { - double pObservation = observation.getValue() / totalObservations; - entropyBase4 = entropyBase4 - pObservation * Math.log(pObservation); - } // Convert to log base 4 so that the entropy is now a measure // of the effective number of DNA bases. If we used log(2.0) // our result would be in bits. - return entropyBase4 / Math.log(4.0); - } - - public void updateDuplicateSetMetrics(final int maxEditDistanceToJoin, final int duplicateSetSize) { - DUPLICATE_SETS_WITH_UMI += duplicateSetSize; - DUPLICATE_SETS_WITHOUT_UMI++; - - // For each duplicate set estimate the number of expected UMI collisions - double nWaysUmisCanDiffer = 0; // Number of ways two UMIs may contain errors, but be considered the same - for (int k = 0; k <= maxEditDistanceToJoin; k++) { - if (UMI_LENGTH > 0) { - nWaysUmisCanDiffer = nWaysUmisCanDiffer + LongMath.binomial(UMI_LENGTH, k) * Math.pow(3.0, k); - } - } - - // The probability of two non-duplicate UMIs are drawn from a uniform distribution being correctly labeled as - // not belonging to the same UMI family is given as, pCorrectlyLabeled = nWaysUmisCanDiffer / 4^metrics.UMI_LENGTH. - // Estimate of probability all members in the duplicate set are correctly labeled is given by - // pAllMembersCorrectlyLabeled = (1 - pCorrectlyLabeled)^duplicateSets.size(). - // The expected number of reads incorrectly labeled as belonging to a duplicate set is given as, - // (1 - pAllMembersCorrectlyLabeled) * duplicateSets.size(). - UMI_COLLISION_EST = UMI_COLLISION_EST + - (1 - Math.pow(1 - nWaysUmisCanDiffer / Math.pow(4, UMI_LENGTH), duplicateSetSize - 1)) * duplicateSetSize; - } - - public void updateUmiRecordMetrics(final String currentUmi, final String inferredUmi) { - OBSERVED_BASE_ERRORS += hammingDistance(currentUmi, inferredUmi); - - observedUmiBases += UMI_LENGTH; - observedUmis.increment(currentUmi); - inferredUmis.increment(inferredUmi); + double entropyBase4 = observations.values().stream().collect(Collectors.summingDouble( + v -> -v.getValue() / totalObservations * Math.log(v.getValue() / totalObservations))) / Math.log(4.0); + return entropyBase4; } } diff --git a/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java b/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java index f6ea1378b..6354dbadd 100644 --- a/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java +++ b/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java @@ -38,11 +38,14 @@ import htsjdk.samtools.DuplicateSetIterator; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.util.CloseableIterator; +import htsjdk.samtools.util.Histogram; import picard.PicardException; import picard.sam.UmiMetrics; import java.util.*; +import static htsjdk.samtools.util.StringUtil.hammingDistance; + /** * UmiAwareDuplicateSetIterator is an iterator that wraps a duplicate set iterator * in such a way that each duplicate set may be broken up into subsets according @@ -59,7 +62,9 @@ private boolean isOpen = false; private UmiMetrics metrics; private boolean haveWeSeenFirstRead = false; - private double expectedCollisions = 0; + private Histogram observedUmis = new Histogram<>(); + private Histogram inferredUmis = new Histogram<>(); + private long observedUmiBases = 0; /** * Creates a UMI aware duplicate set iterator @@ -88,7 +93,7 @@ public void close() { wrappedIterator.close(); if (metrics.UMI_LENGTH > 0) { - metrics.calculateDerivedFields(); + metrics.calculateDerivedFields(observedUmis, inferredUmis, observedUmiBases); } } @@ -151,11 +156,17 @@ private void process(final DuplicateSet set) { throw new PicardException("UMIs of differing lengths were found."); } } - metrics.updateUmiRecordMetrics(currentUmi, inferredUmi); +// metrics.updateUmiRecordMetrics(currentUmi, inferredUmi); + metrics.OBSERVED_BASE_ERRORS += hammingDistance(currentUmi, inferredUmi); + observedUmiBases += currentUmi.length(); + observedUmis.increment(currentUmi); + inferredUmis.increment(inferredUmi); } } } - metrics.updateDuplicateSetMetrics(maxEditDistanceToJoin, duplicateSets.size()); + + metrics.DUPLICATE_SETS_WITH_UMI += duplicateSets.size(); + metrics.DUPLICATE_SETS_WITHOUT_UMI++; nextSetsIterator = duplicateSets.iterator(); } diff --git a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java index 86909a12f..393acaf14 100644 --- a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java +++ b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java @@ -26,6 +26,7 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import htsjdk.samtools.util.QualityUtil; import picard.PicardException; import picard.sam.UmiMetrics; @@ -174,15 +175,11 @@ public void testBadUmis(List umis, List assignedUmi, final List< // effectiveLength4_1 is the effective UMI length observing 5 UMIs where 3 are the same and the other two are // unique double effectiveLength3_1_1 = -(3./5.)*Math.log(3./5.)/Math.log(4.) -2*(1./5.)*Math.log(1./5.)/Math.log(4.); + // estimatedBaseQualityk_n is the phred scaled base quality score where k of n bases are incorrect - double estimatedBaseQuality1_20 = -10*Math.log10(1./20.); - double estimatedBaseQuality3_20 = -10*Math.log10(3./20.); - - // expectedCollisionsi_j_k where edit distance to join is i, duplicate sets is k and UMI length is k. - double expectedCollisions1_2_4 = 13./64.; - double expectedCollisions0_16_2 = (1. - Math.pow(1. - 1./16., 15))*16.*2.; - double collisionQ1_2_4 = -10*Math.log10(expectedCollisions1_2_4/2.0); - double collisionQ0_16_2 = -10*Math.log10(expectedCollisions0_16_2/16.0); + double estimatedBaseQuality1_20 = QualityUtil.getPhredScoreFromErrorProbability(1./20.); + double estimatedBaseQuality3_20 = QualityUtil.getPhredScoreFromErrorProbability(3./20.); + return new Object[][]{{ // Test basic error correction using edit distance of 1 Arrays.asList(new String[]{"AAAA", "AAAA", "ATTA", "AAAA", "AAAT"}), // Observed UMI @@ -197,9 +194,7 @@ public void testBadUmis(List umis, List assignedUmi, final List< 4, // DUPLICATE_SETS_WITH_UMI effectiveLength4_1, // EFFECTIVE_LENGTH_OF_INFERRED_UMIS effectiveLength3_1_1, // EFECTIVE_LENGTH_OF_OBSERVED_UMIS - estimatedBaseQuality1_20, // ESTIMATED_BASE_QUALITY_OF_UMIS - expectedCollisions1_2_4, // EXPECTED_READS_WITH_UMI_COLLISION - collisionQ1_2_4) // UMI_COLLISION_Q + estimatedBaseQuality1_20) // ESTIMATED_BASE_QUALITY_OF_UMIS }, { // Test basic error correction using edit distance of 2 Arrays.asList(new String[]{"AAAA", "AAAA", "ATTA", "AAAA", "AAAT"}), @@ -214,9 +209,7 @@ public void testBadUmis(List umis, List assignedUmi, final List< 2, // DUPLICATE_SETS_WITH_UMI 0.0, // EFFECTIVE_LENGTH_OF_INFERRED_UMIS effectiveLength3_1_1, // EFECTIVE_LENGTH_OF_OBSERVED_UMIS - estimatedBaseQuality3_20, // ESTIMATED_BASE_QUALITY_OF_UMIS - 0, // EXPECTED_READS_WITH_UMI_COLLISION - Double.NaN) // UMI_COLLISION_Q + estimatedBaseQuality3_20) // ESTIMATED_BASE_QUALITY_OF_UMIS }, { // Test maximum entropy (EFFECTIVE_LENGTH_OF_INFERRED_UMIS) Arrays.asList(new String[]{"AA", "AT", "AC", "AG", "TA", "TT", "TC", "TG", "CA", "CT", "CC", "CG", "GA", "GT", "GC", "GG"}), @@ -231,9 +224,7 @@ public void testBadUmis(List umis, List assignedUmi, final List< 16, // DUPLICATE_SETS_WITH_UMI 2.0, // EFFECTIVE_LENGTH_OF_INFERRED_UMIS 2, // EFECTIVE_LENGTH_OF_OBSERVED_UMIS - Double.NaN, // ESTIMATED_BASE_QUALITY_OF_UMIS - expectedCollisions0_16_2, // EXPECTED_READS_WITH_UMI_COLLISION - collisionQ0_16_2) // UMI_COLLISION_Q + -1) // ESTIMATED_BASE_QUALITY_OF_UMIS }}; } diff --git a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java index 0d4e8fcee..d4930b79a 100644 --- a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java +++ b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java @@ -191,8 +191,6 @@ public void test() { Assert.assertEquals(observedMetrics.INFERRED_UMI_ENTROPY, expectedMetrics.INFERRED_UMI_ENTROPY, tolerance, "INFERRED_UMI_ENTROPY does not match expected"); Assert.assertEquals(observedMetrics.OBSERVED_UMI_ENTROPY, expectedMetrics.OBSERVED_UMI_ENTROPY, tolerance, "OBSERVED_UMI_ENTROPY does not match expected"); Assert.assertEquals(observedMetrics.UMI_BASE_QUALITIES, expectedMetrics.UMI_BASE_QUALITIES, tolerance, "UMI_BASE_QUALITIES does not match expected"); - Assert.assertEquals(observedMetrics.UMI_COLLISION_EST, expectedMetrics.UMI_COLLISION_EST, tolerance, "UMI_COLLISION_EST does not match expected"); - Assert.assertEquals(observedMetrics.UMI_COLLISION_Q, expectedMetrics.UMI_COLLISION_Q, tolerance, "UMI_COLLISION_Q does not match expected"); } // Also do tests from AbstractMarkDuplicatesCommandLineProgramTester From fd0364928abadac1c1ab6c471f363017fbf7dfdc Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Wed, 29 Mar 2017 11:50:30 -0400 Subject: [PATCH 6/9] Addressing Yossi's comments again. --- .../markduplicates/UmiAwareDuplicateSetIterator.java | 17 ++++++++--------- .../UmiAwareMarkDuplicatesWithMateCigar.java | 1 - .../picard/sam/{ => markduplicates}/UmiMetrics.java | 16 +++++++++------- src/main/java/picard/util/MathUtil.java | 4 ++++ .../UmiAwareMarkDuplicatesWithMateCigarTest.java | 1 - .../UmiAwareMarkDuplicatesWithMateCigarTester.java | 3 +-- 6 files changed, 22 insertions(+), 20 deletions(-) rename src/main/java/picard/sam/{ => markduplicates}/UmiMetrics.java (91%) diff --git a/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java b/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java index 6354dbadd..985365fd7 100644 --- a/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java +++ b/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java @@ -38,9 +38,7 @@ import htsjdk.samtools.DuplicateSetIterator; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.util.CloseableIterator; -import htsjdk.samtools.util.Histogram; import picard.PicardException; -import picard.sam.UmiMetrics; import java.util.*; @@ -62,8 +60,7 @@ private boolean isOpen = false; private UmiMetrics metrics; private boolean haveWeSeenFirstRead = false; - private Histogram observedUmis = new Histogram<>(); - private Histogram inferredUmis = new Histogram<>(); + private long observedUmiBases = 0; /** @@ -93,7 +90,7 @@ public void close() { wrappedIterator.close(); if (metrics.UMI_LENGTH > 0) { - metrics.calculateDerivedFields(observedUmis, inferredUmis, observedUmiBases); + metrics.calculateDerivedFields(observedUmiBases); } } @@ -156,17 +153,19 @@ private void process(final DuplicateSet set) { throw new PicardException("UMIs of differing lengths were found."); } } -// metrics.updateUmiRecordMetrics(currentUmi, inferredUmi); + + // Update UMI metrics associated with each record metrics.OBSERVED_BASE_ERRORS += hammingDistance(currentUmi, inferredUmi); observedUmiBases += currentUmi.length(); - observedUmis.increment(currentUmi); - inferredUmis.increment(inferredUmi); + metrics.observedUmis.increment(currentUmi); + metrics.inferredUmis.increment(inferredUmi); } } } + // Update UMI metrics associated with each duplicate set metrics.DUPLICATE_SETS_WITH_UMI += duplicateSets.size(); - metrics.DUPLICATE_SETS_WITHOUT_UMI++; + metrics.DUPLICATE_SETS_IGNORING_UMI++; nextSetsIterator = duplicateSets.iterator(); } diff --git a/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java b/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java index a136bf3c6..384fdfcbf 100644 --- a/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java +++ b/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java @@ -32,7 +32,6 @@ import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; import picard.cmdline.programgroups.Alpha; -import picard.sam.UmiMetrics; import java.io.File; diff --git a/src/main/java/picard/sam/UmiMetrics.java b/src/main/java/picard/sam/markduplicates/UmiMetrics.java similarity index 91% rename from src/main/java/picard/sam/UmiMetrics.java rename to src/main/java/picard/sam/markduplicates/UmiMetrics.java index 7a7e9f89b..7110a0ce7 100644 --- a/src/main/java/picard/sam/UmiMetrics.java +++ b/src/main/java/picard/sam/markduplicates/UmiMetrics.java @@ -22,20 +22,22 @@ * THE SOFTWARE. */ -package picard.sam; +package picard.sam.markduplicates; import java.util.stream.Collectors; -import com.google.common.math.LongMath; import htsjdk.samtools.metrics.MetricBase; import htsjdk.samtools.util.Histogram; import htsjdk.samtools.util.QualityUtil; -import static htsjdk.samtools.util.StringUtil.hammingDistance; +import picard.util.MathUtil; /** * Metrics that are calculated during the process of marking duplicates * within a stream of SAMRecords using the UmiAwareDuplicateSetIterator. */ public class UmiMetrics extends MetricBase { + Histogram observedUmis = new Histogram<>(); + Histogram inferredUmis = new Histogram<>(); + /** Number of bases in each UMI */ public int UMI_LENGTH; @@ -49,7 +51,7 @@ public long OBSERVED_BASE_ERRORS = 0; /** Number of duplicate sets found before taking UMIs into account */ - public long DUPLICATE_SETS_WITHOUT_UMI = 0; + public long DUPLICATE_SETS_IGNORING_UMI = 0; /** Number of duplicate sets found after taking UMIs into account */ public long DUPLICATE_SETS_WITH_UMI = 0; @@ -82,14 +84,14 @@ public UmiMetrics(final int length, final int observedUniqueUmis, final int infe OBSERVED_UNIQUE_UMIS = observedUniqueUmis; INFERRED_UNIQUE_UMIS = inferredUniqueUmis; OBSERVED_BASE_ERRORS = observedBaseErrors; - DUPLICATE_SETS_WITHOUT_UMI = duplicateSetsWithoutUmi; + DUPLICATE_SETS_IGNORING_UMI = duplicateSetsWithoutUmi; DUPLICATE_SETS_WITH_UMI = duplicateSetsWithUmi; INFERRED_UMI_ENTROPY = effectiveLengthOfInferredUmis; OBSERVED_UMI_ENTROPY = effectiveLengthOfObservedUmis; UMI_BASE_QUALITIES = estimatedBaseQualityOfUmis; } - public void calculateDerivedFields(final Histogram observedUmis, Histogram inferredUmis, long observedUmiBases) { + public void calculateDerivedFields(long observedUmiBases) { OBSERVED_UNIQUE_UMIS = observedUmis.size(); INFERRED_UNIQUE_UMIS = inferredUmis.size(); @@ -106,7 +108,7 @@ private double effectiveNumberOfBases(Histogram observations) { // of the effective number of DNA bases. If we used log(2.0) // our result would be in bits. double entropyBase4 = observations.values().stream().collect(Collectors.summingDouble( - v -> -v.getValue() / totalObservations * Math.log(v.getValue() / totalObservations))) / Math.log(4.0); + v -> -v.getValue() / totalObservations * Math.log(v.getValue() / totalObservations))) / MathUtil.LOG_4_BASE_E; return entropyBase4; } } diff --git a/src/main/java/picard/util/MathUtil.java b/src/main/java/picard/util/MathUtil.java index acaf80c75..b890c733f 100644 --- a/src/main/java/picard/util/MathUtil.java +++ b/src/main/java/picard/util/MathUtil.java @@ -41,6 +41,10 @@ /** The double value closest to 1 while still being less than 1. */ public static final double MAX_PROB_BELOW_ONE = 0.9999999999999999d; + /** Constant to convert between the natural base e and 4.0. Useful for + * entropy calculations on DNA. */ + public static final double LOG_4_BASE_E = Math.log(4.0); + /** * this function mimics the behavior of log_1p but resulting in log _base 10_ of (1+x) instead of natural log of 1+x */ diff --git a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java index 393acaf14..506e09b63 100644 --- a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java +++ b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java @@ -28,7 +28,6 @@ import org.testng.annotations.Test; import htsjdk.samtools.util.QualityUtil; import picard.PicardException; -import picard.sam.UmiMetrics; import java.util.*; diff --git a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java index d4930b79a..a15b6b032 100644 --- a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java +++ b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java @@ -30,7 +30,6 @@ import htsjdk.samtools.metrics.MetricsFile; import org.testng.Assert; import picard.cmdline.CommandLineProgram; -import picard.sam.UmiMetrics; import java.io.File; import java.io.FileNotFoundException; @@ -187,7 +186,7 @@ public void test() { Assert.assertEquals(observedMetrics.OBSERVED_UNIQUE_UMIS, expectedMetrics.OBSERVED_UNIQUE_UMIS, "OBSERVED_UNIQUE_UMIS does not match expected"); Assert.assertEquals(observedMetrics.INFERRED_UNIQUE_UMIS, expectedMetrics.INFERRED_UNIQUE_UMIS, "INFERRED_UNIQUE_UMIS does not match expected"); Assert.assertEquals(observedMetrics.OBSERVED_BASE_ERRORS, expectedMetrics.OBSERVED_BASE_ERRORS, "OBSERVED_BASE_ERRORS does not match expected"); - Assert.assertEquals(observedMetrics.DUPLICATE_SETS_WITHOUT_UMI, expectedMetrics.DUPLICATE_SETS_WITHOUT_UMI, "DUPLICATE_SETS_WITHOUT_UMI does not match expected"); + Assert.assertEquals(observedMetrics.DUPLICATE_SETS_IGNORING_UMI, expectedMetrics.DUPLICATE_SETS_IGNORING_UMI, "DUPLICATE_SETS_WITHOUT_UMI does not match expected"); Assert.assertEquals(observedMetrics.INFERRED_UMI_ENTROPY, expectedMetrics.INFERRED_UMI_ENTROPY, tolerance, "INFERRED_UMI_ENTROPY does not match expected"); Assert.assertEquals(observedMetrics.OBSERVED_UMI_ENTROPY, expectedMetrics.OBSERVED_UMI_ENTROPY, tolerance, "OBSERVED_UMI_ENTROPY does not match expected"); Assert.assertEquals(observedMetrics.UMI_BASE_QUALITIES, expectedMetrics.UMI_BASE_QUALITIES, tolerance, "UMI_BASE_QUALITIES does not match expected"); From 3a130b4e15309aef73d09161e4cf5b2adacff8d0 Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Mon, 3 Apr 2017 14:09:37 -0400 Subject: [PATCH 7/9] Fixing lambda expression --- src/main/java/picard/sam/markduplicates/UmiMetrics.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/main/java/picard/sam/markduplicates/UmiMetrics.java b/src/main/java/picard/sam/markduplicates/UmiMetrics.java index 7110a0ce7..f1f279a37 100644 --- a/src/main/java/picard/sam/markduplicates/UmiMetrics.java +++ b/src/main/java/picard/sam/markduplicates/UmiMetrics.java @@ -107,9 +107,11 @@ private double effectiveNumberOfBases(Histogram observations) { // Convert to log base 4 so that the entropy is now a measure // of the effective number of DNA bases. If we used log(2.0) // our result would be in bits. - double entropyBase4 = observations.values().stream().collect(Collectors.summingDouble( - v -> -v.getValue() / totalObservations * Math.log(v.getValue() / totalObservations))) / MathUtil.LOG_4_BASE_E; - return entropyBase4; + double entropyBaseE = observations.values().stream().collect(Collectors.summingDouble( + v -> {double p = v.getValue() / totalObservations; + return -p * Math.log(p);})); + + return entropyBaseE / MathUtil.LOG_4_BASE_E; } } From bc0b6efe6c1f9a1648acdbc321b8f762a23c2d0c Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Tue, 4 Apr 2017 12:54:14 -0400 Subject: [PATCH 8/9] Making a few variables private, and removing argument from calculateDerivedFields --- .../markduplicates/UmiAwareDuplicateSetIterator.java | 5 ++--- .../java/picard/sam/markduplicates/UmiMetrics.java | 18 +++++++++++++++--- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java b/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java index 985365fd7..2d0290a3f 100644 --- a/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java +++ b/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java @@ -90,7 +90,7 @@ public void close() { wrappedIterator.close(); if (metrics.UMI_LENGTH > 0) { - metrics.calculateDerivedFields(observedUmiBases); + metrics.calculateDerivedFields(); } } @@ -157,8 +157,7 @@ private void process(final DuplicateSet set) { // Update UMI metrics associated with each record metrics.OBSERVED_BASE_ERRORS += hammingDistance(currentUmi, inferredUmi); observedUmiBases += currentUmi.length(); - metrics.observedUmis.increment(currentUmi); - metrics.inferredUmis.increment(inferredUmi); + metrics.addUmiObservation(currentUmi, inferredUmi); } } } diff --git a/src/main/java/picard/sam/markduplicates/UmiMetrics.java b/src/main/java/picard/sam/markduplicates/UmiMetrics.java index f1f279a37..dbc5ec695 100644 --- a/src/main/java/picard/sam/markduplicates/UmiMetrics.java +++ b/src/main/java/picard/sam/markduplicates/UmiMetrics.java @@ -35,8 +35,9 @@ * within a stream of SAMRecords using the UmiAwareDuplicateSetIterator. */ public class UmiMetrics extends MetricBase { - Histogram observedUmis = new Histogram<>(); - Histogram inferredUmis = new Histogram<>(); + private final Histogram observedUmis = new Histogram<>(); + private final Histogram inferredUmis = new Histogram<>(); + private long observedUmiBases = 0; /** Number of bases in each UMI */ public int UMI_LENGTH; @@ -91,7 +92,7 @@ public UmiMetrics(final int length, final int observedUniqueUmis, final int infe UMI_BASE_QUALITIES = estimatedBaseQualityOfUmis; } - public void calculateDerivedFields(long observedUmiBases) { + public void calculateDerivedFields() { OBSERVED_UNIQUE_UMIS = observedUmis.size(); INFERRED_UNIQUE_UMIS = inferredUmis.size(); @@ -101,6 +102,17 @@ public void calculateDerivedFields(long observedUmiBases) { UMI_BASE_QUALITIES = QualityUtil.getPhredScoreFromErrorProbability((double) OBSERVED_BASE_ERRORS / (double) observedUmiBases); } + /** + * Add an observation of a UMI to the metrics + * @param observedUmi String containing the observed UMI + * @param inferredUmi String containing the UMI inferred after error correcting the observed UMI + */ + public void addUmiObservation(String observedUmi, String inferredUmi) { + observedUmis.increment(observedUmi); + inferredUmis.increment(inferredUmi); + observedUmiBases += observedUmi.length(); + } + private double effectiveNumberOfBases(Histogram observations) { double totalObservations = observations.getSumOfValues(); From 03a002d0f0c65cbbf6cf6a5b1928002f484c2cbb Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Sun, 16 Apr 2017 15:40:22 -0400 Subject: [PATCH 9/9] Responding to Yossi's comments... --- .../sam/markduplicates/UmiAwareDuplicateSetIterator.java | 14 ++++++-------- .../UmiAwareMarkDuplicatesWithMateCigar.java | 2 +- src/main/java/picard/sam/markduplicates/UmiMetrics.java | 8 ++++---- .../UmiAwareMarkDuplicatesWithMateCigarTest.java | 8 ++++---- .../UmiAwareMarkDuplicatesWithMateCigarTester.java | 5 +++-- 5 files changed, 18 insertions(+), 19 deletions(-) diff --git a/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java b/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java index 2d0290a3f..40def6f4b 100644 --- a/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java +++ b/src/main/java/picard/sam/markduplicates/UmiAwareDuplicateSetIterator.java @@ -66,7 +66,7 @@ /** * Creates a UMI aware duplicate set iterator * - * @param wrappedIterator UMI aware duplicate set iterator is a wrapper + * @param wrappedIterator Iterator of DuplicatesSets to use and break-up by UMI. * @param maxEditDistanceToJoin The edit distance between UMIs that will be used to union UMIs into groups * @param umiTag The tag used in the bam file that designates the UMI * @param assignedUmiTag The tag in the bam file that designates the assigned UMI @@ -88,10 +88,7 @@ public void close() { isOpen = false; wrappedIterator.close(); - - if (metrics.UMI_LENGTH > 0) { - metrics.calculateDerivedFields(); - } + metrics.calculateDerivedFields(); } @Override @@ -144,12 +141,13 @@ private void process(final DuplicateSet set) { String currentUmi = rec.getStringAttribute(umiTag); if (currentUmi != null) { - // All UMIs should be the same length + // All UMIs should be the same length, the code presently does not support variable length UMIs + // TODO: Add support for variable length UMIs if (!haveWeSeenFirstRead) { - metrics.UMI_LENGTH = currentUmi.length(); + metrics.MEAN_UMI_LENGTH = currentUmi.length(); haveWeSeenFirstRead = true; } else { - if (metrics.UMI_LENGTH != currentUmi.length()) { + if (metrics.MEAN_UMI_LENGTH != currentUmi.length()) { throw new PicardException("UMIs of differing lengths were found."); } } diff --git a/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java b/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java index 384fdfcbf..bdaeec724 100644 --- a/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java +++ b/src/main/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigar.java @@ -93,7 +93,7 @@ protected int doWork() { IOUtil.assertFileIsWritable(UMI_METRICS_FILE); // Perform Mark Duplicates work - int retval=super.doWork(); + int retval = super.doWork(); // Write metrics specific to UMIs MetricsFile metricsFile = getMetricsFile(); diff --git a/src/main/java/picard/sam/markduplicates/UmiMetrics.java b/src/main/java/picard/sam/markduplicates/UmiMetrics.java index dbc5ec695..8c43fb8c1 100644 --- a/src/main/java/picard/sam/markduplicates/UmiMetrics.java +++ b/src/main/java/picard/sam/markduplicates/UmiMetrics.java @@ -40,7 +40,7 @@ private long observedUmiBases = 0; /** Number of bases in each UMI */ - public int UMI_LENGTH; + public double MEAN_UMI_LENGTH = 0.0; /** Number of different UMI sequences observed */ public long OBSERVED_UNIQUE_UMIS = 0; @@ -73,15 +73,15 @@ public double INFERRED_UMI_ENTROPY = 0; /** Estimation of Phred scaled quality scores for UMIs */ - public double UMI_BASE_QUALITIES; + public double UMI_BASE_QUALITIES = 0.0; public UmiMetrics() {} - public UmiMetrics(final int length, final int observedUniqueUmis, final int inferredUniqueUmis, + public UmiMetrics(final double length, final int observedUniqueUmis, final int inferredUniqueUmis, final int observedBaseErrors, final int duplicateSetsWithoutUmi, final int duplicateSetsWithUmi, final double effectiveLengthOfInferredUmis, final double effectiveLengthOfObservedUmis, final double estimatedBaseQualityOfUmis) { - UMI_LENGTH = length; + MEAN_UMI_LENGTH = length; OBSERVED_UNIQUE_UMIS = observedUniqueUmis; INFERRED_UNIQUE_UMIS = inferredUniqueUmis; OBSERVED_BASE_ERRORS = observedBaseErrors; diff --git a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java index 506e09b63..eda1476d5 100644 --- a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java +++ b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTest.java @@ -185,7 +185,7 @@ public void testBadUmis(List umis, List assignedUmi, final List< Arrays.asList(new String[]{"AAAA", "AAAA", "ATTA", "AAAA", "AAAA"}), // Expected inferred UMI Arrays.asList(new Boolean[]{false, true, false, true, true}), // Should it be marked as duplicate? 1, // Edit Distance to Join - new UmiMetrics(4, // UMI_LENGTH + new UmiMetrics(4.0, // MEAN_UMI_LENGTH 3, // OBSERVED_UNIQUE_UMIS 2, // INFERRED_UNIQUE_UMIS 2, // OBSERVED_BASE_ERRORS (Note: This is 2 rather than 1 because we are using paired end reads) @@ -200,7 +200,7 @@ public void testBadUmis(List umis, List assignedUmi, final List< Arrays.asList(new String[]{"AAAA", "AAAA", "AAAA", "AAAA", "AAAA"}), Arrays.asList(new Boolean[]{false, true, true, true, true}), 2, - new UmiMetrics(4, // UMI_LENGTH + new UmiMetrics(4.0, // MEAN_UMI_LENGTH 3, // OBSERVED_UNIQUE_UMIS 1, // INFERRED_UNIQUE_UMIS 6, // OBSERVED_BASE_ERRORS @@ -215,12 +215,12 @@ public void testBadUmis(List umis, List assignedUmi, final List< Arrays.asList(new String[]{"AA", "AT", "AC", "AG", "TA", "TT", "TC", "TG", "CA", "CT", "CC", "CG", "GA", "GT", "GC", "GG"}), Arrays.asList(new Boolean[]{false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false}), 0, - new UmiMetrics(2, // UMI_LENGTH + new UmiMetrics(2.0, // MEAN_UMI_LENGTH 16, // OBSERVED_UNIQUE_UMIS 16, // INFERRED_UNIQUE_UMIS 0, // OBSERVED_BASE_ERRORS 2, // DUPLICATE_SETS_WITHOUT_UMI - 16, // DUPLICATE_SETS_WITH_UMI + 32, // DUPLICATE_SETS_WITH_UMI 2.0, // EFFECTIVE_LENGTH_OF_INFERRED_UMIS 2, // EFECTIVE_LENGTH_OF_OBSERVED_UMIS -1) // ESTIMATED_BASE_QUALITY_OF_UMIS diff --git a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java index a15b6b032..a34e5bd0a 100644 --- a/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java +++ b/src/test/java/picard/sam/markduplicates/UmiAwareMarkDuplicatesWithMateCigarTester.java @@ -182,11 +182,12 @@ public void test() { double tolerance = 1e-6; Assert.assertEquals(metricsOutput.getMetrics().size(), 1); final UmiMetrics observedMetrics = metricsOutput.getMetrics().get(0); - Assert.assertEquals(observedMetrics.UMI_LENGTH, expectedMetrics.UMI_LENGTH, "UMI_LENGTH does not match expected"); + Assert.assertEquals(observedMetrics.MEAN_UMI_LENGTH, expectedMetrics.MEAN_UMI_LENGTH, "UMI_LENGTH does not match expected"); Assert.assertEquals(observedMetrics.OBSERVED_UNIQUE_UMIS, expectedMetrics.OBSERVED_UNIQUE_UMIS, "OBSERVED_UNIQUE_UMIS does not match expected"); Assert.assertEquals(observedMetrics.INFERRED_UNIQUE_UMIS, expectedMetrics.INFERRED_UNIQUE_UMIS, "INFERRED_UNIQUE_UMIS does not match expected"); Assert.assertEquals(observedMetrics.OBSERVED_BASE_ERRORS, expectedMetrics.OBSERVED_BASE_ERRORS, "OBSERVED_BASE_ERRORS does not match expected"); - Assert.assertEquals(observedMetrics.DUPLICATE_SETS_IGNORING_UMI, expectedMetrics.DUPLICATE_SETS_IGNORING_UMI, "DUPLICATE_SETS_WITHOUT_UMI does not match expected"); + Assert.assertEquals(observedMetrics.DUPLICATE_SETS_IGNORING_UMI, expectedMetrics.DUPLICATE_SETS_IGNORING_UMI, "DUPLICATE_SETS_IGNORING_UMI does not match expected"); + Assert.assertEquals(observedMetrics.DUPLICATE_SETS_WITH_UMI, expectedMetrics.DUPLICATE_SETS_WITH_UMI, "DUPLICATE_SETS_WITH_UMI does not match expected"); Assert.assertEquals(observedMetrics.INFERRED_UMI_ENTROPY, expectedMetrics.INFERRED_UMI_ENTROPY, tolerance, "INFERRED_UMI_ENTROPY does not match expected"); Assert.assertEquals(observedMetrics.OBSERVED_UMI_ENTROPY, expectedMetrics.OBSERVED_UMI_ENTROPY, tolerance, "OBSERVED_UMI_ENTROPY does not match expected"); Assert.assertEquals(observedMetrics.UMI_BASE_QUALITIES, expectedMetrics.UMI_BASE_QUALITIES, tolerance, "UMI_BASE_QUALITIES does not match expected");