diff --git a/src/main/java/picard/sam/markduplicates/MarkDuplicates.java b/src/main/java/picard/sam/markduplicates/MarkDuplicates.java index a04b17c7c..0e0eed58c 100644 --- a/src/main/java/picard/sam/markduplicates/MarkDuplicates.java +++ b/src/main/java/picard/sam/markduplicates/MarkDuplicates.java @@ -686,6 +686,8 @@ private void generateDuplicateIndexes(final boolean useBarcodes, final boolean i if (nextChunk.size() > 1) { markDuplicatePairs(nextChunk); if (TAG_DUPLICATE_SET_MEMBERS) addRepresentativeReadIndex(nextChunk); + } else if (nextChunk.size() ==1) { + AbstractMarkDuplicatesCommandLineProgram.addSingletonToCount(libraryIdGenerator); } nextChunk.clear(); nextChunk.add(next); @@ -695,6 +697,8 @@ private void generateDuplicateIndexes(final boolean useBarcodes, final boolean i if (nextChunk.size() > 1) { markDuplicatePairs(nextChunk); if (TAG_DUPLICATE_SET_MEMBERS) addRepresentativeReadIndex(nextChunk); + } else if (nextChunk.size() ==1) { + AbstractMarkDuplicatesCommandLineProgram.addSingletonToCount(libraryIdGenerator); } this.pairSort.cleanup(); this.pairSort = null; diff --git a/src/main/java/picard/sam/markduplicates/util/AbstractMarkDuplicatesCommandLineProgram.java b/src/main/java/picard/sam/markduplicates/util/AbstractMarkDuplicatesCommandLineProgram.java index 775a771d8..d1bf3265f 100644 --- a/src/main/java/picard/sam/markduplicates/util/AbstractMarkDuplicatesCommandLineProgram.java +++ b/src/main/java/picard/sam/markduplicates/util/AbstractMarkDuplicatesCommandLineProgram.java @@ -156,6 +156,10 @@ protected void finalizeAndWriteMetrics(final LibraryIdGenerator libraryIdGenerator) { final Map metricsByLibrary = libraryIdGenerator.getMetricsByLibraryMap(); final Histogram opticalDuplicatesByLibraryId = libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap(); + final Histogram duplicateCountHist = libraryIdGenerator.getDuplicateCountHist(); + final Histogram opticalDuplicateCountHist = libraryIdGenerator.getOpticalDuplicateCountHist(); + final Histogram nonOpticalDuplicateCountHist = libraryIdGenerator.getNonOpticalDuplicateCountHist(); + final Map libraryIds = libraryIdGenerator.getLibraryIdsMap(); // Write out the metrics @@ -183,6 +187,11 @@ protected void finalizeAndWriteMetrics(final LibraryIdGenerator libraryIdGenerat file.setHistogram(metricsByLibrary.values().iterator().next().calculateRoiHistogram()); } + // Add set size histograms - the set size counts are printed on adjacent columns to the ROI metric. + file.addHistogram(duplicateCountHist); + file.addHistogram(nonOpticalDuplicateCountHist); + file.addHistogram(opticalDuplicateCountHist); + file.write(METRICS_FILE); } @@ -259,6 +268,8 @@ public static void trackOpticalDuplicates(List ends, // Variables used for optical duplicate detection and tracking final List trackOpticalDuplicatesF = new ArrayList<>(); final List trackOpticalDuplicatesR = new ArrayList<>(); + final int nOpoticalDupF; + final int nOpoticalDupR; // Split into two lists: first of pairs and second of pairs, since they must have orientation and same starting end for (final ReadEnds end : ends) { @@ -272,13 +283,34 @@ public static void trackOpticalDuplicates(List ends, } // track the duplicates - trackOpticalDuplicates(trackOpticalDuplicatesF, keeper, opticalDuplicateFinder, libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap()); - trackOpticalDuplicates(trackOpticalDuplicatesR, keeper, opticalDuplicateFinder, libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap()); + nOpoticalDupF = trackOpticalDuplicates(trackOpticalDuplicatesF, + keeper, + opticalDuplicateFinder, + libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap()); + nOpoticalDupR = trackOpticalDuplicates(trackOpticalDuplicatesR, + keeper, + opticalDuplicateFinder, + libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap()); + trackDuplicateCounts(ends.size(), + nOpoticalDupF+nOpoticalDupR, + libraryIdGenerator.getDuplicateCountHist(), + libraryIdGenerator.getNonOpticalDuplicateCountHist(), + libraryIdGenerator.getOpticalDuplicateCountHist()); } else { // No need to partition - AbstractMarkDuplicatesCommandLineProgram.trackOpticalDuplicates(ends, keeper, opticalDuplicateFinder, libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap()); + final int nOpticalDup = trackOpticalDuplicates(ends, keeper, opticalDuplicateFinder, libraryIdGenerator.getOpticalDuplicatesByLibraryIdMap()); + trackDuplicateCounts(ends.size(), + nOpticalDup, + libraryIdGenerator.getDuplicateCountHist(), + libraryIdGenerator.getNonOpticalDuplicateCountHist(), + libraryIdGenerator.getOpticalDuplicateCountHist()); } } + public static void addSingletonToCount(final LibraryIdGenerator libraryIdGenerator){ + addSingletonToCount(libraryIdGenerator.getDuplicateCountHist(), + libraryIdGenerator.getNonOpticalDuplicateCountHist()); + } + /** * Looks through the set of reads and identifies how many of the duplicates are * in fact optical duplicates, and stores the data in the instance level histogram. @@ -289,7 +321,7 @@ public static void trackOpticalDuplicates(List ends, * optical duplicate detection, we do not consider them duplicates if one read as FR and the other RF when we order orientation by the * first mate sequenced (read #1 of the pair). */ - private static void trackOpticalDuplicates(final List list, + private static int trackOpticalDuplicates(final List list, final ReadEnds keeper, final OpticalDuplicateFinder opticalDuplicateFinder, final Histogram opticalDuplicatesByLibraryId) { @@ -306,5 +338,27 @@ private static void trackOpticalDuplicates(final List list, if (opticalDuplicates > 0) { opticalDuplicatesByLibraryId.increment(list.get(0).getLibraryId(), opticalDuplicates); } + return opticalDuplicates; } + + private static void trackDuplicateCounts(final int listSize, + final int optDupCnt, + final Histogram duplicatesCountHist, + final Histogram nonOpticalDuplicatesCountHist, + final Histogram opticalDuplicatesCountHist) { + duplicatesCountHist.increment(new Double(listSize)); + if ( (listSize - optDupCnt) > 0) { + nonOpticalDuplicatesCountHist.increment(new Double(listSize - optDupCnt)); + } + if (optDupCnt > 0) { + opticalDuplicatesCountHist.increment(new Double(optDupCnt+1)); + } + } + + private static void addSingletonToCount(final Histogram duplicatesCountHist, + final Histogram nonOpticalDuplicatesCountHist) { + duplicatesCountHist.increment((double) 1); + nonOpticalDuplicatesCountHist.increment((double) 1); + } + } diff --git a/src/main/java/picard/sam/markduplicates/util/LibraryIdGenerator.java b/src/main/java/picard/sam/markduplicates/util/LibraryIdGenerator.java index d9f2c592f..852c8fe89 100644 --- a/src/main/java/picard/sam/markduplicates/util/LibraryIdGenerator.java +++ b/src/main/java/picard/sam/markduplicates/util/LibraryIdGenerator.java @@ -48,7 +48,9 @@ private short nextLibraryId = 1; private final Map metricsByLibrary = new HashMap(); private final Histogram opticalDuplicatesByLibraryId = new Histogram(); - + public final Histogram duplicateCountHist = new Histogram("set_size", "all_sets"); + public final Histogram nonOpticalDuplicateCountHist = new Histogram("set_size", "non_optical_sets"); + public final Histogram opticalDuplicateCountHist = new Histogram("set_size", "optical_sets"); public LibraryIdGenerator(final SAMFileHeader header) { this.header = header; @@ -70,7 +72,13 @@ public LibraryIdGenerator(final SAMFileHeader header) { public Histogram getOpticalDuplicatesByLibraryIdMap() { return this.opticalDuplicatesByLibraryId; } - public static String getReadGroupLibraryName(SAMReadGroupRecord readGroup) { + public Histogram getDuplicateCountHist() { return this.duplicateCountHist; } + + public Histogram getNonOpticalDuplicateCountHist() { return this.nonOpticalDuplicateCountHist; } + + public Histogram getOpticalDuplicateCountHist() { return this.opticalDuplicateCountHist; } + + public static String getReadGroupLibraryName(SAMReadGroupRecord readGroup) { return Optional.ofNullable(readGroup.getLibrary()) .orElse(UNKNOWN_LIBRARY); } diff --git a/src/test/java/picard/sam/markduplicates/MarkDuplicatesSetSizeHistogramTest.java b/src/test/java/picard/sam/markduplicates/MarkDuplicatesSetSizeHistogramTest.java new file mode 100644 index 000000000..c9d2b2aeb --- /dev/null +++ b/src/test/java/picard/sam/markduplicates/MarkDuplicatesSetSizeHistogramTest.java @@ -0,0 +1,105 @@ +/* + * The MIT License + * + * Copyright (c) 2014 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.sam.markduplicates; + +import org.testng.annotations.Test; + +import java.util.Arrays; + +/** + * This class defines the individual test cases to run. The actual running of the test is done + * by MarkDuplicatesWithMateCigarTester (see getTester). + * @author nhomer@broadinstitute.org + */ +public class MarkDuplicatesSetSizeHistogramTest extends AbstractMarkDuplicatesCommandLineProgramTest { + protected MarkDuplicatesSetSizeHistogramTester getTester() { + return new MarkDuplicatesSetSizeHistogramTester(); + } + + @Test + public void TestSingleSet() { + final MarkDuplicatesSetSizeHistogramTester tester = getTester(); + tester.setExpectedOpticalDuplicate(1); + String representativeReadName = "RUNID:1:1:16020:13352"; + tester.addMatePair(representativeReadName, 1, 485253, 485253, false, false, false, false, "45M", "45M", false, true, false, false, false, DEFAULT_BASE_QUALITY); + tester.addMatePair("RUNID:1:1:15993:13361", 1, 485253, 485253, false, false, true, true, "44M1S", "44M1S", false, true, false, false, false, DEFAULT_BASE_QUALITY); + // set expected counts in hashmap that takes the form: key=(Histogram Label, histogram bin), value=histogram entry + tester.expectedSetSizeMap.put(Arrays.asList("all_sets","2.0"),1.0); + tester.expectedSetSizeMap.put(Arrays.asList("optical_sets","2.0"),1.0); + tester.runTest(); + } + + @Test + public void testOpticalAndNonOpticalSet() { + final MarkDuplicatesSetSizeHistogramTester tester = getTester(); + tester.setExpectedOpticalDuplicate(2); + String representativeReadName = "RUNID:1:1:16020:13352"; + tester.addMatePair(representativeReadName, 1, 485253, 485253, false, false, false, false, "45M", "45M", false, true, false, false, false, DEFAULT_BASE_QUALITY); + tester.addMatePair("RUNID:1:1:15993:13361", 1, 485253, 485253, false, false, true, true, "44M1S", "44M1S", false, true, false, false, false, DEFAULT_BASE_QUALITY); + tester.addMatePair("RUNID:1:1:15994:13364", 1, 485253, 485253, false, false, true, true, "44M1S", "44M1S", false, true, false, false, false, DEFAULT_BASE_QUALITY); + // add one non-optical duplicate + tester.addMatePair("RUNID:1:1:25993:23361", 1, 485253, 485253, false, false, true, true, "44M1S", "44M1S", false, true, false, false, false, DEFAULT_BASE_QUALITY); + tester.expectedSetSizeMap.put(Arrays.asList("all_sets","4.0"),1.0); + tester.expectedSetSizeMap.put(Arrays.asList("optical_sets","3.0"),1.0); + tester.runTest(); + } + + @Test + public void testSingleton() { + final MarkDuplicatesSetSizeHistogramTester tester = getTester(); + tester.setExpectedOpticalDuplicate(0); + // Add non-duplicate read + tester.addMatePair("RUNID:1:1:15993:13375", 1, 485255, 485255, false, false, false, false, "43M2S", "43M2S", false, true, false, false, false, DEFAULT_BASE_QUALITY); + tester.expectedSetSizeMap.put(Arrays.asList("all_sets","1.0"),1.0); + tester.runTest(); + } + + @Test + public void testMultiRepresentativeReadTags() { + final MarkDuplicatesSetSizeHistogramTester tester = getTester(); + tester.setExpectedOpticalDuplicate(3); + // Duplicate set: size 2 - all optical + String representativeReadName1 = "RUNID:1:1:16020:13352"; + tester.addMatePair(representativeReadName1, 1, 485253, 485253, false, false, false, false, "45M", "45M", false, true, false, false, false, DEFAULT_BASE_QUALITY); + tester.addMatePair("RUNID:1:1:15993:13361", 1, 485253, 485253, false, false, true, true, "44M1S", "44M1S", false, true, false, false, false, DEFAULT_BASE_QUALITY); + tester.expectedSetSizeMap.put(Arrays.asList("all_sets","3.0"),1.0); + tester.expectedSetSizeMap.put(Arrays.asList("optical_sets","3.0"),1.0); + + // Duplicate set: size 3 - all optical + String representativeReadName2 = "RUNID:1:1:15993:13360"; + tester.addMatePair(representativeReadName2, 1, 485299, 485299, false, false, false, false, "45M", "45M", false, true, false, false, false, DEFAULT_BASE_QUALITY); + tester.addMatePair("RUNID:1:1:15993:13365", 1, 485299, 485299, false, false, true, true, "44M1S", "44M1S", false, true, false, false, false, DEFAULT_BASE_QUALITY); + tester.addMatePair("RUNID:1:1:15993:13370", 1, 485299, 485299, false, false, true, true, "43M2S", "43M2S", false, true, false, false, false, DEFAULT_BASE_QUALITY); + tester.expectedSetSizeMap.put(Arrays.asList("all_sets","4.0"),1.0); + tester.expectedSetSizeMap.put(Arrays.asList("optical_sets","4.0"),1.0); + + // Add non-duplicate read + tester.addMatePair("RUNID:1:1:15993:13375", 1, 485255, 485255, false, false, false, false, "43M2S", "43M2S", false, true, false, false, false, DEFAULT_BASE_QUALITY); + tester.expectedSetSizeMap.put(Arrays.asList("all_sets","1.0"),1.0); + + tester.runTest(); + } + +} diff --git a/src/test/java/picard/sam/markduplicates/MarkDuplicatesSetSizeHistogramTester.java b/src/test/java/picard/sam/markduplicates/MarkDuplicatesSetSizeHistogramTester.java new file mode 100644 index 000000000..8fa9428de --- /dev/null +++ b/src/test/java/picard/sam/markduplicates/MarkDuplicatesSetSizeHistogramTester.java @@ -0,0 +1,126 @@ +/* + * The MIT License + * + * Copyright (c) 2014 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.sam.markduplicates; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.metrics.MetricsFile; +import htsjdk.samtools.util.CloserUtil; +import htsjdk.samtools.util.Histogram; +import htsjdk.samtools.util.TestUtil; +import org.testng.Assert; +import picard.cmdline.CommandLineProgram; +import picard.sam.DuplicationMetrics; + +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.util.*; + +/** + * This class is an extension of AbstractMarkDuplicatesCommandLineProgramTester used to test MarkDuplicatesWithMateCigar with SAM files generated on the fly. + * This performs the underlying tests defined by classes such as see AbstractMarkDuplicatesCommandLineProgramTest and MarkDuplicatesWithMateCigarTest. + */ +public class MarkDuplicatesSetSizeHistogramTester extends AbstractMarkDuplicatesCommandLineProgramTester { + + final public Map, Double> expectedSetSizeMap = new HashMap<>(); // key=(Histogram Label, histogram bin), value=histogram entry + + public MarkDuplicatesSetSizeHistogramTester() {} + + @Override + protected CommandLineProgram getProgram() { return new MarkDuplicates(); } + + @Override + public void test() { + try { + updateExpectedDuplicationMetrics(); + + // Read the output and check the duplicate flag + int outputRecords = 0; + final SamReader reader = SamReaderFactory.makeDefault().open(getOutput()); + System.out.println(getOutput().getAbsolutePath()); + for (final SAMRecord record : reader) { + outputRecords++; + final String key = samRecordToDuplicatesFlagsKey(record); + if (!this.duplicateFlags.containsKey(key)) { + System.err.println("DOES NOT CONTAIN KEY: " + key); + } + Assert.assertTrue(this.duplicateFlags.containsKey(key)); + final boolean value = this.duplicateFlags.get(key); + this.duplicateFlags.remove(key); + if (value != record.getDuplicateReadFlag()) { + System.err.println("Mismatching read:"); + System.err.print(record.getSAMString()); + } + Assert.assertEquals(record.getDuplicateReadFlag(), value); + } + CloserUtil.close(reader); + + // Ensure the program output the same number of records as were read in + Assert.assertEquals(outputRecords, this.getNumberOfRecords(), ("saw " + outputRecords + " output records, vs. " + this.getNumberOfRecords() + " input records")); + + // Check the values written to metrics.txt against our input expectations + final MetricsFile metricsOutput = new MetricsFile(); + try{ + metricsOutput.read(new FileReader(metricsFile)); + } + catch (final FileNotFoundException ex) { + System.err.println("Metrics file not found: " + ex); + } + Assert.assertEquals(metricsOutput.getMetrics().size(), 1); + final DuplicationMetrics observedMetrics = metricsOutput.getMetrics().get(0); + Assert.assertEquals(observedMetrics.UNPAIRED_READS_EXAMINED, expectedMetrics.UNPAIRED_READS_EXAMINED, "UNPAIRED_READS_EXAMINED does not match expected"); + Assert.assertEquals(observedMetrics.READ_PAIRS_EXAMINED, expectedMetrics.READ_PAIRS_EXAMINED, "READ_PAIRS_EXAMINED does not match expected"); + Assert.assertEquals(observedMetrics.UNMAPPED_READS, expectedMetrics.UNMAPPED_READS, "UNMAPPED_READS does not match expected"); + Assert.assertEquals(observedMetrics.UNPAIRED_READ_DUPLICATES, expectedMetrics.UNPAIRED_READ_DUPLICATES, "UNPAIRED_READ_DUPLICATES does not match expected"); + Assert.assertEquals(observedMetrics.READ_PAIR_DUPLICATES, expectedMetrics.READ_PAIR_DUPLICATES, "READ_PAIR_DUPLICATES does not match expected"); + Assert.assertEquals(observedMetrics.READ_PAIR_OPTICAL_DUPLICATES, expectedMetrics.READ_PAIR_OPTICAL_DUPLICATES, "READ_PAIR_OPTICAL_DUPLICATES does not match expected"); + Assert.assertEquals(observedMetrics.PERCENT_DUPLICATION, expectedMetrics.PERCENT_DUPLICATION, "PERCENT_DUPLICATION does not match expected"); + Assert.assertEquals(observedMetrics.ESTIMATED_LIBRARY_SIZE, expectedMetrics.ESTIMATED_LIBRARY_SIZE, "ESTIMATED_LIBRARY_SIZE does not match expected"); + Assert.assertEquals(observedMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS, expectedMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS, "SECONDARY_OR_SUPPLEMENTARY_RDS does not match expected"); + + + // Check contents of set size bin against expected values // + for (final Histogram histo : metricsOutput.getAllHistograms()) { + String label = histo.getValueLabel(); + for (Object bin :histo.keySet()) { + String binStr = bin.toString(); + if (expectedSetSizeMap.containsKey(Arrays.asList(label,binStr))) { + Histogram.Bin binValue = histo.get(bin); + double actual = binValue.getValue(); + double expected = expectedSetSizeMap.get(Arrays.asList(label, binStr)); + Assert.assertEquals(actual, expected); + } + + } + + } + + } finally { + TestUtil.recursiveDelete(getOutputDir()); + } + } + +}