diff --git a/settings.gradle b/settings.gradle index adda0d745..bec5c6e7a 100644 --- a/settings.gradle +++ b/settings.gradle @@ -1 +1,3 @@ rootProject.name = "picard" +include 'picard_gradle_v2' + diff --git a/src/main/java/picard/sam/markduplicates/MarkDuplicates.java b/src/main/java/picard/sam/markduplicates/MarkDuplicates.java index 0141c4bdc..a04b17c7c 100644 --- a/src/main/java/picard/sam/markduplicates/MarkDuplicates.java +++ b/src/main/java/picard/sam/markduplicates/MarkDuplicates.java @@ -37,16 +37,9 @@ import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.SortingCollection; import htsjdk.samtools.util.SortingLongCollection; -import picard.sam.markduplicates.util.AbstractMarkDuplicatesCommandLineProgram; -import picard.sam.markduplicates.util.DiskBasedReadEndsForMarkDuplicatesMap; -import picard.sam.markduplicates.util.LibraryIdGenerator; -import picard.sam.markduplicates.util.ReadEnds; -import picard.sam.markduplicates.util.ReadEndsForMarkDuplicates; -import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesCodec; -import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesMap; import htsjdk.samtools.DuplicateScoringStrategy.ScoringStrategy; -import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesWithBarcodes; -import picard.sam.markduplicates.util.ReadEndsForMarkDuplicatesWithBarcodesCodec; +import picard.sam.markduplicates.util.*; +import picard.sam.util.RepresentativeReadIndexer; import java.io.*; import java.util.*; @@ -125,6 +118,10 @@ public static final String DUPLICATE_TYPE_LIBRARY = "LB"; /** The duplicate type tag value for duplicate type: sequencing (optical & pad-hopping, or "co-localized"). */ public static final String DUPLICATE_TYPE_SEQUENCING = "SQ"; + /** The attribute in the SAM/BAM file used to store which read was selected as representative out of a duplicate set */ + public static final String DUPLICATE_SET_INDEX_TAG = "DI"; + /** The attribute in the SAM/BAM file used to store the size of a duplicate set */ + public static final String DUPLICATE_SET_SIZE_TAG = "DS"; /** Enum for the possible values that a duplicate read can be tagged with in the DT attribute. */ public enum DuplicateType { @@ -165,6 +162,14 @@ @Option(doc = "Read two barcode SAM tag (ex. BX for 10X Genomics)", optional = true) public String READ_TWO_BARCODE_TAG = null; + @Option(doc = "If a read appears in a duplicate set, add two tags. The first tag, DUPLICATE_SET_SIZE_TAG (DS), " + + "indicates the size of the duplicate set. The smallest possible DS value is 2 which occurs when two " + + "reads map to the same portion of the reference only one of which is marked as duplicate. The second " + + "tag, DUPLICATE_SET_INDEX_TAG (DI), represents a unique identifier for the duplicate set to which the " + + "record belongs. This identifier is the index-in-file of the representative read that was selected out " + + "of the duplicate set.", optional = true) + public boolean TAG_DUPLICATE_SET_MEMBERS = false; + @Option(doc = "If true remove 'optical' duplicates and other duplicates that appear to have arisen from the " + "sequencing process instead of the library preparation process, even if REMOVE_DUPLICATES is false. " + "If REMOVE_DUPLICATES is true, all duplicates are removed and this option is ignored.") @@ -177,6 +182,7 @@ private SortingCollection fragSort; private SortingLongCollection duplicateIndexes; private SortingLongCollection opticalDuplicateIndexes; + private SortingCollection representativeReadIndicesForDuplicates; private int numDuplicateIndices = 0; static private final long NO_SUCH_INDEX = Long.MAX_VALUE; // needs to be large so that that >= test fails for query-sorted traversal @@ -259,6 +265,22 @@ protected int doWork() { long nextOpticalDuplicateIndex = this.opticalDuplicateIndexes != null && this.opticalDuplicateIndexes.hasNext() ? this.opticalDuplicateIndexes.next() : NO_SUCH_INDEX; long nextDuplicateIndex = (this.duplicateIndexes.hasNext() ? this.duplicateIndexes.next() : NO_SUCH_INDEX); + // initialize variables for optional representative read tagging + CloseableIterator representativeReadIterator = null; + RepresentativeReadIndexer rri = null; + int representativeReadIndexInFile = -1; + int duplicateSetSize = -1; + int nextRepresentativeIndex = -1; + if (TAG_DUPLICATE_SET_MEMBERS) { + representativeReadIterator = this.representativeReadIndicesForDuplicates.iterator(); + if (representativeReadIterator.hasNext()) { + rri = representativeReadIterator.next(); + nextRepresentativeIndex = rri.readIndexInFile; + representativeReadIndexInFile = rri.representativeReadIndexInFile; + duplicateSetSize = rri.setSize; + } + } + final ProgressLogger progress = new ProgressLogger(log, (int) 1e7, "Written"); final CloseableIterator iterator = headerAndIterator.iterator; String duplicateQueryName = null; @@ -343,6 +365,28 @@ protected int doWork() { } } + // Tag any read pair that was in a duplicate set with the duplicate set size and a representative read name + if (TAG_DUPLICATE_SET_MEMBERS) { + final boolean needNextRepresentativeIndex = recordInFileIndex > nextRepresentativeIndex; + if (needNextRepresentativeIndex && representativeReadIterator.hasNext()) { + rri = representativeReadIterator.next(); + nextRepresentativeIndex = rri.readIndexInFile; + representativeReadIndexInFile = rri.representativeReadIndexInFile; + duplicateSetSize = rri.setSize; + } + final boolean isInDuplicateSet = recordInFileIndex == nextRepresentativeIndex || + (sortOrder == SAMFileHeader.SortOrder.queryname && + recordInFileIndex > nextDuplicateIndex); + if (isInDuplicateSet) { + if (!rec.isSecondaryOrSupplementary() && !rec.getReadUnmappedFlag()) { + if (TAG_DUPLICATE_SET_MEMBERS) { + rec.setAttribute(DUPLICATE_SET_INDEX_TAG, representativeReadIndexInFile); + rec.setAttribute(DUPLICATE_SET_SIZE_TAG, duplicateSetSize); + } + } + } + } + // Output the record if desired and bump the record index recordInFileIndex++; if (this.REMOVE_DUPLICATES && rec.getDuplicateReadFlag()) continue; @@ -357,6 +401,9 @@ protected int doWork() { iterator.close(); this.duplicateIndexes.cleanup(); + if (TAG_DUPLICATE_SET_MEMBERS) { + this.representativeReadIndicesForDuplicates.cleanup(); + } reportMemoryStats("Before output close"); out.close(); @@ -508,8 +555,8 @@ private void buildSortedReadEndLists(final boolean useBarcodes) { if (pairedEnds.read2ReferenceIndex == pairedEnds.read1ReferenceIndex && pairedEnds.read2Coordinate == pairedEnds.read1Coordinate && pairedEnds.orientation == ReadEnds.RF) { - pairedEnds.orientation = ReadEnds.FR; - } + pairedEnds.orientation = ReadEnds.FR; + } } else { pairedEnds.read2ReferenceIndex = pairedEnds.read1ReferenceIndex; pairedEnds.read2Coordinate = pairedEnds.read1Coordinate; @@ -600,18 +647,35 @@ private ReadEndsForMarkDuplicates buildReadEnds(final SAMFileHeader header, fina * @return an array with an ordered list of indexes into the source file */ private void generateDuplicateIndexes(final boolean useBarcodes, final boolean indexOpticalDuplicates) { + int entryOverhead; + if (TAG_DUPLICATE_SET_MEMBERS) { + // Memory requirements for RepresentativeReadIndexer: + // three int entries + overhead: (3 * 4) + 4 = 16 bytes + entryOverhead = 16; + } + else { + entryOverhead = SortingLongCollection.SIZEOF; + } // Keep this number from getting too large even if there is a huge heap. - int maxInMemory = (int) Math.min((Runtime.getRuntime().maxMemory() * 0.25) / SortingLongCollection.SIZEOF, (double) (Integer.MAX_VALUE - 5)); - // If we're also tracking optical duplicates, cut maxInMemory in half, since we'll need two sorting collections + int maxInMemory = (int) Math.min((Runtime.getRuntime().maxMemory() * 0.25) / entryOverhead, (double) (Integer.MAX_VALUE - 5)); + // If we're also tracking optical duplicates, reduce maxInMemory, since we'll need two sorting collections if (indexOpticalDuplicates) { - maxInMemory /= 2; + maxInMemory /= ((entryOverhead + SortingLongCollection.SIZEOF) / entryOverhead); this.opticalDuplicateIndexes = new SortingLongCollection(maxInMemory, TMP_DIR.toArray(new File[TMP_DIR.size()])); } log.info("Will retain up to " + maxInMemory + " duplicate indices before spilling to disk."); this.duplicateIndexes = new SortingLongCollection(maxInMemory, TMP_DIR.toArray(new File[TMP_DIR.size()])); + if (TAG_DUPLICATE_SET_MEMBERS) { + final RepresentativeReadIndexerCodec representativeIndexCodec = new RepresentativeReadIndexerCodec(); + this.representativeReadIndicesForDuplicates = SortingCollection.newInstance(RepresentativeReadIndexer.class, + representativeIndexCodec, + new RepresentativeReadComparator(), + maxInMemory, + TMP_DIR); + } ReadEndsForMarkDuplicates firstOfNextChunk = null; - final List nextChunk = new ArrayList(200); + final List nextChunk = new ArrayList(200); // First just do the pairs log.info("Traversing read pair information and detecting duplicates."); @@ -621,13 +685,17 @@ private void generateDuplicateIndexes(final boolean useBarcodes, final boolean i } else { if (nextChunk.size() > 1) { markDuplicatePairs(nextChunk); + if (TAG_DUPLICATE_SET_MEMBERS) addRepresentativeReadIndex(nextChunk); } nextChunk.clear(); nextChunk.add(next); firstOfNextChunk = next; } } - if (nextChunk.size() > 1) markDuplicatePairs(nextChunk); + if (nextChunk.size() > 1) { + markDuplicatePairs(nextChunk); + if (TAG_DUPLICATE_SET_MEMBERS) addRepresentativeReadIndex(nextChunk); + } this.pairSort.cleanup(); this.pairSort = null; @@ -661,6 +729,7 @@ private void generateDuplicateIndexes(final boolean useBarcodes, final boolean i log.info("Sorting list of duplicate records."); this.duplicateIndexes.doneAddingStartIteration(); if (this.opticalDuplicateIndexes != null) this.opticalDuplicateIndexes.doneAddingStartIteration(); + if (TAG_DUPLICATE_SET_MEMBERS) this.representativeReadIndicesForDuplicates.doneAdding(); } private boolean areComparableForDuplicates(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDuplicates rhs, final boolean compareRead2, final boolean useBarcodes) { @@ -693,6 +762,42 @@ private void addIndexAsDuplicate(final long bamIndex) { ++this.numDuplicateIndices; } + private void addRepresentativeReadOfDuplicateSet(final long representativeReadIndexInFile, final int setSize, final long read1IndexInFile) { + final RepresentativeReadIndexer rri = new RepresentativeReadIndexer(); + rri.representativeReadIndexInFile = (int) representativeReadIndexInFile; + rri.setSize = setSize; + rri.readIndexInFile = (int) read1IndexInFile; + this.representativeReadIndicesForDuplicates.add(rri); + } + + /** + * Takes a list of ReadEndsForMarkDuplicates objects and identify the representative read based on + * quality score. For all members of the duplicate set, add the read1 index-in-file of the representative + * read to the records of the first and second in a pair. This value becomes is used for + * the 'DI' tag. + * + * @param list + */ + private void addRepresentativeReadIndex(final List list) { + short maxScore = 0; + ReadEndsForMarkDuplicates best = null; + + /** All read ends should have orientation FF, FR, RF, or RR **/ + for (final ReadEndsForMarkDuplicates end : list) { + if (end.score > maxScore || best == null) { + maxScore = end.score; + best = end; + } + } + + // for read name (for representative read name), add the last of the pair that was examined + for (final ReadEndsForMarkDuplicates end : list) { + addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read1IndexInFile); + addRepresentativeReadOfDuplicateSet(best.read1IndexInFile, list.size(), end.read2IndexInFile); + } + } + + /** * Takes a list of ReadEndsForMarkDuplicates objects and removes from it all objects that should * not be marked as duplicates. This assumes that the list contains objects representing pairs. @@ -795,4 +900,17 @@ public int compare(final ReadEndsForMarkDuplicates lhs, final ReadEndsForMarkDup return compareDifference; } } + + // order representative read entries based on the record index + static class RepresentativeReadComparator implements Comparator { + + public RepresentativeReadComparator() {} + + public int compare(final RepresentativeReadIndexer lhs, final RepresentativeReadIndexer rhs) { + int compareDifference = lhs.readIndexInFile - rhs.readIndexInFile; + return compareDifference; + } + } + + } diff --git a/src/main/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicates.java b/src/main/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicates.java index da503855e..b677d377f 100644 --- a/src/main/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicates.java +++ b/src/main/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicates.java @@ -34,10 +34,10 @@ What do we need to store you ask? Well, we need to store: - byte: orientation - short: libraryId, readGroup, tile, x, y, score - - int: read1ReferenceIndex, read1Coordinate, read2ReferenceIndex, read2Coordinate + - int: read1ReferenceIndex, read1Coordinate, read2ReferenceIndex, read2Coordinate, duplicateSetSize - long: read1IndexInFile, read2IndexInFile */ - protected static final int SIZE_OF = (1 * 1) + (5 * 2) + (4 * 4) + (8 * 2) + 1 + protected static final int SIZE_OF = (1 * 1) + (5 * 2) + (5 * 4) + (8 * 2) + 1 + 8 + // last 8 == reference overhead 13; // This is determined experimentally with JProfiler @@ -48,6 +48,7 @@ public static int getSizeOf() { public short score = 0; public long read1IndexInFile = -1; public long read2IndexInFile = -1; + public int duplicateSetSize = -1; public ReadEndsForMarkDuplicates() {} @@ -76,4 +77,5 @@ public ReadEndsForMarkDuplicates(final ReadEndsForMarkDuplicates read) { public ReadEndsForMarkDuplicates clone() { return new ReadEndsForMarkDuplicates(this); } + } \ No newline at end of file diff --git a/src/main/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicatesCodec.java b/src/main/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicatesCodec.java index 790f042fd..26289f29f 100644 --- a/src/main/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicatesCodec.java +++ b/src/main/java/picard/sam/markduplicates/util/ReadEndsForMarkDuplicatesCodec.java @@ -69,6 +69,7 @@ public void encode(final ReadEndsForMarkDuplicates read) { this.out.writeShort((short)read.x); this.out.writeShort((short)read.y); this.out.writeByte(read.orientationForOpticalDuplicates); + this.out.writeInt(read.duplicateSetSize); } catch (final IOException ioe) { throw new PicardException("Exception writing ReadEnds to file.", ioe); } @@ -102,6 +103,7 @@ public ReadEndsForMarkDuplicates decode() { read.y = this.in.readShort(); read.orientationForOpticalDuplicates = this.in.readByte(); + read.duplicateSetSize = this.in.readInt(); return read; } catch (final IOException ioe) { diff --git a/src/main/java/picard/sam/markduplicates/util/RepresentativeReadIndexerCodec.java b/src/main/java/picard/sam/markduplicates/util/RepresentativeReadIndexerCodec.java new file mode 100644 index 000000000..224a5c870 --- /dev/null +++ b/src/main/java/picard/sam/markduplicates/util/RepresentativeReadIndexerCodec.java @@ -0,0 +1,79 @@ +/* + * The MIT License + * + * Copyright (c) 2016 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +package picard.sam.markduplicates.util; + +import htsjdk.samtools.util.SortingCollection; +import picard.PicardException; +import picard.sam.util.RepresentativeReadIndexer; + +import java.io.*; + +/** Codec for read names and integers that outputs the primitive fields and reads them back. */ +public class RepresentativeReadIndexerCodec implements SortingCollection.Codec { + protected DataInputStream in; + protected DataOutputStream out; + + public SortingCollection.Codec clone() { + return new RepresentativeReadIndexerCodec(); + } + + public void setOutputStream(final OutputStream os) { this.out = new DataOutputStream(os); } + + public void setInputStream(final InputStream is) { this.in = new DataInputStream(is); } + + public DataInputStream getInputStream() { + return in; + } + + public DataOutputStream getOutputStream() { + return out; + } + + public void encode(final RepresentativeReadIndexer rni) { + try { + this.out.writeInt(rni.readIndexInFile); + this.out.writeInt(rni.setSize); + this.out.writeInt(rni.representativeReadIndexInFile); + } catch (final IOException ioe) { + throw new PicardException("Exception writing ReadEnds to file.", ioe); + } + } + + public RepresentativeReadIndexer decode() { + final RepresentativeReadIndexer rni = new RepresentativeReadIndexer(); + try { + // If the first read results in an EOF we've exhausted the stream + try { + rni.readIndexInFile = this.in.readInt(); + } catch (final EOFException eof) { + return null; + } + rni.setSize = this.in.readInt(); + rni.representativeReadIndexInFile = this.in.readInt(); + return rni; + } catch (final IOException ioe) { + throw new PicardException("Exception writing ReadEnds to file.", ioe); + } + } +} diff --git a/src/main/java/picard/sam/util/RepresentativeReadIndexer.java b/src/main/java/picard/sam/util/RepresentativeReadIndexer.java new file mode 100644 index 000000000..440a7c0f4 --- /dev/null +++ b/src/main/java/picard/sam/util/RepresentativeReadIndexer.java @@ -0,0 +1,11 @@ +package picard.sam.util; + +/** + * Little struct-like class to hold a record index, the index of the corresponding representative read, and duplicate set size information. + */ + +public class RepresentativeReadIndexer { + public int readIndexInFile = -1; + public int setSize = -1; + public int representativeReadIndexInFile = -1; +} diff --git a/src/test/java/picard/sam/markduplicates/AbstractMarkDuplicatesCommandLineProgramTest.java b/src/test/java/picard/sam/markduplicates/AbstractMarkDuplicatesCommandLineProgramTest.java index 192ab001d..8da6e1995 100644 --- a/src/test/java/picard/sam/markduplicates/AbstractMarkDuplicatesCommandLineProgramTest.java +++ b/src/test/java/picard/sam/markduplicates/AbstractMarkDuplicatesCommandLineProgramTest.java @@ -598,5 +598,4 @@ public void testTwoMappedPairsWithSupplementaryReadsAfterCanonical(final Boolean tester.addMappedFragment(fragLikeFirst ? "RUNID:1:1:15993:13361" : "RUNID:1:1:16020:13352", 1, 400, markSecondaryAndSupplementaryRecordsLikeTheCanonical() && !fragLikeFirst, null, null, additionalFragSecondary, additionalFragSupplementary, DEFAULT_BASE_QUALITY); tester.runTest(); } - } diff --git a/src/test/java/picard/sam/markduplicates/AbstractMarkDuplicatesCommandLineProgramTester.java b/src/test/java/picard/sam/markduplicates/AbstractMarkDuplicatesCommandLineProgramTester.java index c5d99e6ba..6453af57e 100644 --- a/src/test/java/picard/sam/markduplicates/AbstractMarkDuplicatesCommandLineProgramTester.java +++ b/src/test/java/picard/sam/markduplicates/AbstractMarkDuplicatesCommandLineProgramTester.java @@ -50,7 +50,7 @@ */ abstract public class AbstractMarkDuplicatesCommandLineProgramTester extends SamFileTester { - final private File metricsFile; + final File metricsFile; final DuplicationMetrics expectedMetrics; public AbstractMarkDuplicatesCommandLineProgramTester(final ScoringStrategy duplicateScoringStrategy, SAMFileHeader.SortOrder sortOrder) { @@ -82,7 +82,7 @@ public AbstractMarkDuplicatesCommandLineProgramTester() { /** * Fill in expected duplication metrics directly from the input records given to this tester */ - private void updateExpectedDuplicationMetrics() { + public void updateExpectedDuplicationMetrics() { final FormatUtil formatter = new FormatUtil(); diff --git a/src/test/java/picard/sam/markduplicates/MarkDuplicatesTagRepresentativeReadIndexTest.java b/src/test/java/picard/sam/markduplicates/MarkDuplicatesTagRepresentativeReadIndexTest.java new file mode 100644 index 000000000..846800f8a --- /dev/null +++ b/src/test/java/picard/sam/markduplicates/MarkDuplicatesTagRepresentativeReadIndexTest.java @@ -0,0 +1,104 @@ +/* + * The MIT License + * + * Copyright (c) 2016 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.sam.markduplicates; + +import org.testng.annotations.Test; + +/** + * This class defines the individual test cases to run. The actual running of the test is done + * by MarkDuplicatesWithMateCigarTester (see getTester). + * @author hogstrom@broadinstitute.org + */ +public class MarkDuplicatesTagRepresentativeReadIndexTest extends AbstractMarkDuplicatesCommandLineProgramTest { + protected MarkDuplicatesTagRepresentativeReadIndexTester getTester() { + return new MarkDuplicatesTagRepresentativeReadIndexTester(); + } + + // This tests the creation of a single duplicate pair. The expected size of the duplicate set is 2 + // and the expected representative read is the first read defined in the text. The test fails if + // the 'DI' and 'DS' tags do not match the expectation. + @Test + public void testRepresentativeReadTag() { + final MarkDuplicatesTagRepresentativeReadIndexTester tester = getTester(); + tester.testRepresentativeReads = true; + tester.setExpectedOpticalDuplicate(1); + String representativeReadName = "RUNID:1:1:16020:13352"; + Integer representativeReadIndexInFileForward = 1; + tester.addMatePair(representativeReadName, 1, 485253, 485253, false, false, false, false, "45M", "45M", false, true, false, false, false, DEFAULT_BASE_QUALITY); + tester.expectedRepresentativeIndexMap.put(representativeReadIndexInFileForward, representativeReadIndexInFileForward); + tester.expectedRepresentativeIndexMap.put(3, representativeReadIndexInFileForward); + tester.expectedSetSizeMap.put(representativeReadName,2); + tester.addMatePair("RUNID:1:1:15993:13361", 1, 485253, 485253, false, false, true, true, "44M1S", "44M1S", false, true, false, false, false, DEFAULT_BASE_QUALITY); + tester.expectedRepresentativeIndexMap.put(0, representativeReadIndexInFileForward); + tester.expectedRepresentativeIndexMap.put(2, representativeReadIndexInFileForward); + tester.expectedSetSizeMap.put("RUNID:1:1:15993:13361",2); + tester.runTest(); + } + + // This tests the creation of a two duplicate sets of size 2 and 3 respectively. The test fails if + // the 'DI' and 'DS' tags do not match the expectation. + @Test + public void testMultiRepresentativeReadTags() { + final MarkDuplicatesTagRepresentativeReadIndexTester tester = getTester(); + tester.testRepresentativeReads = true; + tester.setExpectedOpticalDuplicate(3); + // Duplicate set: size 2 - all optical + String representativeReadName1 = "RUNID:1:1:16020:13352"; + Integer representativeReadIndexInFileForward = 1; + tester.addMatePair(representativeReadName1, 1, 485253, 485253, false, false, false, false, "45M", "45M", false, true, false, false, false, DEFAULT_BASE_QUALITY); + tester.expectedRepresentativeIndexMap.put(representativeReadIndexInFileForward, representativeReadIndexInFileForward); + tester.expectedRepresentativeIndexMap.put(3, representativeReadIndexInFileForward); + tester.expectedSetSizeMap.put(representativeReadName1,2); + tester.addMatePair("RUNID:1:1:15993:13361", 1, 485253, 485253, false, false, true, true, "44M1S", "44M1S", false, true, false, false, false, DEFAULT_BASE_QUALITY); + tester.expectedRepresentativeIndexMap.put(0, representativeReadIndexInFileForward); + tester.expectedRepresentativeIndexMap.put(2, representativeReadIndexInFileForward); + tester.expectedSetSizeMap.put("RUNID:1:1:15993:13361",2); + + // Duplicate set: size 3 - all optical + String representativeReadName2 = "RUNID:1:1:15993:13360"; + Integer representativeReadIndexInFileForward2 = 6; + tester.addMatePair(representativeReadName2, 1, 485299, 485299, false, false, false, false, "45M", "45M", false, true, false, false, false, DEFAULT_BASE_QUALITY); + tester.expectedRepresentativeIndexMap.put(representativeReadIndexInFileForward2, representativeReadIndexInFileForward2); + tester.expectedRepresentativeIndexMap.put(9, representativeReadIndexInFileForward2); + tester.expectedSetSizeMap.put(representativeReadName2,3); + tester.addMatePair("RUNID:1:1:15993:13365", 1, 485299, 485299, false, false, true, true, "44M1S", "44M1S", false, true, false, false, false, DEFAULT_BASE_QUALITY); + tester.expectedRepresentativeIndexMap.put(7, representativeReadIndexInFileForward2); + tester.expectedRepresentativeIndexMap.put(10, representativeReadIndexInFileForward2); + tester.expectedSetSizeMap.put("RUNID:1:1:15993:13365",3); + tester.addMatePair("RUNID:1:1:15993:13370", 1, 485299, 485299, false, false, true, true, "43M2S", "43M2S", false, true, false, false, false, DEFAULT_BASE_QUALITY); + tester.expectedRepresentativeIndexMap.put(8, representativeReadIndexInFileForward2); + tester.expectedRepresentativeIndexMap.put(11, representativeReadIndexInFileForward2); + tester.expectedSetSizeMap.put("RUNID:1:1:15993:13370",3); + + // Add non-duplicate read + tester.addMatePair("RUNID:1:1:15993:13375", 1, 485255, 485255, false, false, false, false, "43M2S", "43M2S", false, true, false, false, false, DEFAULT_BASE_QUALITY); + tester.expectedRepresentativeIndexMap.put(4, null); + tester.expectedRepresentativeIndexMap.put(5, null); + tester.expectedSetSizeMap.put("RUNID:1:1:15993:13375",null); + + tester.runTest(); + } + +} diff --git a/src/test/java/picard/sam/markduplicates/MarkDuplicatesTagRepresentativeReadIndexTester.java b/src/test/java/picard/sam/markduplicates/MarkDuplicatesTagRepresentativeReadIndexTester.java new file mode 100644 index 000000000..81903529a --- /dev/null +++ b/src/test/java/picard/sam/markduplicates/MarkDuplicatesTagRepresentativeReadIndexTester.java @@ -0,0 +1,122 @@ +/* + * The MIT License + * + * Copyright (c) 2016 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.sam.markduplicates; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SamReader; +import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.metrics.MetricsFile; +import htsjdk.samtools.util.CloserUtil; +import htsjdk.samtools.util.TestUtil; +import org.testng.Assert; +import picard.PicardException; +import picard.cmdline.CommandLineProgram; +import picard.sam.DuplicationMetrics; + +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * This class is an extension of AbstractMarkDuplicatesCommandLineProgramTester used to test MarkDuplicatesWithMateCigar with SAM files generated on the fly. + * This performs the underlying tests defined by classes such as see AbstractMarkDuplicatesCommandLineProgramTest and MarkDuplicatesWithMateCigarTest. + * @author hogstrom@broadinstitute.org + */ +public class MarkDuplicatesTagRepresentativeReadIndexTester extends AbstractMarkDuplicatesCommandLineProgramTester { + + final public Map expectedRepresentativeIndexMap = new HashMap<>(); + final public Map expectedSetSizeMap = new HashMap<>(); + public boolean testRepresentativeReads = false; + public MarkDuplicatesTagRepresentativeReadIndexTester() { + + addArg("TAGGING_POLICY=All"); + addArg("TAG_DUPLICATE_SET_MEMBERS=true"); + } + + @Override + protected CommandLineProgram getProgram() { return new MarkDuplicates(); } + + @Override + public void test() { + try { + updateExpectedDuplicationMetrics(); + // Read the output and check the duplicate flag + int outputRecords = 0; + int indexInFile = 0; + final SamReader reader = SamReaderFactory.makeDefault().open(getOutput()); + for (final SAMRecord record : reader) { + outputRecords++; + + final String key = samRecordToDuplicatesFlagsKey(record); + Assert.assertTrue(this.duplicateFlags.containsKey(key),"DOES NOT CONTAIN KEY: " + key); + final boolean value = this.duplicateFlags.get(key); + this.duplicateFlags.remove(key); + if (value != record.getDuplicateReadFlag()) { + System.err.println("Mismatching read:"); + System.err.print(record.getSAMString()); + } + Assert.assertEquals(record.getDuplicateReadFlag(), value); + if (testRepresentativeReads) { + if (expectedRepresentativeIndexMap.containsKey(indexInFile) && expectedSetSizeMap.containsKey(record.getReadName())){ + Assert.assertEquals(record.getAttribute("DI"), expectedRepresentativeIndexMap.get(indexInFile)); + Assert.assertEquals(record.getAttribute("DS"), expectedSetSizeMap.get(record.getReadName())); + } + } + indexInFile+=1; + } + CloserUtil.close(reader); + + // Ensure the program output the same number of records as were read in + Assert.assertEquals(outputRecords, this.getNumberOfRecords(), ("saw " + outputRecords + " output records, vs. " + this.getNumberOfRecords() + " input records")); + + // Check the values written to metrics.txt against our input expectations + final MetricsFile> metricsOutput = new MetricsFile<>(); + try{ + metricsOutput.read(new FileReader(metricsFile)); + } + catch (final FileNotFoundException ex) { + throw new PicardException("Metrics file not found: " + ex); + } + final List g = metricsOutput.getMetrics(); + // expect getMetrics to return a collection with a single duplicateMetrics object + Assert.assertEquals(metricsOutput.getMetrics().size(), 1); + final DuplicationMetrics observedMetrics = metricsOutput.getMetrics().get(0); + Assert.assertEquals(observedMetrics.UNPAIRED_READS_EXAMINED, expectedMetrics.UNPAIRED_READS_EXAMINED, "UNPAIRED_READS_EXAMINED does not match expected"); + Assert.assertEquals(observedMetrics.READ_PAIRS_EXAMINED, expectedMetrics.READ_PAIRS_EXAMINED, "READ_PAIRS_EXAMINED does not match expected"); + Assert.assertEquals(observedMetrics.UNMAPPED_READS, expectedMetrics.UNMAPPED_READS, "UNMAPPED_READS does not match expected"); + Assert.assertEquals(observedMetrics.UNPAIRED_READ_DUPLICATES, expectedMetrics.UNPAIRED_READ_DUPLICATES, "UNPAIRED_READ_DUPLICATES does not match expected"); + Assert.assertEquals(observedMetrics.READ_PAIR_DUPLICATES, expectedMetrics.READ_PAIR_DUPLICATES, "READ_PAIR_DUPLICATES does not match expected"); + Assert.assertEquals(observedMetrics.READ_PAIR_OPTICAL_DUPLICATES, expectedMetrics.READ_PAIR_OPTICAL_DUPLICATES, "READ_PAIR_OPTICAL_DUPLICATES does not match expected"); + Assert.assertEquals(observedMetrics.PERCENT_DUPLICATION, expectedMetrics.PERCENT_DUPLICATION, "PERCENT_DUPLICATION does not match expected"); + Assert.assertEquals(observedMetrics.ESTIMATED_LIBRARY_SIZE, expectedMetrics.ESTIMATED_LIBRARY_SIZE, "ESTIMATED_LIBRARY_SIZE does not match expected"); + Assert.assertEquals(observedMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS, expectedMetrics.SECONDARY_OR_SUPPLEMENTARY_RDS, "SECONDARY_OR_SUPPLEMENTARY_RDS does not match expected"); + } finally { + TestUtil.recursiveDelete(getOutputDir()); + } + } + +}