diff --git a/src/main/java/picard/sam/AbstractAlignmentMerger.java b/src/main/java/picard/sam/AbstractAlignmentMerger.java index 484a9b568..51cbb0477 100644 --- a/src/main/java/picard/sam/AbstractAlignmentMerger.java +++ b/src/main/java/picard/sam/AbstractAlignmentMerger.java @@ -59,6 +59,7 @@ import java.io.File; import java.util.ArrayList; import java.util.List; +import java.util.Optional; /** * Abstract class that coordinates the general task of taking in a set of alignment information, @@ -86,6 +87,7 @@ public static final int MAX_RECORDS_IN_RAM = 500000; private static final char[] RESERVED_ATTRIBUTE_STARTS = {'X', 'Y', 'Z'}; + private int crossSpeciesReads = 0; private final Log log = Log.getInstance(AbstractAlignmentMerger.class); private final ProgressLogger progress = new ProgressLogger(this.log, 1000000, "Merged", "records"); @@ -113,6 +115,8 @@ private boolean keepAlignerProperPairFlags = false; private boolean addMateCigar = false; private boolean unmapContaminantReads = false; + private UnmappingReadStrategy unmappingReadsStrategy = UnmappingReadStrategy.DO_NOT_CHANGE; + private final SamRecordFilter alignmentFilter = new SamRecordFilter() { public boolean filterOut(final SAMRecord record) { @@ -156,6 +160,29 @@ void close() { } } + public enum UnmappingReadStrategy { + // Leave on record, and copy to tag + COPY_TO_TAG(false, true), + // Leave on record, but do not create additional tag + DO_NOT_CHANGE(false, false), + // Add tag with information, and remove from standard fields in record + MOVE_TO_TAG(true, true); + + private final boolean resetMappingInformation, populatePATag; + + UnmappingReadStrategy(final boolean resetMappingInformation, final boolean populatePATag) { + this.resetMappingInformation = resetMappingInformation; + this.populatePATag = populatePATag; + } + + public boolean isResetMappingInformation() { + return resetMappingInformation; + } + + public boolean isPopulatePaTag() { + return populatePATag; + } + } protected abstract CloseableIterator getQuerynameSortedAlignedRecords(); @@ -163,6 +190,43 @@ void close() { protected boolean isContaminant(final HitsForInsert hits) { return false; } // default implementation + /** constructor with a default setting for unmappingReadsStrategy. + * + * see full constructor for parameters + * + * + */ + public AbstractAlignmentMerger(final File unmappedBamFile, final File targetBamFile, + final File referenceFasta, final boolean clipAdapters, + final boolean bisulfiteSequence, final boolean alignedReadsOnly, + final SAMProgramRecord programRecord, final List attributesToRetain, + final List attributesToRemove, + final Integer read1BasesTrimmed, final Integer read2BasesTrimmed, + final List expectedOrientations, + final SAMFileHeader.SortOrder sortOrder, + final PrimaryAlignmentSelectionStrategy primaryAlignmentSelectionStrategy, + final boolean addMateCigar, + final boolean unmapContaminantReads) { + this(unmappedBamFile, + targetBamFile, + referenceFasta, + clipAdapters, + bisulfiteSequence, + alignedReadsOnly, + programRecord, + attributesToRetain, + attributesToRemove, + read1BasesTrimmed, + read2BasesTrimmed, + expectedOrientations, + sortOrder, + primaryAlignmentSelectionStrategy, + addMateCigar, + unmapContaminantReads, + UnmappingReadStrategy.DO_NOT_CHANGE); + } + + /** * Constructor * @@ -192,8 +256,10 @@ void close() { * @param primaryAlignmentSelectionStrategy What to do when there are multiple primary alignments, or multiple * alignments but none primary, for a read or read pair. * @param addMateCigar True if we are to add or maintain the mate CIGAR (MC) tag, false if we are to remove or not include. - * @param unmapContaminantReads If true, identify reads having the signature of contamination from a foreign organism (i.e. mostly clipped bases), + * @param unmapContaminantReads If true, identify reads having the signature of cross-species contamination (i.e. mostly clipped bases), * and mark them as unmapped. + * @param unmappingReadsStrategy An enum describing how to deal with reads whose mapping information are being removed (currently this happens due to cross-species + * contamination). Ignored unless unmapContaminantReads is true. */ public AbstractAlignmentMerger(final File unmappedBamFile, final File targetBamFile, final File referenceFasta, final boolean clipAdapters, @@ -205,7 +271,8 @@ public AbstractAlignmentMerger(final File unmappedBamFile, final File targetBamF final SAMFileHeader.SortOrder sortOrder, final PrimaryAlignmentSelectionStrategy primaryAlignmentSelectionStrategy, final boolean addMateCigar, - final boolean unmapContaminantReads) { + final boolean unmapContaminantReads, + final UnmappingReadStrategy unmappingReadsStrategy) { IOUtil.assertFileIsReadable(unmappedBamFile); IOUtil.assertFileIsWritable(targetBamFile); IOUtil.assertFileIsReadable(referenceFasta); @@ -255,6 +322,7 @@ public AbstractAlignmentMerger(final File unmappedBamFile, final File targetBamF this.addMateCigar = addMateCigar; this.unmapContaminantReads = unmapContaminantReads; + this.unmappingReadsStrategy = unmappingReadsStrategy; } /** Allows the caller to override the maximum records in RAM. */ @@ -515,7 +583,9 @@ public static void fixNmMdAndUq(final SAMRecord record, final ReferenceSequenceF private void addIfNotFiltered(final Sink out, final SAMRecord rec) { if (includeSecondaryAlignments || !rec.getNotPrimaryAlignmentFlag()) { out.add(rec); - this.progress.record(rec); + if (this.progress.record(rec) && crossSpeciesReads > 0) { + log.info(String.format("%d Reads have been unmapped due to being suspected of being Cross-species contamination.", crossSpeciesReads)); + } } } @@ -537,10 +607,20 @@ private HitsForInsert nextAligned() { return null; } + private int numCrossSpeciesContaminantWarnings = 0; + /** * Copies alignment info from aligned to unaligned read, clips as appropriate, and sets PG ID. * May also un-map the resulting read if the alignment is bad (e.g. no unclipped bases). * + * Depending on the value of unmappingReadsStrategy this function will potentially + * + * - reset the mapping information (MOVE_TO_TAG) + * - copy the original mapping information to a tag "PA" (MOVE_TO_TAG, or COPY_TO_TAG) + * a third possibility for unmappingReadsStrategy is to do nothig (DO_NOT_CHANGE) + * + * This is because the CRAM format will reset mapping information for reads that are flagged as unaligned. + * * @param unaligned Original SAMRecord, and object into which values are copied. * @param aligned Holds alignment info that will be copied into unaligned. * @param isContaminant Should this read be unmapped due to contamination? @@ -555,17 +635,49 @@ private void transferAlignmentInfoToFragment(final SAMRecord unaligned, final SA log.warn("Record mapped off end of reference; making unmapped: " + aligned); SAMUtils.makeReadUnmapped(unaligned); } else if (isContaminant) { - log.warn("Record looks like foreign contamination; making unmapped: " + aligned); - // NB: for reads that look like contamination, just set unmapped flag and zero MQ but keep other flags as-is. - // this maintains the sort order so that downstream analyses can use them for calculating evidence - // of contamination vs other causes (e.g. structural variants) + + crossSpeciesReads++; + + if (unmappingReadsStrategy.isPopulatePaTag()) { + unaligned.setAttribute("PA", encodeMappingInformation(aligned)); + } + + if (unmappingReadsStrategy.isResetMappingInformation()) { + unaligned.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); + unaligned.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START); + unaligned.setCigar(null); + unaligned.setCigarString(SAMRecord.NO_ALIGNMENT_CIGAR); + } + unaligned.setReadUnmappedFlag(true); + // Unmapped read cannot have non-zero mapping quality and remain valid unaligned.setMappingQuality(SAMRecord.NO_MAPPING_QUALITY); - unaligned.setAttribute(SAMTag.FT.name(), "Cross-species contamination"); + + // if there already is a comment, add second comment with a | separator: + Optional optionalComment = Optional.ofNullable(unaligned.getStringAttribute(SAMTag.CO.name())); + unaligned.setAttribute(optionalComment.map(s -> s + " | ").orElse("") + + SAMTag.CO.name(), "Cross-species contamination"); } } /** + * Encodes mapping information from a record into a string according to the format sepcified in the + * Sam-Spec under the SA tag. No protection against missing values (for cigar, and NM tag). + * (Might make sense to move this to htsJDK.) + * + * @param rec SAMRecord whose alignment information will be encoded + * @return String encoding rec's alignment information according to SA tag in the SAM spec + */ + static private String encodeMappingInformation(SAMRecord rec) { + return String.join(",", + rec.getContig(), + ((Integer) rec.getAlignmentStart()).toString(), + rec.getCigarString(), + ((Integer) rec.getMappingQuality()).toString(), + rec.getStringAttribute(SAMTag.NM.name()))+";"; + } + + /** * Copies alignment info from aligned to unaligned read, if there is an alignment, and sets mate information. * * @param firstUnaligned Original first of pair, into which alignment and pair info will be written. diff --git a/src/main/java/picard/sam/MergeBamAlignment.java b/src/main/java/picard/sam/MergeBamAlignment.java index b51b1664b..2ed07d872 100644 --- a/src/main/java/picard/sam/MergeBamAlignment.java +++ b/src/main/java/picard/sam/MergeBamAlignment.java @@ -206,6 +206,9 @@ @Option(doc = "If UNMAP_CONTAMINANT_READS is set, require this many unclipped bases or else the read will be marked as contaminant.") public int MIN_UNCLIPPED_BASES = 32; + @Option(doc = "How to deal with alignment information in reads that are being unmapped (e.g. due to cross-species contamination.) Currently ignored unless UNMAP_CONTAMINANT_READS = true", optional = true) + public AbstractAlignmentMerger.UnmappingReadStrategy UNMAPPED_READ_STRATEGY = AbstractAlignmentMerger.UnmappingReadStrategy.DO_NOT_CHANGE; + private static final Log log = Log.getInstance(MergeBamAlignment.class); /** @@ -260,7 +263,7 @@ protected int doWork() { ATTRIBUTES_TO_RETAIN, ATTRIBUTES_TO_REMOVE, READ1_TRIM, READ2_TRIM, READ1_ALIGNED_BAM, READ2_ALIGNED_BAM, EXPECTED_ORIENTATIONS, SORT_ORDER, PRIMARY_ALIGNMENT_STRATEGY.newInstance(), ADD_MATE_CIGAR, UNMAP_CONTAMINANT_READS, - MIN_UNCLIPPED_BASES); + MIN_UNCLIPPED_BASES, UNMAPPED_READ_STRATEGY); merger.setClipOverlappingReads(CLIP_OVERLAPPING_READS); merger.setMaxRecordsInRam(MAX_RECORDS_IN_RAM); merger.setKeepAlignerProperPairFlags(ALIGNER_PROPER_PAIR_FLAGS); diff --git a/src/main/java/picard/sam/SamAlignmentMerger.java b/src/main/java/picard/sam/SamAlignmentMerger.java index a716c4bb6..69278076b 100644 --- a/src/main/java/picard/sam/SamAlignmentMerger.java +++ b/src/main/java/picard/sam/SamAlignmentMerger.java @@ -13,10 +13,8 @@ import htsjdk.samtools.SamPairUtil; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; -import htsjdk.samtools.ValidationStringency; import htsjdk.samtools.filter.OverclippedReadFilter; import htsjdk.samtools.util.CloseableIterator; -import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.DelegatingIterator; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Log; @@ -49,48 +47,8 @@ private final OverclippedReadFilter contaminationFilter; /** - * Constructor + * Constructor with a default value for unmappingReadStrategy * - * @param unmappedBamFile The BAM file that was used as the input to the aligner, which will - * include info on all the reads that did not map. Required. - * @param targetBamFile The file to which to write the merged SAM records. Required. - * @param referenceFasta The reference sequence for the map files. Required. - * @param programRecord Program record for taget file SAMRecords created. - * @param clipAdapters Whether adapters marked in unmapped BAM file should be marked as - * soft clipped in the merged bam. Required. - * @param bisulfiteSequence Whether the reads are bisulfite sequence (used when calculating the - * NM and UQ tags). Required. - * @param alignedReadsOnly Whether to output only those reads that have alignment data - * @param alignedSamFile The SAM file(s) with alignment information. Optional. If this is - * not provided, then read1AlignedSamFile and read2AlignedSamFile must be. - * @param maxGaps The maximum number of insertions or deletions permitted in an - * alignment. Alignments with more than this many gaps will be ignored. - * -1 means to allow any number of gaps. - * @param attributesToRetain attributes from the alignment record that should be - * retained when merging, overridden by attributesToRemove if they share - * common tags. - * @param attributesToRemove attributes from the alignment record that should be - * removed when merging. This overrides attributesToRetain if they share - * common tags. - * @param read1BasesTrimmed The number of bases trimmed from start of read 1 prior to alignment. Optional. - * @param read2BasesTrimmed The number of bases trimmed from start of read 2 prior to alignment. Optional. - * @param read1AlignedSamFile The alignment records for read1. Used when the two ends of a read are - * aligned separately. This is optional, but must be specified if - * alignedSamFile is not. - * @param read2AlignedSamFile The alignment records for read1. Used when the two ends of a read are - * aligned separately. This is optional, but must be specified if - * alignedSamFile is not. - * @param expectedOrientations A List of SamPairUtil.PairOrientations that are expected for - * aligned pairs. Used to determine the properPair flag. - * @param sortOrder The order in which the merged records should be output. If null, - * output will be coordinate-sorted - * @param primaryAlignmentSelectionStrategy How to handle multiple alignments for a fragment or read pair, - * in which none are primary, or more than one is marked primary - * @param addMateCigar True if we are to add or maintain the mate CIGAR (MC) tag, false if we are to remove or not include. - * - * @param unmapContaminantReads If true, identify reads having the signature of contamination from a foreign organism (i.e. mostly clipped bases), - * and mark them as unmapped. - * @param minUnclippedBases If unmapContaminantReads is set, require this many unclipped bases or else the read will be marked as contaminant. */ public SamAlignmentMerger(final File unmappedBamFile, final File targetBamFile, final File referenceFasta, final SAMProgramRecord programRecord, final boolean clipAdapters, final boolean bisulfiteSequence, @@ -105,10 +63,94 @@ public SamAlignmentMerger(final File unmappedBamFile, final File targetBamFile, final boolean addMateCigar, final boolean unmapContaminantReads, final int minUnclippedBases) { + this(unmappedBamFile, + targetBamFile, + referenceFasta, + programRecord, + clipAdapters, + bisulfiteSequence, + alignedReadsOnly, + alignedSamFile, + maxGaps, + attributesToRetain, + attributesToRemove, + read1BasesTrimmed, + read2BasesTrimmed, + read1AlignedSamFile, + read2AlignedSamFile, + expectedOrientations, + sortOrder, + primaryAlignmentSelectionStrategy, + addMateCigar, + unmapContaminantReads, + minUnclippedBases, + UnmappingReadStrategy.DO_NOT_CHANGE); + } + + /** + * Constructor + * + * @param unmappedBamFile The BAM file that was used as the input to the aligner, which will + * include info on all the reads that did not map. Required. + * @param targetBamFile The file to which to write the merged SAM records. Required. + * @param referenceFasta The reference sequence for the map files. Required. + * @param programRecord Program record for taget file SAMRecords created. + * @param clipAdapters Whether adapters marked in unmapped BAM file should be marked as + * soft clipped in the merged bam. Required. + * @param bisulfiteSequence Whether the reads are bisulfite sequence (used when calculating the + * NM and UQ tags). Required. + * @param alignedReadsOnly Whether to output only those reads that have alignment data + * @param alignedSamFile The SAM file(s) with alignment information. Optional. If this is + * not provided, then read1AlignedSamFile and read2AlignedSamFile must be. + * @param maxGaps The maximum number of insertions or deletions permitted in an + * alignment. Alignments with more than this many gaps will be ignored. + * -1 means to allow any number of gaps. + * @param attributesToRetain attributes from the alignment record that should be + * retained when merging, overridden by attributesToRemove if they share + * common tags. + * @param attributesToRemove attributes from the alignment record that should be + * removed when merging. This overrides attributesToRetain if they share + * common tags. + * @param read1BasesTrimmed The number of bases trimmed from start of read 1 prior to alignment. Optional. + * @param read2BasesTrimmed The number of bases trimmed from start of read 2 prior to alignment. Optional. + * @param read1AlignedSamFile The alignment records for read1. Used when the two ends of a read are + * aligned separately. This is optional, but must be specified if + * alignedSamFile is not. + * @param read2AlignedSamFile The alignment records for read1. Used when the two ends of a read are + * aligned separately. This is optional, but must be specified if + * alignedSamFile is not. + * @param expectedOrientations A List of SamPairUtil.PairOrientations that are expected for + * aligned pairs. Used to determine the properPair flag. + * @param sortOrder The order in which the merged records should be output. If null, + * output will be coordinate-sorted + * @param primaryAlignmentSelectionStrategy How to handle multiple alignments for a fragment or read pair, + * in which none are primary, or more than one is marked primary + * @param addMateCigar True if we are to add or maintain the mate CIGAR (MC) tag, false if we are to remove or not include. + * @param unmapContaminantReads If true, identify reads having the signature of cross-species contamination (i.e. mostly clipped bases), + * and mark them as unmapped. + * @param minUnclippedBases If unmapContaminantReads is set, require this many unclipped bases or else the read will be marked as contaminant. + * @param unmappingReadStrategy An enum describing how to deal with reads whose mapping information are being removed (currently this happens due to cross-species + * contamination). Ignored unless unmapContaminantReads is true. + */ + public SamAlignmentMerger(final File unmappedBamFile, final File targetBamFile, final File referenceFasta, + final SAMProgramRecord programRecord, final boolean clipAdapters, final boolean bisulfiteSequence, + final boolean alignedReadsOnly, + final List alignedSamFile, final int maxGaps, final List attributesToRetain, + final List attributesToRemove, + final Integer read1BasesTrimmed, final Integer read2BasesTrimmed, + final List read1AlignedSamFile, final List read2AlignedSamFile, + final List expectedOrientations, + final SortOrder sortOrder, + final PrimaryAlignmentSelectionStrategy primaryAlignmentSelectionStrategy, + final boolean addMateCigar, + final boolean unmapContaminantReads, + final int minUnclippedBases, + final UnmappingReadStrategy unmappingReadStrategy) { super(unmappedBamFile, targetBamFile, referenceFasta, clipAdapters, bisulfiteSequence, alignedReadsOnly, programRecord, attributesToRetain, attributesToRemove, read1BasesTrimmed, - read2BasesTrimmed, expectedOrientations, sortOrder, primaryAlignmentSelectionStrategy, addMateCigar, unmapContaminantReads); + read2BasesTrimmed, expectedOrientations, sortOrder, primaryAlignmentSelectionStrategy, addMateCigar, + unmapContaminantReads, unmappingReadStrategy); if ((alignedSamFile == null || alignedSamFile.isEmpty()) && (read1AlignedSamFile == null || read1AlignedSamFile.isEmpty() || @@ -118,16 +160,10 @@ public SamAlignmentMerger(final File unmappedBamFile, final File targetBamFile, } if (alignedSamFile != null) { - for (final File f : alignedSamFile) { - IOUtil.assertFileIsReadable(f); - } + alignedSamFile.forEach(IOUtil::assertFileIsReadable); } else { - for (final File f : read1AlignedSamFile) { - IOUtil.assertFileIsReadable(f); - } - for (final File f : read2AlignedSamFile) { - IOUtil.assertFileIsReadable(f); - } + read1AlignedSamFile.forEach(IOUtil::assertFileIsReadable); + read2AlignedSamFile.forEach(IOUtil::assertFileIsReadable); } this.alignedSamFile = alignedSamFile; @@ -137,7 +173,7 @@ public SamAlignmentMerger(final File unmappedBamFile, final File targetBamFile, this.minUnclippedBases = minUnclippedBases; this.contaminationFilter = new OverclippedReadFilter(minUnclippedBases, false); - log.info("Processing SAM file(s): " + alignedSamFile != null ? alignedSamFile : read1AlignedSamFile + "," + read2AlignedSamFile); + log.info("Processing SAM file(s): " + ((alignedSamFile != null) ? alignedSamFile : (read1AlignedSamFile + "," + read2AlignedSamFile))); } diff --git a/src/test/java/picard/sam/MergeBamAlignmentTest.java b/src/test/java/picard/sam/MergeBamAlignmentTest.java index 968cc94ec..c368d7bc9 100644 --- a/src/test/java/picard/sam/MergeBamAlignmentTest.java +++ b/src/test/java/picard/sam/MergeBamAlignmentTest.java @@ -289,9 +289,7 @@ else if (sam.getReadName().equals("both_reads_present_only_first_aligns") || } else { throw new Exception("Unexpected read name: " + sam.getReadName()); } - } - } @Test(dataProvider="data") @@ -304,7 +302,7 @@ public void testSortingOnSamAlignmentMerger(final File unmapped, final File alig false, Arrays.asList(aligned), 1, null, null, null, null, null, null, Arrays.asList(SamPairUtil.PairOrientation.FR), coordinateSorted ? SAMFileHeader.SortOrder.coordinate : SAMFileHeader.SortOrder.queryname, - new BestMapqPrimaryAlignmentSelectionStrategy(), false, false, 30); + new BestMapqPrimaryAlignmentSelectionStrategy(), false, false, 30, AbstractAlignmentMerger.UnmappingReadStrategy.DO_NOT_CHANGE); merger.mergeAlignment(Defaults.REFERENCE_FASTA); Assert.assertEquals(sorted, !merger.getForceSort()); @@ -1282,7 +1280,7 @@ private void testBestFragmentMapqStrategy(final String testName, final int[] fir Assert.assertEquals(numSecondRecords, Math.max(1, secondMapQs.length)); } } - + private void doMergeAlignment(final File unmappedBam, final List alignedBams, final List read1AlignedBams, final List read2AlignedBams, final Integer read1Trim, final Integer read2Trim, final boolean alignReadsOnly, final boolean clipAdapters, final boolean isBisulfiteSequence, final int maxInsOrDels, @@ -1293,8 +1291,26 @@ private void doMergeAlignment(final File unmappedBam, final List alignedBa final Boolean includeSecondary, final Boolean unmapContaminantReads, final SAMFileHeader.SortOrder sortOrder) { + doMergeAlignment(unmappedBam, alignedBams, read1AlignedBams, read2AlignedBams, read1Trim, read2Trim, + alignReadsOnly, clipAdapters, isBisulfiteSequence, maxInsOrDels, + progRecordId, progGroupVersion, progGroupCommandLine, progGroupName, pairedRun, refSeq, output, + expectedOrientation, primaryAlignmentStrategy, attributesToRetain, includeSecondary, unmapContaminantReads, + sortOrder, null); + } + + private void doMergeAlignment(final File unmappedBam, final List alignedBams, + final List read1AlignedBams, final List read2AlignedBams, final Integer read1Trim, final Integer read2Trim, + final boolean alignReadsOnly, final boolean clipAdapters, final boolean isBisulfiteSequence, final int maxInsOrDels, + final String progRecordId, final String progGroupVersion, final String progGroupCommandLine, final String progGroupName, + final boolean pairedRun, final File refSeq, final File output, + final SamPairUtil.PairOrientation expectedOrientation, final MergeBamAlignment.PrimaryAlignmentStrategy primaryAlignmentStrategy, + final String attributesToRetain, + final Boolean includeSecondary, + final Boolean unmapContaminantReads, + final SAMFileHeader.SortOrder sortOrder, + final AbstractAlignmentMerger.UnmappingReadStrategy unmappingReadStrategy) { - final List args = new ArrayList(Arrays.asList( + final List args = new ArrayList<>(Arrays.asList( "UNMAPPED_BAM=" + unmappedBam.getAbsolutePath(), "ALIGNED_READS_ONLY=" + alignReadsOnly, "CLIP_ADAPTERS=" + clipAdapters, @@ -1352,6 +1368,9 @@ private void doMergeAlignment(final File unmappedBam, final List alignedBa if (unmapContaminantReads != null) { args.add("UNMAP_CONTAMINANT_READS=" + unmapContaminantReads); } + if (unmappingReadStrategy != null) { + args.add("UNMAPPED_READ_STRATEGY=" + unmappingReadStrategy); + } if (sortOrder != null) { args.add("SORT_ORDER=" + sortOrder.name()); } @@ -1657,11 +1676,21 @@ private MostDistantStrategyAlignmentSpec(final boolean expectedPrimary, final St }; } - @Test - public void testContaminationDetection() throws IOException { + @DataProvider(name="UnmappedReadStrategies") + public Object[][] UnmappedReadStrategiesProvider() { + return new Object[][] { + {AbstractAlignmentMerger.UnmappingReadStrategy.DO_NOT_CHANGE, "contam.expected.NO_CHANGE.sam"}, + {null, "contam.expected.NO_CHANGE.sam"}, + {AbstractAlignmentMerger.UnmappingReadStrategy.COPY_TO_TAG, "contam.expected.COPY_TO_TAG.sam"}, + {AbstractAlignmentMerger.UnmappingReadStrategy.MOVE_TO_TAG, "contam.expected.MOVE_TO_TAG.sam"} + }; + } + + @Test(dataProvider = "UnmappedReadStrategies") + public void testContaminationDetection(final AbstractAlignmentMerger.UnmappingReadStrategy strategy, final String basename) throws IOException { final File unmappedSam = new File(TEST_DATA_DIR, "contam.unmapped.sam"); final File alignedSam = new File(TEST_DATA_DIR, "contam.aligned.sam"); - final File expectedSam = new File(TEST_DATA_DIR, "contam.expected.sam"); + final File expectedSam = new File(TEST_DATA_DIR, basename); final File refFasta = new File(TEST_DATA_DIR, "cliptest.fasta"); final File mergedSam = File.createTempFile("merged", ".sam"); mergedSam.deleteOnExit(); @@ -1671,7 +1700,7 @@ public void testContaminationDetection() throws IOException { false, true, false, 1, "0", "1.0", "align!", "myAligner", true, refFasta, mergedSam, - null, null, null, null, true, null); + null, null, null, null, true, null, strategy); assertSamValid(mergedSam); IOUtil.assertFilesEqual(expectedSam, mergedSam); diff --git a/testdata/picard/sam/MergeBamAlignment/contam.expected.COPY_TO_TAG.sam b/testdata/picard/sam/MergeBamAlignment/contam.expected.COPY_TO_TAG.sam new file mode 100644 index 000000000..76ef1af05 --- /dev/null +++ b/testdata/picard/sam/MergeBamAlignment/contam.expected.COPY_TO_TAG.sam @@ -0,0 +1,16 @@ +@HD VN:1.5 SO:coordinate +@SQ SN:chr1 LN:1000 UR:file:testdata/net/sf/picard/sam/MergeBamAlignment/cliptest.fasta M5:17522ddd273279f4595f50fea9864734 +@RG ID:0 SM:Hi,Mom! PL:ILLUMINA +@PG ID:0 VN:1.0 CL:align! PN:myAligner +frag_multiple_primary_1 4 chr1 1 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PA:Z:chr1,1,20S10M20S,30,null; PG:Z:0 RG:Z:0 CO:Z:Cross-species contamination +frag_multiple_primary_2 0 chr1 1 30 50M * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? MD:Z:50 PG:Z:0 RG:Z:0 NM:i:0 UQ:i:0 +frag_multiple_primary_2 256 chr1 1 15 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? MD:Z:0T0T0C0A0T1C0T0G1 PG:Z:0 RG:Z:0 NM:i:8 UQ:i:240 +frag_primary_clipped 4 chr1 1 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PA:Z:chr1,1,20S10M20S,30,null; PG:Z:0 RG:Z:0 CO:Z:Cross-species contamination +frag_secondary_clipped 0 chr1 1 30 50M * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? MD:Z:50 PG:Z:0 RG:Z:0 NM:i:0 UQ:i:0 +frag_secondary_clipped 256 chr1 1 30 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? MD:Z:0T0T0C0A0T1C0T0G1 PG:Z:0 RG:Z:0 NM:i:8 UQ:i:240 +r1_clipped_r2_clipped 109 * 0 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PA:Z:chr1,1,20S10M20S,30,null; PG:Z:0 RG:Z:0 CO:Z:Cross-species contamination +r1_clipped_r2_perfect 109 * 0 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PA:Z:chr1,1,20S10M20S,30,null; PG:Z:0 RG:Z:0 CO:Z:Cross-species contamination +r1_clipped_r2_unmapped 77 * 0 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PA:Z:chr1,1,20S10M20S,30,null; PG:Z:0 RG:Z:0 CO:Z:Cross-species contamination +r1_clipped_r2_unmapped 141 * 0 0 * * 0 0 TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 +r1_clipped_r2_clipped 157 * 0 0 20S10M20S * 0 0 TGGAGTGTTAACGTACTCTATTATTGTATTGTTTTTTTTTTGCCCTTAAA ?????????????????????????????????????????????????? PA:Z:chr1,51,20S10M20S,30,null; PG:Z:0 RG:Z:0 CO:Z:Cross-species contamination +r1_clipped_r2_perfect 157 * 0 0 50M * 0 0 TGGAGTGTTAACGTACTCTATTATTGTATTGTTTTTTTTTTGCCCTTAAA ?????????????????????????????????????????????????? PA:Z:chr1,51,50M,30,null; PG:Z:0 RG:Z:0 CO:Z:Cross-species contamination diff --git a/testdata/picard/sam/MergeBamAlignment/contam.expected.MOVE_TO_TAG.sam b/testdata/picard/sam/MergeBamAlignment/contam.expected.MOVE_TO_TAG.sam new file mode 100644 index 000000000..af6f9e6c8 --- /dev/null +++ b/testdata/picard/sam/MergeBamAlignment/contam.expected.MOVE_TO_TAG.sam @@ -0,0 +1,16 @@ +@HD VN:1.5 SO:coordinate +@SQ SN:chr1 LN:1000 UR:file:testdata/net/sf/picard/sam/MergeBamAlignment/cliptest.fasta M5:17522ddd273279f4595f50fea9864734 +@RG ID:0 SM:Hi,Mom! PL:ILLUMINA +@PG ID:0 VN:1.0 CL:align! PN:myAligner +frag_multiple_primary_2 0 chr1 1 30 50M * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? MD:Z:50 PG:Z:0 RG:Z:0 NM:i:0 UQ:i:0 +frag_multiple_primary_2 256 chr1 1 15 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? MD:Z:0T0T0C0A0T1C0T0G1 PG:Z:0 RG:Z:0 NM:i:8 UQ:i:240 +frag_secondary_clipped 0 chr1 1 30 50M * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? MD:Z:50 PG:Z:0 RG:Z:0 NM:i:0 UQ:i:0 +frag_secondary_clipped 256 chr1 1 30 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? MD:Z:0T0T0C0A0T1C0T0G1 PG:Z:0 RG:Z:0 NM:i:8 UQ:i:240 +frag_multiple_primary_1 4 * 0 0 * * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PA:Z:chr1,1,20S10M20S,30,null; PG:Z:0 RG:Z:0 CO:Z:Cross-species contamination +frag_primary_clipped 4 * 0 0 * * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PA:Z:chr1,1,20S10M20S,30,null; PG:Z:0 RG:Z:0 CO:Z:Cross-species contamination +r1_clipped_r2_clipped 109 * 0 0 * * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PA:Z:chr1,1,20S10M20S,30,null; PG:Z:0 RG:Z:0 CO:Z:Cross-species contamination +r1_clipped_r2_perfect 109 * 0 0 * * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PA:Z:chr1,1,20S10M20S,30,null; PG:Z:0 RG:Z:0 CO:Z:Cross-species contamination +r1_clipped_r2_unmapped 77 * 0 0 * * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PA:Z:chr1,1,20S10M20S,30,null; PG:Z:0 RG:Z:0 CO:Z:Cross-species contamination +r1_clipped_r2_unmapped 141 * 0 0 * * 0 0 TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 +r1_clipped_r2_clipped 157 * 0 0 * * 0 0 TGGAGTGTTAACGTACTCTATTATTGTATTGTTTTTTTTTTGCCCTTAAA ?????????????????????????????????????????????????? PA:Z:chr1,51,20S10M20S,30,null; PG:Z:0 RG:Z:0 CO:Z:Cross-species contamination +r1_clipped_r2_perfect 157 * 0 0 * * 0 0 TGGAGTGTTAACGTACTCTATTATTGTATTGTTTTTTTTTTGCCCTTAAA ?????????????????????????????????????????????????? PA:Z:chr1,51,50M,30,null; PG:Z:0 RG:Z:0 CO:Z:Cross-species contamination diff --git a/testdata/picard/sam/MergeBamAlignment/contam.expected.sam b/testdata/picard/sam/MergeBamAlignment/contam.expected.NO_CHANGE.sam similarity index 82% rename from testdata/picard/sam/MergeBamAlignment/contam.expected.sam rename to testdata/picard/sam/MergeBamAlignment/contam.expected.NO_CHANGE.sam index 3e2892262..2a3352fc2 100644 --- a/testdata/picard/sam/MergeBamAlignment/contam.expected.sam +++ b/testdata/picard/sam/MergeBamAlignment/contam.expected.NO_CHANGE.sam @@ -2,15 +2,15 @@ @SQ SN:chr1 LN:1000 UR:file:testdata/net/sf/picard/sam/MergeBamAlignment/cliptest.fasta M5:17522ddd273279f4595f50fea9864734 @RG ID:0 SM:Hi,Mom! PL:ILLUMINA @PG ID:0 VN:1.0 CL:align! PN:myAligner -frag_multiple_primary_1 4 chr1 1 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 FT:Z:Cross-species contamination +frag_multiple_primary_1 4 chr1 1 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 CO:Z:Cross-species contamination frag_multiple_primary_2 0 chr1 1 30 50M * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? MD:Z:50 PG:Z:0 RG:Z:0 NM:i:0 UQ:i:0 frag_multiple_primary_2 256 chr1 1 15 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? MD:Z:0T0T0C0A0T1C0T0G1 PG:Z:0 RG:Z:0 NM:i:8 UQ:i:240 -frag_primary_clipped 4 chr1 1 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 FT:Z:Cross-species contamination +frag_primary_clipped 4 chr1 1 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 CO:Z:Cross-species contamination frag_secondary_clipped 0 chr1 1 30 50M * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? MD:Z:50 PG:Z:0 RG:Z:0 NM:i:0 UQ:i:0 frag_secondary_clipped 256 chr1 1 30 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? MD:Z:0T0T0C0A0T1C0T0G1 PG:Z:0 RG:Z:0 NM:i:8 UQ:i:240 -r1_clipped_r2_clipped 109 * 0 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 FT:Z:Cross-species contamination -r1_clipped_r2_perfect 109 * 0 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 FT:Z:Cross-species contamination -r1_clipped_r2_unmapped 77 * 0 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 FT:Z:Cross-species contamination +r1_clipped_r2_clipped 109 * 0 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 CO:Z:Cross-species contamination +r1_clipped_r2_perfect 109 * 0 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 CO:Z:Cross-species contamination +r1_clipped_r2_unmapped 77 * 0 0 20S10M20S * 0 0 TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 CO:Z:Cross-species contamination r1_clipped_r2_unmapped 141 * 0 0 * * 0 0 TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 -r1_clipped_r2_clipped 157 * 0 0 20S10M20S * 0 0 TGGAGTGTTAACGTACTCTATTATTGTATTGTTTTTTTTTTGCCCTTAAA ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 FT:Z:Cross-species contamination -r1_clipped_r2_perfect 157 * 0 0 50M * 0 0 TGGAGTGTTAACGTACTCTATTATTGTATTGTTTTTTTTTTGCCCTTAAA ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 FT:Z:Cross-species contamination +r1_clipped_r2_clipped 157 * 0 0 20S10M20S * 0 0 TGGAGTGTTAACGTACTCTATTATTGTATTGTTTTTTTTTTGCCCTTAAA ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 CO:Z:Cross-species contamination +r1_clipped_r2_perfect 157 * 0 0 50M * 0 0 TGGAGTGTTAACGTACTCTATTATTGTATTGTTTTTTTTTTGCCCTTAAA ?????????????????????????????????????????????????? PG:Z:0 RG:Z:0 CO:Z:Cross-species contamination