diff --git a/src/main/java/picard/analysis/RnaSeqMetrics.java b/src/main/java/picard/analysis/RnaSeqMetrics.java index 07e299e5e..c0f7f5376 100644 --- a/src/main/java/picard/analysis/RnaSeqMetrics.java +++ b/src/main/java/picard/analysis/RnaSeqMetrics.java @@ -66,6 +66,36 @@ /** Number of aligned reads that are mapped to the incorrect strand. 0 if library is not strand-specific. */ public long INCORRECT_STRAND_READS; + /** The number of reads that support the model where R1 is on the strand of transcription and R2 is on the + * opposite strand. + */ + public long NUM_R1_TRANSCRIPT_STRAND_READS; + + /** + * The fraction of reads that support the model where R2 is on the strand of transcription and R1 is on the opposite + * strand. + */ + public long NUM_R2_TRANSCRIPT_STRAND_READS; + + /** + * The fraction of reads for which the transcription strand model could not be inferred. + */ + public long NUM_UNEXPLAINED_READS; + + + /** The fraction of reads that support the model where R1 is on the strand of transcription and R2 is on the + * opposite strand. For unpaired reads, it is the fraction of reads that are on the transcription strand (out of all + * the reads). + */ + public double PCT_R1_TRANSCRIPT_STRAND_READS; + + /** + * The fraction of reads that support the model where R2 is on the strand of transcription and R1 is on the opposite + * strand. For unpaired reads, it is the fraction of reads that are on opposite strand than that of the the + * transcription strand (out of all the reads). + */ + public double PCT_R2_TRANSCRIPT_STRAND_READS; + /** Fraction of PF_ALIGNED_BASES that mapped to regions encoding ribosomal RNA, RIBOSOMAL_BASES/PF_ALIGNED_BASES */ public Double PCT_RIBOSOMAL_BASES; diff --git a/src/main/java/picard/analysis/directed/RnaSeqMetricsCollector.java b/src/main/java/picard/analysis/directed/RnaSeqMetricsCollector.java index 149e0ffd9..3ffb4ffe2 100644 --- a/src/main/java/picard/analysis/directed/RnaSeqMetricsCollector.java +++ b/src/main/java/picard/analysis/directed/RnaSeqMetricsCollector.java @@ -1,10 +1,6 @@ package picard.analysis.directed; -import htsjdk.samtools.AlignmentBlock; -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMReadGroupRecord; -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.*; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.util.CoordMath; import htsjdk.samtools.util.Histogram; @@ -232,25 +228,64 @@ public void acceptRecord(SAMRecord rec) { // Strand-specificity is tallied on read basis rather than base at a time. A read that aligns to more than one // gene is not counted. - if (!rec.getNotPrimaryAlignmentFlag() && overlapsExon && strandSpecificity != StrandSpecificity.NONE && overlappingGenes.size() == 1) { - final boolean negativeTranscriptionStrand = overlappingGenes.iterator().next().isNegativeStrand(); - final boolean negativeReadStrand = rec.getReadNegativeStrandFlag(); - final boolean readAndTranscriptStrandsAgree = negativeReadStrand == negativeTranscriptionStrand; - final boolean readOneOrUnpaired = !rec.getReadPairedFlag() || rec.getFirstOfPairFlag(); - final boolean firstReadExpectedToAgree = strandSpecificity == StrandSpecificity.FIRST_READ_TRANSCRIPTION_STRAND; - final boolean thisReadExpectedToAgree = readOneOrUnpaired == firstReadExpectedToAgree; - // If the read strand is the same as the strand of the transcript, and the end is the one that is supposed to agree, - // then the strand specificity for this read is correct. - // -- OR -- - // If the read strand is not the same as the strand of the transcript, and the end is not the one that is supposed - // to agree, then the strand specificity for this read is correct. - if (readAndTranscriptStrandsAgree == thisReadExpectedToAgree) { - ++metrics.CORRECT_STRAND_READS; - } else { - ++metrics.INCORRECT_STRAND_READS; + if (!rec.getSupplementaryAlignmentFlag() && overlapsExon && overlappingGenes.size() == 1) { + final Gene gene = overlappingGenes.iterator().next(); + final boolean negativeTranscriptionStrand = gene.isNegativeStrand(); + final boolean readOneOrUnpaired = !rec.getReadPairedFlag() || rec.getFirstOfPairFlag(); + final boolean negativeReadStrand = rec.getReadNegativeStrandFlag(); + if (strandSpecificity != StrandSpecificity.NONE) { + final boolean readAndTranscriptStrandsAgree = negativeReadStrand == negativeTranscriptionStrand; + final boolean firstReadExpectedToAgree = strandSpecificity == StrandSpecificity.FIRST_READ_TRANSCRIPTION_STRAND; + final boolean thisReadExpectedToAgree = readOneOrUnpaired == firstReadExpectedToAgree; + // If the read strand is the same as the strand of the transcript, and the end is the one that is supposed to agree, + // then the strand specificity for this read is correct. + // -- OR -- + // If the read strand is not the same as the strand of the transcript, and the end is not the one that is supposed + // to agree, then the strand specificity for this read is correct. + if (readAndTranscriptStrandsAgree == thisReadExpectedToAgree) { + ++metrics.CORRECT_STRAND_READS; + } else { + ++metrics.INCORRECT_STRAND_READS; + } } - } + // Count templates only once rather than individual reads/records + if (readOneOrUnpaired) { + // Check that for paired end reads they are in the proper orientation (FR) and that the entire + // template is enclosed in the gene. + final boolean properOrientation; + final int leftMostAlignedBase, rightMostAlignedBase; + if (rec.getReadPairedFlag()) { + if (rec.getMateUnmappedFlag()) { + properOrientation = false; // mate is unmapped! + leftMostAlignedBase = rightMostAlignedBase = 0; + } else { + // Get the alignment length of the mate. Approximate it with the current read length if no + // mate cigar is found. + final Cigar mateCigar = SAMUtils.getMateCigar(rec); + final int mateReferenceLength = (mateCigar == null) ? rec.getReadLength() : mateCigar.getReferenceLength(); + final int mateAlignmentEnd = CoordMath.getEnd(rec.getMateAlignmentStart(), mateReferenceLength); + properOrientation = SamPairUtil.getPairOrientation(rec) == SamPairUtil.PairOrientation.FR; + leftMostAlignedBase = Math.min(rec.getAlignmentStart(), rec.getMateAlignmentStart()); + rightMostAlignedBase = Math.max(rec.getAlignmentEnd(), mateAlignmentEnd); + } + } else { + properOrientation = true; // can only be false for paired end reads + leftMostAlignedBase = rec.getAlignmentStart(); + rightMostAlignedBase = rec.getAlignmentEnd(); + } + + if (properOrientation && CoordMath.encloses(gene.getStart(), gene.getEnd(), leftMostAlignedBase, rightMostAlignedBase)) { + if (negativeReadStrand == negativeTranscriptionStrand) { + ++metrics.NUM_R1_TRANSCRIPT_STRAND_READS; + } else { + ++metrics.NUM_R2_TRANSCRIPT_STRAND_READS; + } + } else { + ++metrics.NUM_UNEXPLAINED_READS; + } + } + } } protected int getNumAlignedBases(SAMRecord rec) { @@ -277,6 +312,12 @@ public void finish() { if (metrics.CORRECT_STRAND_READS > 0 || metrics.INCORRECT_STRAND_READS > 0) { metrics.PCT_CORRECT_STRAND_READS = metrics.CORRECT_STRAND_READS/(double)(metrics.CORRECT_STRAND_READS + metrics.INCORRECT_STRAND_READS); } + + final long readsExamined = metrics.NUM_R1_TRANSCRIPT_STRAND_READS + metrics.NUM_R2_TRANSCRIPT_STRAND_READS; + if (0 < readsExamined) { + metrics.PCT_R1_TRANSCRIPT_STRAND_READS = metrics.NUM_R1_TRANSCRIPT_STRAND_READS / (double) readsExamined; + metrics.PCT_R2_TRANSCRIPT_STRAND_READS = metrics.NUM_R2_TRANSCRIPT_STRAND_READS / (double) readsExamined; + } } @Override diff --git a/src/test/java/picard/analysis/CollectRnaSeqMetricsTest.java b/src/test/java/picard/analysis/CollectRnaSeqMetricsTest.java index d58779166..fd25a8503 100644 --- a/src/test/java/picard/analysis/CollectRnaSeqMetricsTest.java +++ b/src/test/java/picard/analysis/CollectRnaSeqMetricsTest.java @@ -23,12 +23,7 @@ */ package picard.analysis; -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMFileWriter; -import htsjdk.samtools.SAMFileWriterFactory; -import htsjdk.samtools.SAMReadGroupRecord; -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.SAMRecordSetBuilder; +import htsjdk.samtools.*; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.util.Interval; import htsjdk.samtools.util.IntervalList; @@ -48,7 +43,7 @@ public String getCommandLineProgramName() { } @Test - public void basic() throws Exception { + public void testBasic() throws Exception { final String sequence = "chr1"; final String ignoredSequence = "chrM"; @@ -92,7 +87,7 @@ public void basic() throws Exception { "REF_FLAT=" + getRefFlatFile(sequence).getAbsolutePath(), "RIBOSOMAL_INTERVALS=" + rRnaIntervalsFile.getAbsolutePath(), "STRAND_SPECIFICITY=SECOND_READ_TRANSCRIPTION_STRAND", - "IGNORE_SEQUENCE=" +ignoredSequence + "IGNORE_SEQUENCE=" + ignoredSequence }; Assert.assertEquals(runPicardCommandLine(args), 0); @@ -110,6 +105,11 @@ public void basic() throws Exception { Assert.assertEquals(metrics.CORRECT_STRAND_READS, 3); Assert.assertEquals(metrics.INCORRECT_STRAND_READS, 4); Assert.assertEquals(metrics.IGNORED_READS, 1); + Assert.assertEquals(metrics.NUM_R1_TRANSCRIPT_STRAND_READS, 1); + Assert.assertEquals(metrics.NUM_R2_TRANSCRIPT_STRAND_READS, 2); + Assert.assertEquals(metrics.NUM_UNEXPLAINED_READS, 2); + Assert.assertEquals(metrics.PCT_R1_TRANSCRIPT_STRAND_READS, 0.333333); + Assert.assertEquals(metrics.PCT_R2_TRANSCRIPT_STRAND_READS, 0.666667); } @Test @@ -165,7 +165,7 @@ public void testMultiLevel() throws Exception { "REF_FLAT=" + getRefFlatFile(sequence).getAbsolutePath(), "RIBOSOMAL_INTERVALS=" + rRnaIntervalsFile.getAbsolutePath(), "STRAND_SPECIFICITY=SECOND_READ_TRANSCRIPTION_STRAND", - "IGNORE_SEQUENCE=" +ignoredSequence, + "IGNORE_SEQUENCE=" + ignoredSequence, "LEVEL=null", "LEVEL=SAMPLE", "LEVEL=LIBRARY" @@ -187,6 +187,11 @@ public void testMultiLevel() throws Exception { Assert.assertEquals(metrics.CORRECT_STRAND_READS, 3); Assert.assertEquals(metrics.INCORRECT_STRAND_READS, 4); Assert.assertEquals(metrics.IGNORED_READS, 1); + Assert.assertEquals(metrics.NUM_R1_TRANSCRIPT_STRAND_READS, 1); + Assert.assertEquals(metrics.NUM_R2_TRANSCRIPT_STRAND_READS, 2); + Assert.assertEquals(metrics.NUM_UNEXPLAINED_READS, 2); + Assert.assertEquals(metrics.PCT_R1_TRANSCRIPT_STRAND_READS, 0.333333); + Assert.assertEquals(metrics.PCT_R2_TRANSCRIPT_STRAND_READS, 0.666667); } else if (metrics.LIBRARY.equals("foo")) { Assert.assertEquals(metrics.PF_ALIGNED_BASES, 216); @@ -199,7 +204,11 @@ else if (metrics.LIBRARY.equals("foo")) { Assert.assertEquals(metrics.CORRECT_STRAND_READS, 3); Assert.assertEquals(metrics.INCORRECT_STRAND_READS, 2); Assert.assertEquals(metrics.IGNORED_READS, 0); - + Assert.assertEquals(metrics.NUM_R1_TRANSCRIPT_STRAND_READS, 0); + Assert.assertEquals(metrics.NUM_R2_TRANSCRIPT_STRAND_READS, 2); + Assert.assertEquals(metrics.NUM_UNEXPLAINED_READS, 1); + Assert.assertEquals(metrics.PCT_R1_TRANSCRIPT_STRAND_READS, 0.0); + Assert.assertEquals(metrics.PCT_R2_TRANSCRIPT_STRAND_READS, 1.0); } else if (metrics.LIBRARY.equals("bar")) { Assert.assertEquals(metrics.PF_ALIGNED_BASES, 180); @@ -212,11 +221,104 @@ else if (metrics.LIBRARY.equals("bar")) { Assert.assertEquals(metrics.CORRECT_STRAND_READS, 0); Assert.assertEquals(metrics.INCORRECT_STRAND_READS, 2); Assert.assertEquals(metrics.IGNORED_READS, 1); - + Assert.assertEquals(metrics.NUM_R1_TRANSCRIPT_STRAND_READS, 1); + Assert.assertEquals(metrics.NUM_R2_TRANSCRIPT_STRAND_READS, 0); + Assert.assertEquals(metrics.NUM_UNEXPLAINED_READS, 1); + Assert.assertEquals(metrics.PCT_R1_TRANSCRIPT_STRAND_READS, 1.0); + Assert.assertEquals(metrics.PCT_R2_TRANSCRIPT_STRAND_READS, 0.0); } } } + @Test + public void testTranscriptionStrandMetrics() throws Exception { + final String sequence = "chr1"; + final String ignoredSequence = "chrM"; + + // Create some alignments that hit the ribosomal sequence, various parts of the gene, and intergenic. + final SAMRecordSetBuilder builder = new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.coordinate); + // Set seed so that strandedness is consistent among runs. + builder.setRandomSeed(0); + builder.setReadLength(50); + final int sequenceIndex = builder.getHeader().getSequenceIndex(sequence); + + // Reads that start *before* the gene: count as unexamined + builder.addPair("pair_prior_1", sequenceIndex, 45, 50, false, false, "50M", "50M", true, false, -1); + builder.addPair("pair_prior_2", sequenceIndex, 49, 50, false, false, "50M", "50M", true, false, -1); + + // Read that is enclosed in the gene, but one end does not map: count as unexamined + builder.addPair("read_one_end_unmapped", sequenceIndex, 50, 51, false, true, "50M", null, false, false, -1); + + // Reads that are enclosed in the gene, paired and frag: one count per template + builder.addFrag("frag_enclosed_forward", sequenceIndex, 150, false); + builder.addFrag("frag_enclosed_reverse", sequenceIndex, 150, true); + builder.addPair("pair_enclosed_forward", sequenceIndex, 150, 200, false, false, "50M", "50M", false, true, -1); + builder.addPair("pair_enclosed_reverse", sequenceIndex, 200, 150, false, false, "50M", "50M", true, false, -1); + + // Read that starts *after* the gene: not counted + builder.addPair("pair_after_1", sequenceIndex, 545, 550, false, false, "50M", "50M", true, false, -1); + builder.addPair("pair_after_2", sequenceIndex, 549, 550, false, false, "50M", "50M", true, false, -1); + + // A read that uses the read length instead of the mate cigar + builder.addFrag("frag_enclosed_forward_no_mate_cigar", sequenceIndex, 150, false).setAttribute(SAMTag.MC.name(), null); + + // Duplicate reads are counted + builder.addFrag("frag_duplicate", sequenceIndex, 150, false).setDuplicateReadFlag(true); + + // These reads should be ignored. + builder.addFrag("frag_secondary", sequenceIndex, 150, false).setNotPrimaryAlignmentFlag(true); + builder.addFrag("frag_supplementary", sequenceIndex, 150, false).setSupplementaryAlignmentFlag(true); + builder.addFrag("frag_qc_failure", sequenceIndex, 150, false).setReadFailsVendorQualityCheckFlag(true); + + final File samFile = File.createTempFile("tmp.collectRnaSeqMetrics.", ".sam"); + samFile.deleteOnExit(); + + final SAMFileWriter samWriter = new SAMFileWriterFactory().makeSAMWriter(builder.getHeader(), false, samFile); + for (final SAMRecord rec: builder.getRecords()) samWriter.addAlignment(rec); + samWriter.close(); + + // Create an interval list with one ribosomal interval. + final Interval rRnaInterval = new Interval(sequence, 300, 520, true, "rRNA"); + final IntervalList rRnaIntervalList = new IntervalList(builder.getHeader()); + rRnaIntervalList.add(rRnaInterval); + final File rRnaIntervalsFile = File.createTempFile("tmp.rRna.", ".interval_list"); + rRnaIntervalsFile.deleteOnExit(); + rRnaIntervalList.write(rRnaIntervalsFile); + + // Generate the metrics. + final File metricsFile = File.createTempFile("tmp.", ".rna_metrics"); + metricsFile.deleteOnExit(); + + final String[] args = new String[] { + "INPUT=" + samFile.getAbsolutePath(), + "OUTPUT=" + metricsFile.getAbsolutePath(), + "REF_FLAT=" + getRefFlatFile(sequence).getAbsolutePath(), + "RIBOSOMAL_INTERVALS=" + rRnaIntervalsFile.getAbsolutePath(), + "STRAND_SPECIFICITY=SECOND_READ_TRANSCRIPTION_STRAND", + "IGNORE_SEQUENCE=" + ignoredSequence + }; + Assert.assertEquals(runPicardCommandLine(args), 0); + + final MetricsFile> output = new MetricsFile>(); + output.read(new FileReader(metricsFile)); + + final RnaSeqMetrics metrics = output.getMetrics().get(0); + Assert.assertEquals(metrics.PF_ALIGNED_BASES, 900); + Assert.assertEquals(metrics.PF_BASES, 950); + Assert.assertEquals(metrics.RIBOSOMAL_BASES.longValue(), 0L); + Assert.assertEquals(metrics.CODING_BASES, 471); + Assert.assertEquals(metrics.UTR_BASES, 125); + Assert.assertEquals(metrics.INTRONIC_BASES, 98); + Assert.assertEquals(metrics.INTERGENIC_BASES, 206); + Assert.assertEquals(metrics.CORRECT_STRAND_READS, 7); + Assert.assertEquals(metrics.INCORRECT_STRAND_READS, 6); + Assert.assertEquals(metrics.IGNORED_READS, 0); + Assert.assertEquals(metrics.NUM_R1_TRANSCRIPT_STRAND_READS, 4); + Assert.assertEquals(metrics.NUM_R2_TRANSCRIPT_STRAND_READS, 2); + Assert.assertEquals(metrics.NUM_UNEXPLAINED_READS, 3); + Assert.assertEquals(metrics.PCT_R1_TRANSCRIPT_STRAND_READS, 0.666667); + Assert.assertEquals(metrics.PCT_R2_TRANSCRIPT_STRAND_READS, 0.333333); + } public File getRefFlatFile(String sequence) throws Exception { // Create a refFlat file with a single gene containing two exons, one of which is overlapped by the @@ -242,5 +344,4 @@ public File getRefFlatFile(String sequence) throws Exception { return refFlatFile; } - }