diff --git a/src/main/java/picard/analysis/AbstractWgsMetricsCollector.java b/src/main/java/picard/analysis/AbstractWgsMetricsCollector.java new file mode 100644 index 000000000..f84f300a5 --- /dev/null +++ b/src/main/java/picard/analysis/AbstractWgsMetricsCollector.java @@ -0,0 +1,219 @@ +/* + * The MIT License + * + * Copyright (c) 2016 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.analysis; + +import htsjdk.samtools.metrics.MetricsFile; +import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.util.AbstractLocusInfo; +import htsjdk.samtools.util.AbstractRecordAndOffset; +import htsjdk.samtools.util.Histogram; +import htsjdk.samtools.util.IntervalList; +import htsjdk.samtools.util.SequenceUtil; +import picard.filter.CountingFilter; +import picard.filter.CountingPairedFilter; + +/** + * Class for collecting data on reference coverage, base qualities and excluded bases from one AbstractLocusInfo object for + * CollectWgsMetrics. + *

+ * The shared code for forming result for CollectWgsMetrics is abstracted into this class. + * Classes that extend this collector implement their logic in addInfo() method. + * @author Mariia_Zueva@epam.com, EPAM Systems, Inc. + */ + +public abstract class AbstractWgsMetricsCollector { + + /** + * The source CollectWgsMetrics object + */ + final CollectWgsMetrics collectWgsMetrics; + /** Count of sites with a given depth of coverage. Includes all but quality 2 bases. + * We draw depths from this histogram when we calculate the theoretical het sensitivity. + */ + protected final long[] unfilteredDepthHistogramArray; + /** Count of bases observed with a given base quality. Includes all but quality 2 bases. + * We draw bases from this histogram when we calculate the theoretical het sensitivity. + */ + protected final long[] unfilteredBaseQHistogramArray; + /** + * Count of sites with a given depth of coverage. + * Excludes bases with quality below MINIMUM_BASE_QUALITY (default 20). + */ + protected final long[] highQualityDepthHistogramArray; + /** + * Number of aligned bases that were filtered out because they were of low base quality (default is < 20). + */ + long basesExcludedByBaseq = 0; + /** + * Number of aligned bases that were filtered out because they were the second observation from an insert with overlapping reads. + */ + long basesExcludedByOverlap = 0; + /** + * Number of aligned bases that were filtered out because they would have raised coverage above the capped value (default cap = 250x). + */ + long basesExcludedByCapping = 0; + /** + * Positions with coverage exceeding this value are treated as if they had coverage at this value + */ + protected final int coverageCap; + + protected final IntervalList intervals; + /** + * This value indicates that processing will stop after specified int the metric amount of genomic bases. + */ + private final boolean usingStopAfter; + /** + * The number of processed genomic bases + */ + protected long counter = 0; + + /** + * Creates a collector and initializes the inner data structures + * + * @param collectWgsMetrics CollectWgsMetrics, that creates this collector + * @param coverageCap coverage cap + */ + AbstractWgsMetricsCollector(CollectWgsMetrics collectWgsMetrics, final int coverageCap, final IntervalList intervals) { + if (coverageCap <= 0) { + throw new IllegalArgumentException("Coverage cap must be positive."); + } + this.collectWgsMetrics = collectWgsMetrics; + unfilteredDepthHistogramArray = new long[coverageCap + 1]; + highQualityDepthHistogramArray = new long[coverageCap + 1]; + unfilteredBaseQHistogramArray = new long[Byte.MAX_VALUE]; + this.coverageCap = coverageCap; + this.intervals = intervals; + this.usingStopAfter = collectWgsMetrics.STOP_AFTER > 0; + } + + /** + * Accumulates the data from AbstractLocusInfo in inner structures + * @param info {@link htsjdk.samtools.util.AbstractLocusInfo} with aligned to reference position reads + * @param ref {@link htsjdk.samtools.reference.ReferenceSequence} + * @param referenceBaseN true if current the value of reference base represents a no call + */ + public abstract void addInfo(final AbstractLocusInfo info, final ReferenceSequence ref, boolean referenceBaseN); + + /** + * Adds collected metrics and depth histogram to file + * @param file MetricsFile for result of collector's work + * @param dupeFilter counting filter for duplicate reads + * @param mapqFilter counting filter for mapping quality + * @param pairFilter counting filter for reads without a mapped mate pair + */ + public void addToMetricsFile(final MetricsFile file, + final boolean includeBQHistogram, + final CountingFilter dupeFilter, + final CountingFilter mapqFilter, + final CountingPairedFilter pairFilter) { + final CollectWgsMetrics.WgsMetrics + metrics = getMetrics(dupeFilter, mapqFilter, pairFilter); + + // add them to the file + file.addMetric(metrics); + file.addHistogram(getHighQualityDepthHistogram()); + if (includeBQHistogram) addBaseQHistogram(file); + } + + protected void addBaseQHistogram(final MetricsFile file) { + file.addHistogram(getUnfilteredBaseQHistogram()); + } + + protected Histogram getHighQualityDepthHistogram() { + return getHistogram(highQualityDepthHistogramArray, "coverage", "high_quality_coverage_count"); + } + + protected Histogram getUnfilteredDepthHistogram() { + return getHistogram(unfilteredDepthHistogramArray, "coverage", "unfiltered_coverage_count"); + } + + protected Histogram getUnfilteredBaseQHistogram() { + return getHistogram(unfilteredBaseQHistogramArray, "baseq", "unfiltered_baseq_count"); + } + + protected Histogram getHistogram(final long[] array, final String binLabel, final String valueLabel) { + final Histogram histogram = new Histogram<>(binLabel, valueLabel); + for (int i = 0; i < array.length; ++i) { + histogram.increment(i, array[i]); + } + return histogram; + } + + /** + * Creates CollectWgsMetrics.WgsMetrics - the object holding the result of CollectWgsMetrics + * + * @param dupeFilter counting filter for duplicate reads + * @param mapqFilter counting filter for mapping quality + * @param pairFilter counting filter for reads without a mapped mate pair + * @return CollectWgsMetrics.WgsMetrics with set fields + */ + protected CollectWgsMetrics.WgsMetrics getMetrics(final CountingFilter dupeFilter, + final CountingFilter mapqFilter, + final CountingPairedFilter pairFilter) { + return collectWgsMetrics.generateWgsMetrics( + this.intervals, + getHighQualityDepthHistogram(), + getUnfilteredDepthHistogram(), + collectWgsMetrics.getBasesExcludedBy(mapqFilter), + collectWgsMetrics.getBasesExcludedBy(dupeFilter), + collectWgsMetrics.getBasesExcludedBy(pairFilter), + basesExcludedByBaseq, + basesExcludedByOverlap, + basesExcludedByCapping, + coverageCap, + getUnfilteredBaseQHistogram(), + collectWgsMetrics.SAMPLE_SIZE + ); + } + + /** + * @return true, of number of processed loci exceeded the threshold, otherwise false + */ + boolean isTimeToStop(final long processedLoci) { + return usingStopAfter && processedLoci > collectWgsMetrics.STOP_AFTER - 1; + } + + /** + * Sets the counter to the current number of processed loci. Counter, must be updated + * from outside, since we are skipping a no call reference positions outside of the collector + * + * @param counter number of processed loci + */ + public void setCounter(long counter) { + this.counter = counter; + } + + /** + * Checks if reference base at given position is unknown. + * + * @param position to check the base + * @param ref reference sequence + * @return true if reference base at position represents a no call, otherwise false + */ + boolean isReferenceBaseN(final int position, final ReferenceSequence ref) { + final byte base = ref.getBases()[position - 1]; + return SequenceUtil.isNoCall(base); + } +} diff --git a/src/main/java/picard/analysis/CollectWgsMetrics.java b/src/main/java/picard/analysis/CollectWgsMetrics.java index ab7df819f..645adb3d7 100644 --- a/src/main/java/picard/analysis/CollectWgsMetrics.java +++ b/src/main/java/picard/analysis/CollectWgsMetrics.java @@ -47,16 +47,16 @@ import java.io.File; import java.util.ArrayList; -import java.util.Arrays; import java.util.HashSet; import java.util.List; -import java.util.stream.LongStream; import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME; /** * Computes a number of metrics that are useful for evaluating coverage and performance of whole genome sequencing experiments. - * + * Two algorithms are available for this metrics: default and fast. The fast algorithm is enabled by USE_FAST_ALGORITHM option. + * The fast algorithm works better for regions of BAM file with coverage at least 10 reads per locus, + * for lower coverage the algorithms perform the same. * @author tfennell */ @CommandLineProgramProperties( @@ -118,6 +118,12 @@ @Option(doc="Sample Size used for Theoretical Het Sensitivity sampling. Default is 10000.", optional = true) public int SAMPLE_SIZE=10000; + @Option(doc = "If true, fast algorithm is used.") + public boolean USE_FAST_ALGORITHM = false; + + @Option(doc = "Average read length in the file. Default is 150.", optional = true) + public int READ_LENGTH = 150; + @Option(doc = "An interval list file that contains the positions to restrict the assessment. Please note that " + "all bases of reads that overlap these intervals will be considered, even if some of those bases extend beyond the boundaries of " + "the interval. The ideal use case for this argument is to use it to restrict the calculation to a subset of (whole) contigs. To " + @@ -437,7 +443,7 @@ protected int doWork() { final ProgressLogger progress = new ProgressLogger(log, 10000000, "Processed", "loci"); final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE); final SamReader in = getSamReader(); - final SamLocusIterator iterator = getLocusIterator(in); + final AbstractLocusIterator iterator = getLocusIterator(in); final List filters = new ArrayList<>(); final CountingFilter mapqFilter = new CountingMapQFilter(MINIMUM_MAPPING_QUALITY); @@ -451,49 +457,26 @@ protected int doWork() { filters.add(pairFilter); } iterator.setSamFilters(filters); - iterator.setEmitUncoveredLoci(true); iterator.setMappingQualityScoreCutoff(0); // Handled separately because we want to count bases - iterator.setQualityScoreCutoff(0); // Handled separately because we want to count bases iterator.setIncludeNonPfReads(false); - iterator.setMaxReadsToAccumulatePerLocus(LOCUS_ACCUMULATION_CAP); - - final WgsMetricsCollector collector = getCollector(COVERAGE_CAP, getIntervalsToExamine()); - - final boolean usingStopAfter = STOP_AFTER > 0; - final long stopAfter = STOP_AFTER - 1; - long counter = 0; - // Loop through all the loci - while (iterator.hasNext()) { - final SamLocusIterator.LocusInfo info = iterator.next(); - final ReferenceSequence ref = refWalker.get(info.getSequenceIndex()); - - // Check that the reference is not N - final byte base = ref.getBases()[info.getPosition() - 1]; - if (SequenceUtil.isNoCall(base)) continue; - - // add to the collector - collector.addInfo(info); - - // Record progress and perhaps stop - progress.record(info.getSequenceName(), info.getPosition()); - if (usingStopAfter && ++counter > stopAfter) break; - } - - // check that we added the same number of bases to the raw coverage histogram and the base quality histograms - final long sumBaseQ= Arrays.stream(collector.unfilteredBaseQHistogramArray).sum(); - final long sumDepthHisto = LongStream.rangeClosed(0, collector.coverageCap).map(i -> (i * collector.unfilteredDepthHistogramArray[(int) i])).sum(); - if (sumBaseQ != sumDepthHisto) { - log.error("Coverage and baseQ distributions contain different amount of bases!"); - } + final AbstractWgsMetricsCollector collector = getCollector(COVERAGE_CAP, getIntervalsToExamine()); + final WgsMetricsProcessor processor = getWgsMetricsProcessor(progress, refWalker, iterator, collector); + processor.processFile(); final MetricsFile out = getMetricsFile(); - collector.addToMetricsFile(out, INCLUDE_BQ_HISTOGRAM, dupeFilter, mapqFilter, pairFilter); + processor.addToMetricsFile(out, INCLUDE_BQ_HISTOGRAM, dupeFilter, mapqFilter, pairFilter); out.write(OUTPUT); return 0; } + private WgsMetricsProcessorImpl getWgsMetricsProcessor( + ProgressLogger progress, ReferenceSequenceFileWalker refWalker, + AbstractLocusIterator> iterator, AbstractWgsMetricsCollector collector) { + return new WgsMetricsProcessorImpl<>(iterator, refWalker, collector, progress); + } + /** Gets the intervals over which we will calculate metrics. */ protected IntervalList getIntervalsToExamine() { final IntervalList intervals; @@ -547,7 +530,7 @@ protected WgsMetrics generateWgsMetrics(final IntervalList intervals, ); } - private WgsMetrics generateWgsMetrics(final IntervalList intervals, + WgsMetrics generateWgsMetrics(final IntervalList intervals, final Histogram highQualityDepthHistogram, final Histogram unfilteredDepthHistogram, final long basesExcludedByMapq, @@ -596,48 +579,49 @@ protected long getBasesExcludedBy(final CountingFilter filter) { return filter.getFilteredBases(); } - protected SamLocusIterator getLocusIterator(final SamReader in) { - return (INTERVALS != null) ? new SamLocusIterator(in, IntervalList.fromFile(INTERVALS)) : new SamLocusIterator(in); + /** + * Creates {@link htsjdk.samtools.util.AbstractLocusIterator} implementation according to {@link this#USE_FAST_ALGORITHM} value. + * + * @param in inner {@link htsjdk.samtools.SamReader} + * @return if {@link this#USE_FAST_ALGORITHM} is enabled, returns {@link htsjdk.samtools.util.EdgeReadIterator} implementation, + * otherwise default algorithm is used and {@link htsjdk.samtools.util.SamLocusIterator} is returned. + */ + protected AbstractLocusIterator getLocusIterator(final SamReader in) { + if (USE_FAST_ALGORITHM) { + return (INTERVALS != null) ? new EdgeReadIterator(in, IntervalList.fromFile(INTERVALS)) : new EdgeReadIterator(in); + } + SamLocusIterator iterator = (INTERVALS != null) ? new SamLocusIterator(in, IntervalList.fromFile(INTERVALS)) : new SamLocusIterator(in); + iterator.setMaxReadsToAccumulatePerLocus(LOCUS_ACCUMULATION_CAP); + iterator.setEmitUncoveredLoci(true); + iterator.setQualityScoreCutoff(0); + return iterator; } /** + * Creates {@link picard.analysis.AbstractWgsMetricsCollector} implementation according to {@link this#USE_FAST_ALGORITHM} value. + * * @param coverageCap the maximum depth/coverage to consider. * @param intervals the intervals over which metrics are collected. - * @return + * @return if {@link this#USE_FAST_ALGORITHM} is enabled, returns {@link picard.analysis.FastWgsMetricsCollector} implementation, + * otherwise default algorithm is used and {@link picard.analysis.CollectWgsMetrics.WgsMetricsCollector} is returned. */ - protected WgsMetricsCollector getCollector(final int coverageCap, final IntervalList intervals) { - return new WgsMetricsCollector(coverageCap, intervals); + protected AbstractWgsMetricsCollector getCollector(final int coverageCap, final IntervalList intervals) { + return USE_FAST_ALGORITHM ? new FastWgsMetricsCollector(this, coverageCap, intervals) : + new WgsMetricsCollector(this, coverageCap, intervals); } - protected class WgsMetricsCollector { + protected static class WgsMetricsCollector extends AbstractWgsMetricsCollector { - // Count of sites with a given depth of coverage. Includes all but quality 2 bases. - // We draw depths from this histogram when we calculate the theoretical het sensitivity. - protected final long[] unfilteredDepthHistogramArray; - - // Count of bases observed with a given base quality. Includes all but quality 2 bases. - // We draw bases from this histogram when we calculate the theoretical het sensitivity. - protected final long[] unfilteredBaseQHistogramArray; - - // Count of sites with a given depth of coverage. Excludes bases with quality below MINIMUM_BASE_QUALITY (default 20). - protected final long[] highQualityDepthHistogramArray; - - private long basesExcludedByBaseq = 0; - private long basesExcludedByOverlap = 0; - private long basesExcludedByCapping = 0; - protected final IntervalList intervals; - protected final int coverageCap; - - public WgsMetricsCollector(final int coverageCap, final IntervalList intervals) { - unfilteredDepthHistogramArray = new long[coverageCap + 1]; - highQualityDepthHistogramArray = new long[coverageCap + 1]; - unfilteredBaseQHistogramArray = new long[Byte.MAX_VALUE]; - this.coverageCap = coverageCap; - this.intervals = intervals; + public WgsMetricsCollector(final CollectWgsMetrics metrics, final int coverageCap, final IntervalList intervals) { + super(metrics, coverageCap, intervals); } - public void addInfo(final SamLocusIterator.LocusInfo info) { + @Override + public void addInfo(final AbstractLocusInfo info, final ReferenceSequence ref, boolean referenceBaseN) { + if (referenceBaseN) { + return; + } // Figure out the coverage while not counting overlapping reads twice, and excluding various things final HashSet readNames = new HashSet<>(info.getRecordAndOffsets().size()); int pileupSize = 0; @@ -653,78 +637,15 @@ public void addInfo(final SamLocusIterator.LocusInfo info) { unfilteredDepth++; } - if (recs.getBaseQuality() < MINIMUM_BASE_QUALITY || + if (recs.getBaseQuality() < collectWgsMetrics.MINIMUM_BASE_QUALITY || SequenceUtil.isNoCall(recs.getReadBase())) { ++basesExcludedByBaseq; continue; } if (!readNames.add(recs.getRecord().getReadName())) { ++basesExcludedByOverlap; continue; } - pileupSize++; } - final int highQualityDepth = Math.min(pileupSize, coverageCap); if (highQualityDepth < pileupSize) basesExcludedByCapping += pileupSize - coverageCap; highQualityDepthHistogramArray[highQualityDepth]++; unfilteredDepthHistogramArray[unfilteredDepth]++; } - - public void addToMetricsFile(final MetricsFile file, - final boolean includeBQHistogram, - final CountingFilter dupeFilter, - final CountingFilter mapqFilter, - final CountingPairedFilter pairFilter) { - final WgsMetrics metrics = getMetrics(dupeFilter, mapqFilter, pairFilter); - - // add them to the file - file.addMetric(metrics); - file.addHistogram(getHighQualityDepthHistogram()); - if (includeBQHistogram) addBaseQHistogram(file); - - - } - - protected void addBaseQHistogram(final MetricsFile file) { - file.addHistogram(getUnfilteredBaseQHistogram()); - } - - protected Histogram getHighQualityDepthHistogram() { - return getHistogram(highQualityDepthHistogramArray, "coverage", "high_quality_coverage_count"); - } - - protected Histogram getUnfilteredDepthHistogram(){ - return getHistogram(unfilteredDepthHistogramArray, "coverage", "unfiltered_coverage_count"); - } - - protected Histogram getUnfilteredBaseQHistogram() { - return getHistogram(unfilteredBaseQHistogramArray, "baseq", "unfiltered_baseq_count"); - } - - protected Histogram getHistogram(final long[] array, final String binLabel, final String valueLabel) { - final Histogram histogram = new Histogram<>(binLabel, valueLabel); - for (int i = 0; i < array.length; ++i) { - histogram.increment(i, array[i]); - } - return histogram; - } - - protected WgsMetrics getMetrics(final CountingFilter dupeFilter, - final CountingFilter mapqFilter, - final CountingPairedFilter pairFilter) { - return generateWgsMetrics( - this.intervals, - getHighQualityDepthHistogram(), - getUnfilteredDepthHistogram(), - getBasesExcludedBy(mapqFilter), - getBasesExcludedBy(dupeFilter), - getBasesExcludedBy(pairFilter), - basesExcludedByBaseq, - basesExcludedByOverlap, - basesExcludedByCapping, - coverageCap, - getUnfilteredBaseQHistogram(), - SAMPLE_SIZE - ); - } - - } } - diff --git a/src/main/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java b/src/main/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java index 2c872ff43..17e013afd 100644 --- a/src/main/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java +++ b/src/main/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java @@ -27,7 +27,11 @@ import htsjdk.samtools.SAMReadGroupRecord; import htsjdk.samtools.SamReader; import htsjdk.samtools.metrics.MetricsFile; -import htsjdk.samtools.util.*; +import htsjdk.samtools.util.Histogram; +import htsjdk.samtools.util.IOUtil; +import htsjdk.samtools.util.IntervalList; +import htsjdk.samtools.util.Log; +import htsjdk.samtools.util.StringUtil; import picard.PicardException; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; @@ -125,7 +129,7 @@ protected int doWork() { // Initialize the SamReader, so the header is available prior to super.doWork, for getIntervalsToExamine call. */ getSamReader(); - this.collector = new WgsMetricsWithNonZeroCoverageCollector(COVERAGE_CAP, getIntervalsToExamine()); + this.collector = new WgsMetricsWithNonZeroCoverageCollector(this, COVERAGE_CAP, getIntervalsToExamine()); super.doWork(); final List readGroups = getSamFileHeader().getReadGroups(); @@ -187,8 +191,9 @@ protected WgsMetricsCollector getCollector(final int coverageCap, final Interval Histogram highQualityDepthHistogram; Histogram highQualityDepthHistogramNonZero; - public WgsMetricsWithNonZeroCoverageCollector(final int coverageCap, final IntervalList intervals) { - super(coverageCap, intervals); + public WgsMetricsWithNonZeroCoverageCollector(final CollectWgsMetricsWithNonZeroCoverage metrics, + final int coverageCap, final IntervalList intervals) { + super(metrics, coverageCap, intervals); } @Override @@ -239,5 +244,4 @@ public boolean areHistogramsEmpty() { return (highQualityDepthHistogram.isEmpty() || highQualityDepthHistogramNonZero.isEmpty()); } } - -} \ No newline at end of file +} diff --git a/src/main/java/picard/analysis/CounterManager.java b/src/main/java/picard/analysis/CounterManager.java new file mode 100644 index 000000000..112536b13 --- /dev/null +++ b/src/main/java/picard/analysis/CounterManager.java @@ -0,0 +1,194 @@ +/* + * The MIT License + * + * Copyright (c) 2016 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.analysis; + +import java.util.ArrayList; +import java.util.Arrays; + +/** + * Class for managing a list of Counters of integer, + * provides methods to access data from Counters with respect to an offset. + * Each Counter represents a certain region of a processed sequence starting from offset and + * can accumulate some value for each locus of this sequence, for instance, read coverage. + * @author Mariia_Zueva@epam.com, EPAM Systems, Inc. + */ + +public class CounterManager { + + /** + * Length of inner Counter arrays + */ + private final int arrayLength; + /** + * Proposed length of processed reads + */ + private final int readLength; + /** + * Current offset from the start of the processed sequence. If offset = 100, this means, that value in + * Counter arrays at index 0, represents the 100th position of the sequence. + */ + private int offset = 0; + /** + * List of Counter arrays for managing + */ + private final ArrayList arrays = new ArrayList<>(2); + + /** + * Constructor creates new CounterManager without any Counters, + * counters are added to CounterManager via newCounter() method + * + * @param arrayLength length of inner Counter arrays + * @param readLength proposed length of processed reads + */ + public CounterManager(final int arrayLength, int readLength) { + this.arrayLength = arrayLength; + this.readLength = readLength; + } + + /** + * Method added for use in unit tests + * + * @return offset from the reference sequence, that is represented by index 0 of {@link picard.analysis.CounterManager.Counter} arrays. + */ + int getOffset() { + return offset; + } + + /** + * Method added for use in unit tests + * + * @param offset from the reference sequence, that will be represented by index 0 of {@link picard.analysis.CounterManager.Counter} arrays. + */ + void setOffset(int offset) { + this.offset = offset; + } + + /** + * Method checks that new locus position is not out of bounds of Counter arrays and there is enough + * space in them to hold information on at least one more read of length {@link this#readLength}. + * If there is no free space, but there is accumulated information in Counter arrays after new locus position + * that we may need, the arrays are rebased, so that 0 index of arrays represents new locus position. + * In other case, we just clear the arrays. + * + * @param locusPosition position in the reference sequence, + * that will be represented by index 0 of {@link picard.analysis.CounterManager.Counter} arrays. + */ + public void checkOutOfBounds(int locusPosition) { + if (locusPosition - offset + readLength >= arrayLength) { + if (locusPosition - offset < arrayLength) { + rebase(locusPosition); + } else { + clear(); + offset = locusPosition; + } + } + } + + /** + * Rebases inner Counter arrays so that 0 index of array represents the new locus position + * + * @param locusPosition position in the reference sequence, + * that will be represented by index 0 of {@link picard.analysis.CounterManager.Counter} arrays. + */ + private void rebase(int locusPosition) { + if (locusPosition < offset) { + throw new IllegalArgumentException("Position in the reference sequence is lesser than offset."); + } + for (Counter counter : arrays) { + final int[] array = counter.array; + final int skipLength = locusPosition - offset; + System.arraycopy(array, skipLength, array, 0, arrayLength - skipLength); + Arrays.fill(array, arrayLength - skipLength, arrayLength, 0); + } + offset = locusPosition; + } + + /** + * Clears all inner Counter arrays + */ + public void clear() { + for (Counter counter : arrays) { + final int[] array = counter.array; + Arrays.fill(array, 0); + } + offset = 0; + } + + /** + * Creates a new Counter object and adds it to the list of managed Counters. + * + * @return {@link picard.analysis.CounterManager.Counter}, that will be managed by current {@link picard.analysis.CounterManager} + */ + public Counter newCounter() { + final Counter counter = new Counter(arrayLength); + arrays.add(counter); + return counter; + } + + /** + * Class represents an integer array with methods to increment and get the values from it with respect + * to offset of outer {@link picard.analysis.CounterManager}. + */ + public class Counter { + + /** + * Inner array to store accumulated values + */ + private final int[] array; + + /** + * @param arrayLength length of the inner array + */ + private Counter(int arrayLength) { + array = new int[arrayLength]; + } + + /** + * Increments value corresponding to reference sequence index + * + * @param index in the reference sequence + */ + public void increment(int index) { + checkIndex(index); + array[index - offset]++; + } + + /** + * @param index in the reference sequence + * @return value from the inner array, corresponding to input reference index + */ + public int get(int index) { + checkIndex(index); + return array[index - offset]; + } + + private void checkIndex(int index) { + if ((index - offset) < 0 || (index - offset) >= array.length) { + throw new ArrayIndexOutOfBoundsException("The requested index " + index + " is out of counter bounds. " + + "Possible cause of exception can be wrong READ_LENGTH parameter (much smaller than actual read length)"); + } + } + } +} diff --git a/src/main/java/picard/analysis/FastWgsMetricsCollector.java b/src/main/java/picard/analysis/FastWgsMetricsCollector.java new file mode 100644 index 000000000..989474d66 --- /dev/null +++ b/src/main/java/picard/analysis/FastWgsMetricsCollector.java @@ -0,0 +1,195 @@ +/* + * The MIT License + * + * Copyright (c) 2016 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.analysis; + +import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.util.AbstractLocusInfo; +import htsjdk.samtools.util.AbstractRecordAndOffset; +import htsjdk.samtools.util.EdgingRecordAndOffset; +import htsjdk.samtools.util.IntervalList; +import htsjdk.samtools.util.SequenceUtil; + +import java.util.HashSet; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.HashMap; + +/** + * Class represents fast algorithm for collecting data from AbstractLocusInfo + * with a list of aligned {@link htsjdk.samtools.util.EdgingRecordAndOffset} objects. According to the algorithm + * we receive only two {@link htsjdk.samtools.util.EdgingRecordAndOffset} objects for each alignment block of a read: + * one for the start of block and one for the end. When meeting a {@link htsjdk.samtools.util.EdgingRecordAndOffset} + * with type {@link htsjdk.samtools.util.EdgingRecordAndOffset.Type#BEGIN}, all information from the alignment block is accumulated in the collector, + * read name is added to a map holding the names of processed reads for detecting overlapping positions. + * When meeting a {@link htsjdk.samtools.util.EdgingRecordAndOffset} with type {@link htsjdk.samtools.util.EdgingRecordAndOffset.Type#END}, + * the read name is removed from the map with names of processed reads. + * @author Mariia_Zueva@epam.com, EPAM Systems, Inc. + */ + +public class FastWgsMetricsCollector extends AbstractWgsMetricsCollector { + + /** + * Index of the sequence of the previous processed AbstractLocusInfo + */ + private int previousSequenceIndex; + + /** + * Manager for {@link this#pileupSize} Counter. + */ + private final CounterManager counterManager; + /** + * Counter, accumulating the data on coverage for the calculation of base quality histogram. + */ + private final CounterManager.Counter pileupSize; + /** + * Counter, accumulating the data on unfiltered coverage (includes all but quality 2 bases) + * for the calculation of theoretical het sensitivity. + */ + private final CounterManager.Counter unfilteredDepthSize; + + /** + * Map, holding information on currently processed reads, that possibly have overlapping regions. + */ + private Map> readsNames; + + /** + * Determines the size of created {@link picard.analysis.CounterManager.Counter} objects. The bigger {@link picard.analysis.CounterManager.Counter} objects + * are created, the less rebasing in {@link picard.analysis.CounterManager.Counter} will occur. + */ + private final int ARRAY_SIZE_PER_READ_LENGTH = 2000; + + /** + * Creates a collector and initializes the inner data structures + * + * @param collectWgsMetrics CollectWgsMetrics, that creates this collector + * @param coverageCap coverage cap + */ + public FastWgsMetricsCollector(CollectWgsMetrics collectWgsMetrics, int coverageCap, final IntervalList intervals) { + super(collectWgsMetrics, coverageCap, intervals); + this.previousSequenceIndex = -1; + this.counterManager = new CounterManager(collectWgsMetrics.READ_LENGTH * ARRAY_SIZE_PER_READ_LENGTH, collectWgsMetrics.READ_LENGTH); + this.pileupSize = counterManager.newCounter(); + this.unfilteredDepthSize = counterManager.newCounter(); + } + + @Override + public void addInfo(final AbstractLocusInfo info, final ReferenceSequence ref, boolean referenceBaseN) { + prepareCollector(info); + for (final EdgingRecordAndOffset record : info.getRecordAndOffsets()) { + final String readName = record.getReadName(); + Optional> recordsAndOffsetsForName = Optional.ofNullable((Set)readsNames.get(readName)); + if (record.getType() == EdgingRecordAndOffset.Type.BEGIN) { + processRecord(info.getPosition(), ref, record, recordsAndOffsetsForName.orElse(new HashSet<>())); + } else { + recordsAndOffsetsForName.ifPresent( + edgingRecordAndOffsets -> removeRecordFromMap(record, + edgingRecordAndOffsets)); + } + } + if (!referenceBaseN) { + final int readNamesSize = pileupSize.get(info.getPosition()); + final int highQualityDepth = Math.min(readNamesSize, coverageCap); + if (highQualityDepth < readNamesSize) { + basesExcludedByCapping += readNamesSize - coverageCap; + } + highQualityDepthHistogramArray[highQualityDepth]++; + unfilteredDepthHistogramArray[unfilteredDepthSize.get(info.getPosition())]++; + } + } + + private void processRecord(int position, ReferenceSequence ref, EdgingRecordAndOffset record, Set recordsAndOffsetsForName) { + long processedLoci = counter; + readsNames.put(record.getReadName(), recordsAndOffsetsForName); + final byte[] qualities = record.getBaseQualities(); + final byte[] bases = record.getRecord().getReadBases(); + for (int i = 0; i < record.getLength(); i++) { + final int index = i + position; + if (isReferenceBaseN(index, ref)) { + continue; + } + final byte quality = qualities[i + record.getOffset()]; + if (quality <= 2) { + basesExcludedByBaseq++; + } else { + if (unfilteredDepthSize.get(index) < coverageCap) { + unfilteredBaseQHistogramArray[quality]++; + unfilteredDepthSize.increment(index); + } + if (quality < collectWgsMetrics.MINIMUM_BASE_QUALITY || SequenceUtil.isNoCall(bases[i + record.getOffset()])){ + basesExcludedByBaseq++; + } else { + final int bsq = excludeByQuality(recordsAndOffsetsForName, index); + if (recordsAndOffsetsForName.size() - bsq > 0) { + basesExcludedByOverlap++; + } else { + pileupSize.increment(index); + } + } + } + if (isTimeToStop(++processedLoci)) { + break; + } + } + recordsAndOffsetsForName.add(record); + } + + private void removeRecordFromMap(EdgingRecordAndOffset record, Set recordsAndOffsetsForName) { + if (recordsAndOffsetsForName.size() == 1) { + readsNames.remove(record.getReadName()); + } else { + recordsAndOffsetsForName.remove(record.getStart()); + } + } + + /** + * Prepares the accumulator objects to process a new {@link htsjdk.samtools.util.AbstractLocusInfo}. + * If we switch to a new sequence, all accumulators are cleared. + * + * @param info the next {@link htsjdk.samtools.util.AbstractLocusInfo} to process + */ + private void prepareCollector(AbstractLocusInfo info) { + if (readsNames == null) { + readsNames = new HashMap<>(); + } + if (previousSequenceIndex != info.getSequenceIndex()) { + readsNames.clear(); + counterManager.clear(); + previousSequenceIndex = info.getSequenceIndex(); + } + counterManager.checkOutOfBounds(info.getPosition()); + } + + private int excludeByQuality(final Set setForName, int position) { + int bsq = 0; + for (EdgingRecordAndOffset recordAndOffset : setForName) { + if (position - recordAndOffset.getRefPos() >= recordAndOffset.getLength() + || recordAndOffset.getBaseQuality(position) < collectWgsMetrics.MINIMUM_BASE_QUALITY) { + bsq++; + } + } + return bsq; + } +} diff --git a/src/main/java/picard/analysis/WgsMetricsProcessor.java b/src/main/java/picard/analysis/WgsMetricsProcessor.java new file mode 100644 index 000000000..c4415c56d --- /dev/null +++ b/src/main/java/picard/analysis/WgsMetricsProcessor.java @@ -0,0 +1,57 @@ +/* + * The MIT License + * + * Copyright (c) 2016 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.analysis; + + +import htsjdk.samtools.metrics.MetricsFile; +import picard.filter.CountingFilter; +import picard.filter.CountingPairedFilter; + +/** + * Interface for processing data and generate result for CollectWgsMetrics + * @author Mariia_Zueva@epam.com, EPAM Systems, Inc. + */ +public interface WgsMetricsProcessor { + + /** + * Method processes the input data and accumulates result data + */ + void processFile(); + + /** + * Adds result metric's data to input file + * + * @param file MetricsFile for result of collector's work + * @param includeBQHistogram include base quality histogram + * @param dupeFilter counting filter for duplicate reads + * @param mapqFilter counting filter for mapping quality + * @param pairFilter counting filter for reads without a mapped mate pair + */ + void addToMetricsFile(final MetricsFile file, + final boolean includeBQHistogram, + final CountingFilter dupeFilter, + final CountingFilter mapqFilter, + final CountingPairedFilter pairFilter); +} diff --git a/src/main/java/picard/analysis/WgsMetricsProcessorImpl.java b/src/main/java/picard/analysis/WgsMetricsProcessorImpl.java new file mode 100644 index 000000000..d987f1685 --- /dev/null +++ b/src/main/java/picard/analysis/WgsMetricsProcessorImpl.java @@ -0,0 +1,129 @@ +/* + * The MIT License + * + * Copyright (c) 2016 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.analysis; + +import htsjdk.samtools.metrics.MetricsFile; +import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.reference.ReferenceSequenceFileWalker; +import htsjdk.samtools.util.AbstractLocusInfo; +import htsjdk.samtools.util.AbstractLocusIterator; +import htsjdk.samtools.util.AbstractRecordAndOffset; +import htsjdk.samtools.util.ProgressLogger; +import htsjdk.samtools.util.Log; +import picard.filter.CountingFilter; +import picard.filter.CountingPairedFilter; + +import java.util.Arrays; +import java.util.stream.LongStream; + +/** + * Implementation of {@link picard.analysis.WgsMetricsProcessor} that gets input data from a given iterator + * and processes it with a help of collector + * @author Mariia_Zueva@epam.com, EPAM Systems, Inc. + */ +public class WgsMetricsProcessorImpl implements WgsMetricsProcessor { + /** + * Source of input data + */ + private final AbstractLocusIterator> iterator; + /** + * Accumulates the data from iterator + */ + private final AbstractWgsMetricsCollector collector; + /** + * ReferenceWalker for a processed reference sequence + */ + private final ReferenceSequenceFileWalker refWalker; + /** + * Logger for the progress of work + */ + private final ProgressLogger progress; + + private final Log log = Log.getInstance(WgsMetricsProcessorImpl.class); + + /** + * @param iterator input {@link htsjdk.samtools.util.AbstractLocusIterator} + * @param refWalker over processed reference file + * @param collector input {@link picard.analysis.AbstractWgsMetricsCollector} + * @param progress logger + */ + public WgsMetricsProcessorImpl(AbstractLocusIterator> iterator, + ReferenceSequenceFileWalker refWalker, + AbstractWgsMetricsCollector collector, + ProgressLogger progress) { + this.iterator = iterator; + this.collector = collector; + this.refWalker = refWalker; + this.progress = progress; + } + + /** + * Method gets the data from iterator for each locus and processes it with the help of collector. + */ + @Override + public void processFile() { + long counter = 0; + + while (iterator.hasNext()) { + final AbstractLocusInfo info = iterator.next(); + final ReferenceSequence ref = refWalker.get(info.getSequenceIndex()); + boolean referenceBaseN = collector.isReferenceBaseN(info.getPosition(), ref); + collector.addInfo(info, ref, referenceBaseN); + if (referenceBaseN) { + continue; + } + + progress.record(info.getSequenceName(), info.getPosition()); + if (collector.isTimeToStop(++counter)) { + break; + } + collector.setCounter(counter); + } + // check that we added the same number of bases to the raw coverage histogram and the base quality histograms + final long sumBaseQ = Arrays.stream(collector.unfilteredBaseQHistogramArray).sum(); + final long sumDepthHisto = LongStream.rangeClosed(0, collector.coverageCap).map(i -> (i * collector.unfilteredDepthHistogramArray[(int) i])).sum(); + if (sumBaseQ != sumDepthHisto) { + log.error("Coverage and baseQ distributions contain different amount of bases!"); + } + } + + /** + * Adds result metric's data to input file + * + * @param file MetricsFile for result of collector's work + * @param includeBQHistogram include base quality histogram + * @param dupeFilter counting filter for duplicate reads + * @param mapqFilter counting filter for mapping quality + * @param pairFilter counting filter for reads without a mapped mate pair + */ + @Override + public void addToMetricsFile(MetricsFile file, + boolean includeBQHistogram, + CountingFilter dupeFilter, + CountingFilter mapqFilter, + CountingPairedFilter pairFilter) { + collector.addToMetricsFile(file, includeBQHistogram, dupeFilter, mapqFilter, pairFilter); + } +} diff --git a/src/test/java/picard/analysis/AbstractWgsMetricsCollectorTest.java b/src/test/java/picard/analysis/AbstractWgsMetricsCollectorTest.java new file mode 100644 index 000000000..e2bcc4f8f --- /dev/null +++ b/src/test/java/picard/analysis/AbstractWgsMetricsCollectorTest.java @@ -0,0 +1,107 @@ +/* + * The MIT License + * + * Copyright (c) 2016 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.analysis; + +import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.util.AbstractLocusInfo; +import org.testng.annotations.Test; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.assertFalse; +import static picard.analysis.CollectWgsMetricsTestUtils.createIntervalList; + +public class AbstractWgsMetricsCollectorTest { + + @Test + public void testForCollectorWithoutData(){ + long[] templateQualHistogram = new long[127]; + long[] templateHistogramArray = new long[11]; + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + AbstractWgsMetricsCollector collector = new AbstractWgsMetricsCollector(collectWgsMetrics, + 10, createIntervalList()) { + @Override + public void addInfo(AbstractLocusInfo info, ReferenceSequence ref, boolean referenceBaseN) { + } + }; + assertEquals(templateHistogramArray, collector.highQualityDepthHistogramArray); + assertEquals(templateQualHistogram, collector.unfilteredBaseQHistogramArray); + assertEquals(0, collector.basesExcludedByCapping); + assertEquals(0, collector.basesExcludedByOverlap); + assertEquals(0, collector.basesExcludedByBaseq); + } + + @Test (expectedExceptions = IllegalArgumentException.class) + public void testForExceptionWithNegativeCoverage(){ + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + new AbstractWgsMetricsCollector(collectWgsMetrics, -10, createIntervalList()) { + @Override + public void addInfo(AbstractLocusInfo info, ReferenceSequence ref, boolean referenceBaseN) { + } + }; + } + + @Test + public void testForSetCounter(){ + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + AbstractWgsMetricsCollector collector = new AbstractWgsMetricsCollector(collectWgsMetrics, + 10, createIntervalList()) { + @Override + public void addInfo(AbstractLocusInfo info, ReferenceSequence ref, boolean referenceBaseN) { + } + }; + long counter = 20; + collector.setCounter(counter); + assertEquals(20, collector.counter); + } + + @Test + public void testForStop(){ + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + collectWgsMetrics.STOP_AFTER = 10; + AbstractWgsMetricsCollector collector = new AbstractWgsMetricsCollector(collectWgsMetrics, + 10, createIntervalList()) { + @Override + public void addInfo(AbstractLocusInfo info, ReferenceSequence ref, boolean referenceBaseN) { + } + }; + assertTrue(collector.isTimeToStop(10)); + assertFalse(collector.isTimeToStop(2)); + } + + @Test + public void testForRefBaseN(){ + byte[] refBasis = {'A', 'C', 'C', 'T', 'A', 'N', 'G', 'T', 'N', 'N'}; + ReferenceSequence ref = new ReferenceSequence("test", 0, refBasis); + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + AbstractWgsMetricsCollector collector = new AbstractWgsMetricsCollector(collectWgsMetrics, + 10, createIntervalList()) { + @Override + public void addInfo(AbstractLocusInfo info, ReferenceSequence ref, boolean referenceBaseN) { + } + }; + assertTrue(collector.isReferenceBaseN(6, ref)); + assertFalse(collector.isReferenceBaseN(1, ref)); + } +} diff --git a/src/test/java/picard/analysis/CollectWgsMetricsTest.java b/src/test/java/picard/analysis/CollectWgsMetricsTest.java index b282d73db..1a24eeae6 100644 --- a/src/test/java/picard/analysis/CollectWgsMetricsTest.java +++ b/src/test/java/picard/analysis/CollectWgsMetricsTest.java @@ -75,12 +75,14 @@ public String getCommandLineProgramName() { final String referenceFile = "testdata/picard/quality/chrM.reference.fasta"; return new Object[][] { - {tempSamFile, outfile, referenceFile} + {tempSamFile, outfile, referenceFile, "false"}, + {tempSamFile, outfile, referenceFile, "true"}, }; } @Test(dataProvider = "wgsDataProvider") - public void testMetricsFromWGS(final File input, final File outfile, final String referenceFile) throws IOException { + public void testMetricsFromWGS(final File input, final File outfile, final String referenceFile, + final String useFastAlgorithm) throws IOException { outfile.deleteOnExit(); final int sampleSize = 1000; @@ -88,7 +90,8 @@ public void testMetricsFromWGS(final File input, final File outfile, final Strin "INPUT=" + input.getAbsolutePath(), "OUTPUT=" + outfile.getAbsolutePath(), "REFERENCE_SEQUENCE=" + referenceFile, - "SAMPLE_SIZE=" + sampleSize + "SAMPLE_SIZE=" + sampleSize, + "USE_FAST_ALGORITHM=" + useFastAlgorithm }; Assert.assertEquals(runPicardCommandLine(args), 0); @@ -193,12 +196,19 @@ void setupBuilder() throws IOException { sorter.instanceMain(args); //create output files for tests - outfile = File.createTempFile("testWgsMetrics", ".txt"); + outfile = File.createTempFile("testWgsMetrics", ".txt", TEST_DIR); outfile.deleteOnExit(); } - @Test - public void testLargeIntervals() throws IOException { + @DataProvider(name = "wgsAlgorithm") + public Object[][] wgsAlgorithm() { + return new Object[][] { + {"false"}, + {"true"}, + }; + } + @Test(dataProvider = "wgsAlgorithm") + public void testLargeIntervals(final String useFastAlgorithm) throws IOException { final File input = new File(TEST_DIR, "forMetrics.sam"); final File outfile = File.createTempFile("test", ".wgs_metrics"); final File ref = new File(TEST_DIR, "merger.fasta"); @@ -210,7 +220,8 @@ public void testLargeIntervals() throws IOException { "OUTPUT=" + outfile.getAbsolutePath(), "REFERENCE_SEQUENCE=" + ref.getAbsolutePath(), "INTERVALS=" + intervals.getAbsolutePath(), - "SAMPLE_SIZE=" + sampleSize + "SAMPLE_SIZE=" + sampleSize, + "USE_FAST_ALGORITHM=" + useFastAlgorithm }; Assert.assertEquals(runPicardCommandLine(args), 0); @@ -225,8 +236,8 @@ public void testLargeIntervals() throws IOException { } } - @Test - public void testExclusions() throws IOException { + @Test(dataProvider = "wgsAlgorithm") + public void testExclusions(final String useFastAlgorithm) throws IOException { final File reference = new File("testdata/picard/sam/merger.fasta"); final File tempSamFile = File.createTempFile("CollectWgsMetrics", ".bam", TEST_DIR); tempSamFile.deleteOnExit(); @@ -273,7 +284,8 @@ public void testExclusions() throws IOException { "OUTPUT=" + outfile.getAbsolutePath(), "REFERENCE_SEQUENCE=" + reference.getAbsolutePath(), "INCLUDE_BQ_HISTOGRAM=true", - "COVERAGE_CAP=3" + "COVERAGE_CAP=3", + "USE_FAST_ALGORITHM=" + useFastAlgorithm }; Assert.assertEquals(runPicardCommandLine(args), 0); @@ -289,8 +301,8 @@ public void testExclusions() throws IOException { Assert.assertEquals((long) highQualityDepthHistogram.get(3).getValue(), 2*10); } - @Test - public void testPoorQualityBases() throws IOException { + @Test(dataProvider = "wgsAlgorithm") + public void testPoorQualityBases(final String useFastAlgorithm) throws IOException { final File reference = new File("testdata/picard/quality/chrM.reference.fasta"); final File testSamFile = File.createTempFile("CollectWgsMetrics", ".bam", TEST_DIR); testSamFile.deleteOnExit(); @@ -336,7 +348,8 @@ public void testPoorQualityBases() throws IOException { "OUTPUT=" + outfile.getAbsolutePath(), "REFERENCE_SEQUENCE=" + reference.getAbsolutePath(), "INCLUDE_BQ_HISTOGRAM=true", - "COVERAGE_CAP=3" + "COVERAGE_CAP=3", + "USE_FAST_ALGORITHM=" + useFastAlgorithm }; Assert.assertEquals(runPicardCommandLine(args), 0); @@ -349,4 +362,47 @@ public void testPoorQualityBases() throws IOException { Assert.assertEquals(metrics.PCT_EXC_BASEQ, 0.5); Assert.assertEquals(metrics.PCT_EXC_CAPPED, 0.0); } + + @Test(dataProvider = "wgsAlgorithm") + public void testGiantDeletion(final String useFastAlgorithm) throws IOException { + final File reference = new File("testdata/picard/quality/chrM.reference.fasta"); + final File testSamFile = File.createTempFile("CollectWgsMetrics", ".bam", TEST_DIR); + testSamFile.deleteOnExit(); + + final SAMRecordSetBuilder setBuilder = CollectWgsMetricsTestUtils.createTestSAMBuilder(reference, READ_GROUP_ID, SAMPLE, PLATFORM, LIBRARY); + setBuilder.setReadLength(10); + for (int i = 0; i < 3; i++){ + setBuilder.addPair("Read:" + i, 0, 1, 30, false, false, "5M10000D5M", "1000D10M", false, true, 40); + } + + final SAMFileWriter writer = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(setBuilder.getHeader(), false, testSamFile); + + for (final SAMRecord record : setBuilder) { + writer.addAlignment(record); + } + + writer.close(); + + final File outfile = File.createTempFile("testGiantDeletion", ".txt"); + outfile.deleteOnExit(); + + final String[] args = new String[] { + "INPUT=" + testSamFile.getAbsolutePath(), + "OUTPUT=" + outfile.getAbsolutePath(), + "REFERENCE_SEQUENCE=" + reference.getAbsolutePath(), + "INCLUDE_BQ_HISTOGRAM=true", + "COVERAGE_CAP=3", + "USE_FAST_ALGORITHM=" + useFastAlgorithm + }; + + Assert.assertEquals(runPicardCommandLine(args), 0); + + final MetricsFile output = new MetricsFile<>(); + output.read(new FileReader(outfile)); + + final CollectWgsMetrics.WgsMetrics metrics = output.getMetrics().get(0); + + Assert.assertEquals(metrics.PCT_EXC_BASEQ, 0.0); + Assert.assertEquals(metrics.PCT_EXC_CAPPED, 0.0); + } } diff --git a/src/test/java/picard/analysis/CollectWgsMetricsTestUtils.java b/src/test/java/picard/analysis/CollectWgsMetricsTestUtils.java index 468a3853f..e629cac6e 100644 --- a/src/test/java/picard/analysis/CollectWgsMetricsTestUtils.java +++ b/src/test/java/picard/analysis/CollectWgsMetricsTestUtils.java @@ -26,10 +26,24 @@ import htsjdk.samtools.*; +import htsjdk.samtools.filter.SamRecordFilter; +import htsjdk.samtools.filter.SecondaryAlignmentFilter; +import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFileWalker; +import htsjdk.samtools.util.AbstractLocusIterator; +import htsjdk.samtools.util.EdgeReadIterator; +import htsjdk.samtools.util.IntervalList; import htsjdk.variant.utils.SAMSequenceDictionaryExtractor; +import picard.filter.CountingDuplicateFilter; +import picard.filter.CountingFilter; +import picard.filter.CountingMapQFilter; +import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import java.util.stream.IntStream; @@ -38,6 +52,14 @@ */ public class CollectWgsMetricsTestUtils { + + private static final String sqHeaderLN20 = "@HD\tSO:coordinate\tVN:1.0\n@SQ\tSN:chrM\tAS:HG18\tLN:20\n"; + private static final String s1 = "3851612\t16\tchrM\t1\t255\t3M2D10M\t*\t0\t0\tACCTACGTTCAAT\tDDDDDDDDDDDDD\n"; + private static final String sqHeaderLN100 = "@HD\tSO:coordinate\tVN:1.0\n@SQ\tSN:chrM\tAS:HG18\tLN:100\n"; + private static final String s2 = "3851612\t16\tchrM\t2\t255\t10M1D5M\t*\t0\t0\tCCTACGTTCAATATT\tDDDDDDDDDDDDDDD\n"; + static final String exampleSamOneRead = sqHeaderLN20+s1; + static final String exampleSamTwoReads = sqHeaderLN100+s1+s2; + protected static SAMRecordSetBuilder createTestSAMBuilder(final File reference, final String readGroupId, final String sample, @@ -96,6 +118,97 @@ private static void createTestSAM(String testSamName) throws IOException { writer.close(); } + static IntervalList createIntervalList() { + return new IntervalList(new SAMFileHeader()); + } + + static AbstractLocusIterator createReadEndsIterator(String exampleSam){ + final List filters = new ArrayList<>(); + final CountingFilter dupeFilter = new CountingDuplicateFilter(); + final CountingFilter mapqFilter = new CountingMapQFilter(0); + filters.add(new SecondaryAlignmentFilter()); // Not a counting filter because we never want to count reads twice + filters.add(mapqFilter); + filters.add(dupeFilter); + SamReader samReader = createSamReader(exampleSam); + AbstractLocusIterator iterator = new EdgeReadIterator(samReader); + iterator.setSamFilters(filters); + iterator.setMappingQualityScoreCutoff(0); // Handled separately because we want to count bases + iterator.setIncludeNonPfReads(false); + return iterator; + } + + static SamReader createSamReader(String samExample) { + ByteArrayInputStream inputStream = new ByteArrayInputStream(samExample.getBytes()); + return SamReaderFactory.makeDefault().open(SamInputResource.of(inputStream)); + } + + static ReferenceSequenceFileWalker getReferenceSequenceFileWalker(String referenceString){ + ReferenceSequenceFile referenceSequenceFile = createReferenceSequenceFile(referenceString); + return new ReferenceSequenceFileWalker(referenceSequenceFile); + } + + static ReferenceSequenceFileWalker getReferenceSequenceFileWalker(){ + String referenceString = ">ref\nACCTACGTTCAATATTCTTC"; + return getReferenceSequenceFileWalker(referenceString); + } + + static ReferenceSequenceFile createReferenceSequenceFile(String referenceString) { + final SAMSequenceRecord record = new SAMSequenceRecord("ref", referenceString.length()); + final SAMSequenceDictionary dictionary = new SAMSequenceDictionary(); + dictionary.addSequence(record); + return new ReferenceSequenceFile() { + + boolean done = false; + @Override + public SAMSequenceDictionary getSequenceDictionary() { + return dictionary; + } + + @Override + public ReferenceSequence nextSequence() { + if (!done) { + done = true; + return getSequence(record.getSequenceName()); + } + return null; + } + + @Override + public void reset() { + done=false; + } + + @Override + public boolean isIndexed() { + return false; + } + + @Override + public ReferenceSequence getSequence(String contig) { + if (contig.equals(record.getSequenceName())) { + return new ReferenceSequence(record.getSequenceName(), 0, referenceString.getBytes()); + } else { + return null; + } + } + + @Override + public ReferenceSequence getSubsequenceAt(String contig, long start, long stop) { + return null; + } + + @Override + public String toString() { + return null; + } + + @Override + public void close() throws IOException { + + } + }; + } + // call main (from IDE for example) to createTestSAM(), which creates a test SAM file public static void main(String[] args) { try { createTestSAM("TestSam"); } catch(IOException e) { ; } diff --git a/src/test/java/picard/analysis/CounterManagerTest.java b/src/test/java/picard/analysis/CounterManagerTest.java new file mode 100644 index 000000000..2077860ff --- /dev/null +++ b/src/test/java/picard/analysis/CounterManagerTest.java @@ -0,0 +1,95 @@ +package picard.analysis; + +import org.testng.Assert; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.Test; + +public class CounterManagerTest { + + private CounterManager testCounterManager; + private CounterManager.Counter testCounter; + private int arrayLength = 10; + private int readLength = 10; + private int OFFSET = 7; + private CounterManager secondTestCounterManager; + private CounterManager.Counter secondCounter; + + @BeforeTest + public void setUp() { + secondTestCounterManager = new CounterManager(arrayLength, readLength); + secondTestCounterManager.setOffset(OFFSET); + secondCounter = secondTestCounterManager.newCounter(); + testCounterManager = new CounterManager(arrayLength, readLength); + testCounterManager.setOffset(OFFSET); + testCounter = testCounterManager.newCounter(); + + for (int i = 0; i < arrayLength; i++) { + testCounter.increment(i + OFFSET); + secondCounter.increment(i + OFFSET); + } + testCounter.increment(OFFSET); + secondCounter.increment(OFFSET); + } + + @Test + public void testCounterInc() { + Assert.assertEquals(2, testCounter.get(OFFSET), "Test method increment:"); + } + + @Test + public void testForClearCounter() { + testCounterManager.clear(); + Assert.assertEquals(0, testCounter.get(OFFSET), "The value of the array with index 0 must be 0 after clear manager:"); + } + + @Test + public void testForCorrectOffsetAfterRebase() { + secondTestCounterManager.checkOutOfBounds(11); + Assert.assertEquals(11, secondTestCounterManager.getOffset(), "After rebase offset must be new int"); + } + + @Test + public void testForCorrectCounterAfterRebase() { + secondTestCounterManager.checkOutOfBounds(11); + Assert.assertEquals(1, secondCounter.get(11), "The value of the array with index 0 must be 1 after rebase:"); + } + + @Test + public void testForOutOfBoundCounter() { + secondTestCounterManager.checkOutOfBounds(44); + Assert.assertEquals(44, secondTestCounterManager.getOffset(), "New offset after clear must be 44:"); + } + + @Test + public void testForCleanCounterAfter() { + testCounterManager.checkOutOfBounds(88); + Assert.assertEquals(0, testCounter.get(88), "The value of the array with index 0 must be 1 after clean:"); + } + + @Test + public void testForCheckIncrement(){ + CounterManager testCounterManager = new CounterManager(arrayLength, readLength); + testCounterManager.setOffset(0); + CounterManager.Counter counter = testCounterManager.newCounter(); + for (int i=0; i<10; i++){ + counter.increment(1); + } + int[] testArray = new int[arrayLength]; + for (int i = 0; i< arrayLength; i++){ + testArray[i] = counter.get(i); + } + int[]templateArray = new int[arrayLength]; + templateArray[1] = 10; + Assert.assertEquals(templateArray, testArray); + } + + @Test(expectedExceptions = IndexOutOfBoundsException.class) + public void testForWrongIndexInInc(){ + testCounter.increment(40); + } + + @Test(expectedExceptions = IndexOutOfBoundsException.class) + public void testForWrongIndexInGet(){ + testCounter.get(40); + } +} diff --git a/src/test/java/picard/analysis/FastWgsMetricsCollectorTest.java b/src/test/java/picard/analysis/FastWgsMetricsCollectorTest.java new file mode 100644 index 000000000..84675170b --- /dev/null +++ b/src/test/java/picard/analysis/FastWgsMetricsCollectorTest.java @@ -0,0 +1,215 @@ +package picard.analysis; + + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.util.AbstractLocusInfo; +import htsjdk.samtools.util.AbstractLocusIterator; +import htsjdk.samtools.util.EdgingRecordAndOffset; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.Test; +import static org.testng.Assert.assertEquals; +import static picard.analysis.CollectWgsMetricsTestUtils.createIntervalList; +import static picard.analysis.CollectWgsMetricsTestUtils.createReadEndsIterator; +import static picard.analysis.CollectWgsMetricsTestUtils.exampleSamTwoReads; + + +public class FastWgsMetricsCollectorTest { + private byte[] highQualities = {30, 30, 30 ,30, 30, 30, 30, 30, 30, 30, 50, 50, 50 ,50, 50, 50, 50, 50, 50, 60, 60, 60, 70 ,70, 70, 80, 80, 90, 90, 90}; + private byte[] qualities = {2, 2, 3 ,3, 3, 4, 4, 4, 4, 1}; + private byte[] refBases = {'A', 'C', 'C', 'T', 'A', 'C', 'G', 'T', 'T', 'C', 'A', 'A', 'T', 'A', 'T', 'T', 'C', 'T', 'T', 'C', 'G', 'A', 'G', 'T', 'C', 'D' , 'G', 'T', 'C', 'D','A', 'G', 'T', 'C', 'T', 'T', 'C', 'G', 'A', 'G', 'T', 'C', 'T', 'T', 'C', 'G', 'C', 'T', 'T', 'C', 'G', 'A', 'G', 'T', 'C', 'D' , 'G', 'T', 'C', 'D','A', 'G', 'T', 'C', 'T', 'T', 'C', 'G', 'A', 'G', 'T', 'C', 'T', 'T', 'C', 'G', 'C', 'T', 'T', 'C', 'G', 'A', 'G', 'T', 'C', 'D' , 'G', 'T', 'C', 'D', 'G', 'A', 'G', 'T', 'C', 'D' , 'G', 'T', 'C', 'D'}; + private SAMRecord record; + private SAMSequenceRecord sequence; + private SAMRecord secondRecord; + private SAMRecord thirdRecord; + private ReferenceSequence ref; + + @BeforeTest + public void setUp(){ + String referenceString = ">chrM\nACCTACGTTCAATATTCTTCACCTACGTTCAATATTCTTCACCTACGTTCAATATTCTTCACCTACGTTCAATATTCTTCACCTACGTTCAATATTCTTC"; + ref = new ReferenceSequence("chrM", 0, referenceString.getBytes()); + sequence = new SAMSequenceRecord("chrM", 100); + record = new SAMRecord(new SAMFileHeader()); + record.setReadName("test"); + record.setBaseQualities(qualities); + record.setReadBases(refBases); + secondRecord = generateRecord("test1"); + thirdRecord = generateRecord("test2"); + } + + private SAMRecord generateRecord(String name) { + SAMRecord record = new SAMRecord(new SAMFileHeader()); + record.setReadName(name); + record.setBaseQualities(highQualities); + record.setReadBases(refBases); + return record; + } + + @Test + public void testAddInfoForQuality() throws Exception { + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + FastWgsMetricsCollector collector = new FastWgsMetricsCollector(collectWgsMetrics, 100, createIntervalList()); + AbstractLocusInfo firstInfo = new AbstractLocusInfo<>(sequence, 1); + AbstractLocusInfo secondInfo = new AbstractLocusInfo<>(sequence, 10); + EdgingRecordAndOffset record1 = EdgingRecordAndOffset.createBeginRecord(record, 0, 10, 0); + firstInfo.add(record1); + secondInfo.add(EdgingRecordAndOffset.createEndRecord(record1)); + EdgingRecordAndOffset record2 = EdgingRecordAndOffset.createBeginRecord(record, 0, 10, 0); + firstInfo.add(record2); + secondInfo.add(EdgingRecordAndOffset.createEndRecord(record2)); + collector.addInfo(firstInfo, ref, false); + collector.addInfo(secondInfo, ref, false); + assertEquals(20, collector.basesExcludedByBaseq); + } + + @Test + public void testAddInfoForOverlap(){ + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + FastWgsMetricsCollector collector = new FastWgsMetricsCollector(collectWgsMetrics, 100, createIntervalList()); + AbstractLocusInfo firstInfo = new AbstractLocusInfo<>(sequence, 1); + AbstractLocusInfo secondInfo = new AbstractLocusInfo<>(sequence, 5); + AbstractLocusInfo thirdInfo = new AbstractLocusInfo<>(sequence, 10); + EdgingRecordAndOffset record1 = EdgingRecordAndOffset.createBeginRecord(secondRecord, 0, 10, 0); + firstInfo.add(record1); + thirdInfo.add(EdgingRecordAndOffset.createEndRecord(record1)); + EdgingRecordAndOffset record2 = EdgingRecordAndOffset.createBeginRecord(secondRecord, 5, 5, 0); + secondInfo.add(record2); + thirdInfo.add(EdgingRecordAndOffset.createEndRecord(record2)); + collector.addInfo(firstInfo, ref, false); + collector.addInfo(secondInfo, ref, false); + collector.addInfo(thirdInfo, ref, false); + assertEquals(5, collector.basesExcludedByOverlap, "Excluded by overlap:"); + } + + @Test + public void testAddInfoForCapping(){ + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + FastWgsMetricsCollector collector = new FastWgsMetricsCollector(collectWgsMetrics, 1, createIntervalList()); + AbstractLocusInfo firstInfo = new AbstractLocusInfo<>(sequence, 1); + AbstractLocusInfo secondInfo = new AbstractLocusInfo<>(sequence, 10); + EdgingRecordAndOffset record1 = EdgingRecordAndOffset.createBeginRecord(secondRecord, 0, 10, 0); + firstInfo.add(record1); + secondInfo.add(EdgingRecordAndOffset.createEndRecord(record1)); + EdgingRecordAndOffset record2 = EdgingRecordAndOffset.createBeginRecord(secondRecord, 0, 10, 0); + firstInfo.add(record2); + secondInfo.add(EdgingRecordAndOffset.createEndRecord(record2)); + collector.addInfo(firstInfo, ref, false); + collector.addInfo(secondInfo, ref, false); + assertEquals(1, collector.basesExcludedByCapping, "Excluded by capping:"); + } + + @Test + public void testForExcludedForQualityHistogramArray(){ + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + collectWgsMetrics.INCLUDE_BQ_HISTOGRAM = true; + FastWgsMetricsCollector collector = new FastWgsMetricsCollector(collectWgsMetrics, 100, createIntervalList()); + AbstractLocusInfo firstInfo = new AbstractLocusInfo<>(sequence, 1); + AbstractLocusInfo secondInfo = new AbstractLocusInfo<>(sequence, 30); + EdgingRecordAndOffset record = EdgingRecordAndOffset.createBeginRecord(secondRecord, 0, 30, 0); + firstInfo.add(record); + firstInfo.add(EdgingRecordAndOffset.createEndRecord(record)); + collector.addInfo(firstInfo, ref, false); + collector.addInfo(secondInfo, ref, false); + long[] expectedResult = new long[127]; + expectedResult[30] = 10; expectedResult[50] = 9; expectedResult[60] = 3; + expectedResult[70] = 3; expectedResult[80] = 2; expectedResult[90] = 3; + assertEquals(expectedResult, collector.unfilteredBaseQHistogramArray); + } + + @Test + public void testForBaseQualityHetSens(){ + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + collectWgsMetrics.INCLUDE_BQ_HISTOGRAM = true; + FastWgsMetricsCollector collector = new FastWgsMetricsCollector(collectWgsMetrics, 10, createIntervalList()); + AbstractLocusInfo firstInfo = new AbstractLocusInfo<>(sequence, 1); + AbstractLocusInfo secondInfo = new AbstractLocusInfo<>(sequence, 1); + AbstractLocusInfo thirdInfo = new AbstractLocusInfo<>(sequence, 1); + EdgingRecordAndOffset record1 = EdgingRecordAndOffset.createBeginRecord(secondRecord, 0, 30, 0); + firstInfo.add(record1); + thirdInfo.add(EdgingRecordAndOffset.createEndRecord(record1)); + EdgingRecordAndOffset record2 = EdgingRecordAndOffset.createBeginRecord(record, 0, 10, 0); + firstInfo.add(record2); + secondInfo.add(EdgingRecordAndOffset.createEndRecord(record2)); + collector.addInfo(firstInfo, ref, false); + collector.addInfo(secondInfo, ref, false); + collector.addInfo(thirdInfo, ref, false); + + long[] expectedResult = new long[127]; + expectedResult[30] = 10; expectedResult[50] = 9; expectedResult[60] = 3; + expectedResult[70] = 3; expectedResult[80] = 2; expectedResult[90] = 3; + expectedResult[1] = 1; expectedResult[2] = 2; expectedResult[3] = 3; + expectedResult[4] = 4; + } + + @Test + public void testForHistogramArray(){ + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + FastWgsMetricsCollector collector = new FastWgsMetricsCollector(collectWgsMetrics, 10, createIntervalList()); + long[] templateHistogramArray = {0,1,3,0,0,0,0,0,0,0,0}; + AbstractLocusInfo firstInfo = new AbstractLocusInfo<>(sequence, 1); + AbstractLocusInfo secondInfo = new AbstractLocusInfo<>(sequence, 10); + AbstractLocusInfo thirdInfo = new AbstractLocusInfo<>(sequence, 20); + AbstractLocusInfo fourthInfo = new AbstractLocusInfo<>(sequence, 30); + EdgingRecordAndOffset record1 = EdgingRecordAndOffset.createBeginRecord(record, 0, 10, 0); + firstInfo.add(record1); + secondInfo.add(EdgingRecordAndOffset.createEndRecord(record1)); + EdgingRecordAndOffset record2 = EdgingRecordAndOffset.createBeginRecord(secondRecord, 0, 30, 0); + firstInfo.add(record2); + fourthInfo.add(EdgingRecordAndOffset.createEndRecord(record2)); + EdgingRecordAndOffset record3 = EdgingRecordAndOffset.createBeginRecord(thirdRecord, 0, 20, 0); + firstInfo.add(record3); + thirdInfo.add(EdgingRecordAndOffset.createEndRecord(record3)); + collector.addInfo(firstInfo, ref, false); + collector.addInfo(secondInfo, ref, false); + collector.addInfo(thirdInfo, ref, false); + collector.addInfo(fourthInfo, ref, false); + assertEquals(templateHistogramArray, collector.unfilteredDepthHistogramArray); + } + + @Test + public void testForCollectorWithoutData(){ + long[] templateQualHistogram = new long[127]; + long[] templateHistogramArray = new long[11]; + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + FastWgsMetricsCollector collector = new FastWgsMetricsCollector(collectWgsMetrics, 10, createIntervalList()); + assertEquals(templateHistogramArray, collector.unfilteredDepthHistogramArray); + assertEquals(templateQualHistogram, collector.unfilteredBaseQHistogramArray); + assertEquals(0, collector.basesExcludedByCapping); + assertEquals(0, collector.basesExcludedByOverlap); + assertEquals(0, collector.basesExcludedByBaseq); + } + + @Test + public void testAddInfoWithoutOverlap(){ + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + FastWgsMetricsCollector collector = new FastWgsMetricsCollector(collectWgsMetrics, 100, createIntervalList()); + AbstractLocusInfo firstInfo = new AbstractLocusInfo<>(sequence, 1); + AbstractLocusInfo secondInfo = new AbstractLocusInfo<>(sequence, 5); + AbstractLocusInfo thirdInfo = new AbstractLocusInfo<>(sequence, 6); + AbstractLocusInfo fourthInfo = new AbstractLocusInfo<>(sequence, 10); + EdgingRecordAndOffset record1 = EdgingRecordAndOffset.createBeginRecord(secondRecord, 0, 5, 0); + firstInfo.add(record1); + secondInfo.add(EdgingRecordAndOffset.createEndRecord(record1)); + EdgingRecordAndOffset record2 = EdgingRecordAndOffset.createBeginRecord(secondRecord, 6, 5, 6); + thirdInfo.add(record2); + fourthInfo.add(EdgingRecordAndOffset.createEndRecord(record2)); + + collector.addInfo(firstInfo, ref, false); + collector.addInfo(secondInfo, ref, false); + assertEquals(0, collector.basesExcludedByOverlap, "Excluded by overlap:"); + } + + @Test + public void testForComplicatedCigar(){ + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + FastWgsMetricsCollector collector = new FastWgsMetricsCollector(collectWgsMetrics, 100, createIntervalList()); + AbstractLocusIterator sli = createReadEndsIterator(exampleSamTwoReads); + while(sli.hasNext()) { + AbstractLocusInfo info = sli.next(); + collector.addInfo(info, ref, false); + } + assertEquals(11, collector.basesExcludedByOverlap, "Excluded by overlap:"); + } +} diff --git a/src/test/java/picard/analysis/WgsMetricsProcessorImplTest.java b/src/test/java/picard/analysis/WgsMetricsProcessorImplTest.java new file mode 100644 index 000000000..9bd9e20b8 --- /dev/null +++ b/src/test/java/picard/analysis/WgsMetricsProcessorImplTest.java @@ -0,0 +1,86 @@ +/* + * The MIT License + * + * Copyright (c) 2016 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.analysis; + +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFileWalker; +import htsjdk.samtools.util.AbstractLocusIterator; +import htsjdk.samtools.util.Log; +import htsjdk.samtools.util.ProgressLogger; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertEquals; +import static picard.analysis.CollectWgsMetricsTestUtils.createIntervalList; +import static picard.analysis.CollectWgsMetricsTestUtils.createReadEndsIterator; +import static picard.analysis.CollectWgsMetricsTestUtils.exampleSamOneRead; +import static picard.analysis.CollectWgsMetricsTestUtils.getReferenceSequenceFileWalker; +import static picard.analysis.CollectWgsMetricsTestUtils.createReferenceSequenceFile; + +public class WgsMetricsProcessorImplTest { + private ProgressLogger progress; + + @BeforeTest + public void setUp(){ + progress = new ProgressLogger(Log.getInstance(WgsMetricsProcessorImpl.class)); + } + + @Test + public void testForProcessFile(){ + AbstractLocusIterator iterator = createReadEndsIterator(exampleSamOneRead); + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + FastWgsMetricsCollector collector = new FastWgsMetricsCollector(collectWgsMetrics, 100, createIntervalList()); + WgsMetricsProcessorImpl wgsMetricsProcessor = + new WgsMetricsProcessorImpl(iterator, getReferenceSequenceFileWalker(), collector, progress); + wgsMetricsProcessor.processFile(); + assertEquals(20, collector.counter); + } + + @Test + public void testForFilteredBases(){ + AbstractLocusIterator iterator = createReadEndsIterator(exampleSamOneRead); + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + FastWgsMetricsCollector collector = new FastWgsMetricsCollector(collectWgsMetrics, 100, createIntervalList()); + String secondReferenceString = ">ref\nNNNNNNNNNNAATATTCTTC"; + ReferenceSequenceFile referenceSequenceFile = createReferenceSequenceFile(secondReferenceString); + ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(referenceSequenceFile); + WgsMetricsProcessorImpl wgsMetricsProcessor = + new WgsMetricsProcessorImpl(iterator, refWalker, collector, progress); + wgsMetricsProcessor.processFile(); + assertEquals(10, collector.counter); + } + + @Test + public void testForExitAfter(){ + AbstractLocusIterator iterator = createReadEndsIterator(exampleSamOneRead); + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + collectWgsMetrics.STOP_AFTER = 16; + AbstractWgsMetricsCollector collector = new FastWgsMetricsCollector(collectWgsMetrics, 100, null); + WgsMetricsProcessorImpl wgsMetricsProcessor = + new WgsMetricsProcessorImpl(iterator, getReferenceSequenceFileWalker(), collector, progress); + wgsMetricsProcessor.processFile(); + assertEquals(15, collector.counter); + } +}