From a3cf5a35df8deac92b7a4fb3ccbc102dd6e4cd72 Mon Sep 17 00:00:00 2001 From: Zaal Date: Tue, 10 Jan 2017 12:01:29 +0300 Subject: [PATCH 1/3] New CollectWGS Algorithm --- .../analysis/AbstractWgsMetricsCollector.java | 293 +++++++++++++++++++++ .../java/picard/analysis/CollectWgsMetrics.java | 183 ++++--------- .../CollectWgsMetricsWithNonZeroCoverage.java | 9 +- src/main/java/picard/analysis/CounterManager.java | 189 +++++++++++++ .../picard/analysis/FastWgsMetricsCollector.java | 192 ++++++++++++++ .../java/picard/analysis/WgsMetricsProcessor.java | 57 ++++ .../picard/analysis/WgsMetricsProcessorImpl.java | 126 +++++++++ .../analysis/AbstractWgsMetricsCollectorTest.java | 107 ++++++++ .../picard/analysis/CollectWgsMetricsTest.java | 30 ++- .../analysis/CollectWgsMetricsTestUtils.java | 113 ++++++++ .../java/picard/analysis/CounterManagerTest.java | 95 +++++++ .../analysis/FastWgsMetricsCollectorTest.java | 212 +++++++++++++++ .../analysis/WgsMetricsProcessorImplTest.java | 83 ++++++ 13 files changed, 1545 insertions(+), 144 deletions(-) create mode 100644 src/main/java/picard/analysis/AbstractWgsMetricsCollector.java create mode 100644 src/main/java/picard/analysis/CounterManager.java create mode 100644 src/main/java/picard/analysis/FastWgsMetricsCollector.java create mode 100644 src/main/java/picard/analysis/WgsMetricsProcessor.java create mode 100644 src/main/java/picard/analysis/WgsMetricsProcessorImpl.java create mode 100644 src/test/java/picard/analysis/AbstractWgsMetricsCollectorTest.java create mode 100644 src/test/java/picard/analysis/CounterManagerTest.java create mode 100644 src/test/java/picard/analysis/FastWgsMetricsCollectorTest.java create mode 100644 src/test/java/picard/analysis/WgsMetricsProcessorImplTest.java diff --git a/src/main/java/picard/analysis/AbstractWgsMetricsCollector.java b/src/main/java/picard/analysis/AbstractWgsMetricsCollector.java new file mode 100644 index 000000000..422db6346 --- /dev/null +++ b/src/main/java/picard/analysis/AbstractWgsMetricsCollector.java @@ -0,0 +1,293 @@ +/* + * The MIT License + * + * Copyright (c) 2016 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.analysis; + +import htsjdk.samtools.metrics.MetricsFile; +import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.util.AbstractLocusInfo; +import htsjdk.samtools.util.AbstractRecordAndOffset; +import htsjdk.samtools.util.Histogram; +import htsjdk.samtools.util.IntervalList; +import htsjdk.samtools.util.QualityUtil; +import htsjdk.samtools.util.SequenceUtil; +import picard.filter.CountingFilter; +import picard.filter.CountingPairedFilter; +import picard.util.MathUtil; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +/** + * Class for collecting data on reference coverage, base qualities and excluded bases from one AbstractLocusInfo object for + * CollectWgsMetrics. + *

+ * The shared code for forming result for CollectWgsMetrics is abstracted into this class. + * Classes that extend this collector implement their logic in addInfo() method. + * @author Mariia_Zueva@epam.com, EPAM Systems, Inc. + */ + +public abstract class AbstractWgsMetricsCollector { + + /** + * The source CollectWgsMetrics object + */ + final CollectWgsMetrics collectWgsMetrics; + /** Count of sites with a given depth of coverage. Includes all but quality 2 bases. + * We draw depths from this histogram when we calculate the theoretical het sensitivity. + */ + protected final long[] unfilteredDepthHistogramArray; + /** Count of bases observed with a given base quality. Includes all but quality 2 bases. + * We draw bases from this histogram when we calculate the theoretical het sensitivity. + */ + protected final long[] unfilteredBaseQHistogramArray; + /** + * Count of sites with a given depth of coverage. + * Excludes bases with quality below MINIMUM_BASE_QUALITY (default 20). + */ + protected final long[] highQualityDepthHistogramArray; + /** + * Number of aligned bases that were filtered out because they were of low base quality (default is < 20). + */ + long basesExcludedByBaseq = 0; + /** + * Number of aligned bases that were filtered out because they were the second observation from an insert with overlapping reads. + */ + long basesExcludedByOverlap = 0; + /** + * Number of aligned bases that were filtered out because they would have raised coverage above the capped value (default cap = 250x). + */ + long basesExcludedByCapping = 0; + /** + * Positions with coverage exceeding this value are treated as if they had coverage at this value + */ + protected final int coverageCap; + + protected final IntervalList intervals; + /** + * This value indicates that processing will stop after specified int the metric amount of genomic bases. + */ + private final boolean usingStopAfter; + /** + * The number of processed genomic bases + */ + protected long counter = 0; + + ReadNamesCollectionFactory collectionFactory = new ReadNamesCollectionFactoryImpl(); + /** + * Creates a collector and initializes the inner data structures + * + * @param collectWgsMetrics CollectWgsMetrics, that creates this collector + * @param coverageCap coverage cap + */ + AbstractWgsMetricsCollector(CollectWgsMetrics collectWgsMetrics, final int coverageCap, final IntervalList intervals) { + if (coverageCap <= 0) { + throw new IllegalArgumentException("Coverage cap must be positive."); + } + this.collectWgsMetrics = collectWgsMetrics; + unfilteredDepthHistogramArray = new long[coverageCap + 1]; + highQualityDepthHistogramArray = new long[coverageCap + 1]; + unfilteredBaseQHistogramArray = new long[Byte.MAX_VALUE]; + this.coverageCap = coverageCap; + this.intervals = intervals; + this.usingStopAfter = collectWgsMetrics.STOP_AFTER > 0; + } + + /** + * Accumulates the data from AbstractLocusInfo in inner structures + * @param info AbstractLocusInfo with aligned to reference position reads + * @param ref ReferenceSequence + * @param referenceBaseN true if current the value of reference base represents a no call + */ + public abstract void addInfo(final AbstractLocusInfo info, final ReferenceSequence ref, boolean referenceBaseN); + + /** + * Adds collected metrics and depth histogram to file + * + * @param file MetricsFile for result of collector's work + * @param dupeFilter counting filter for duplicate reads + * @param mapqFilter counting filter for mapping quality + * @param pairFilter counting filter for reads without a mapped mate pair + */ + public void addToMetricsFile(final MetricsFile file, + final boolean includeBQHistogram, + final CountingFilter dupeFilter, + final CountingFilter mapqFilter, + final CountingPairedFilter pairFilter) { + final CollectWgsMetrics.WgsMetrics + metrics = getMetrics(dupeFilter, mapqFilter, pairFilter); + + // add them to the file + file.addMetric(metrics); + file.addHistogram(getHighQualityDepthHistogram()); + if (includeBQHistogram) addBaseQHistogram(file); + } + + protected void addBaseQHistogram(final MetricsFile file) { + file.addHistogram(getUnfilteredBaseQHistogram()); + } + + protected Histogram getHighQualityDepthHistogram() { + return getHistogram(highQualityDepthHistogramArray, "coverage", "high_quality_coverage_count"); + } + + protected Histogram getUnfilteredDepthHistogram(){ + return getHistogram(unfilteredDepthHistogramArray, "coverage", "unfiltered_coverage_count"); + } + + protected Histogram getUnfilteredBaseQHistogram() { + return getHistogram(unfilteredBaseQHistogramArray, "baseq", "unfiltered_baseq_count"); + } + + protected Histogram getHistogram(final long[] array, final String binLabel, final String valueLabel) { + final Histogram histogram = new Histogram<>(binLabel, valueLabel); + for (int i = 0; i < array.length; ++i) { + histogram.increment(i, array[i]); + } + return histogram; + } + + /** + * Creates CollectWgsMetrics.WgsMetrics - the object holding the result of CollectWgsMetrics + * + * @param dupeFilter counting filter for duplicate reads + * @param mapqFilter counting filter for mapping quality + * @param pairFilter counting filter for reads without a mapped mate pair + * @return CollectWgsMetrics.WgsMetrics with set fields + */ + protected CollectWgsMetrics.WgsMetrics getMetrics(final CountingFilter dupeFilter, + final CountingFilter mapqFilter, + final CountingPairedFilter pairFilter) { + return collectWgsMetrics.generateWgsMetrics( + this.intervals, + getHighQualityDepthHistogram(), + getUnfilteredDepthHistogram(), + collectWgsMetrics.getBasesExcludedBy(mapqFilter), + collectWgsMetrics.getBasesExcludedBy(dupeFilter), + collectWgsMetrics.getBasesExcludedBy(pairFilter), + basesExcludedByBaseq, + basesExcludedByOverlap, + basesExcludedByCapping, + coverageCap, + getUnfilteredBaseQHistogram(), + collectWgsMetrics.SAMPLE_SIZE + ); + } + + /** + * @return true, of number of processed loci exceeded the threshold, otherwise false + */ + boolean isTimeToStop(final long processedLoci) { + return usingStopAfter && processedLoci > collectWgsMetrics.STOP_AFTER - 1; + } + + /** + * Sets the counter to the current number of processed loci. Counter, must be updated + * from outside, since we are skipping a no call reference positions outside of the collector + * + * @param counter number of processed loci + */ + public void setCounter(long counter) { + this.counter = counter; + } + + /** + * Checks if reference base at given position is unknown. + * + * @param position to check the base + * @param ref reference sequence + * @return true if reference base at position represents a no call, otherwise false + */ + boolean isReferenceBaseN(final int position, final ReferenceSequence ref) { + final byte base = ref.getBases()[position - 1]; + return SequenceUtil.isNoCall(base); + } + + public void setCollectionFactory(ReadNamesCollectionFactory collectionFactory) { + this.collectionFactory = collectionFactory; + } + + interface ReadNamesCollection { + boolean add(String readName); + Set put(String readName, Set recordsAndOffsetsForName); + Set get(String readName); + Set remove(String readName); + int size(); + void clear(); + } + + interface ReadNamesCollectionFactory { + ReadNamesCollection createCollection(); + } + + private static class ReadNamesCollectionFactoryImpl implements ReadNamesCollectionFactory { + + @Override + public ReadNamesCollection createCollection() { + return new ReadNamesCollectionImpl(); + } + } + + private static class ReadNamesCollectionImpl implements ReadNamesCollection { + + private Map> readNames; + private final Set dummySet = new HashSet<>(); + + ReadNamesCollectionImpl() { + this.readNames = new HashMap<>(); + } + + @Override + public boolean add(String readName) { + return readNames.put(readName, dummySet) == null; + } + + @Override + public Set put(String readName, Set recordsAndOffsetsForName) { + return readNames.put(readName, recordsAndOffsetsForName); + } + + @Override + public Set get(String readName) { + return readNames.get(readName); + } + + @Override + public Set remove(String readName) { + return readNames.remove(readName); + } + + @Override + public int size() { + return readNames.size(); + } + + @Override + public void clear() { + readNames.clear(); + } + } +} diff --git a/src/main/java/picard/analysis/CollectWgsMetrics.java b/src/main/java/picard/analysis/CollectWgsMetrics.java index ab7df819f..6418c7a88 100644 --- a/src/main/java/picard/analysis/CollectWgsMetrics.java +++ b/src/main/java/picard/analysis/CollectWgsMetrics.java @@ -56,7 +56,9 @@ /** * Computes a number of metrics that are useful for evaluating coverage and performance of whole genome sequencing experiments. - * + * Two algorithms are available for this metrics: default and fast. The fast algorithm is enabled by USE_FAST_ALGORITHM option. + * The fast algorithm works better for regions of BAM file with coverage at least 10 reads per locus, + * for lower coverage the algorithms perform the same. * @author tfennell */ @CommandLineProgramProperties( @@ -118,6 +120,12 @@ @Option(doc="Sample Size used for Theoretical Het Sensitivity sampling. Default is 10000.", optional = true) public int SAMPLE_SIZE=10000; + @Option(doc = "If true, fast algorithm is used") + public boolean USE_FAST_ALGORITHM = false; + + @Option(doc = "Average read length in the file. Default is 150.", optional = true) + public int READ_LENGTH = 150; + @Option(doc = "An interval list file that contains the positions to restrict the assessment. Please note that " + "all bases of reads that overlap these intervals will be considered, even if some of those bases extend beyond the boundaries of " + "the interval. The ideal use case for this argument is to use it to restrict the calculation to a subset of (whole) contigs. To " + @@ -437,7 +445,7 @@ protected int doWork() { final ProgressLogger progress = new ProgressLogger(log, 10000000, "Processed", "loci"); final ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE); final SamReader in = getSamReader(); - final SamLocusIterator iterator = getLocusIterator(in); + final AbstractLocusIterator iterator = getLocusIterator(in); final List filters = new ArrayList<>(); final CountingFilter mapqFilter = new CountingMapQFilter(MINIMUM_MAPPING_QUALITY); @@ -451,49 +459,26 @@ protected int doWork() { filters.add(pairFilter); } iterator.setSamFilters(filters); - iterator.setEmitUncoveredLoci(true); iterator.setMappingQualityScoreCutoff(0); // Handled separately because we want to count bases - iterator.setQualityScoreCutoff(0); // Handled separately because we want to count bases iterator.setIncludeNonPfReads(false); - iterator.setMaxReadsToAccumulatePerLocus(LOCUS_ACCUMULATION_CAP); - - final WgsMetricsCollector collector = getCollector(COVERAGE_CAP, getIntervalsToExamine()); - - final boolean usingStopAfter = STOP_AFTER > 0; - final long stopAfter = STOP_AFTER - 1; - long counter = 0; - // Loop through all the loci - while (iterator.hasNext()) { - final SamLocusIterator.LocusInfo info = iterator.next(); - final ReferenceSequence ref = refWalker.get(info.getSequenceIndex()); - - // Check that the reference is not N - final byte base = ref.getBases()[info.getPosition() - 1]; - if (SequenceUtil.isNoCall(base)) continue; - - // add to the collector - collector.addInfo(info); - - // Record progress and perhaps stop - progress.record(info.getSequenceName(), info.getPosition()); - if (usingStopAfter && ++counter > stopAfter) break; - } - - // check that we added the same number of bases to the raw coverage histogram and the base quality histograms - final long sumBaseQ= Arrays.stream(collector.unfilteredBaseQHistogramArray).sum(); - final long sumDepthHisto = LongStream.rangeClosed(0, collector.coverageCap).map(i -> (i * collector.unfilteredDepthHistogramArray[(int) i])).sum(); - if (sumBaseQ != sumDepthHisto) { - log.error("Coverage and baseQ distributions contain different amount of bases!"); - } + final AbstractWgsMetricsCollector collector = getCollector(COVERAGE_CAP, getIntervalsToExamine()); + final WgsMetricsProcessor processor = getWgsMetricsProcessor(progress, refWalker, iterator, collector); + processor.processFile(); final MetricsFile out = getMetricsFile(); - collector.addToMetricsFile(out, INCLUDE_BQ_HISTOGRAM, dupeFilter, mapqFilter, pairFilter); + processor.addToMetricsFile(out, INCLUDE_BQ_HISTOGRAM, dupeFilter, mapqFilter, pairFilter); out.write(OUTPUT); return 0; } + private WgsMetricsProcessorImpl getWgsMetricsProcessor( + ProgressLogger progress, ReferenceSequenceFileWalker refWalker, + AbstractLocusIterator> iterator, AbstractWgsMetricsCollector collector) { + return new WgsMetricsProcessorImpl<>(iterator, refWalker, collector, progress); + } + /** Gets the intervals over which we will calculate metrics. */ protected IntervalList getIntervalsToExamine() { final IntervalList intervals; @@ -547,7 +532,7 @@ protected WgsMetrics generateWgsMetrics(final IntervalList intervals, ); } - private WgsMetrics generateWgsMetrics(final IntervalList intervals, + WgsMetrics generateWgsMetrics(final IntervalList intervals, final Histogram highQualityDepthHistogram, final Histogram unfilteredDepthHistogram, final long basesExcludedByMapq, @@ -596,48 +581,49 @@ protected long getBasesExcludedBy(final CountingFilter filter) { return filter.getFilteredBases(); } - protected SamLocusIterator getLocusIterator(final SamReader in) { - return (INTERVALS != null) ? new SamLocusIterator(in, IntervalList.fromFile(INTERVALS)) : new SamLocusIterator(in); + /** + * Creates AbstractLocusIterator implementation according to USE_FAST_ALGORITHM value. + * + * @param in inner SAMReader + * @return if USE_FAST_ALGORITHM is enabled, returns EdgeReadIterator implementation, + * otherwise default algorithm is used and SamLocusIterator is returned. + */ + protected AbstractLocusIterator getLocusIterator(final SamReader in) { + if (USE_FAST_ALGORITHM) { + return (INTERVALS != null) ? new EdgeReadIterator(in, IntervalList.fromFile(INTERVALS)) : new EdgeReadIterator(in); + } + SamLocusIterator iterator = (INTERVALS != null) ? new SamLocusIterator(in, IntervalList.fromFile(INTERVALS)) : new SamLocusIterator(in); + iterator.setMaxReadsToAccumulatePerLocus(LOCUS_ACCUMULATION_CAP); + iterator.setEmitUncoveredLoci(true); + iterator.setQualityScoreCutoff(0); + return iterator; } /** + * Creates AbstractWgsMetricsCollector implementation according to USE_FAST_ALGORITHM value. + * * @param coverageCap the maximum depth/coverage to consider. * @param intervals the intervals over which metrics are collected. - * @return + * @return if USE_FAST_ALGORITHM is enabled, returns FastWgsMetricsCollector implementation, + * otherwise default algorithm is used and WgsMetricsCollector is returned. */ - protected WgsMetricsCollector getCollector(final int coverageCap, final IntervalList intervals) { - return new WgsMetricsCollector(coverageCap, intervals); + protected AbstractWgsMetricsCollector getCollector(final int coverageCap, final IntervalList intervals) { + return USE_FAST_ALGORITHM ? new FastWgsMetricsCollector(this, coverageCap, intervals) : + new WgsMetricsCollector(this, coverageCap, intervals); } - protected class WgsMetricsCollector { + protected static class WgsMetricsCollector extends AbstractWgsMetricsCollector { - // Count of sites with a given depth of coverage. Includes all but quality 2 bases. - // We draw depths from this histogram when we calculate the theoretical het sensitivity. - protected final long[] unfilteredDepthHistogramArray; - - // Count of bases observed with a given base quality. Includes all but quality 2 bases. - // We draw bases from this histogram when we calculate the theoretical het sensitivity. - protected final long[] unfilteredBaseQHistogramArray; - - // Count of sites with a given depth of coverage. Excludes bases with quality below MINIMUM_BASE_QUALITY (default 20). - protected final long[] highQualityDepthHistogramArray; - - private long basesExcludedByBaseq = 0; - private long basesExcludedByOverlap = 0; - private long basesExcludedByCapping = 0; - protected final IntervalList intervals; - protected final int coverageCap; - - public WgsMetricsCollector(final int coverageCap, final IntervalList intervals) { - unfilteredDepthHistogramArray = new long[coverageCap + 1]; - highQualityDepthHistogramArray = new long[coverageCap + 1]; - unfilteredBaseQHistogramArray = new long[Byte.MAX_VALUE]; - this.coverageCap = coverageCap; - this.intervals = intervals; + public WgsMetricsCollector(final CollectWgsMetrics metrics, final int coverageCap, final IntervalList intervals) { + super(metrics, coverageCap, intervals); } - public void addInfo(final SamLocusIterator.LocusInfo info) { + @Override + public void addInfo(final AbstractLocusInfo info, final ReferenceSequence ref, boolean referenceBaseN) { + if (referenceBaseN) { + return; + } // Figure out the coverage while not counting overlapping reads twice, and excluding various things final HashSet readNames = new HashSet<>(info.getRecordAndOffsets().size()); int pileupSize = 0; @@ -653,78 +639,15 @@ public void addInfo(final SamLocusIterator.LocusInfo info) { unfilteredDepth++; } - if (recs.getBaseQuality() < MINIMUM_BASE_QUALITY || + if (recs.getBaseQuality() < collectWgsMetrics.MINIMUM_BASE_QUALITY || SequenceUtil.isNoCall(recs.getReadBase())) { ++basesExcludedByBaseq; continue; } if (!readNames.add(recs.getRecord().getReadName())) { ++basesExcludedByOverlap; continue; } - pileupSize++; } - final int highQualityDepth = Math.min(pileupSize, coverageCap); if (highQualityDepth < pileupSize) basesExcludedByCapping += pileupSize - coverageCap; highQualityDepthHistogramArray[highQualityDepth]++; unfilteredDepthHistogramArray[unfilteredDepth]++; } - - public void addToMetricsFile(final MetricsFile file, - final boolean includeBQHistogram, - final CountingFilter dupeFilter, - final CountingFilter mapqFilter, - final CountingPairedFilter pairFilter) { - final WgsMetrics metrics = getMetrics(dupeFilter, mapqFilter, pairFilter); - - // add them to the file - file.addMetric(metrics); - file.addHistogram(getHighQualityDepthHistogram()); - if (includeBQHistogram) addBaseQHistogram(file); - - - } - - protected void addBaseQHistogram(final MetricsFile file) { - file.addHistogram(getUnfilteredBaseQHistogram()); - } - - protected Histogram getHighQualityDepthHistogram() { - return getHistogram(highQualityDepthHistogramArray, "coverage", "high_quality_coverage_count"); - } - - protected Histogram getUnfilteredDepthHistogram(){ - return getHistogram(unfilteredDepthHistogramArray, "coverage", "unfiltered_coverage_count"); - } - - protected Histogram getUnfilteredBaseQHistogram() { - return getHistogram(unfilteredBaseQHistogramArray, "baseq", "unfiltered_baseq_count"); - } - - protected Histogram getHistogram(final long[] array, final String binLabel, final String valueLabel) { - final Histogram histogram = new Histogram<>(binLabel, valueLabel); - for (int i = 0; i < array.length; ++i) { - histogram.increment(i, array[i]); - } - return histogram; - } - - protected WgsMetrics getMetrics(final CountingFilter dupeFilter, - final CountingFilter mapqFilter, - final CountingPairedFilter pairFilter) { - return generateWgsMetrics( - this.intervals, - getHighQualityDepthHistogram(), - getUnfilteredDepthHistogram(), - getBasesExcludedBy(mapqFilter), - getBasesExcludedBy(dupeFilter), - getBasesExcludedBy(pairFilter), - basesExcludedByBaseq, - basesExcludedByOverlap, - basesExcludedByCapping, - coverageCap, - getUnfilteredBaseQHistogram(), - SAMPLE_SIZE - ); - } - - } } - diff --git a/src/main/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java b/src/main/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java index 2c872ff43..857b65b51 100644 --- a/src/main/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java +++ b/src/main/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java @@ -125,7 +125,7 @@ protected int doWork() { // Initialize the SamReader, so the header is available prior to super.doWork, for getIntervalsToExamine call. */ getSamReader(); - this.collector = new WgsMetricsWithNonZeroCoverageCollector(COVERAGE_CAP, getIntervalsToExamine()); + this.collector = new WgsMetricsWithNonZeroCoverageCollector(this, COVERAGE_CAP, getIntervalsToExamine()); super.doWork(); final List readGroups = getSamFileHeader().getReadGroups(); @@ -187,8 +187,9 @@ protected WgsMetricsCollector getCollector(final int coverageCap, final Interval Histogram highQualityDepthHistogram; Histogram highQualityDepthHistogramNonZero; - public WgsMetricsWithNonZeroCoverageCollector(final int coverageCap, final IntervalList intervals) { - super(coverageCap, intervals); + public WgsMetricsWithNonZeroCoverageCollector(final CollectWgsMetricsWithNonZeroCoverage metrics, + final int coverageCap, final IntervalList intervals) { + super(metrics, coverageCap, intervals); } @Override @@ -240,4 +241,4 @@ public boolean areHistogramsEmpty() { } } -} \ No newline at end of file +} diff --git a/src/main/java/picard/analysis/CounterManager.java b/src/main/java/picard/analysis/CounterManager.java new file mode 100644 index 000000000..44835613e --- /dev/null +++ b/src/main/java/picard/analysis/CounterManager.java @@ -0,0 +1,189 @@ +/* + * The MIT License + * + * Copyright (c) 2016 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.analysis; + +import java.util.ArrayList; +import java.util.Arrays; + +/** + * Class for managing a list of Counters of integer, + * provides methods to access data from Counters with respect to an offset. + * Each Counter represents a certain region of a processed sequence starting from offset and + * can accumulate some value for each locus of this sequence, for instance, read coverage. + * @author Mariia_Zueva@epam.com, EPAM Systems, Inc. + */ + +public class CounterManager { + + /** + * Length of inner Counter arrays + */ + private final int arrayLength; + /** + * Proposed length of processed reads + */ + private final int readLength; + /** + * Current offset from the start of the processed sequence. If offset = 100, this means, that value in + * Counter arrays at index 0, represents the 100th position of the sequence. + */ + private int offset = 0; + /** + * List of Counter arrays for managing + */ + private final ArrayList arrays = new ArrayList<>(2); + + /** + * Constructor creates new CounterManager without any Counters, + * counters are added to CounterManager via newCounter() method + * + * @param arrayLength length of inner Counter arrays + * @param readLength proposed length of processed reads + */ + public CounterManager(final int arrayLength, int readLength) { + this.arrayLength = arrayLength; + this.readLength = readLength; + } + + /** + * Method added for use in unit tests + * + * @return offset from the reference sequence, that is represented by index 0 of Counter arrays. + */ + int getOffset() { + return offset; + } + + /** + * Method added for use in unit tests + * + * @param offset from the reference sequence, that will be represented by index 0 of Counter arrays. + */ + void setOffset(int offset) { + this.offset = offset; + } + + /** + * Method checks that new locus position is not out of bounds of Counter arrays and there is enough + * space in them to hold information on at least one more read of length readLength. + * If there is no free space, but there is accumulated information in Counter arrays after new locus position + * that we may need, the arrays are rebased, so that 0 index of arrays represents new locus position. + * In other case, we just clear the arrays. + * + * @param locusPosition position in the reference sequence, + * that will be represented by index 0 of Counter arrays. + */ + public void checkOutOfBounds(int locusPosition) { + if (locusPosition - offset + readLength >= arrayLength) { + if (locusPosition - offset < arrayLength) { + rebase(locusPosition); + } else { + clear(); + offset = locusPosition; + } + } + } + + /** + * Rebases inner Counter arrays so that 0 index of array represents the new locus position + * + * @param locusPosition position in the reference sequence, + * that will be represented by index 0 of Counter arrays. + */ + private void rebase(int locusPosition) { + for (Counter counter : arrays) { + int[] array = counter.array; + System.arraycopy(array, locusPosition - offset, array, 0, arrayLength - locusPosition + offset); + Arrays.fill(array, arrayLength - locusPosition + offset, arrayLength, 0); + } + offset = locusPosition; + } + + /** + * Clears all inner Counter arrays + */ + public void clear() { + for (Counter counter : arrays) { + int[] array = counter.array; + Arrays.fill(array, 0); + } + offset = 0; + } + + /** + * Creates a new Counter object and adds it to the list of managed Counters. + * + * @return Counter, that will be managed by current CounterManager + */ + public Counter newCounter() { + Counter counter = new Counter(arrayLength); + arrays.add(counter); + return counter; + } + + /** + * Class represents an integer array with methods to increment and get the values form it with respect + * to offset of outer CounterManager. + */ + public class Counter { + + /** + * Inner array to store accumulated values + */ + private int[] array; + + /** + * @param arrayLength length of the inner array + */ + private Counter(int arrayLength) { + array = new int[arrayLength]; + } + + /** + * Increments value corresponding to reference sequence index + * + * @param index in the reference sequence + */ + public void inc(int index) { + checkIndex(index); + array[index - offset]++; + } + + /** + * @param index in the reference sequence + * @return value from the inner array, corresponding to input reference index + */ + public int get(int index) { + checkIndex(index); + return array[index - offset]; + } + + private void checkIndex(int index) { + if ((index - offset) < 0 || (index - offset) >= array.length) { + throw new ArrayIndexOutOfBoundsException("The requested index is out of counter bounds."); + } + } + } +} diff --git a/src/main/java/picard/analysis/FastWgsMetricsCollector.java b/src/main/java/picard/analysis/FastWgsMetricsCollector.java new file mode 100644 index 000000000..9142f6d7d --- /dev/null +++ b/src/main/java/picard/analysis/FastWgsMetricsCollector.java @@ -0,0 +1,192 @@ +/* + * The MIT License + * + * Copyright (c) 2016 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.analysis; + +import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.util.AbstractLocusInfo; +import htsjdk.samtools.util.EdgingRecordAndOffset; +import htsjdk.samtools.util.IntervalList; +import htsjdk.samtools.util.SequenceUtil; + +import java.util.HashSet; +import java.util.Optional; +import java.util.Set; + +/** + * Class represents fast algorithm for collecting data from AbstractLocusInfo + * with a list of aligned EdgingRecordAndOffset objects. According to the algorithm + * we receive only two EdgingRecordAndOffset objects for each alignment block of a read: + * one for the start of block and one for the end. When meeting a EdgingRecordAndOffset + * with type BEGIN, all information from the alignment block is accumulated in the collector, + * read name is added to a map holding the names of processed reads for detecting overlapping positions. + * When meeting a EdgingRecordAndOffset with type END, + * the read name is removed from the map with names of processed reads. + * @author Mariia_Zueva@epam.com, EPAM Systems, Inc. + */ + +public class FastWgsMetricsCollector extends AbstractWgsMetricsCollector { + + /** + * Index of the sequence of the previous processed AbstractLocusInfo + */ + private int previousSequenceIndex; + + /** + * Manager for pileupSize Counter. + */ + private final CounterManager counterManager; + /** + * Counter, accumulating the data on coverage for the calculation of base quality histogram. + */ + private final CounterManager.Counter pileupSize; + /** + * Counter, accumulating the data on unfiltered coverage (includes all but quality 2 bases) + * for the calculation of theoretical het sensitivity. + */ + private final CounterManager.Counter unfilteredDepthSize; + + /** + * Map, holding information on currently processed reads, that possibly have overlapping regions. + */ + private ReadNamesCollection readsNames; + + /** + * Determines the size of created Counter objects. The bigger Counter objects + * are created, the less rebasing in Counter will occur. + */ + private final int ARRAY_SIZE_PER_READ_LENGTH = 2000; + + /** + * Creates a collector and initializes the inner data structures + * + * @param collectWgsMetrics CollectWgsMetrics, that creates this collector + * @param coverageCap coverage cap + */ + public FastWgsMetricsCollector(CollectWgsMetrics collectWgsMetrics, int coverageCap, final IntervalList intervals) { + super(collectWgsMetrics, coverageCap, intervals); + this.previousSequenceIndex = -1; + this.counterManager = new CounterManager(collectWgsMetrics.READ_LENGTH * ARRAY_SIZE_PER_READ_LENGTH, collectWgsMetrics.READ_LENGTH); + this.pileupSize = counterManager.newCounter(); + this.unfilteredDepthSize = counterManager.newCounter(); + } + + @Override + public void addInfo(final AbstractLocusInfo info, final ReferenceSequence ref, boolean referenceBaseN) { + prepareCollector(info); + for (final EdgingRecordAndOffset record : info.getRecordAndOffsets()) { + final String readName = record.getReadName(); + Optional> recordsAndOffsetsForName = Optional.ofNullable((Set)readsNames.get(readName)); + if (record.getType() == EdgingRecordAndOffset.Type.BEGIN) { + processRecord(info.getPosition(), ref, record, recordsAndOffsetsForName.orElse(new HashSet<>())); + } else { + recordsAndOffsetsForName.ifPresent( + edgingRecordAndOffsets -> removeRecordFromMap(record, + edgingRecordAndOffsets)); + } + } + if (!referenceBaseN) { + final int readNamesSize = pileupSize.get(info.getPosition()); + final int highQualityDepth = Math.min(readNamesSize, coverageCap); + if (highQualityDepth < readNamesSize) { + basesExcludedByCapping += readNamesSize - coverageCap; + } + highQualityDepthHistogramArray[highQualityDepth]++; + unfilteredDepthHistogramArray[unfilteredDepthSize.get(info.getPosition())]++; + } + } + + private void processRecord(int position, ReferenceSequence ref, EdgingRecordAndOffset record, Set recordsAndOffsetsForName) { + long processedLoci = counter; + readsNames.put(record.getReadName(), recordsAndOffsetsForName); + final byte[] qualities = record.getBaseQualities(); + final byte[] bases = record.getRecord().getReadBases(); + for (int i = 0; i < record.getLength(); i++) { + final int index = i + position; + if (isReferenceBaseN(index, ref)) { + continue; + } + final byte quality = qualities[i + record.getOffset()]; + if (quality <= 2) { + basesExcludedByBaseq++; + } else { + if (unfilteredDepthSize.get(index) < coverageCap) { + unfilteredBaseQHistogramArray[quality]++; + unfilteredDepthSize.inc(index); + } + if (quality < collectWgsMetrics.MINIMUM_BASE_QUALITY || SequenceUtil.isNoCall(bases[i + record.getOffset()])){ + basesExcludedByBaseq++; + } else { + final int bsq = excludeByQuality(recordsAndOffsetsForName, index); + if (recordsAndOffsetsForName.size() - bsq > 0) { + basesExcludedByOverlap++; + } else { + pileupSize.inc(index); + } + } + } + if (isTimeToStop(++processedLoci)) { + break; + } + } + recordsAndOffsetsForName.add(record); + } + + private void removeRecordFromMap(EdgingRecordAndOffset record, Set recordsAndOffsetsForName) { + if (recordsAndOffsetsForName.size() == 1) { + readsNames.remove(record.getReadName()); + } else { + recordsAndOffsetsForName.remove(record.getStart()); + } + } + + /** + * Prepares the accumulator objects to process a new AbstractLocusInfo. + * If we switch to a new sequence, all accumulators are cleared. + * + * @param info the next AbstractLocusInfo to process + */ + private void prepareCollector(AbstractLocusInfo info) { + if (readsNames == null) { + readsNames = collectionFactory.createCollection(); + } + if (previousSequenceIndex != info.getSequenceIndex()) { + readsNames.clear(); + counterManager.clear(); + previousSequenceIndex = info.getSequenceIndex(); + } + counterManager.checkOutOfBounds(info.getPosition()); + } + + private int excludeByQuality(final Set setForName, int position) { + int bsq = 0; + for (EdgingRecordAndOffset recordAndOffset : setForName) { + if (position - recordAndOffset.getRefPos() >= recordAndOffset.getLength() + || recordAndOffset.getBaseQuality(position) < collectWgsMetrics.MINIMUM_BASE_QUALITY) { + bsq++; + } + } + return bsq; + } +} diff --git a/src/main/java/picard/analysis/WgsMetricsProcessor.java b/src/main/java/picard/analysis/WgsMetricsProcessor.java new file mode 100644 index 000000000..c4415c56d --- /dev/null +++ b/src/main/java/picard/analysis/WgsMetricsProcessor.java @@ -0,0 +1,57 @@ +/* + * The MIT License + * + * Copyright (c) 2016 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.analysis; + + +import htsjdk.samtools.metrics.MetricsFile; +import picard.filter.CountingFilter; +import picard.filter.CountingPairedFilter; + +/** + * Interface for processing data and generate result for CollectWgsMetrics + * @author Mariia_Zueva@epam.com, EPAM Systems, Inc. + */ +public interface WgsMetricsProcessor { + + /** + * Method processes the input data and accumulates result data + */ + void processFile(); + + /** + * Adds result metric's data to input file + * + * @param file MetricsFile for result of collector's work + * @param includeBQHistogram include base quality histogram + * @param dupeFilter counting filter for duplicate reads + * @param mapqFilter counting filter for mapping quality + * @param pairFilter counting filter for reads without a mapped mate pair + */ + void addToMetricsFile(final MetricsFile file, + final boolean includeBQHistogram, + final CountingFilter dupeFilter, + final CountingFilter mapqFilter, + final CountingPairedFilter pairFilter); +} diff --git a/src/main/java/picard/analysis/WgsMetricsProcessorImpl.java b/src/main/java/picard/analysis/WgsMetricsProcessorImpl.java new file mode 100644 index 000000000..7c9b13733 --- /dev/null +++ b/src/main/java/picard/analysis/WgsMetricsProcessorImpl.java @@ -0,0 +1,126 @@ +/* + * The MIT License + * + * Copyright (c) 2016 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.analysis; + +import htsjdk.samtools.metrics.MetricsFile; +import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.reference.ReferenceSequenceFileWalker; +import htsjdk.samtools.util.*; +import picard.PicardException; +import picard.filter.CountingFilter; +import picard.filter.CountingPairedFilter; + +import java.util.Arrays; +import java.util.stream.LongStream; + +/** + * Implementation of WgsMetricsProcessor that gets input data from a given iterator + * and processes it with a help of collector + * @author Mariia_Zueva@epam.com, EPAM Systems, Inc. + */ +public class WgsMetricsProcessorImpl implements WgsMetricsProcessor { + /** + * Source of input data + */ + private final AbstractLocusIterator> iterator; + /** + * Accumulates the data from iterator + */ + private final AbstractWgsMetricsCollector collector; + /** + * ReferenceWalker for a processed reference sequence + */ + private final ReferenceSequenceFileWalker refWalker; + /** + * Logger for the progress of work + */ + private final ProgressLogger progress; + + private final Log log = Log.getInstance(WgsMetricsProcessorImpl.class); + + /** + * @param iterator input AbstractLocusIterator + * @param refWalker over processed reference file + * @param collector input AbstractWgsMetricsCollector + * @param progress logger + */ + public WgsMetricsProcessorImpl(AbstractLocusIterator> iterator, + ReferenceSequenceFileWalker refWalker, + AbstractWgsMetricsCollector collector, + ProgressLogger progress) { + this.iterator = iterator; + this.collector = collector; + this.refWalker = refWalker; + this.progress = progress; + } + + /** + * Method gets the data from iterator for each locus and processes it with the help of collector. + */ + @Override + public void processFile() { + long counter = 0; + + while (iterator.hasNext()) { + final AbstractLocusInfo info = iterator.next(); + final ReferenceSequence ref = refWalker.get(info.getSequenceIndex()); + boolean referenceBaseN = collector.isReferenceBaseN(info.getPosition(), ref); + collector.addInfo(info, ref, referenceBaseN); + if (referenceBaseN) { + continue; + } + + progress.record(info.getSequenceName(), info.getPosition()); + if (collector.isTimeToStop(++counter)) { + break; + } + collector.setCounter(counter); + } + // check that we added the same number of bases to the raw coverage histogram and the base quality histograms + final long sumBaseQ= Arrays.stream(collector.unfilteredBaseQHistogramArray).sum(); + final long sumDepthHisto = LongStream.rangeClosed(0, collector.coverageCap).map(i -> (i * collector.unfilteredDepthHistogramArray[(int) i])).sum(); + if (sumBaseQ != sumDepthHisto) { + log.error("Coverage and baseQ distributions contain different amount of bases!"); + } + } + + /** + * Adds result metric's data to input file + * + * @param file MetricsFile for result of collector's work + * @param includeBQHistogram include base quality histogram + * @param dupeFilter counting filter for duplicate reads + * @param mapqFilter counting filter for mapping quality + * @param pairFilter counting filter for reads without a mapped mate pair + */ + @Override + public void addToMetricsFile(MetricsFile file, + boolean includeBQHistogram, + CountingFilter dupeFilter, + CountingFilter mapqFilter, + CountingPairedFilter pairFilter) { + collector.addToMetricsFile(file, includeBQHistogram, dupeFilter, mapqFilter, pairFilter); + } +} diff --git a/src/test/java/picard/analysis/AbstractWgsMetricsCollectorTest.java b/src/test/java/picard/analysis/AbstractWgsMetricsCollectorTest.java new file mode 100644 index 000000000..e2bcc4f8f --- /dev/null +++ b/src/test/java/picard/analysis/AbstractWgsMetricsCollectorTest.java @@ -0,0 +1,107 @@ +/* + * The MIT License + * + * Copyright (c) 2016 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.analysis; + +import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.util.AbstractLocusInfo; +import org.testng.annotations.Test; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.assertFalse; +import static picard.analysis.CollectWgsMetricsTestUtils.createIntervalList; + +public class AbstractWgsMetricsCollectorTest { + + @Test + public void testForCollectorWithoutData(){ + long[] templateQualHistogram = new long[127]; + long[] templateHistogramArray = new long[11]; + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + AbstractWgsMetricsCollector collector = new AbstractWgsMetricsCollector(collectWgsMetrics, + 10, createIntervalList()) { + @Override + public void addInfo(AbstractLocusInfo info, ReferenceSequence ref, boolean referenceBaseN) { + } + }; + assertEquals(templateHistogramArray, collector.highQualityDepthHistogramArray); + assertEquals(templateQualHistogram, collector.unfilteredBaseQHistogramArray); + assertEquals(0, collector.basesExcludedByCapping); + assertEquals(0, collector.basesExcludedByOverlap); + assertEquals(0, collector.basesExcludedByBaseq); + } + + @Test (expectedExceptions = IllegalArgumentException.class) + public void testForExceptionWithNegativeCoverage(){ + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + new AbstractWgsMetricsCollector(collectWgsMetrics, -10, createIntervalList()) { + @Override + public void addInfo(AbstractLocusInfo info, ReferenceSequence ref, boolean referenceBaseN) { + } + }; + } + + @Test + public void testForSetCounter(){ + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + AbstractWgsMetricsCollector collector = new AbstractWgsMetricsCollector(collectWgsMetrics, + 10, createIntervalList()) { + @Override + public void addInfo(AbstractLocusInfo info, ReferenceSequence ref, boolean referenceBaseN) { + } + }; + long counter = 20; + collector.setCounter(counter); + assertEquals(20, collector.counter); + } + + @Test + public void testForStop(){ + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + collectWgsMetrics.STOP_AFTER = 10; + AbstractWgsMetricsCollector collector = new AbstractWgsMetricsCollector(collectWgsMetrics, + 10, createIntervalList()) { + @Override + public void addInfo(AbstractLocusInfo info, ReferenceSequence ref, boolean referenceBaseN) { + } + }; + assertTrue(collector.isTimeToStop(10)); + assertFalse(collector.isTimeToStop(2)); + } + + @Test + public void testForRefBaseN(){ + byte[] refBasis = {'A', 'C', 'C', 'T', 'A', 'N', 'G', 'T', 'N', 'N'}; + ReferenceSequence ref = new ReferenceSequence("test", 0, refBasis); + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + AbstractWgsMetricsCollector collector = new AbstractWgsMetricsCollector(collectWgsMetrics, + 10, createIntervalList()) { + @Override + public void addInfo(AbstractLocusInfo info, ReferenceSequence ref, boolean referenceBaseN) { + } + }; + assertTrue(collector.isReferenceBaseN(6, ref)); + assertFalse(collector.isReferenceBaseN(1, ref)); + } +} diff --git a/src/test/java/picard/analysis/CollectWgsMetricsTest.java b/src/test/java/picard/analysis/CollectWgsMetricsTest.java index b282d73db..071a1d33e 100644 --- a/src/test/java/picard/analysis/CollectWgsMetricsTest.java +++ b/src/test/java/picard/analysis/CollectWgsMetricsTest.java @@ -75,12 +75,14 @@ public String getCommandLineProgramName() { final String referenceFile = "testdata/picard/quality/chrM.reference.fasta"; return new Object[][] { - {tempSamFile, outfile, referenceFile} + {tempSamFile, outfile, referenceFile, "false"}, + {tempSamFile, outfile, referenceFile, "true"}, }; } @Test(dataProvider = "wgsDataProvider") - public void testMetricsFromWGS(final File input, final File outfile, final String referenceFile) throws IOException { + public void testMetricsFromWGS(final File input, final File outfile, final String referenceFile, + final String useFastAlgorithm) throws IOException { outfile.deleteOnExit(); final int sampleSize = 1000; @@ -88,7 +90,8 @@ public void testMetricsFromWGS(final File input, final File outfile, final Strin "INPUT=" + input.getAbsolutePath(), "OUTPUT=" + outfile.getAbsolutePath(), "REFERENCE_SEQUENCE=" + referenceFile, - "SAMPLE_SIZE=" + sampleSize + "SAMPLE_SIZE=" + sampleSize, + "USE_FAST_ALGORITHM=" + useFastAlgorithm }; Assert.assertEquals(runPicardCommandLine(args), 0); @@ -193,12 +196,19 @@ void setupBuilder() throws IOException { sorter.instanceMain(args); //create output files for tests - outfile = File.createTempFile("testWgsMetrics", ".txt"); + outfile = File.createTempFile("testWgsMetrics", ".txt", TEST_DIR); outfile.deleteOnExit(); } - @Test - public void testLargeIntervals() throws IOException { + @DataProvider(name = "wgsAlgorithm") + public Object[][] wgsAlgorithm() { + return new Object[][] { + {"false"}, + {"true"}, + }; + } + @Test(dataProvider = "wgsAlgorithm") + public void testLargeIntervals(final String useFastAlgorithm) throws IOException { final File input = new File(TEST_DIR, "forMetrics.sam"); final File outfile = File.createTempFile("test", ".wgs_metrics"); final File ref = new File(TEST_DIR, "merger.fasta"); @@ -225,8 +235,8 @@ public void testLargeIntervals() throws IOException { } } - @Test - public void testExclusions() throws IOException { + @Test(dataProvider = "wgsAlgorithm") + public void testExclusions(final String useFastAlgorithm) throws IOException { final File reference = new File("testdata/picard/sam/merger.fasta"); final File tempSamFile = File.createTempFile("CollectWgsMetrics", ".bam", TEST_DIR); tempSamFile.deleteOnExit(); @@ -289,8 +299,8 @@ public void testExclusions() throws IOException { Assert.assertEquals((long) highQualityDepthHistogram.get(3).getValue(), 2*10); } - @Test - public void testPoorQualityBases() throws IOException { + @Test(dataProvider = "wgsAlgorithm") + public void testPoorQualityBases(final String useFastAlgorithm) throws IOException { final File reference = new File("testdata/picard/quality/chrM.reference.fasta"); final File testSamFile = File.createTempFile("CollectWgsMetrics", ".bam", TEST_DIR); testSamFile.deleteOnExit(); diff --git a/src/test/java/picard/analysis/CollectWgsMetricsTestUtils.java b/src/test/java/picard/analysis/CollectWgsMetricsTestUtils.java index 468a3853f..e629cac6e 100644 --- a/src/test/java/picard/analysis/CollectWgsMetricsTestUtils.java +++ b/src/test/java/picard/analysis/CollectWgsMetricsTestUtils.java @@ -26,10 +26,24 @@ import htsjdk.samtools.*; +import htsjdk.samtools.filter.SamRecordFilter; +import htsjdk.samtools.filter.SecondaryAlignmentFilter; +import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFileWalker; +import htsjdk.samtools.util.AbstractLocusIterator; +import htsjdk.samtools.util.EdgeReadIterator; +import htsjdk.samtools.util.IntervalList; import htsjdk.variant.utils.SAMSequenceDictionaryExtractor; +import picard.filter.CountingDuplicateFilter; +import picard.filter.CountingFilter; +import picard.filter.CountingMapQFilter; +import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import java.util.stream.IntStream; @@ -38,6 +52,14 @@ */ public class CollectWgsMetricsTestUtils { + + private static final String sqHeaderLN20 = "@HD\tSO:coordinate\tVN:1.0\n@SQ\tSN:chrM\tAS:HG18\tLN:20\n"; + private static final String s1 = "3851612\t16\tchrM\t1\t255\t3M2D10M\t*\t0\t0\tACCTACGTTCAAT\tDDDDDDDDDDDDD\n"; + private static final String sqHeaderLN100 = "@HD\tSO:coordinate\tVN:1.0\n@SQ\tSN:chrM\tAS:HG18\tLN:100\n"; + private static final String s2 = "3851612\t16\tchrM\t2\t255\t10M1D5M\t*\t0\t0\tCCTACGTTCAATATT\tDDDDDDDDDDDDDDD\n"; + static final String exampleSamOneRead = sqHeaderLN20+s1; + static final String exampleSamTwoReads = sqHeaderLN100+s1+s2; + protected static SAMRecordSetBuilder createTestSAMBuilder(final File reference, final String readGroupId, final String sample, @@ -96,6 +118,97 @@ private static void createTestSAM(String testSamName) throws IOException { writer.close(); } + static IntervalList createIntervalList() { + return new IntervalList(new SAMFileHeader()); + } + + static AbstractLocusIterator createReadEndsIterator(String exampleSam){ + final List filters = new ArrayList<>(); + final CountingFilter dupeFilter = new CountingDuplicateFilter(); + final CountingFilter mapqFilter = new CountingMapQFilter(0); + filters.add(new SecondaryAlignmentFilter()); // Not a counting filter because we never want to count reads twice + filters.add(mapqFilter); + filters.add(dupeFilter); + SamReader samReader = createSamReader(exampleSam); + AbstractLocusIterator iterator = new EdgeReadIterator(samReader); + iterator.setSamFilters(filters); + iterator.setMappingQualityScoreCutoff(0); // Handled separately because we want to count bases + iterator.setIncludeNonPfReads(false); + return iterator; + } + + static SamReader createSamReader(String samExample) { + ByteArrayInputStream inputStream = new ByteArrayInputStream(samExample.getBytes()); + return SamReaderFactory.makeDefault().open(SamInputResource.of(inputStream)); + } + + static ReferenceSequenceFileWalker getReferenceSequenceFileWalker(String referenceString){ + ReferenceSequenceFile referenceSequenceFile = createReferenceSequenceFile(referenceString); + return new ReferenceSequenceFileWalker(referenceSequenceFile); + } + + static ReferenceSequenceFileWalker getReferenceSequenceFileWalker(){ + String referenceString = ">ref\nACCTACGTTCAATATTCTTC"; + return getReferenceSequenceFileWalker(referenceString); + } + + static ReferenceSequenceFile createReferenceSequenceFile(String referenceString) { + final SAMSequenceRecord record = new SAMSequenceRecord("ref", referenceString.length()); + final SAMSequenceDictionary dictionary = new SAMSequenceDictionary(); + dictionary.addSequence(record); + return new ReferenceSequenceFile() { + + boolean done = false; + @Override + public SAMSequenceDictionary getSequenceDictionary() { + return dictionary; + } + + @Override + public ReferenceSequence nextSequence() { + if (!done) { + done = true; + return getSequence(record.getSequenceName()); + } + return null; + } + + @Override + public void reset() { + done=false; + } + + @Override + public boolean isIndexed() { + return false; + } + + @Override + public ReferenceSequence getSequence(String contig) { + if (contig.equals(record.getSequenceName())) { + return new ReferenceSequence(record.getSequenceName(), 0, referenceString.getBytes()); + } else { + return null; + } + } + + @Override + public ReferenceSequence getSubsequenceAt(String contig, long start, long stop) { + return null; + } + + @Override + public String toString() { + return null; + } + + @Override + public void close() throws IOException { + + } + }; + } + // call main (from IDE for example) to createTestSAM(), which creates a test SAM file public static void main(String[] args) { try { createTestSAM("TestSam"); } catch(IOException e) { ; } diff --git a/src/test/java/picard/analysis/CounterManagerTest.java b/src/test/java/picard/analysis/CounterManagerTest.java new file mode 100644 index 000000000..c96ef0eeb --- /dev/null +++ b/src/test/java/picard/analysis/CounterManagerTest.java @@ -0,0 +1,95 @@ +package picard.analysis; + +import org.testng.Assert; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.Test; + +public class CounterManagerTest { + + private CounterManager testCounterManager; + private CounterManager.Counter testCounter; + private int arrayLength = 10; + private int readLength = 10; + private int OFFSET = 7; + private CounterManager secondTestCounterManager; + private CounterManager.Counter secondCounter; + + @BeforeTest + public void setUp() { + secondTestCounterManager = new CounterManager(arrayLength, readLength); + secondTestCounterManager.setOffset(OFFSET); + secondCounter = secondTestCounterManager.newCounter(); + testCounterManager = new CounterManager(arrayLength, readLength); + testCounterManager.setOffset(OFFSET); + testCounter = testCounterManager.newCounter(); + + for (int i = 0; i < arrayLength; i++) { + testCounter.inc(i + OFFSET); + secondCounter.inc(i + OFFSET); + } + testCounter.inc(OFFSET); + secondCounter.inc(OFFSET); + } + + @Test + public void testCounterInc() { + Assert.assertEquals(2, testCounter.get(OFFSET), "Test method inc:"); + } + + @Test + public void testForClearCounter() { + testCounterManager.clear(); + Assert.assertEquals(0, testCounter.get(OFFSET), "The value of the array with index 0 must be 0 after clear manager:"); + } + + @Test + public void testForCorrectOffsetAfterRebase() { + secondTestCounterManager.checkOutOfBounds(11); + Assert.assertEquals(11, secondTestCounterManager.getOffset(), "After rebase offset must be new int"); + } + + @Test + public void testForCorrectCounterAfterRebase() { + secondTestCounterManager.checkOutOfBounds(11); + Assert.assertEquals(1, secondCounter.get(11), "The value of the array with index 0 must be 1 after rebase:"); + } + + @Test + public void testForOutOfBoundCounter() { + secondTestCounterManager.checkOutOfBounds(44); + Assert.assertEquals(44, secondTestCounterManager.getOffset(), "New offset after clear must be 44:"); + } + + @Test + public void testForCleanCounterAfter() { + testCounterManager.checkOutOfBounds(88); + Assert.assertEquals(0, testCounter.get(88), "The value of the array with index 0 must be 1 after clean:"); + } + + @Test + public void testForCheckIncrement(){ + CounterManager testCounterManager = new CounterManager(arrayLength, readLength); + testCounterManager.setOffset(0); + CounterManager.Counter counter = testCounterManager.newCounter(); + for (int i=0; i<10; i++){ + counter.inc(1); + } + int[] testArray = new int[arrayLength]; + for (int i = 0; i< arrayLength; i++){ + testArray[i] = counter.get(i); + } + int[]templateArray = new int[arrayLength]; + templateArray[1] = 10; + Assert.assertEquals(templateArray, testArray); + } + + @Test(expectedExceptions = IndexOutOfBoundsException.class) + public void testForWrongIndexInInc(){ + testCounter.inc(40); + } + + @Test(expectedExceptions = IndexOutOfBoundsException.class) + public void testForWrongIndexInGet(){ + testCounter.get(40); + } +} diff --git a/src/test/java/picard/analysis/FastWgsMetricsCollectorTest.java b/src/test/java/picard/analysis/FastWgsMetricsCollectorTest.java new file mode 100644 index 000000000..884b09a23 --- /dev/null +++ b/src/test/java/picard/analysis/FastWgsMetricsCollectorTest.java @@ -0,0 +1,212 @@ +package picard.analysis; + + +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.util.AbstractLocusInfo; +import htsjdk.samtools.util.AbstractLocusIterator; +import htsjdk.samtools.util.EdgingRecordAndOffset; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.Test; +import static org.testng.Assert.assertEquals; +import static picard.analysis.CollectWgsMetricsTestUtils.*; + +public class FastWgsMetricsCollectorTest { + private byte[] highQualities = {30, 30, 30 ,30, 30, 30, 30, 30, 30, 30, 50, 50, 50 ,50, 50, 50, 50, 50, 50, 60, 60, 60, 70 ,70, 70, 80, 80, 90, 90, 90}; + private byte[] qualities = {2, 2, 3 ,3, 3, 4, 4, 4, 4, 1}; + private byte[] refBases = {'A', 'C', 'C', 'T', 'A', 'C', 'G', 'T', 'T', 'C', 'A', 'A', 'T', 'A', 'T', 'T', 'C', 'T', 'T', 'C', 'G', 'A', 'G', 'T', 'C', 'D' , 'G', 'T', 'C', 'D','A', 'G', 'T', 'C', 'T', 'T', 'C', 'G', 'A', 'G', 'T', 'C', 'T', 'T', 'C', 'G', 'C', 'T', 'T', 'C', 'G', 'A', 'G', 'T', 'C', 'D' , 'G', 'T', 'C', 'D','A', 'G', 'T', 'C', 'T', 'T', 'C', 'G', 'A', 'G', 'T', 'C', 'T', 'T', 'C', 'G', 'C', 'T', 'T', 'C', 'G', 'A', 'G', 'T', 'C', 'D' , 'G', 'T', 'C', 'D', 'G', 'A', 'G', 'T', 'C', 'D' , 'G', 'T', 'C', 'D'}; + private SAMRecord record; + private SAMSequenceRecord sequence; + private SAMRecord secondRecord; + private SAMRecord thirdRecord; + private ReferenceSequence ref; + + @BeforeTest + public void setUp(){ + String referenceString = ">chrM\nACCTACGTTCAATATTCTTCACCTACGTTCAATATTCTTCACCTACGTTCAATATTCTTCACCTACGTTCAATATTCTTCACCTACGTTCAATATTCTTC"; + ref = new ReferenceSequence("chrM", 0, referenceString.getBytes()); + sequence = new SAMSequenceRecord("chrM", 100); + record = new SAMRecord(new SAMFileHeader()); + record.setReadName("test"); + record.setBaseQualities(qualities); + record.setReadBases(refBases); + secondRecord = generateRecord("test1"); + thirdRecord = generateRecord("test2"); + } + + private SAMRecord generateRecord(String name) { + SAMRecord record = new SAMRecord(new SAMFileHeader()); + record.setReadName(name); + record.setBaseQualities(highQualities); + record.setReadBases(refBases); + return record; + } + + @Test + public void testAddInfoForQuality() throws Exception { + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + FastWgsMetricsCollector collector = new FastWgsMetricsCollector(collectWgsMetrics, 100, createIntervalList()); + AbstractLocusInfo firstInfo = new AbstractLocusInfo<>(sequence, 1); + AbstractLocusInfo secondInfo = new AbstractLocusInfo<>(sequence, 10); + EdgingRecordAndOffset record1 = EdgingRecordAndOffset.createBeginRecord(record, 0, 10, 0); + firstInfo.add(record1); + secondInfo.add(EdgingRecordAndOffset.createEndRecord(record1)); + EdgingRecordAndOffset record2 = EdgingRecordAndOffset.createBeginRecord(record, 0, 10, 0); + firstInfo.add(record2); + secondInfo.add(EdgingRecordAndOffset.createEndRecord(record2)); + collector.addInfo(firstInfo, ref, false); + collector.addInfo(secondInfo, ref, false); + assertEquals(20, collector.basesExcludedByBaseq); + } + + @Test + public void testAddInfoForOverlap(){ + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + FastWgsMetricsCollector collector = new FastWgsMetricsCollector(collectWgsMetrics, 100, createIntervalList()); + AbstractLocusInfo firstInfo = new AbstractLocusInfo<>(sequence, 1); + AbstractLocusInfo secondInfo = new AbstractLocusInfo<>(sequence, 5); + AbstractLocusInfo thirdInfo = new AbstractLocusInfo<>(sequence, 10); + EdgingRecordAndOffset record1 = EdgingRecordAndOffset.createBeginRecord(secondRecord, 0, 10, 0); + firstInfo.add(record1); + thirdInfo.add(EdgingRecordAndOffset.createEndRecord(record1)); + EdgingRecordAndOffset record2 = EdgingRecordAndOffset.createBeginRecord(secondRecord, 5, 5, 0); + secondInfo.add(record2); + thirdInfo.add(EdgingRecordAndOffset.createEndRecord(record2)); + collector.addInfo(firstInfo, ref, false); + collector.addInfo(secondInfo, ref, false); + collector.addInfo(thirdInfo, ref, false); + assertEquals(5, collector.basesExcludedByOverlap, "Excluded by overlap:"); + } + + @Test + public void testAddInfoForCapping(){ + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + FastWgsMetricsCollector collector = new FastWgsMetricsCollector(collectWgsMetrics, 1, createIntervalList()); + AbstractLocusInfo firstInfo = new AbstractLocusInfo<>(sequence, 1); + AbstractLocusInfo secondInfo = new AbstractLocusInfo<>(sequence, 10); + EdgingRecordAndOffset record1 = EdgingRecordAndOffset.createBeginRecord(secondRecord, 0, 10, 0); + firstInfo.add(record1); + secondInfo.add(EdgingRecordAndOffset.createEndRecord(record1)); + EdgingRecordAndOffset record2 = EdgingRecordAndOffset.createBeginRecord(secondRecord, 0, 10, 0); + firstInfo.add(record2); + secondInfo.add(EdgingRecordAndOffset.createEndRecord(record2)); + collector.addInfo(firstInfo, ref, false); + collector.addInfo(secondInfo, ref, false); + assertEquals(1, collector.basesExcludedByCapping, "Excluded by capping:"); + } + + @Test + public void testForExcludedForQualityHistogramArray(){ + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + collectWgsMetrics.INCLUDE_BQ_HISTOGRAM = true; + FastWgsMetricsCollector collector = new FastWgsMetricsCollector(collectWgsMetrics, 100, createIntervalList()); + AbstractLocusInfo firstInfo = new AbstractLocusInfo<>(sequence, 1); + AbstractLocusInfo secondInfo = new AbstractLocusInfo<>(sequence, 30); + EdgingRecordAndOffset record = EdgingRecordAndOffset.createBeginRecord(secondRecord, 0, 30, 0); + firstInfo.add(record); + firstInfo.add(EdgingRecordAndOffset.createEndRecord(record)); + collector.addInfo(firstInfo, ref, false); + collector.addInfo(secondInfo, ref, false); + long[] expectedResult = new long[127]; + expectedResult[30] = 10; expectedResult[50] = 9; expectedResult[60] = 3; + expectedResult[70] = 3; expectedResult[80] = 2; expectedResult[90] = 3; + assertEquals(expectedResult, collector.unfilteredBaseQHistogramArray); + } + + @Test + public void testForBaseQualityHetSens(){ + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + collectWgsMetrics.INCLUDE_BQ_HISTOGRAM = true; + FastWgsMetricsCollector collector = new FastWgsMetricsCollector(collectWgsMetrics, 10, createIntervalList()); + AbstractLocusInfo firstInfo = new AbstractLocusInfo<>(sequence, 1); + AbstractLocusInfo secondInfo = new AbstractLocusInfo<>(sequence, 1); + AbstractLocusInfo thirdInfo = new AbstractLocusInfo<>(sequence, 1); + EdgingRecordAndOffset record1 = EdgingRecordAndOffset.createBeginRecord(secondRecord, 0, 30, 0); + firstInfo.add(record1); + thirdInfo.add(EdgingRecordAndOffset.createEndRecord(record1)); + EdgingRecordAndOffset record2 = EdgingRecordAndOffset.createBeginRecord(record, 0, 10, 0); + firstInfo.add(record2); + secondInfo.add(EdgingRecordAndOffset.createEndRecord(record2)); + collector.addInfo(firstInfo, ref, false); + collector.addInfo(secondInfo, ref, false); + collector.addInfo(thirdInfo, ref, false); + + long[] expectedResult = new long[127]; + expectedResult[30] = 10; expectedResult[50] = 9; expectedResult[60] = 3; + expectedResult[70] = 3; expectedResult[80] = 2; expectedResult[90] = 3; + expectedResult[1] = 1; expectedResult[2] = 2; expectedResult[3] = 3; + expectedResult[4] = 4; + } + + @Test + public void testForHistogramArray(){ + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + FastWgsMetricsCollector collector = new FastWgsMetricsCollector(collectWgsMetrics, 10, createIntervalList()); + long[] templateHistogramArray = {0,1,3,0,0,0,0,0,0,0,0}; + AbstractLocusInfo firstInfo = new AbstractLocusInfo<>(sequence, 1); + AbstractLocusInfo secondInfo = new AbstractLocusInfo<>(sequence, 10); + AbstractLocusInfo thirdInfo = new AbstractLocusInfo<>(sequence, 20); + AbstractLocusInfo fourthInfo = new AbstractLocusInfo<>(sequence, 30); + EdgingRecordAndOffset record1 = EdgingRecordAndOffset.createBeginRecord(record, 0, 10, 0); + firstInfo.add(record1); + secondInfo.add(EdgingRecordAndOffset.createEndRecord(record1)); + EdgingRecordAndOffset record2 = EdgingRecordAndOffset.createBeginRecord(secondRecord, 0, 30, 0); + firstInfo.add(record2); + fourthInfo.add(EdgingRecordAndOffset.createEndRecord(record2)); + EdgingRecordAndOffset record3 = EdgingRecordAndOffset.createBeginRecord(thirdRecord, 0, 20, 0); + firstInfo.add(record3); + thirdInfo.add(EdgingRecordAndOffset.createEndRecord(record3)); + collector.addInfo(firstInfo, ref, false); + collector.addInfo(secondInfo, ref, false); + collector.addInfo(thirdInfo, ref, false); + collector.addInfo(fourthInfo, ref, false); + assertEquals(templateHistogramArray, collector.unfilteredDepthHistogramArray); + } + + @Test + public void testForCollectorWithoutData(){ + long[] templateQualHistogram = new long[127]; + long[] templateHistogramArray = new long[11]; + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + FastWgsMetricsCollector collector = new FastWgsMetricsCollector(collectWgsMetrics, 10, createIntervalList()); + assertEquals(templateHistogramArray, collector.unfilteredDepthHistogramArray); + assertEquals(templateQualHistogram, collector.unfilteredBaseQHistogramArray); + assertEquals(0, collector.basesExcludedByCapping); + assertEquals(0, collector.basesExcludedByOverlap); + assertEquals(0, collector.basesExcludedByBaseq); + } + + @Test + public void testAddInfoWithoutOverlap(){ + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + FastWgsMetricsCollector collector = new FastWgsMetricsCollector(collectWgsMetrics, 100, createIntervalList()); + AbstractLocusInfo firstInfo = new AbstractLocusInfo<>(sequence, 1); + AbstractLocusInfo secondInfo = new AbstractLocusInfo<>(sequence, 5); + AbstractLocusInfo thirdInfo = new AbstractLocusInfo<>(sequence, 6); + AbstractLocusInfo fourthInfo = new AbstractLocusInfo<>(sequence, 10); + EdgingRecordAndOffset record1 = EdgingRecordAndOffset.createBeginRecord(secondRecord, 0, 5, 0); + firstInfo.add(record1); + secondInfo.add(EdgingRecordAndOffset.createEndRecord(record1)); + EdgingRecordAndOffset record2 = EdgingRecordAndOffset.createBeginRecord(secondRecord, 6, 5, 6); + thirdInfo.add(record2); + fourthInfo.add(EdgingRecordAndOffset.createEndRecord(record2)); + + collector.addInfo(firstInfo, ref, false); + collector.addInfo(secondInfo, ref, false); + assertEquals(0, collector.basesExcludedByOverlap, "Excluded by overlap:"); + } + + @Test + public void testForComplicatedCigar(){ + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + FastWgsMetricsCollector collector = new FastWgsMetricsCollector(collectWgsMetrics, 100, createIntervalList()); + AbstractLocusIterator sli = createReadEndsIterator(exampleSamTwoReads); + while(sli.hasNext()) { + AbstractLocusInfo info = sli.next(); + collector.addInfo(info, ref, false); + } + assertEquals(11, collector.basesExcludedByOverlap, "Excluded by overlap:"); + } +} diff --git a/src/test/java/picard/analysis/WgsMetricsProcessorImplTest.java b/src/test/java/picard/analysis/WgsMetricsProcessorImplTest.java new file mode 100644 index 000000000..bd5173e22 --- /dev/null +++ b/src/test/java/picard/analysis/WgsMetricsProcessorImplTest.java @@ -0,0 +1,83 @@ +/* + * The MIT License + * + * Copyright (c) 2016 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.analysis; + +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFileWalker; +import htsjdk.samtools.util.AbstractLocusIterator; +import htsjdk.samtools.util.Log; +import htsjdk.samtools.util.ProgressLogger; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.Test; + +import static org.testng.Assert.assertEquals; +import static picard.analysis.CollectWgsMetricsTestUtils.*; + +public class WgsMetricsProcessorImplTest { + private ProgressLogger progress; + + @BeforeTest + public void setUp(){ + progress = new ProgressLogger(Log.getInstance(WgsMetricsProcessorImpl.class)); + } + + @Test + public void testForProcessFile(){ + AbstractLocusIterator iterator = createReadEndsIterator(exampleSamOneRead); + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + FastWgsMetricsCollector collector = new FastWgsMetricsCollector(collectWgsMetrics, 100, createIntervalList()); + WgsMetricsProcessorImpl wgsMetricsProcessor = + new WgsMetricsProcessorImpl(iterator, getReferenceSequenceFileWalker(), collector, progress); + wgsMetricsProcessor.processFile(); + assertEquals(20, collector.counter); + } + + @Test + public void testForFilteredBases(){ + AbstractLocusIterator iterator = createReadEndsIterator(exampleSamOneRead); + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + FastWgsMetricsCollector collector = new FastWgsMetricsCollector(collectWgsMetrics, 100, createIntervalList()); + String secondReferenceString = ">ref\nNNNNNNNNNNAATATTCTTC"; + ReferenceSequenceFile referenceSequenceFile = createReferenceSequenceFile(secondReferenceString); + ReferenceSequenceFileWalker refWalker = new ReferenceSequenceFileWalker(referenceSequenceFile); + WgsMetricsProcessorImpl wgsMetricsProcessor = + new WgsMetricsProcessorImpl(iterator, refWalker, collector, progress); + wgsMetricsProcessor.processFile(); + assertEquals(10, collector.counter); + } + + @Test + public void testForExitAfter(){ + AbstractLocusIterator iterator = createReadEndsIterator(exampleSamOneRead); + CollectWgsMetrics collectWgsMetrics = new CollectWgsMetrics(); + collectWgsMetrics.STOP_AFTER = 16; + AbstractWgsMetricsCollector collector = new FastWgsMetricsCollector(collectWgsMetrics, 100, null); + WgsMetricsProcessorImpl wgsMetricsProcessor = + new WgsMetricsProcessorImpl(iterator, getReferenceSequenceFileWalker(), collector, progress); + wgsMetricsProcessor.processFile(); + assertEquals(15, collector.counter); + } + +} From f16213d5d3e6583e718c9395db1d27f7900ebfda Mon Sep 17 00:00:00 2001 From: Mariia Zueva Date: Thu, 26 Jan 2017 11:44:58 +0300 Subject: [PATCH 2/3] Code style fixes according to the review --- src/main/java/picard/analysis/AbstractWgsMetricsCollector.java | 7 ++----- src/main/java/picard/analysis/CollectWgsMetrics.java | 2 +- .../java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java | 1 - 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/src/main/java/picard/analysis/AbstractWgsMetricsCollector.java b/src/main/java/picard/analysis/AbstractWgsMetricsCollector.java index 422db6346..595f503cd 100644 --- a/src/main/java/picard/analysis/AbstractWgsMetricsCollector.java +++ b/src/main/java/picard/analysis/AbstractWgsMetricsCollector.java @@ -118,7 +118,7 @@ /** * Accumulates the data from AbstractLocusInfo in inner structures - * @param info AbstractLocusInfo with aligned to reference position reads + * @param info AbstractLocusInfo with aligned to reference position reads * @param ref ReferenceSequence * @param referenceBaseN true if current the value of reference base represents a no call */ @@ -126,7 +126,6 @@ /** * Adds collected metrics and depth histogram to file - * * @param file MetricsFile for result of collector's work * @param dupeFilter counting filter for duplicate reads * @param mapqFilter counting filter for mapping quality @@ -154,7 +153,7 @@ protected void addBaseQHistogram(final MetricsFile getUnfilteredDepthHistogram(){ + protected Histogram getUnfilteredDepthHistogram() { return getHistogram(unfilteredDepthHistogramArray, "coverage", "unfiltered_coverage_count"); } @@ -244,7 +243,6 @@ public void setCollectionFactory(ReadNamesCollectionFactory collectionFactory) { } private static class ReadNamesCollectionFactoryImpl implements ReadNamesCollectionFactory { - @Override public ReadNamesCollection createCollection() { return new ReadNamesCollectionImpl(); @@ -252,7 +250,6 @@ public ReadNamesCollection createCollection() { } private static class ReadNamesCollectionImpl implements ReadNamesCollection { - private Map> readNames; private final Set dummySet = new HashSet<>(); diff --git a/src/main/java/picard/analysis/CollectWgsMetrics.java b/src/main/java/picard/analysis/CollectWgsMetrics.java index 6418c7a88..f464a15cc 100644 --- a/src/main/java/picard/analysis/CollectWgsMetrics.java +++ b/src/main/java/picard/analysis/CollectWgsMetrics.java @@ -120,7 +120,7 @@ @Option(doc="Sample Size used for Theoretical Het Sensitivity sampling. Default is 10000.", optional = true) public int SAMPLE_SIZE=10000; - @Option(doc = "If true, fast algorithm is used") + @Option(doc = "If true, fast algorithm is used.") public boolean USE_FAST_ALGORITHM = false; @Option(doc = "Average read length in the file. Default is 150.", optional = true) diff --git a/src/main/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java b/src/main/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java index 857b65b51..afb3825fa 100644 --- a/src/main/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java +++ b/src/main/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java @@ -240,5 +240,4 @@ public boolean areHistogramsEmpty() { return (highQualityDepthHistogram.isEmpty() || highQualityDepthHistogramNonZero.isEmpty()); } } - } From 7f614f036faaba1aa78f725438d68594b3932569 Mon Sep 17 00:00:00 2001 From: Anna Osipova Date: Wed, 22 Mar 2017 12:29:12 +0300 Subject: [PATCH 3/3] changes according review --- .../analysis/AbstractWgsMetricsCollector.java | 75 +--------------------- .../java/picard/analysis/CollectWgsMetrics.java | 16 ++--- .../CollectWgsMetricsWithNonZeroCoverage.java | 6 +- src/main/java/picard/analysis/CounterManager.java | 37 ++++++----- .../picard/analysis/FastWgsMetricsCollector.java | 31 +++++---- .../picard/analysis/WgsMetricsProcessorImpl.java | 15 +++-- .../picard/analysis/CollectWgsMetricsTest.java | 52 ++++++++++++++- .../java/picard/analysis/CounterManagerTest.java | 14 ++-- .../analysis/FastWgsMetricsCollectorTest.java | 5 +- .../analysis/WgsMetricsProcessorImplTest.java | 7 +- 10 files changed, 126 insertions(+), 132 deletions(-) diff --git a/src/main/java/picard/analysis/AbstractWgsMetricsCollector.java b/src/main/java/picard/analysis/AbstractWgsMetricsCollector.java index 595f503cd..f84f300a5 100644 --- a/src/main/java/picard/analysis/AbstractWgsMetricsCollector.java +++ b/src/main/java/picard/analysis/AbstractWgsMetricsCollector.java @@ -30,16 +30,9 @@ import htsjdk.samtools.util.AbstractRecordAndOffset; import htsjdk.samtools.util.Histogram; import htsjdk.samtools.util.IntervalList; -import htsjdk.samtools.util.QualityUtil; import htsjdk.samtools.util.SequenceUtil; import picard.filter.CountingFilter; import picard.filter.CountingPairedFilter; -import picard.util.MathUtil; - -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; /** * Class for collecting data on reference coverage, base qualities and excluded bases from one AbstractLocusInfo object for @@ -96,7 +89,6 @@ */ protected long counter = 0; - ReadNamesCollectionFactory collectionFactory = new ReadNamesCollectionFactoryImpl(); /** * Creates a collector and initializes the inner data structures * @@ -118,8 +110,8 @@ /** * Accumulates the data from AbstractLocusInfo in inner structures - * @param info AbstractLocusInfo with aligned to reference position reads - * @param ref ReferenceSequence + * @param info {@link htsjdk.samtools.util.AbstractLocusInfo} with aligned to reference position reads + * @param ref {@link htsjdk.samtools.reference.ReferenceSequence} * @param referenceBaseN true if current the value of reference base represents a no call */ public abstract void addInfo(final AbstractLocusInfo info, final ReferenceSequence ref, boolean referenceBaseN); @@ -224,67 +216,4 @@ boolean isReferenceBaseN(final int position, final ReferenceSequence ref) { final byte base = ref.getBases()[position - 1]; return SequenceUtil.isNoCall(base); } - - public void setCollectionFactory(ReadNamesCollectionFactory collectionFactory) { - this.collectionFactory = collectionFactory; - } - - interface ReadNamesCollection { - boolean add(String readName); - Set put(String readName, Set recordsAndOffsetsForName); - Set get(String readName); - Set remove(String readName); - int size(); - void clear(); - } - - interface ReadNamesCollectionFactory { - ReadNamesCollection createCollection(); - } - - private static class ReadNamesCollectionFactoryImpl implements ReadNamesCollectionFactory { - @Override - public ReadNamesCollection createCollection() { - return new ReadNamesCollectionImpl(); - } - } - - private static class ReadNamesCollectionImpl implements ReadNamesCollection { - private Map> readNames; - private final Set dummySet = new HashSet<>(); - - ReadNamesCollectionImpl() { - this.readNames = new HashMap<>(); - } - - @Override - public boolean add(String readName) { - return readNames.put(readName, dummySet) == null; - } - - @Override - public Set put(String readName, Set recordsAndOffsetsForName) { - return readNames.put(readName, recordsAndOffsetsForName); - } - - @Override - public Set get(String readName) { - return readNames.get(readName); - } - - @Override - public Set remove(String readName) { - return readNames.remove(readName); - } - - @Override - public int size() { - return readNames.size(); - } - - @Override - public void clear() { - readNames.clear(); - } - } } diff --git a/src/main/java/picard/analysis/CollectWgsMetrics.java b/src/main/java/picard/analysis/CollectWgsMetrics.java index f464a15cc..645adb3d7 100644 --- a/src/main/java/picard/analysis/CollectWgsMetrics.java +++ b/src/main/java/picard/analysis/CollectWgsMetrics.java @@ -47,10 +47,8 @@ import java.io.File; import java.util.ArrayList; -import java.util.Arrays; import java.util.HashSet; import java.util.List; -import java.util.stream.LongStream; import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME; @@ -582,11 +580,11 @@ protected long getBasesExcludedBy(final CountingFilter filter) { } /** - * Creates AbstractLocusIterator implementation according to USE_FAST_ALGORITHM value. + * Creates {@link htsjdk.samtools.util.AbstractLocusIterator} implementation according to {@link this#USE_FAST_ALGORITHM} value. * - * @param in inner SAMReader - * @return if USE_FAST_ALGORITHM is enabled, returns EdgeReadIterator implementation, - * otherwise default algorithm is used and SamLocusIterator is returned. + * @param in inner {@link htsjdk.samtools.SamReader} + * @return if {@link this#USE_FAST_ALGORITHM} is enabled, returns {@link htsjdk.samtools.util.EdgeReadIterator} implementation, + * otherwise default algorithm is used and {@link htsjdk.samtools.util.SamLocusIterator} is returned. */ protected AbstractLocusIterator getLocusIterator(final SamReader in) { if (USE_FAST_ALGORITHM) { @@ -600,12 +598,12 @@ protected AbstractLocusIterator getLocusIterator(final SamReader in) { } /** - * Creates AbstractWgsMetricsCollector implementation according to USE_FAST_ALGORITHM value. + * Creates {@link picard.analysis.AbstractWgsMetricsCollector} implementation according to {@link this#USE_FAST_ALGORITHM} value. * * @param coverageCap the maximum depth/coverage to consider. * @param intervals the intervals over which metrics are collected. - * @return if USE_FAST_ALGORITHM is enabled, returns FastWgsMetricsCollector implementation, - * otherwise default algorithm is used and WgsMetricsCollector is returned. + * @return if {@link this#USE_FAST_ALGORITHM} is enabled, returns {@link picard.analysis.FastWgsMetricsCollector} implementation, + * otherwise default algorithm is used and {@link picard.analysis.CollectWgsMetrics.WgsMetricsCollector} is returned. */ protected AbstractWgsMetricsCollector getCollector(final int coverageCap, final IntervalList intervals) { return USE_FAST_ALGORITHM ? new FastWgsMetricsCollector(this, coverageCap, intervals) : diff --git a/src/main/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java b/src/main/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java index afb3825fa..17e013afd 100644 --- a/src/main/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java +++ b/src/main/java/picard/analysis/CollectWgsMetricsWithNonZeroCoverage.java @@ -27,7 +27,11 @@ import htsjdk.samtools.SAMReadGroupRecord; import htsjdk.samtools.SamReader; import htsjdk.samtools.metrics.MetricsFile; -import htsjdk.samtools.util.*; +import htsjdk.samtools.util.Histogram; +import htsjdk.samtools.util.IOUtil; +import htsjdk.samtools.util.IntervalList; +import htsjdk.samtools.util.Log; +import htsjdk.samtools.util.StringUtil; import picard.PicardException; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; diff --git a/src/main/java/picard/analysis/CounterManager.java b/src/main/java/picard/analysis/CounterManager.java index 44835613e..112536b13 100644 --- a/src/main/java/picard/analysis/CounterManager.java +++ b/src/main/java/picard/analysis/CounterManager.java @@ -70,7 +70,7 @@ public CounterManager(final int arrayLength, int readLength) { /** * Method added for use in unit tests * - * @return offset from the reference sequence, that is represented by index 0 of Counter arrays. + * @return offset from the reference sequence, that is represented by index 0 of {@link picard.analysis.CounterManager.Counter} arrays. */ int getOffset() { return offset; @@ -79,7 +79,7 @@ int getOffset() { /** * Method added for use in unit tests * - * @param offset from the reference sequence, that will be represented by index 0 of Counter arrays. + * @param offset from the reference sequence, that will be represented by index 0 of {@link picard.analysis.CounterManager.Counter} arrays. */ void setOffset(int offset) { this.offset = offset; @@ -87,13 +87,13 @@ void setOffset(int offset) { /** * Method checks that new locus position is not out of bounds of Counter arrays and there is enough - * space in them to hold information on at least one more read of length readLength. + * space in them to hold information on at least one more read of length {@link this#readLength}. * If there is no free space, but there is accumulated information in Counter arrays after new locus position * that we may need, the arrays are rebased, so that 0 index of arrays represents new locus position. * In other case, we just clear the arrays. * * @param locusPosition position in the reference sequence, - * that will be represented by index 0 of Counter arrays. + * that will be represented by index 0 of {@link picard.analysis.CounterManager.Counter} arrays. */ public void checkOutOfBounds(int locusPosition) { if (locusPosition - offset + readLength >= arrayLength) { @@ -110,13 +110,17 @@ public void checkOutOfBounds(int locusPosition) { * Rebases inner Counter arrays so that 0 index of array represents the new locus position * * @param locusPosition position in the reference sequence, - * that will be represented by index 0 of Counter arrays. + * that will be represented by index 0 of {@link picard.analysis.CounterManager.Counter} arrays. */ private void rebase(int locusPosition) { + if (locusPosition < offset) { + throw new IllegalArgumentException("Position in the reference sequence is lesser than offset."); + } for (Counter counter : arrays) { - int[] array = counter.array; - System.arraycopy(array, locusPosition - offset, array, 0, arrayLength - locusPosition + offset); - Arrays.fill(array, arrayLength - locusPosition + offset, arrayLength, 0); + final int[] array = counter.array; + final int skipLength = locusPosition - offset; + System.arraycopy(array, skipLength, array, 0, arrayLength - skipLength); + Arrays.fill(array, arrayLength - skipLength, arrayLength, 0); } offset = locusPosition; } @@ -126,7 +130,7 @@ private void rebase(int locusPosition) { */ public void clear() { for (Counter counter : arrays) { - int[] array = counter.array; + final int[] array = counter.array; Arrays.fill(array, 0); } offset = 0; @@ -135,24 +139,24 @@ public void clear() { /** * Creates a new Counter object and adds it to the list of managed Counters. * - * @return Counter, that will be managed by current CounterManager + * @return {@link picard.analysis.CounterManager.Counter}, that will be managed by current {@link picard.analysis.CounterManager} */ public Counter newCounter() { - Counter counter = new Counter(arrayLength); + final Counter counter = new Counter(arrayLength); arrays.add(counter); return counter; } /** - * Class represents an integer array with methods to increment and get the values form it with respect - * to offset of outer CounterManager. + * Class represents an integer array with methods to increment and get the values from it with respect + * to offset of outer {@link picard.analysis.CounterManager}. */ public class Counter { /** * Inner array to store accumulated values */ - private int[] array; + private final int[] array; /** * @param arrayLength length of the inner array @@ -166,7 +170,7 @@ private Counter(int arrayLength) { * * @param index in the reference sequence */ - public void inc(int index) { + public void increment(int index) { checkIndex(index); array[index - offset]++; } @@ -182,7 +186,8 @@ public int get(int index) { private void checkIndex(int index) { if ((index - offset) < 0 || (index - offset) >= array.length) { - throw new ArrayIndexOutOfBoundsException("The requested index is out of counter bounds."); + throw new ArrayIndexOutOfBoundsException("The requested index " + index + " is out of counter bounds. " + + "Possible cause of exception can be wrong READ_LENGTH parameter (much smaller than actual read length)"); } } } diff --git a/src/main/java/picard/analysis/FastWgsMetricsCollector.java b/src/main/java/picard/analysis/FastWgsMetricsCollector.java index 9142f6d7d..989474d66 100644 --- a/src/main/java/picard/analysis/FastWgsMetricsCollector.java +++ b/src/main/java/picard/analysis/FastWgsMetricsCollector.java @@ -26,22 +26,25 @@ import htsjdk.samtools.reference.ReferenceSequence; import htsjdk.samtools.util.AbstractLocusInfo; +import htsjdk.samtools.util.AbstractRecordAndOffset; import htsjdk.samtools.util.EdgingRecordAndOffset; import htsjdk.samtools.util.IntervalList; import htsjdk.samtools.util.SequenceUtil; import java.util.HashSet; +import java.util.Map; import java.util.Optional; import java.util.Set; +import java.util.HashMap; /** * Class represents fast algorithm for collecting data from AbstractLocusInfo - * with a list of aligned EdgingRecordAndOffset objects. According to the algorithm - * we receive only two EdgingRecordAndOffset objects for each alignment block of a read: - * one for the start of block and one for the end. When meeting a EdgingRecordAndOffset - * with type BEGIN, all information from the alignment block is accumulated in the collector, + * with a list of aligned {@link htsjdk.samtools.util.EdgingRecordAndOffset} objects. According to the algorithm + * we receive only two {@link htsjdk.samtools.util.EdgingRecordAndOffset} objects for each alignment block of a read: + * one for the start of block and one for the end. When meeting a {@link htsjdk.samtools.util.EdgingRecordAndOffset} + * with type {@link htsjdk.samtools.util.EdgingRecordAndOffset.Type#BEGIN}, all information from the alignment block is accumulated in the collector, * read name is added to a map holding the names of processed reads for detecting overlapping positions. - * When meeting a EdgingRecordAndOffset with type END, + * When meeting a {@link htsjdk.samtools.util.EdgingRecordAndOffset} with type {@link htsjdk.samtools.util.EdgingRecordAndOffset.Type#END}, * the read name is removed from the map with names of processed reads. * @author Mariia_Zueva@epam.com, EPAM Systems, Inc. */ @@ -54,7 +57,7 @@ private int previousSequenceIndex; /** - * Manager for pileupSize Counter. + * Manager for {@link this#pileupSize} Counter. */ private final CounterManager counterManager; /** @@ -70,11 +73,11 @@ /** * Map, holding information on currently processed reads, that possibly have overlapping regions. */ - private ReadNamesCollection readsNames; + private Map> readsNames; /** - * Determines the size of created Counter objects. The bigger Counter objects - * are created, the less rebasing in Counter will occur. + * Determines the size of created {@link picard.analysis.CounterManager.Counter} objects. The bigger {@link picard.analysis.CounterManager.Counter} objects + * are created, the less rebasing in {@link picard.analysis.CounterManager.Counter} will occur. */ private final int ARRAY_SIZE_PER_READ_LENGTH = 2000; @@ -133,7 +136,7 @@ private void processRecord(int position, ReferenceSequence ref, EdgingRecordAndO } else { if (unfilteredDepthSize.get(index) < coverageCap) { unfilteredBaseQHistogramArray[quality]++; - unfilteredDepthSize.inc(index); + unfilteredDepthSize.increment(index); } if (quality < collectWgsMetrics.MINIMUM_BASE_QUALITY || SequenceUtil.isNoCall(bases[i + record.getOffset()])){ basesExcludedByBaseq++; @@ -142,7 +145,7 @@ private void processRecord(int position, ReferenceSequence ref, EdgingRecordAndO if (recordsAndOffsetsForName.size() - bsq > 0) { basesExcludedByOverlap++; } else { - pileupSize.inc(index); + pileupSize.increment(index); } } } @@ -162,14 +165,14 @@ private void removeRecordFromMap(EdgingRecordAndOffset record, SetAbstractLocusInfo. + * Prepares the accumulator objects to process a new {@link htsjdk.samtools.util.AbstractLocusInfo}. * If we switch to a new sequence, all accumulators are cleared. * - * @param info the next AbstractLocusInfo to process + * @param info the next {@link htsjdk.samtools.util.AbstractLocusInfo} to process */ private void prepareCollector(AbstractLocusInfo info) { if (readsNames == null) { - readsNames = collectionFactory.createCollection(); + readsNames = new HashMap<>(); } if (previousSequenceIndex != info.getSequenceIndex()) { readsNames.clear(); diff --git a/src/main/java/picard/analysis/WgsMetricsProcessorImpl.java b/src/main/java/picard/analysis/WgsMetricsProcessorImpl.java index 7c9b13733..d987f1685 100644 --- a/src/main/java/picard/analysis/WgsMetricsProcessorImpl.java +++ b/src/main/java/picard/analysis/WgsMetricsProcessorImpl.java @@ -27,8 +27,11 @@ import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.reference.ReferenceSequence; import htsjdk.samtools.reference.ReferenceSequenceFileWalker; -import htsjdk.samtools.util.*; -import picard.PicardException; +import htsjdk.samtools.util.AbstractLocusInfo; +import htsjdk.samtools.util.AbstractLocusIterator; +import htsjdk.samtools.util.AbstractRecordAndOffset; +import htsjdk.samtools.util.ProgressLogger; +import htsjdk.samtools.util.Log; import picard.filter.CountingFilter; import picard.filter.CountingPairedFilter; @@ -36,7 +39,7 @@ import java.util.stream.LongStream; /** - * Implementation of WgsMetricsProcessor that gets input data from a given iterator + * Implementation of {@link picard.analysis.WgsMetricsProcessor} that gets input data from a given iterator * and processes it with a help of collector * @author Mariia_Zueva@epam.com, EPAM Systems, Inc. */ @@ -61,9 +64,9 @@ private final Log log = Log.getInstance(WgsMetricsProcessorImpl.class); /** - * @param iterator input AbstractLocusIterator + * @param iterator input {@link htsjdk.samtools.util.AbstractLocusIterator} * @param refWalker over processed reference file - * @param collector input AbstractWgsMetricsCollector + * @param collector input {@link picard.analysis.AbstractWgsMetricsCollector} * @param progress logger */ public WgsMetricsProcessorImpl(AbstractLocusIterator> iterator, @@ -99,7 +102,7 @@ public void processFile() { collector.setCounter(counter); } // check that we added the same number of bases to the raw coverage histogram and the base quality histograms - final long sumBaseQ= Arrays.stream(collector.unfilteredBaseQHistogramArray).sum(); + final long sumBaseQ = Arrays.stream(collector.unfilteredBaseQHistogramArray).sum(); final long sumDepthHisto = LongStream.rangeClosed(0, collector.coverageCap).map(i -> (i * collector.unfilteredDepthHistogramArray[(int) i])).sum(); if (sumBaseQ != sumDepthHisto) { log.error("Coverage and baseQ distributions contain different amount of bases!"); diff --git a/src/test/java/picard/analysis/CollectWgsMetricsTest.java b/src/test/java/picard/analysis/CollectWgsMetricsTest.java index 071a1d33e..1a24eeae6 100644 --- a/src/test/java/picard/analysis/CollectWgsMetricsTest.java +++ b/src/test/java/picard/analysis/CollectWgsMetricsTest.java @@ -220,7 +220,8 @@ public void testLargeIntervals(final String useFastAlgorithm) throws IOException "OUTPUT=" + outfile.getAbsolutePath(), "REFERENCE_SEQUENCE=" + ref.getAbsolutePath(), "INTERVALS=" + intervals.getAbsolutePath(), - "SAMPLE_SIZE=" + sampleSize + "SAMPLE_SIZE=" + sampleSize, + "USE_FAST_ALGORITHM=" + useFastAlgorithm }; Assert.assertEquals(runPicardCommandLine(args), 0); @@ -283,7 +284,8 @@ public void testExclusions(final String useFastAlgorithm) throws IOException { "OUTPUT=" + outfile.getAbsolutePath(), "REFERENCE_SEQUENCE=" + reference.getAbsolutePath(), "INCLUDE_BQ_HISTOGRAM=true", - "COVERAGE_CAP=3" + "COVERAGE_CAP=3", + "USE_FAST_ALGORITHM=" + useFastAlgorithm }; Assert.assertEquals(runPicardCommandLine(args), 0); @@ -346,7 +348,8 @@ public void testPoorQualityBases(final String useFastAlgorithm) throws IOExcepti "OUTPUT=" + outfile.getAbsolutePath(), "REFERENCE_SEQUENCE=" + reference.getAbsolutePath(), "INCLUDE_BQ_HISTOGRAM=true", - "COVERAGE_CAP=3" + "COVERAGE_CAP=3", + "USE_FAST_ALGORITHM=" + useFastAlgorithm }; Assert.assertEquals(runPicardCommandLine(args), 0); @@ -359,4 +362,47 @@ public void testPoorQualityBases(final String useFastAlgorithm) throws IOExcepti Assert.assertEquals(metrics.PCT_EXC_BASEQ, 0.5); Assert.assertEquals(metrics.PCT_EXC_CAPPED, 0.0); } + + @Test(dataProvider = "wgsAlgorithm") + public void testGiantDeletion(final String useFastAlgorithm) throws IOException { + final File reference = new File("testdata/picard/quality/chrM.reference.fasta"); + final File testSamFile = File.createTempFile("CollectWgsMetrics", ".bam", TEST_DIR); + testSamFile.deleteOnExit(); + + final SAMRecordSetBuilder setBuilder = CollectWgsMetricsTestUtils.createTestSAMBuilder(reference, READ_GROUP_ID, SAMPLE, PLATFORM, LIBRARY); + setBuilder.setReadLength(10); + for (int i = 0; i < 3; i++){ + setBuilder.addPair("Read:" + i, 0, 1, 30, false, false, "5M10000D5M", "1000D10M", false, true, 40); + } + + final SAMFileWriter writer = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(setBuilder.getHeader(), false, testSamFile); + + for (final SAMRecord record : setBuilder) { + writer.addAlignment(record); + } + + writer.close(); + + final File outfile = File.createTempFile("testGiantDeletion", ".txt"); + outfile.deleteOnExit(); + + final String[] args = new String[] { + "INPUT=" + testSamFile.getAbsolutePath(), + "OUTPUT=" + outfile.getAbsolutePath(), + "REFERENCE_SEQUENCE=" + reference.getAbsolutePath(), + "INCLUDE_BQ_HISTOGRAM=true", + "COVERAGE_CAP=3", + "USE_FAST_ALGORITHM=" + useFastAlgorithm + }; + + Assert.assertEquals(runPicardCommandLine(args), 0); + + final MetricsFile output = new MetricsFile<>(); + output.read(new FileReader(outfile)); + + final CollectWgsMetrics.WgsMetrics metrics = output.getMetrics().get(0); + + Assert.assertEquals(metrics.PCT_EXC_BASEQ, 0.0); + Assert.assertEquals(metrics.PCT_EXC_CAPPED, 0.0); + } } diff --git a/src/test/java/picard/analysis/CounterManagerTest.java b/src/test/java/picard/analysis/CounterManagerTest.java index c96ef0eeb..2077860ff 100644 --- a/src/test/java/picard/analysis/CounterManagerTest.java +++ b/src/test/java/picard/analysis/CounterManagerTest.java @@ -24,16 +24,16 @@ public void setUp() { testCounter = testCounterManager.newCounter(); for (int i = 0; i < arrayLength; i++) { - testCounter.inc(i + OFFSET); - secondCounter.inc(i + OFFSET); + testCounter.increment(i + OFFSET); + secondCounter.increment(i + OFFSET); } - testCounter.inc(OFFSET); - secondCounter.inc(OFFSET); + testCounter.increment(OFFSET); + secondCounter.increment(OFFSET); } @Test public void testCounterInc() { - Assert.assertEquals(2, testCounter.get(OFFSET), "Test method inc:"); + Assert.assertEquals(2, testCounter.get(OFFSET), "Test method increment:"); } @Test @@ -72,7 +72,7 @@ public void testForCheckIncrement(){ testCounterManager.setOffset(0); CounterManager.Counter counter = testCounterManager.newCounter(); for (int i=0; i<10; i++){ - counter.inc(1); + counter.increment(1); } int[] testArray = new int[arrayLength]; for (int i = 0; i< arrayLength; i++){ @@ -85,7 +85,7 @@ public void testForCheckIncrement(){ @Test(expectedExceptions = IndexOutOfBoundsException.class) public void testForWrongIndexInInc(){ - testCounter.inc(40); + testCounter.increment(40); } @Test(expectedExceptions = IndexOutOfBoundsException.class) diff --git a/src/test/java/picard/analysis/FastWgsMetricsCollectorTest.java b/src/test/java/picard/analysis/FastWgsMetricsCollectorTest.java index 884b09a23..84675170b 100644 --- a/src/test/java/picard/analysis/FastWgsMetricsCollectorTest.java +++ b/src/test/java/picard/analysis/FastWgsMetricsCollectorTest.java @@ -11,7 +11,10 @@ import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; import static org.testng.Assert.assertEquals; -import static picard.analysis.CollectWgsMetricsTestUtils.*; +import static picard.analysis.CollectWgsMetricsTestUtils.createIntervalList; +import static picard.analysis.CollectWgsMetricsTestUtils.createReadEndsIterator; +import static picard.analysis.CollectWgsMetricsTestUtils.exampleSamTwoReads; + public class FastWgsMetricsCollectorTest { private byte[] highQualities = {30, 30, 30 ,30, 30, 30, 30, 30, 30, 30, 50, 50, 50 ,50, 50, 50, 50, 50, 50, 60, 60, 60, 70 ,70, 70, 80, 80, 90, 90, 90}; diff --git a/src/test/java/picard/analysis/WgsMetricsProcessorImplTest.java b/src/test/java/picard/analysis/WgsMetricsProcessorImplTest.java index bd5173e22..9bd9e20b8 100644 --- a/src/test/java/picard/analysis/WgsMetricsProcessorImplTest.java +++ b/src/test/java/picard/analysis/WgsMetricsProcessorImplTest.java @@ -33,7 +33,11 @@ import org.testng.annotations.Test; import static org.testng.Assert.assertEquals; -import static picard.analysis.CollectWgsMetricsTestUtils.*; +import static picard.analysis.CollectWgsMetricsTestUtils.createIntervalList; +import static picard.analysis.CollectWgsMetricsTestUtils.createReadEndsIterator; +import static picard.analysis.CollectWgsMetricsTestUtils.exampleSamOneRead; +import static picard.analysis.CollectWgsMetricsTestUtils.getReferenceSequenceFileWalker; +import static picard.analysis.CollectWgsMetricsTestUtils.createReferenceSequenceFile; public class WgsMetricsProcessorImplTest { private ProgressLogger progress; @@ -79,5 +83,4 @@ public void testForExitAfter(){ wgsMetricsProcessor.processFile(); assertEquals(15, collector.counter); } - }