From c978040dc92935d9a2fea2884bffbf9f600e4153 Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Fri, 30 Jun 2017 14:02:50 -0400 Subject: [PATCH 1/7] Add option to CollectWgsMetrics to output theoretical sensitivity at various allele fractions. --- .../analysis/AbstractWgsMetricsCollector.java | 3 +- .../java/picard/analysis/CollectRawWgsMetrics.java | 2 + .../java/picard/analysis/CollectWgsMetrics.java | 30 +++- .../picard/analysis/TheoreticalSensitivity.java | 157 +++++++++++++++++++-- .../analysis/TheoreticalSensitivityMetrics.java | 33 +++++ .../picard/analysis/directed/CollectHsMetrics.java | 3 + .../analysis/TheoreticalSensitivityTest.java | 110 ++++++++++++++- 7 files changed, 320 insertions(+), 18 deletions(-) create mode 100644 src/main/java/picard/analysis/TheoreticalSensitivityMetrics.java diff --git a/src/main/java/picard/analysis/AbstractWgsMetricsCollector.java b/src/main/java/picard/analysis/AbstractWgsMetricsCollector.java index f84f300a5..51364cc38 100644 --- a/src/main/java/picard/analysis/AbstractWgsMetricsCollector.java +++ b/src/main/java/picard/analysis/AbstractWgsMetricsCollector.java @@ -184,8 +184,7 @@ protected void addBaseQHistogram(final MetricsFile tsOut = getMetricsFile(); + tsOut.addMetric(theoreticalSensitivityMetrics); + tsOut.write(THEORETICAL_SENSITIVITY_OUTPUT); + } + return 0; } diff --git a/src/main/java/picard/analysis/TheoreticalSensitivity.java b/src/main/java/picard/analysis/TheoreticalSensitivity.java index 9b93ab4ff..83c03507d 100644 --- a/src/main/java/picard/analysis/TheoreticalSensitivity.java +++ b/src/main/java/picard/analysis/TheoreticalSensitivity.java @@ -41,14 +41,15 @@ private static final Log log = Log.getInstance(TheoreticalSensitivity.class); private static final int SAMPLING_MAX = 600; //prevent 'infinite' loops - private static final int MAX_CONSIDERED_DEPTH = 1000; //no point in looking any deeper than this, otherwise GC overhead is too high. + private static final int MAX_CONSIDERED_DEPTH = 10000; //no point in looking any deeper than this, otherwise GC overhead is too high. + private static int randomSeed = 51; /** - * @param depthDistribution the probability of depth n is depthDistribution[n] for n = 0, 1. . . N - 1 + * @param depthDistribution the probability of depth n is depthDistribution[n] for n = 0, 1. . . N - 1 * @param qualityDistribution the probability of quality q is qualityDistribution[q] for q = 0, 1. . . Q - * @param sampleSize sample size is the number of random sums of quality scores for each m - * @param logOddsThreshold is the log_10 of the likelihood ratio required to call a SNP, - * for example 5 if the variant likelihood must be 10^5 times greater + * @param sampleSize sample size is the number of random sums of quality scores for each m + * @param logOddsThreshold is the log_10 of the likelihood ratio required to call a SNP, + * for example 5 if the variant likelihood must be 10^5 times greater */ public static double hetSNPSensitivity(final double[] depthDistribution, final double[] qualityDistribution, final int sampleSize, final double logOddsThreshold) { @@ -56,12 +57,12 @@ public static double hetSNPSensitivity(final double[] depthDistribution, final d } /** - * @param depthDistribution the probability of depth n is depthDistribution[n] for n = 0, 1. . . N - 1 + * @param depthDistribution the probability of depth n is depthDistribution[n] for n = 0, 1. . . N - 1 * @param qualityDistribution the probability of quality q is qualityDistribution[q] for q = 0, 1. . . Q - * @param sampleSize sample size is the number of random sums of quality scores for each m - * @param logOddsThreshold is the log_10 of the likelihood ratio required to call a SNP, - * for example 5 if the variant likelihood must be 10^5 times greater. - * @param withLogging true to output log messages, false otherwise. + * @param sampleSize sample size is the number of random sums of quality scores for each m + * @param logOddsThreshold is the log_10 of the likelihood ratio required to call a SNP, + * for example 5 if the variant likelihood must be 10^5 times greater. + * @param withLogging true to output log messages, false otherwise. */ public static double hetSNPSensitivity(final double[] depthDistribution, final double[] qualityDistribution, final int sampleSize, final double logOddsThreshold, final boolean withLogging) { @@ -143,7 +144,7 @@ We use an O(1) stochastic acceptance algorithm -- see Physica A, Volume 391, Pag private Random rng; RouletteWheel(final double[] weights) { - rng = new Random(51); + rng = new Random(randomSeed); N = weights.length; probabilities = new ArrayList<>(); @@ -204,4 +205,136 @@ public int draw() { } return normalizedHistogram; } -} + + /** + * Determines if a variant would be called under the particular conditions of a given total depth, alt depth, + * average base qualities, allele fraction of variant and log odds threshold necessary to exceed to call variant. + * @param totalDepth Depth at the site to be called, both alt and ref. + * @param altDepth Number of alt bases at this site. + * @param averageQuality Average Phred-scaled quality of bases + * @param alleleFraction Allele fraction we are attempting to detect + * @param logOddsThreshold Log odds threshold necessary to exceed for variant to be called + * @return + */ + public static boolean isCalled(int totalDepth, int altDepth, double averageQuality, double alleleFraction, double logOddsThreshold) { + double threshold; + double sumOfQualities = altDepth * averageQuality; + threshold = 10.0 * (altDepth * Math.log10(1.0 / alleleFraction) + (totalDepth - altDepth) * Math.log10(1.0 / (1.0 - alleleFraction)) + logOddsThreshold); + + return sumOfQualities > threshold; + } + + /** + * Draw from a binomial distribution. + * @param trials Number of trials to perform + * @param p Probability of individual success + * @param uniformRNG Random number generator to use for making draw + * @return Number of total successes + */ + public static int binomialDraw(final int trials, final double p, final Random uniformRNG) { + if (p > 1.0 || p < 0) { + throw new PicardException("Probabilities should be between 0 and 1, found value " + p + "."); + } + + int successes = 0; + for (int i = 0; i < trials; i++) { + if (uniformRNG.nextDouble() < p) { + successes++; + } + } + return successes; + } + + /** + * Calculates the theoretical sensitivity with a given Phred-scaled quality score distribution at a constant + * depth. + * @param depth Depth to compute sensitivity at + * @param qualityDistribution Phred-scaled quality score distribution + * @param logOddsThreshold Log odd threshold necessary to exceed for variant to be called + * @param sampleSize sampleSize is the total number of simulations to run + * @param alleleFraction the allele fraction to evaluate sensitivity at + * @param randomSeed random number seed to use for random number generator + * @return + */ + public static double sensitivityAtConstantDepth(final int depth, final double[] qualityDistribution, final double logOddsThreshold, final int sampleSize, final double alleleFraction, final int randomSeed) { + final RouletteWheel qualityRW = new RouletteWheel(trimDistribution(qualityDistribution)); + final Random uniformRNG = new Random(randomSeed); + + int altDepth = 0; + int calledVariants = 0; + for (int k = 0; k < sampleSize; k++) { + altDepth = binomialDraw(depth, alleleFraction, uniformRNG); + + int sumOfQualities = 0; + for (int i = 0; i < altDepth; i++) { + sumOfQualities += qualityRW.draw(); + } + if (isCalled(depth, altDepth, (double) sumOfQualities / (double) altDepth, alleleFraction, logOddsThreshold)) { + calledVariants++; + } + } + return (double) calledVariants / sampleSize; + } + + /** + * Calculates the theoretical sensitivity with a given Phred-scaled quality score distribution at a constant + * depth. + * @param depth Depth to compute sensitivity at + * @param qualityDistribution Phred-scaled quality score distribution + * @param logOddsThreshold Log odds threshold necessary to exceed for variant to be called + * @param sampleSize the total number of simulations to run + * @param alleleFraction the allele fraction to evaluate sensitivity at + * @return + */ + public static double sensitivityAtConstantDepth(final int depth, final double[] qualityDistribution, final double logOddsThreshold, final int sampleSize, final double alleleFraction) { + return sensitivityAtConstantDepth(depth, qualityDistribution, logOddsThreshold, sampleSize, alleleFraction, randomSeed); + } + + /** + * Calculates the theoretical sensitivity with a given Phred-scaled quality score distribution and depth + * distribution. + * @param depthDistribution Depth distribution to compute theoretical sensitivity over + * @param qualityDistribution Phred-scaled quality score distribution + * @param sampleSize the total number of simulations to run + * @param logOddsThreshold Log odds threshold necessary to exceed for variant to be called + * @param alleleFraction the allele fraction to evaluate sensitivity at + * @return + */ + public static double theoreticalSensitivity(final double[] depthDistribution, final double[] qualityDistribution, + final int sampleSize, final double logOddsThreshold, final double alleleFraction) { + double sensitivity = 0.0; + for (int k = 0; k < depthDistribution.length; k++) { + if(k % 10 == 0) { + log.info("Calculting sensitivity at depth " + k + " of " + depthDistribution.length); + } + sensitivity += sensitivityAtConstantDepth(k, qualityDistribution, logOddsThreshold, sampleSize, alleleFraction) * depthDistribution[k]; + } + return sensitivity; + } + + /** + * Removes trailing zeros in a distribution. The purpose of this function is to prevent other + * functions from evaluating in regions where the distribution has zero probability. + * @param distribution Distribution of base qualities + * @return Distribution of base qualities removing any trailing zeros + */ + public static double[] trimDistribution(final double[] distribution) { + int endOfDistribution = 0; + + // Locate the index of the distribution where all the values remaining at + // larger indices are zero. + for(endOfDistribution = distribution.length-1;endOfDistribution >= 0;endOfDistribution--) { + if(distribution[endOfDistribution] != 0) { + break; + } + } + + // Remove trailing zeros. + final double[] trimmedDistribution = new double[endOfDistribution+1]; + for(int i = 0;i <= endOfDistribution;i++) { + trimmedDistribution[i] = distribution[i]; + } + + return trimmedDistribution; + } +} \ No newline at end of file diff --git a/src/main/java/picard/analysis/TheoreticalSensitivityMetrics.java b/src/main/java/picard/analysis/TheoreticalSensitivityMetrics.java new file mode 100644 index 000000000..a2ea50a6a --- /dev/null +++ b/src/main/java/picard/analysis/TheoreticalSensitivityMetrics.java @@ -0,0 +1,33 @@ +package picard.analysis; + +import htsjdk.samtools.metrics.MetricBase; + +/** + * Created by fleharty on 6/30/17. + */ +public class TheoreticalSensitivityMetrics extends MetricBase { + /** Theoretical sensitivity at 0.1% allele fraction */ + public double SENSITIVITY_AT_0_1 = 0.0; + /** Theoretical sensitivity at 0.1% allele fraction */ + + /** Theoretical sensitivity at 0.5% allele fraction */ + public double SENSITIVITY_AT_0_5 = 0.0; + + /** Theoretical sensitivity at 1% allele fraction */ + public double SENSITIVITY_AT_01 = 0.0; + + /** Theoretical sensitivity at 2% allele fraction */ + public double SENSITIVITY_AT_02 = 0.0; + + /** Theoretical sensitivity at 5% allele fraction */ + public double SENSITIVITY_AT_05 = 0.0; + + /** Theoretical sensitivity at 10% allele fraction */ + public double SENSITIVITY_AT_10 = 0.0; + + /** Theoretical sensitivity at 30% allele fraction */ + public double SENSITIVITY_AT_30 = 0.0; + + /** Theoretical sensitivity at 50% allele fraction */ + public double SENSITIVITY_AT_50 = 0.0; +} \ No newline at end of file diff --git a/src/main/java/picard/analysis/directed/CollectHsMetrics.java b/src/main/java/picard/analysis/directed/CollectHsMetrics.java index 716de1d6c..2c3705411 100644 --- a/src/main/java/picard/analysis/directed/CollectHsMetrics.java +++ b/src/main/java/picard/analysis/directed/CollectHsMetrics.java @@ -105,6 +105,9 @@ @Option(doc = "True if we are to clip overlapping reads, false otherwise.", optional=true, overridable = true) public boolean CLIP_OVERLAPPING_READS = true; + @Option(doc = "Output theoretical sensitivities for variable allele fraction.", optional=true) + public File TheoreticalSensitivityOutput; + @Override protected IntervalList getProbeIntervals() { for (final File file : BAIT_INTERVALS) IOUtil.assertFileIsReadable(file); diff --git a/src/test/java/picard/analysis/TheoreticalSensitivityTest.java b/src/test/java/picard/analysis/TheoreticalSensitivityTest.java index c36974088..b3566bbd3 100644 --- a/src/test/java/picard/analysis/TheoreticalSensitivityTest.java +++ b/src/test/java/picard/analysis/TheoreticalSensitivityTest.java @@ -207,7 +207,7 @@ public void testHetSensDistributions() throws Exception { } @Test(dataProvider = "hetSensDataProvider") - public void testHetSensTargeted(final double expected, final File metricsFile) throws Exception{ + public void testHetSensTargeted(final double expected, final File metricsFile) throws Exception { final double tolerance = 0.000_000_01; final MetricsFile Metrics = new MetricsFile(); @@ -216,8 +216,8 @@ public void testHetSensTargeted(final double expected, final File metricsFile) t final Histogram depthHistogram = histograms.get(0); final Histogram qualityHistogram = histograms.get(1); - final double [] depthDistribution = TheoreticalSensitivity.normalizeHistogram(depthHistogram); - final double [] qualityDistribution = TheoreticalSensitivity.normalizeHistogram(qualityHistogram); + final double[] depthDistribution = TheoreticalSensitivity.normalizeHistogram(depthHistogram); + final double[] qualityDistribution = TheoreticalSensitivity.normalizeHistogram(qualityHistogram); final int sampleSize = 1_000; final double logOddsThreshold = 3.0; @@ -225,4 +225,108 @@ public void testHetSensTargeted(final double expected, final File metricsFile) t final double result = TheoreticalSensitivity.hetSNPSensitivity(depthDistribution, qualityDistribution, sampleSize, logOddsThreshold); Assert.assertEquals(result, expected, tolerance); } + + @DataProvider(name = "TheoreticalSensitivityConstantDepthDataProvider") + public Object[][] fractionalAlleleSensDataProvider() { + final File wgsMetricsFile = new File(TEST_DIR, "test_Solexa-332667.wgs_metrics"); + final File targetedMetricsFile = new File(TEST_DIR, "test_25103070136.targeted_pcr_metrics"); + + //These magic numbers come from a separate implementation of the code in R. + return new Object[][]{ + // expected sensitivity, metrics file, allele fraction, constant depth, sample size. + {1.00, wgsMetricsFile, .5, 30, 10000}, + {1.00, wgsMetricsFile, .5, 30, 10000}, + {0.78, targetedMetricsFile, .1, 30, 10000}, + {0.26, targetedMetricsFile, 0.1, 10, 10000} + }; + } + + @Test(dataProvider = "TheoreticalSensitivityConstantDepthDataProvider") + public void testSensitivityAtConstantDepth(final double expected, final File metricsFile, final double alleleFraction, final int depth, final int sampleSize) throws Exception { + // This tests Theoretical Sensitivity assuming a uniform depth with a distribution of base quality scores. + // Because this only tests sensitivity at a constant depth, we use this for testing the code at high depths. + final double tolerance = 0.01; + final MetricsFile Metrics = new MetricsFile(); + Metrics.read(new FileReader(metricsFile)); + final List histograms = Metrics.getAllHistograms(); + final Histogram qualityHistogram = histograms.get(1); + + final double[] qualityDistribution = TheoreticalSensitivity.normalizeHistogram(qualityHistogram); + + // We ensure that even using different random seeds we converge to roughly the same value. + for(int i = 0;i < 3;i++) { + double result = TheoreticalSensitivity.sensitivityAtConstantDepth(depth, qualityDistribution, 3, sampleSize, alleleFraction, i); + Assert.assertEquals(result, expected, tolerance); + } + } + + @DataProvider(name = "TheoreticalSensitivityDataProvider") + public Object[][] arbFracSensDataProvider() { + final File wgsMetricsFile = new File(TEST_DIR, "test_Solexa-332667.wgs_metrics"); + + // This test acts primarily as an intergration test. The sample size of 100 + // is not quite large enough to converge properly + return new Object[][] { + {0.90, wgsMetricsFile, .5, 100}, + {0.77, wgsMetricsFile, .3, 100}, + {0.29, wgsMetricsFile, .1, 100}, + {0.08, wgsMetricsFile, .05, 100}, + }; + } + + @Test(dataProvider = "TheoreticalSensitivityDataProvider") + public void testSensitivity(final double expected, final File metricsFile, final double alleleFraction, final int sampleSize) throws Exception { + // This tests Theoretical Sensitivity using distributions on both base quality scores + // and the depth histogram. + + // We use a pretty forgiving tolerance here because for these tests + // we are not using large enough sample sizes to converge. + final double tolerance = 0.02; + + final MetricsFile Metrics = new MetricsFile(); + Metrics.read(new FileReader(metricsFile)); + final List histograms = Metrics.getAllHistograms(); + final Histogram depthHistogram = histograms.get(0); + final Histogram qualityHistogram = histograms.get(1); + + final double[] qualityDistribution = TheoreticalSensitivity.normalizeHistogram(qualityHistogram); + final double[] depthDistribution = TheoreticalSensitivity.normalizeHistogram(depthHistogram); + + final double result = TheoreticalSensitivity.theoreticalSensitivity(depthDistribution, qualityDistribution, sampleSize, 3, alleleFraction); + + Assert.assertEquals(result, expected, tolerance); + } + + @DataProvider(name = "equivalanceHetVsArbitrary") + public Object[][] equivalenceHetVsFull() { + final File wgsMetricsFile = new File(TEST_DIR, "test_Solexa-332667.wgs_metrics"); + final File targetedMetricsFile = new File(TEST_DIR, "test_25103070136.targeted_pcr_metrics"); + + return new Object[][] { + // The sample sizes chosen here for these tests are smaller than what would normally be used + // in order to keep the test time low. It should be noted that for larger sample sizes + // the values converge. + {wgsMetricsFile, 0.01, 300}, + {targetedMetricsFile, 0.01, 50} + }; + } + + @Test (dataProvider = "equivalanceHetVsArbitrary") + public void testHetVsArbitrary(final File metricsFile, final double tolerance, final int sampleSize) throws Exception { + // This test compares Theoretical Sensitivity for arbitrary allele fractions with the theoretical het sensitivity + // model. Since allele fraction of 0.5 is equivalent to a het, these should provide the same answer. + final MetricsFile Metrics = new MetricsFile(); + Metrics.read(new FileReader(metricsFile)); + final List histograms = Metrics.getAllHistograms(); + final Histogram depthHistogram = histograms.get(0); + final Histogram qualityHistogram = histograms.get(1); + + final double[] qualityDistribution = TheoreticalSensitivity.normalizeHistogram(qualityHistogram); + final double[] depthDistribution = TheoreticalSensitivity.normalizeHistogram(depthHistogram); + + final double resultFromTS = TheoreticalSensitivity.theoreticalSensitivity(depthDistribution, qualityDistribution, sampleSize, 3, 0.5); + final double resultFromTHS = TheoreticalSensitivity.hetSNPSensitivity(depthDistribution, qualityDistribution, sampleSize, 3); + + Assert.assertEquals(resultFromTHS, resultFromTS, tolerance); + } } From 76c97bfb0f3dba0d7877c295c925a96a3e884746 Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Thu, 6 Jul 2017 18:55:41 -0400 Subject: [PATCH 2/7] Comitting for the evening. This starts to add TS to CollectHsMetrics... test should fail. --- .../java/picard/analysis/CollectWgsMetrics.java | 24 ++-------------- .../picard/analysis/TheoreticalSensitivity.java | 32 +++++++++++++++++++--- .../analysis/TheoreticalSensitivityMetrics.java | 27 +++--------------- .../picard/analysis/directed/CollectHsMetrics.java | 2 +- .../analysis/directed/TargetMetricsCollector.java | 5 ++++ 5 files changed, 40 insertions(+), 50 deletions(-) diff --git a/src/main/java/picard/analysis/CollectWgsMetrics.java b/src/main/java/picard/analysis/CollectWgsMetrics.java index 62a36e9d7..f482daff1 100644 --- a/src/main/java/picard/analysis/CollectWgsMetrics.java +++ b/src/main/java/picard/analysis/CollectWgsMetrics.java @@ -47,6 +47,7 @@ import java.io.File; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashSet; import java.util.List; @@ -471,28 +472,7 @@ protected int doWork() { out.write(OUTPUT); if(THEORETICAL_SENSITIVITY_OUTPUT != null) { - final double[] depthDoubleArray = TheoreticalSensitivity.normalizeHistogram(collector.getUnfilteredDepthHistogram()); - final double[] baseQDoubleArray = TheoreticalSensitivity.normalizeHistogram(collector.getUnfilteredBaseQHistogram()); - - collector.getUnfilteredBaseQHistogram(); - collector.getUnfilteredDepthHistogram(); - - TheoreticalSensitivityMetrics theoreticalSensitivityMetrics = new TheoreticalSensitivityMetrics(); - int theoreticalHetSensitivitySampleSize = 10000; - - double logOddsThreshold = 6.2; // This threshold is used because it is the value used for MuTect2. - theoreticalSensitivityMetrics.SENSITIVITY_AT_0_1 = TheoreticalSensitivity.theoreticalSensitivity(depthDoubleArray, baseQDoubleArray, theoreticalHetSensitivitySampleSize, logOddsThreshold, 0.001); - theoreticalSensitivityMetrics.SENSITIVITY_AT_0_5 = TheoreticalSensitivity.theoreticalSensitivity(depthDoubleArray, baseQDoubleArray, theoreticalHetSensitivitySampleSize, logOddsThreshold, 0.005); - theoreticalSensitivityMetrics.SENSITIVITY_AT_01 = TheoreticalSensitivity.theoreticalSensitivity(depthDoubleArray, baseQDoubleArray, theoreticalHetSensitivitySampleSize, logOddsThreshold, 0.01); - theoreticalSensitivityMetrics.SENSITIVITY_AT_02 = TheoreticalSensitivity.theoreticalSensitivity(depthDoubleArray, baseQDoubleArray, theoreticalHetSensitivitySampleSize, logOddsThreshold, 0.02); - theoreticalSensitivityMetrics.SENSITIVITY_AT_05 = TheoreticalSensitivity.theoreticalSensitivity(depthDoubleArray, baseQDoubleArray, theoreticalHetSensitivitySampleSize, logOddsThreshold, 0.05); - theoreticalSensitivityMetrics.SENSITIVITY_AT_10 = TheoreticalSensitivity.theoreticalSensitivity(depthDoubleArray, baseQDoubleArray, theoreticalHetSensitivitySampleSize, logOddsThreshold, 0.10); - theoreticalSensitivityMetrics.SENSITIVITY_AT_30 = TheoreticalSensitivity.theoreticalSensitivity(depthDoubleArray, baseQDoubleArray, theoreticalHetSensitivitySampleSize, logOddsThreshold, 0.30); - theoreticalSensitivityMetrics.SENSITIVITY_AT_50 = TheoreticalSensitivity.theoreticalSensitivity(depthDoubleArray, baseQDoubleArray, theoreticalHetSensitivitySampleSize, logOddsThreshold, 0.50); - - final MetricsFile tsOut = getMetricsFile(); - tsOut.addMetric(theoreticalSensitivityMetrics); - tsOut.write(THEORETICAL_SENSITIVITY_OUTPUT); + TheoreticalSensitivity.writeOutput(THEORETICAL_SENSITIVITY_OUTPUT, getMetricsFile(), collector.getUnfilteredDepthHistogram(),collector.getUnfilteredBaseQHistogram()); } return 0; diff --git a/src/main/java/picard/analysis/TheoreticalSensitivity.java b/src/main/java/picard/analysis/TheoreticalSensitivity.java index 83c03507d..70c305051 100644 --- a/src/main/java/picard/analysis/TheoreticalSensitivity.java +++ b/src/main/java/picard/analysis/TheoreticalSensitivity.java @@ -24,15 +24,14 @@ package picard.analysis; +import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.util.Histogram; import htsjdk.samtools.util.Log; import picard.PicardException; import picard.util.MathUtil; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Random; +import java.io.File; +import java.util.*; /** * Created by David Benjamin on 5/13/15. @@ -337,4 +336,29 @@ public static double theoreticalSensitivity(final double[] depthDistribution, fi return trimmedDistribution; } + + public static void writeOutput(File theoreticalSensitivityOutput, MetricsFile tsOut, Histogram depthHistogram, Histogram baseQHistogram) { + if (theoreticalSensitivityOutput != null) { + final double[] depthDoubleArray = TheoreticalSensitivity.normalizeHistogram(depthHistogram); + final double[] baseQDoubleArray = TheoreticalSensitivity.normalizeHistogram(baseQHistogram); + final Double alleleFractionValues[] = new Double[]{0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5}; + final List alleleFractions = new ArrayList<>(Arrays.asList(alleleFractionValues)); + + final TheoreticalSensitivityMetrics theoreticalSensitivityMetrics = new TheoreticalSensitivityMetrics(); + final int theoreticalHetSensitivitySampleSize = 10000; + + theoreticalSensitivityMetrics.HET_SENSITIVITY_LOD3_0 = TheoreticalSensitivity.hetSNPSensitivity(depthDoubleArray, baseQDoubleArray, theoreticalHetSensitivitySampleSize, 3.0); + theoreticalSensitivityMetrics.HET_SENSITIVITY_LOD6_2 = TheoreticalSensitivity.hetSNPSensitivity(depthDoubleArray, baseQDoubleArray, theoreticalHetSensitivitySampleSize, 6.2); + double logOddsThreshold = 6.2; // This threshold is used because it is the value used for MuTect2. + + Histogram sensitivityHistogram = new Histogram(); + for (Double alleleFraction : alleleFractions) { + sensitivityHistogram.increment(alleleFraction, TheoreticalSensitivity + .theoreticalSensitivity(depthDoubleArray, baseQDoubleArray, theoreticalHetSensitivitySampleSize, logOddsThreshold, alleleFraction)); + } + tsOut.addMetric(theoreticalSensitivityMetrics); + tsOut.addHistogram(sensitivityHistogram); + tsOut.write(theoreticalSensitivityOutput); + } + } } \ No newline at end of file diff --git a/src/main/java/picard/analysis/TheoreticalSensitivityMetrics.java b/src/main/java/picard/analysis/TheoreticalSensitivityMetrics.java index a2ea50a6a..464ad03ed 100644 --- a/src/main/java/picard/analysis/TheoreticalSensitivityMetrics.java +++ b/src/main/java/picard/analysis/TheoreticalSensitivityMetrics.java @@ -6,28 +6,9 @@ * Created by fleharty on 6/30/17. */ public class TheoreticalSensitivityMetrics extends MetricBase { - /** Theoretical sensitivity at 0.1% allele fraction */ - public double SENSITIVITY_AT_0_1 = 0.0; - /** Theoretical sensitivity at 0.1% allele fraction */ + /** Theoretical Het Sensitivity for log-odss of 6.2 **/ + public double HET_SENSITIVITY_LOD6_2; - /** Theoretical sensitivity at 0.5% allele fraction */ - public double SENSITIVITY_AT_0_5 = 0.0; - - /** Theoretical sensitivity at 1% allele fraction */ - public double SENSITIVITY_AT_01 = 0.0; - - /** Theoretical sensitivity at 2% allele fraction */ - public double SENSITIVITY_AT_02 = 0.0; - - /** Theoretical sensitivity at 5% allele fraction */ - public double SENSITIVITY_AT_05 = 0.0; - - /** Theoretical sensitivity at 10% allele fraction */ - public double SENSITIVITY_AT_10 = 0.0; - - /** Theoretical sensitivity at 30% allele fraction */ - public double SENSITIVITY_AT_30 = 0.0; - - /** Theoretical sensitivity at 50% allele fraction */ - public double SENSITIVITY_AT_50 = 0.0; + /** Theoretical Het Sensitivity for log-odss of 3.0 **/ + public double HET_SENSITIVITY_LOD3_0; } \ No newline at end of file diff --git a/src/main/java/picard/analysis/directed/CollectHsMetrics.java b/src/main/java/picard/analysis/directed/CollectHsMetrics.java index 2c3705411..00dc0e0df 100644 --- a/src/main/java/picard/analysis/directed/CollectHsMetrics.java +++ b/src/main/java/picard/analysis/directed/CollectHsMetrics.java @@ -106,7 +106,7 @@ public boolean CLIP_OVERLAPPING_READS = true; @Option(doc = "Output theoretical sensitivities for variable allele fraction.", optional=true) - public File TheoreticalSensitivityOutput; + public File THEORETICAL_SENSITIVITY_OUTPUT; @Override protected IntervalList getProbeIntervals() { diff --git a/src/main/java/picard/analysis/directed/TargetMetricsCollector.java b/src/main/java/picard/analysis/directed/TargetMetricsCollector.java index 1094cefdb..5d148b64f 100644 --- a/src/main/java/picard/analysis/directed/TargetMetricsCollector.java +++ b/src/main/java/picard/analysis/directed/TargetMetricsCollector.java @@ -124,6 +124,8 @@ private static final double LOG_ODDS_THRESHOLD = 3.0; + private final File theoreticalSensitivityOutput = null; + private final int minimumMappingQuality; private final int minimumBaseQuality; private final boolean clipOverlappingReads; @@ -625,6 +627,9 @@ public void finish() { calculateTargetCoverageMetrics(); calculateTheoreticalHetSensitivity(); + if(theoreticalSensitivityOutput != null) { + TheoreticalSensitivity.writeOutput(theoreticalSensitivityOutput, getMetricsFile(), unfilteredDepthHistogram, unfilteredBaseQHistogram); + } calculateGcMetrics(); emitPerBaseCoverageIfRequested(); } From 51075cd4ed403bf405de03426e291419aaeb86f6 Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Sun, 9 Jul 2017 17:58:16 -0400 Subject: [PATCH 3/7] Adds ALLELE_FRACTION option and removes weird test that fails because of SAMRecordSetBuilder having random behavior. --- .../java/picard/analysis/CollectWgsMetrics.java | 10 +++++----- .../picard/analysis/TheoreticalSensitivity.java | 23 +++++++++++++--------- .../picard/analysis/directed/CollectHsMetrics.java | 3 --- .../analysis/directed/CollectTargetedMetrics.java | 14 +++++++++++++ .../analysis/directed/TargetMetricsCollector.java | 11 ++++++++--- .../directed/CollectTargetedMetricsTest.java | 1 + 6 files changed, 42 insertions(+), 20 deletions(-) diff --git a/src/main/java/picard/analysis/CollectWgsMetrics.java b/src/main/java/picard/analysis/CollectWgsMetrics.java index f482daff1..b3be8deee 100644 --- a/src/main/java/picard/analysis/CollectWgsMetrics.java +++ b/src/main/java/picard/analysis/CollectWgsMetrics.java @@ -46,10 +46,7 @@ import picard.util.MathUtil; import java.io.File; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashSet; -import java.util.List; +import java.util.*; import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME; @@ -122,6 +119,9 @@ @Option(doc="Output for Theoretical Sensitivity metrics. Default is null.", optional = true) public File THEORETICAL_SENSITIVITY_OUTPUT; + @Option(doc="Allele fraction to run theoretical sensitivity on.", optional = true) + public List ALLELE_FRACTION = new LinkedList<>(Arrays.asList(0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5)); + @Option(doc = "If true, fast algorithm is used.") public boolean USE_FAST_ALGORITHM = false; @@ -472,7 +472,7 @@ protected int doWork() { out.write(OUTPUT); if(THEORETICAL_SENSITIVITY_OUTPUT != null) { - TheoreticalSensitivity.writeOutput(THEORETICAL_SENSITIVITY_OUTPUT, getMetricsFile(), collector.getUnfilteredDepthHistogram(),collector.getUnfilteredBaseQHistogram()); + TheoreticalSensitivity.writeOutput(THEORETICAL_SENSITIVITY_OUTPUT, getMetricsFile(), SAMPLE_SIZE, collector.getUnfilteredDepthHistogram(),collector.getUnfilteredBaseQHistogram(), ALLELE_FRACTION); } return 0; diff --git a/src/main/java/picard/analysis/TheoreticalSensitivity.java b/src/main/java/picard/analysis/TheoreticalSensitivity.java index 70c305051..e8089eab4 100644 --- a/src/main/java/picard/analysis/TheoreticalSensitivity.java +++ b/src/main/java/picard/analysis/TheoreticalSensitivity.java @@ -24,6 +24,7 @@ package picard.analysis; +import htsjdk.samtools.metrics.MetricBase; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.util.Histogram; import htsjdk.samtools.util.Log; @@ -41,7 +42,7 @@ private static final Log log = Log.getInstance(TheoreticalSensitivity.class); private static final int SAMPLING_MAX = 600; //prevent 'infinite' loops private static final int MAX_CONSIDERED_DEPTH = 10000; //no point in looking any deeper than this, otherwise GC overhead is too high. - private static int randomSeed = 51; + private static final int randomSeed = 51; /** * @param depthDistribution the probability of depth n is depthDistribution[n] for n = 0, 1. . . N - 1 @@ -305,6 +306,7 @@ public static double theoreticalSensitivity(final double[] depthDistribution, fi for (int k = 0; k < depthDistribution.length; k++) { if(k % 10 == 0) { log.info("Calculting sensitivity at depth " + k + " of " + depthDistribution.length); + log.info("Sample Size " + sampleSize); } sensitivity += sensitivityAtConstantDepth(k, qualityDistribution, logOddsThreshold, sampleSize, alleleFraction) * depthDistribution[k]; } @@ -337,24 +339,27 @@ public static double theoreticalSensitivity(final double[] depthDistribution, fi return trimmedDistribution; } - public static void writeOutput(File theoreticalSensitivityOutput, MetricsFile tsOut, Histogram depthHistogram, Histogram baseQHistogram) { + public static void writeOutput(File theoreticalSensitivityOutput, MetricsFile tsOut, int sampleSize, Histogram depthHistogram, Histogram baseQHistogram, List alleleFractions) { + System.out.println(depthHistogram); + System.out.println(baseQHistogram); + System.out.println(sampleSize); + if (theoreticalSensitivityOutput != null) { final double[] depthDoubleArray = TheoreticalSensitivity.normalizeHistogram(depthHistogram); final double[] baseQDoubleArray = TheoreticalSensitivity.normalizeHistogram(baseQHistogram); - final Double alleleFractionValues[] = new Double[]{0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5}; - final List alleleFractions = new ArrayList<>(Arrays.asList(alleleFractionValues)); +// final Double alleleFractionValues[] = new Double[]{0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5}; +// final List alleleFractions = new ArrayList<>(Arrays.asList(alleleFractionValues)); final TheoreticalSensitivityMetrics theoreticalSensitivityMetrics = new TheoreticalSensitivityMetrics(); - final int theoreticalHetSensitivitySampleSize = 10000; - theoreticalSensitivityMetrics.HET_SENSITIVITY_LOD3_0 = TheoreticalSensitivity.hetSNPSensitivity(depthDoubleArray, baseQDoubleArray, theoreticalHetSensitivitySampleSize, 3.0); - theoreticalSensitivityMetrics.HET_SENSITIVITY_LOD6_2 = TheoreticalSensitivity.hetSNPSensitivity(depthDoubleArray, baseQDoubleArray, theoreticalHetSensitivitySampleSize, 6.2); + theoreticalSensitivityMetrics.HET_SENSITIVITY_LOD3_0 = TheoreticalSensitivity.hetSNPSensitivity(depthDoubleArray, baseQDoubleArray, sampleSize, 3.0); + theoreticalSensitivityMetrics.HET_SENSITIVITY_LOD6_2 = TheoreticalSensitivity.hetSNPSensitivity(depthDoubleArray, baseQDoubleArray, sampleSize, 6.2); double logOddsThreshold = 6.2; // This threshold is used because it is the value used for MuTect2. - Histogram sensitivityHistogram = new Histogram(); + Histogram sensitivityHistogram = new Histogram<>(); for (Double alleleFraction : alleleFractions) { sensitivityHistogram.increment(alleleFraction, TheoreticalSensitivity - .theoreticalSensitivity(depthDoubleArray, baseQDoubleArray, theoreticalHetSensitivitySampleSize, logOddsThreshold, alleleFraction)); + .theoreticalSensitivity(depthDoubleArray, baseQDoubleArray, sampleSize, logOddsThreshold, alleleFraction)); } tsOut.addMetric(theoreticalSensitivityMetrics); tsOut.addHistogram(sensitivityHistogram); diff --git a/src/main/java/picard/analysis/directed/CollectHsMetrics.java b/src/main/java/picard/analysis/directed/CollectHsMetrics.java index 00dc0e0df..716de1d6c 100644 --- a/src/main/java/picard/analysis/directed/CollectHsMetrics.java +++ b/src/main/java/picard/analysis/directed/CollectHsMetrics.java @@ -105,9 +105,6 @@ @Option(doc = "True if we are to clip overlapping reads, false otherwise.", optional=true, overridable = true) public boolean CLIP_OVERLAPPING_READS = true; - @Option(doc = "Output theoretical sensitivities for variable allele fraction.", optional=true) - public File THEORETICAL_SENSITIVITY_OUTPUT; - @Override protected IntervalList getProbeIntervals() { for (final File file : BAIT_INTERVALS) IOUtil.assertFileIsReadable(file); diff --git a/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java b/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java index 2ec89d370..7d5666a24 100644 --- a/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java +++ b/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java @@ -15,12 +15,16 @@ import htsjdk.samtools.util.ProgressLogger; import htsjdk.samtools.util.SequenceUtil; import picard.analysis.MetricAccumulationLevel; +import picard.analysis.TheoreticalSensitivity; +import picard.analysis.TheoreticalSensitivityMetrics; import picard.cmdline.CommandLineProgram; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; import picard.metrics.MultilevelMetrics; import java.io.File; +import java.util.Arrays; +import java.util.LinkedList; import java.util.List; import java.util.Set; @@ -98,6 +102,11 @@ protected abstract COLLECTOR makeCollector(final Set ac @Option(doc="Sample Size used for Theoretical Het Sensitivity sampling. Default is 10000.", optional = true) public int SAMPLE_SIZE=10000; + @Option(doc="Output for Theoretical Sensitivity metrics. Default is null.", optional = true) + public File THEORETICAL_SENSITIVITY_OUTPUT; + + @Option(doc="Allele fraction to run theoretical sensitivity on.", optional = true) + public List ALLELE_FRACTION = new LinkedList<>(Arrays.asList(0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5)); /** * Asserts that files are readable and writable and then fires off an * HsMetricsCalculator instance to do the real work. @@ -156,6 +165,11 @@ protected int doWork() { metrics.write(OUTPUT); + if(THEORETICAL_SENSITIVITY_OUTPUT != null) { + final MetricsFile theoreticalSensitivityMetrics = getMetricsFile(); + TheoreticalSensitivity.writeOutput(THEORETICAL_SENSITIVITY_OUTPUT, theoreticalSensitivityMetrics, SAMPLE_SIZE, collector.getDepthHistogram(), collector.getBaseQualityHistogram(), ALLELE_FRACTION); + } + CloserUtil.close(reader); return 0; } diff --git a/src/main/java/picard/analysis/directed/TargetMetricsCollector.java b/src/main/java/picard/analysis/directed/TargetMetricsCollector.java index 5d148b64f..dd522a24e 100644 --- a/src/main/java/picard/analysis/directed/TargetMetricsCollector.java +++ b/src/main/java/picard/analysis/directed/TargetMetricsCollector.java @@ -627,9 +627,6 @@ public void finish() { calculateTargetCoverageMetrics(); calculateTheoreticalHetSensitivity(); - if(theoreticalSensitivityOutput != null) { - TheoreticalSensitivity.writeOutput(theoreticalSensitivityOutput, getMetricsFile(), unfilteredDepthHistogram, unfilteredBaseQHistogram); - } calculateGcMetrics(); emitPerBaseCoverageIfRequested(); } @@ -926,6 +923,14 @@ public String toString() { return "TargetedMetricCollector(interval=" + interval + ", depths = [" + StringUtil.intValuesToString(this.depths) + "])"; } } + + public Histogram getBaseQualityHistogram() { + return unfilteredBaseQHistogram; + } + + public Histogram getDepthHistogram() { + return unfilteredDepthHistogram; + } } /** diff --git a/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java b/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java index 7cd9b99e0..b1c7b9d64 100644 --- a/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java +++ b/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java @@ -15,6 +15,7 @@ import org.testng.annotations.BeforeTest; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import picard.analysis.TheoreticalSensitivityMetrics; import picard.cmdline.CommandLineProgramTest; import picard.sam.SortSam; import picard.util.TestNGUtil; From b9a44a80541091147bae620347bf6490e02df175 Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Sun, 9 Jul 2017 19:47:47 -0400 Subject: [PATCH 4/7] Getting ready to send back to Yossi, this should pass tests, includes ALLELE_FRACTION option, and fixes random behavior from SamReacordSetBuilder --- .../picard/analysis/TheoreticalSensitivity.java | 18 +++---- .../directed/CollectTargetedMetricsTest.java | 60 ++++++++++++++++++++++ 2 files changed, 67 insertions(+), 11 deletions(-) diff --git a/src/main/java/picard/analysis/TheoreticalSensitivity.java b/src/main/java/picard/analysis/TheoreticalSensitivity.java index e8089eab4..87378ff4a 100644 --- a/src/main/java/picard/analysis/TheoreticalSensitivity.java +++ b/src/main/java/picard/analysis/TheoreticalSensitivity.java @@ -24,7 +24,6 @@ package picard.analysis; -import htsjdk.samtools.metrics.MetricBase; import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.util.Histogram; import htsjdk.samtools.util.Log; @@ -302,11 +301,13 @@ public static double sensitivityAtConstantDepth(final int depth, final double[] */ public static double theoreticalSensitivity(final double[] depthDistribution, final double[] qualityDistribution, final int sampleSize, final double logOddsThreshold, final double alleleFraction) { + if(alleleFraction > 1.0 || alleleFraction < 0.0) { + throw new PicardException("Allele fractions must be between 0 and 1."); + } double sensitivity = 0.0; for (int k = 0; k < depthDistribution.length; k++) { - if(k % 10 == 0) { - log.info("Calculting sensitivity at depth " + k + " of " + depthDistribution.length); - log.info("Sample Size " + sampleSize); + if(k % 100 == 0) { + log.info("Calculting sensitivity for allele fraction " + alleleFraction + " at depth " + k + " of " + depthDistribution.length); } sensitivity += sensitivityAtConstantDepth(k, qualityDistribution, logOddsThreshold, sampleSize, alleleFraction) * depthDistribution[k]; } @@ -339,16 +340,11 @@ public static double theoreticalSensitivity(final double[] depthDistribution, fi return trimmedDistribution; } - public static void writeOutput(File theoreticalSensitivityOutput, MetricsFile tsOut, int sampleSize, Histogram depthHistogram, Histogram baseQHistogram, List alleleFractions) { - System.out.println(depthHistogram); - System.out.println(baseQHistogram); - System.out.println(sampleSize); - + public static void writeOutput(final File theoreticalSensitivityOutput, final MetricsFile tsOut, final int sampleSize, + final Histogram depthHistogram, final Histogram baseQHistogram, final List alleleFractions) { if (theoreticalSensitivityOutput != null) { final double[] depthDoubleArray = TheoreticalSensitivity.normalizeHistogram(depthHistogram); final double[] baseQDoubleArray = TheoreticalSensitivity.normalizeHistogram(baseQHistogram); -// final Double alleleFractionValues[] = new Double[]{0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5}; -// final List alleleFractions = new ArrayList<>(Arrays.asList(alleleFractionValues)); final TheoreticalSensitivityMetrics theoreticalSensitivityMetrics = new TheoreticalSensitivityMetrics(); diff --git a/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java b/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java index b1c7b9d64..c211b77ea 100644 --- a/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java +++ b/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java @@ -23,6 +23,9 @@ import java.io.File; import java.io.FileReader; import java.io.IOException; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; import java.util.Random; public class CollectTargetedMetricsTest extends CommandLineProgramTest { @@ -34,6 +37,7 @@ private File outfile; private File perTargetOutfile; private final static int LENGTH = 99; + private final static int RANDOM_SEED = 51; final String referenceFile = "testdata/picard/quality/chrM.reference.fasta"; final String emptyIntervals = "testdata/picard/quality/chrM.empty.interval_list"; @@ -83,6 +87,7 @@ void setupBuilder() throws IOException { //Add to setBuilder final SAMRecordSetBuilder setBuilder = new SAMRecordSetBuilder(true, SAMFileHeader.SortOrder.coordinate); + setBuilder.setRandomSeed(RANDOM_SEED); setBuilder.setReadGroup(readGroupRecord); setBuilder.setUseNmFlag(true); setBuilder.setHeader(header); @@ -161,6 +166,61 @@ public void runCollectTargetedMetricsTest(final File input, final File outfile, } } + @DataProvider(name = "theoreticalSensitivityDataProvider") + public Object[][] theoreticalSensitivityDataProvider() { + + return new Object[][] { + // This test is primarily used as an integration test since theoretical sensitivity doesn't converge + // well with a sample size of 10. The sample size is set so low as to prevent the tests from taking + // too long to run. + {tempSamFile, outfile, perTargetOutfile, referenceFile, singleIntervals, 10, + Arrays.asList(0.01, 0.05, 0.10, 0.30, 0.50), // Allele fraction + Arrays.asList(0.01, 0.59, 0.87, 0.99, 0.99), // Expected sensitivity + 0.12, 0.94 + } + }; + } + + @Test(dataProvider = "theoreticalSensitivityDataProvider") + public void runCollectTargetedMetricsTheoreticalSensitivityTest(final File input, final File outfile, final File perTargetOutfile, final String referenceFile, + final String targetIntervals, final int sampleSize, final List alleleFractions, final List expectedSensitivities, + final double additionalAlleleFraction, final double additionalExpectedSensitivity) throws IOException { + + final String[] args = new String[] { + "TARGET_INTERVALS=" + targetIntervals, + "INPUT=" + input.getAbsolutePath(), + "OUTPUT=" + outfile.getAbsolutePath(), + "REFERENCE_SEQUENCE=" + referenceFile, + "PER_TARGET_COVERAGE=" + perTargetOutfile.getAbsolutePath(), + "LEVEL=ALL_READS", + "AMPLICON_INTERVALS=" + targetIntervals, + "ALLELE_FRACTION=" + additionalAlleleFraction, + "THEORETICAL_SENSITIVITY_OUTPUT=tso.metrics", + "SAMPLE_SIZE=" + sampleSize + }; + + Assert.assertEquals(runPicardCommandLine(args), 0); + + final MetricsFile output = new MetricsFile<>(); + output.read(new FileReader("tso.metrics")); + + Histogram h = output.getHistogram(); + + // Ensure that the sensitivity calculated for the option ALLELE_FRACTION is correct + Assert.assertEquals(h.get(additionalAlleleFraction).getValue(), additionalExpectedSensitivity, 0.01); + + // Ensure that the sensitivities calculated for the default allele fractions are correct. + Iterator alleleFraction = alleleFractions.iterator(); + Iterator expectedSensitivity = expectedSensitivities.iterator(); + Assert.assertEquals(alleleFractions.size(), expectedSensitivities.size(), "List of allele fractions to test is not the same size as the list of expected sensitivities."); + while(alleleFraction.hasNext() && expectedSensitivity.hasNext()) { + double af = h.get(alleleFraction.next()).getValue(); + double es = expectedSensitivity.next(); + + Assert.assertEquals(af, es, 0.01); + } + } + @Test() public void testRawBqDistributionWithSoftClips() throws IOException { From 6f04133edd21bb87910a58da286ad7afb13cf188 Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Mon, 10 Jul 2017 11:26:00 -0400 Subject: [PATCH 5/7] Addressing Yossi's comments --- .../java/picard/analysis/CollectRawWgsMetrics.java | 2 -- .../java/picard/analysis/CollectWgsMetrics.java | 7 +++-- .../picard/analysis/TheoreticalSensitivity.java | 32 +++++++++++++++------- .../analysis/TheoreticalSensitivityMetrics.java | 4 --- .../analysis/directed/CollectTargetedMetrics.java | 11 +++----- .../analysis/TheoreticalSensitivityTest.java | 11 ++++---- .../directed/CollectTargetedMetricsTest.java | 8 +++--- 7 files changed, 41 insertions(+), 34 deletions(-) diff --git a/src/main/java/picard/analysis/CollectRawWgsMetrics.java b/src/main/java/picard/analysis/CollectRawWgsMetrics.java index bedc340cb..c647e112f 100644 --- a/src/main/java/picard/analysis/CollectRawWgsMetrics.java +++ b/src/main/java/picard/analysis/CollectRawWgsMetrics.java @@ -30,8 +30,6 @@ import picard.cmdline.Option; import picard.cmdline.programgroups.Metrics; -import java.io.File; - import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME; /** diff --git a/src/main/java/picard/analysis/CollectWgsMetrics.java b/src/main/java/picard/analysis/CollectWgsMetrics.java index b3be8deee..839943b47 100644 --- a/src/main/java/picard/analysis/CollectWgsMetrics.java +++ b/src/main/java/picard/analysis/CollectWgsMetrics.java @@ -137,7 +137,7 @@ private SAMFileHeader header = null; private final Log log = Log.getInstance(CollectWgsMetrics.class); - private static final double LOG_ODDS_THRESHOLD = 3; + private static final double LOG_ODDS_THRESHOLD = 3.0; /** Metrics for evaluating the performance of whole genome sequencing experiments. */ public static class WgsMetrics extends MergeableMetricBase { @@ -435,6 +435,9 @@ protected int doWork() { if (INTERVALS != null) { IOUtil.assertFileIsReadable(INTERVALS); } + if (THEORETICAL_SENSITIVITY_OUTPUT != null) { + IOUtil.assertFileIsWritable(THEORETICAL_SENSITIVITY_OUTPUT); + } // it doesn't make sense for the locus accumulation cap to be lower than the coverage cap if (LOCUS_ACCUMULATION_CAP < COVERAGE_CAP) { @@ -471,7 +474,7 @@ protected int doWork() { processor.addToMetricsFile(out, INCLUDE_BQ_HISTOGRAM, dupeFilter, mapqFilter, pairFilter); out.write(OUTPUT); - if(THEORETICAL_SENSITIVITY_OUTPUT != null) { + if (THEORETICAL_SENSITIVITY_OUTPUT != null) { TheoreticalSensitivity.writeOutput(THEORETICAL_SENSITIVITY_OUTPUT, getMetricsFile(), SAMPLE_SIZE, collector.getUnfilteredDepthHistogram(),collector.getUnfilteredBaseQHistogram(), ALLELE_FRACTION); } diff --git a/src/main/java/picard/analysis/TheoreticalSensitivity.java b/src/main/java/picard/analysis/TheoreticalSensitivity.java index 87378ff4a..aca64c6cf 100644 --- a/src/main/java/picard/analysis/TheoreticalSensitivity.java +++ b/src/main/java/picard/analysis/TheoreticalSensitivity.java @@ -301,12 +301,12 @@ public static double sensitivityAtConstantDepth(final int depth, final double[] */ public static double theoreticalSensitivity(final double[] depthDistribution, final double[] qualityDistribution, final int sampleSize, final double logOddsThreshold, final double alleleFraction) { - if(alleleFraction > 1.0 || alleleFraction < 0.0) { + if (alleleFraction > 1.0 || alleleFraction < 0.0) { throw new PicardException("Allele fractions must be between 0 and 1."); } double sensitivity = 0.0; for (int k = 0; k < depthDistribution.length; k++) { - if(k % 100 == 0) { + if (k % 100 == 0) { log.info("Calculting sensitivity for allele fraction " + alleleFraction + " at depth " + k + " of " + depthDistribution.length); } sensitivity += sensitivityAtConstantDepth(k, qualityDistribution, logOddsThreshold, sampleSize, alleleFraction) * depthDistribution[k]; @@ -325,38 +325,50 @@ public static double theoreticalSensitivity(final double[] depthDistribution, fi // Locate the index of the distribution where all the values remaining at // larger indices are zero. - for(endOfDistribution = distribution.length-1;endOfDistribution >= 0;endOfDistribution--) { - if(distribution[endOfDistribution] != 0) { + for (endOfDistribution = distribution.length-1;endOfDistribution >= 0;endOfDistribution--) { + if (distribution[endOfDistribution] != 0) { break; } } // Remove trailing zeros. final double[] trimmedDistribution = new double[endOfDistribution+1]; - for(int i = 0;i <= endOfDistribution;i++) { + for (int i = 0;i <= endOfDistribution;i++) { trimmedDistribution[i] = distribution[i]; } return trimmedDistribution; } + /** + * This is a utility function + * @param theoreticalSensitivityOutput File to save to results ot theoretical sensitivity. + * @param tsOut MetricsFile object to save results of theoretical sensitivity to. + * @param sampleSize Number of samples to take for each depth. + * @param depthHistogram Histogram of depth distribution for sample. + * @param baseQHistogram Histogram of Phred-scaled quality scores. + * @param alleleFractions Allele fractions + */ public static void writeOutput(final File theoreticalSensitivityOutput, final MetricsFile tsOut, final int sampleSize, final Histogram depthHistogram, final Histogram baseQHistogram, final List alleleFractions) { if (theoreticalSensitivityOutput != null) { + final double logOddsThreshold = 6.2; // This threshold is used because it is the value used for MuTect2. final double[] depthDoubleArray = TheoreticalSensitivity.normalizeHistogram(depthHistogram); final double[] baseQDoubleArray = TheoreticalSensitivity.normalizeHistogram(baseQHistogram); final TheoreticalSensitivityMetrics theoreticalSensitivityMetrics = new TheoreticalSensitivityMetrics(); - theoreticalSensitivityMetrics.HET_SENSITIVITY_LOD3_0 = TheoreticalSensitivity.hetSNPSensitivity(depthDoubleArray, baseQDoubleArray, sampleSize, 3.0); - theoreticalSensitivityMetrics.HET_SENSITIVITY_LOD6_2 = TheoreticalSensitivity.hetSNPSensitivity(depthDoubleArray, baseQDoubleArray, sampleSize, 6.2); - double logOddsThreshold = 6.2; // This threshold is used because it is the value used for MuTect2. - - Histogram sensitivityHistogram = new Histogram<>(); + // For each allele fraction in alleleFractions calculate theoretical sensitivity and add the results + // to the histogram sensitivityHistogram. + final Histogram sensitivityHistogram = new Histogram<>(); + sensitivityHistogram.setBinLabel("allele_fraction"); + sensitivityHistogram.setValueLabel("theoretical_sensitivity"); for (Double alleleFraction : alleleFractions) { sensitivityHistogram.increment(alleleFraction, TheoreticalSensitivity .theoreticalSensitivity(depthDoubleArray, baseQDoubleArray, sampleSize, logOddsThreshold, alleleFraction)); } + + // Write out results to file. tsOut.addMetric(theoreticalSensitivityMetrics); tsOut.addHistogram(sensitivityHistogram); tsOut.write(theoreticalSensitivityOutput); diff --git a/src/main/java/picard/analysis/TheoreticalSensitivityMetrics.java b/src/main/java/picard/analysis/TheoreticalSensitivityMetrics.java index 464ad03ed..d5ac907bd 100644 --- a/src/main/java/picard/analysis/TheoreticalSensitivityMetrics.java +++ b/src/main/java/picard/analysis/TheoreticalSensitivityMetrics.java @@ -6,9 +6,5 @@ * Created by fleharty on 6/30/17. */ public class TheoreticalSensitivityMetrics extends MetricBase { - /** Theoretical Het Sensitivity for log-odss of 6.2 **/ - public double HET_SENSITIVITY_LOD6_2; - /** Theoretical Het Sensitivity for log-odss of 3.0 **/ - public double HET_SENSITIVITY_LOD3_0; } \ No newline at end of file diff --git a/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java b/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java index 7d5666a24..43b58489c 100644 --- a/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java +++ b/src/main/java/picard/analysis/directed/CollectTargetedMetrics.java @@ -23,10 +23,7 @@ import picard.metrics.MultilevelMetrics; import java.io.File; -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; -import java.util.Set; +import java.util.*; import static picard.cmdline.StandardOptionDefinitions.MINIMUM_MAPPING_QUALITY_SHORT_NAME; @@ -102,11 +99,11 @@ protected abstract COLLECTOR makeCollector(final Set ac @Option(doc="Sample Size used for Theoretical Het Sensitivity sampling. Default is 10000.", optional = true) public int SAMPLE_SIZE=10000; - @Option(doc="Output for Theoretical Sensitivity metrics. Default is null.", optional = true) + @Option(doc="Output for Theoretical Sensitivity metrics where the allele fractions are provided by the ALLELE_FRACTION argument. Default is null.", optional = true) public File THEORETICAL_SENSITIVITY_OUTPUT; @Option(doc="Allele fraction to run theoretical sensitivity on.", optional = true) - public List ALLELE_FRACTION = new LinkedList<>(Arrays.asList(0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5)); + public List ALLELE_FRACTION = new ArrayList<>(Arrays.asList(0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5)); /** * Asserts that files are readable and writable and then fires off an * HsMetricsCalculator instance to do the real work. @@ -165,7 +162,7 @@ protected int doWork() { metrics.write(OUTPUT); - if(THEORETICAL_SENSITIVITY_OUTPUT != null) { + if (THEORETICAL_SENSITIVITY_OUTPUT != null) { final MetricsFile theoreticalSensitivityMetrics = getMetricsFile(); TheoreticalSensitivity.writeOutput(THEORETICAL_SENSITIVITY_OUTPUT, theoreticalSensitivityMetrics, SAMPLE_SIZE, collector.getDepthHistogram(), collector.getBaseQualityHistogram(), ALLELE_FRACTION); } diff --git a/src/test/java/picard/analysis/TheoreticalSensitivityTest.java b/src/test/java/picard/analysis/TheoreticalSensitivityTest.java index b3566bbd3..8ce0a217e 100644 --- a/src/test/java/picard/analysis/TheoreticalSensitivityTest.java +++ b/src/test/java/picard/analysis/TheoreticalSensitivityTest.java @@ -231,9 +231,9 @@ public void testHetSensTargeted(final double expected, final File metricsFile) t final File wgsMetricsFile = new File(TEST_DIR, "test_Solexa-332667.wgs_metrics"); final File targetedMetricsFile = new File(TEST_DIR, "test_25103070136.targeted_pcr_metrics"); - //These magic numbers come from a separate implementation of the code in R. + // These magic numbers come from a separate implementation of the code in R. return new Object[][]{ - // expected sensitivity, metrics file, allele fraction, constant depth, sample size. + // Expected sensitivity, metrics file, allele fraction, constant depth, sample size. {1.00, wgsMetricsFile, .5, 30, 10000}, {1.00, wgsMetricsFile, .5, 30, 10000}, {0.78, targetedMetricsFile, .1, 30, 10000}, @@ -254,7 +254,7 @@ public void testSensitivityAtConstantDepth(final double expected, final File met final double[] qualityDistribution = TheoreticalSensitivity.normalizeHistogram(qualityHistogram); // We ensure that even using different random seeds we converge to roughly the same value. - for(int i = 0;i < 3;i++) { + for (int i = 0;i < 3;i++) { double result = TheoreticalSensitivity.sensitivityAtConstantDepth(depth, qualityDistribution, 3, sampleSize, alleleFraction, i); Assert.assertEquals(result, expected, tolerance); } @@ -265,7 +265,8 @@ public void testSensitivityAtConstantDepth(final double expected, final File met final File wgsMetricsFile = new File(TEST_DIR, "test_Solexa-332667.wgs_metrics"); // This test acts primarily as an intergration test. The sample size of 100 - // is not quite large enough to converge properly + // is not quite large enough to converge properly, but is used for the purpose of + // keeping the compute time of the tests short. return new Object[][] { {0.90, wgsMetricsFile, .5, 100}, {0.77, wgsMetricsFile, .3, 100}, @@ -311,7 +312,7 @@ public void testSensitivity(final double expected, final File metricsFile, final }; } - @Test (dataProvider = "equivalanceHetVsArbitrary") + @Test(dataProvider = "equivalanceHetVsArbitrary") public void testHetVsArbitrary(final File metricsFile, final double tolerance, final int sampleSize) throws Exception { // This test compares Theoretical Sensitivity for arbitrary allele fractions with the theoretical het sensitivity // model. Since allele fraction of 0.5 is equivalent to a het, these should provide the same answer. diff --git a/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java b/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java index c211b77ea..bbf161134 100644 --- a/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java +++ b/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java @@ -204,7 +204,7 @@ public void runCollectTargetedMetricsTheoreticalSensitivityTest(final File input final MetricsFile output = new MetricsFile<>(); output.read(new FileReader("tso.metrics")); - Histogram h = output.getHistogram(); + final Histogram h = output.getHistogram(); // Ensure that the sensitivity calculated for the option ALLELE_FRACTION is correct Assert.assertEquals(h.get(additionalAlleleFraction).getValue(), additionalExpectedSensitivity, 0.01); @@ -213,9 +213,9 @@ public void runCollectTargetedMetricsTheoreticalSensitivityTest(final File input Iterator alleleFraction = alleleFractions.iterator(); Iterator expectedSensitivity = expectedSensitivities.iterator(); Assert.assertEquals(alleleFractions.size(), expectedSensitivities.size(), "List of allele fractions to test is not the same size as the list of expected sensitivities."); - while(alleleFraction.hasNext() && expectedSensitivity.hasNext()) { - double af = h.get(alleleFraction.next()).getValue(); - double es = expectedSensitivity.next(); + while (alleleFraction.hasNext() && expectedSensitivity.hasNext()) { + final double af = h.get(alleleFraction.next()).getValue(); + final double es = expectedSensitivity.next(); Assert.assertEquals(af, es, 0.01); } From cc085ff82b3ba4ddf7a7164d5ac7a203d1bb10fb Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Mon, 10 Jul 2017 11:31:13 -0400 Subject: [PATCH 6/7] Forgot to use ArrayList in both Targeted and WGS metrics. Fixed. --- src/main/java/picard/analysis/CollectWgsMetrics.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/picard/analysis/CollectWgsMetrics.java b/src/main/java/picard/analysis/CollectWgsMetrics.java index 839943b47..3f766a82c 100644 --- a/src/main/java/picard/analysis/CollectWgsMetrics.java +++ b/src/main/java/picard/analysis/CollectWgsMetrics.java @@ -120,7 +120,7 @@ public File THEORETICAL_SENSITIVITY_OUTPUT; @Option(doc="Allele fraction to run theoretical sensitivity on.", optional = true) - public List ALLELE_FRACTION = new LinkedList<>(Arrays.asList(0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5)); + public List ALLELE_FRACTION = new ArrayList<>(Arrays.asList(0.001, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.5)); @Option(doc = "If true, fast algorithm is used.") public boolean USE_FAST_ALGORITHM = false; From 42da9af327d1768ccf7131b1779a08c6c95ba891 Mon Sep 17 00:00:00 2001 From: Mark Fleharty Date: Mon, 10 Jul 2017 15:17:49 -0400 Subject: [PATCH 7/7] Fixed MetricFile issue with dummy field. Also added delete on exit for a metric file so that it gets cleaned up. --- .../java/picard/analysis/TheoreticalSensitivityMetrics.java | 3 ++- .../picard/analysis/directed/CollectTargetedMetricsTest.java | 11 +++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/main/java/picard/analysis/TheoreticalSensitivityMetrics.java b/src/main/java/picard/analysis/TheoreticalSensitivityMetrics.java index d5ac907bd..1b6cc1003 100644 --- a/src/main/java/picard/analysis/TheoreticalSensitivityMetrics.java +++ b/src/main/java/picard/analysis/TheoreticalSensitivityMetrics.java @@ -6,5 +6,6 @@ * Created by fleharty on 6/30/17. */ public class TheoreticalSensitivityMetrics extends MetricBase { - + /** This is a dummy value that allows for this to be read properly from a file. */ + public int DUMMY_FIELD = 0; } \ No newline at end of file diff --git a/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java b/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java index bbf161134..d4bcf4a1d 100644 --- a/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java +++ b/src/test/java/picard/analysis/directed/CollectTargetedMetricsTest.java @@ -35,6 +35,7 @@ private File tempSamFile; private File tempSamFileIndex; private File outfile; + private File tsOutfile; // Theoretical sensitivity file private File perTargetOutfile; private final static int LENGTH = 99; private final static int RANDOM_SEED = 51; @@ -127,8 +128,10 @@ void setupBuilder() throws IOException { //create output files for tests outfile = File.createTempFile("test", ".TargetedMetrics_Coverage"); perTargetOutfile = File.createTempFile("perTarget", ".perTargetCoverage"); + tsOutfile = File.createTempFile("test", ".TheoreticalSensitivityMetrics"); outfile.deleteOnExit(); perTargetOutfile.deleteOnExit(); + tsOutfile.deleteOnExit(); } @DataProvider(name = "targetedIntervalDataProvider") @@ -173,7 +176,7 @@ public void runCollectTargetedMetricsTest(final File input, final File outfile, // This test is primarily used as an integration test since theoretical sensitivity doesn't converge // well with a sample size of 10. The sample size is set so low as to prevent the tests from taking // too long to run. - {tempSamFile, outfile, perTargetOutfile, referenceFile, singleIntervals, 10, + {tempSamFile, outfile, tsOutfile, perTargetOutfile, referenceFile, singleIntervals, 10, Arrays.asList(0.01, 0.05, 0.10, 0.30, 0.50), // Allele fraction Arrays.asList(0.01, 0.59, 0.87, 0.99, 0.99), // Expected sensitivity 0.12, 0.94 @@ -182,7 +185,7 @@ public void runCollectTargetedMetricsTest(final File input, final File outfile, } @Test(dataProvider = "theoreticalSensitivityDataProvider") - public void runCollectTargetedMetricsTheoreticalSensitivityTest(final File input, final File outfile, final File perTargetOutfile, final String referenceFile, + public void runCollectTargetedMetricsTheoreticalSensitivityTest(final File input, final File outfile, final File tsOutfile, final File perTargetOutfile, final String referenceFile, final String targetIntervals, final int sampleSize, final List alleleFractions, final List expectedSensitivities, final double additionalAlleleFraction, final double additionalExpectedSensitivity) throws IOException { @@ -195,14 +198,14 @@ public void runCollectTargetedMetricsTheoreticalSensitivityTest(final File input "LEVEL=ALL_READS", "AMPLICON_INTERVALS=" + targetIntervals, "ALLELE_FRACTION=" + additionalAlleleFraction, - "THEORETICAL_SENSITIVITY_OUTPUT=tso.metrics", + "THEORETICAL_SENSITIVITY_OUTPUT=" + tsOutfile.getAbsolutePath(), "SAMPLE_SIZE=" + sampleSize }; Assert.assertEquals(runPicardCommandLine(args), 0); final MetricsFile output = new MetricsFile<>(); - output.read(new FileReader("tso.metrics")); + output.read(new FileReader(tsOutfile.getAbsolutePath())); final Histogram h = output.getHistogram();