diff --git a/src/main/java/picard/fingerprint/CrosscheckFingerprints.java b/src/main/java/picard/fingerprint/CrosscheckFingerprints.java index adef082f7..9a2c338f4 100644 --- a/src/main/java/picard/fingerprint/CrosscheckFingerprints.java +++ b/src/main/java/picard/fingerprint/CrosscheckFingerprints.java @@ -252,13 +252,9 @@ private void writeMatrix() { }; } - /** - * Method that combines the fingerprint evidence across all the read groups for the same sample - * and then produces a matrix of LOD scores for comparing every sample with every other sample. - */ - private int crossCheckGrouped(final Map fingerprints, final List metrics, - final Function by, - CrosscheckMetric.DataType type) { + static public Map mergeFingerprintsBy( + final Map fingerprints, + final Function by){ // collect the various entries according to the grouping "by" @@ -267,7 +263,7 @@ private int crossCheckGrouped(final Map finge .stream() .collect(Collectors.groupingBy(entry -> by.apply(entry.getKey()))); - final Map fingerprintsByGroup = collection.entrySet().stream() + return collection.entrySet().stream() .collect(Collectors.toMap( entry -> { // merge the keys (unequal values are eliminated by merge). @@ -287,6 +283,18 @@ private int crossCheckGrouped(final Map finge return sampleFp; })); + } + + + /** + * Method that combines the fingerprint evidence across all the read groups for the same sample + * and then produces a matrix of LOD scores for comparing every sample with every other sample. + */ + private int crossCheckGrouped(final Map fingerprints, final List metrics, + final Function by, + final CrosscheckMetric.DataType type) { + + final Map fingerprintsByGroup = mergeFingerprintsBy(fingerprints, by); if (MATRIX_OUTPUT != null) { crosscheckMatrix = new double[fingerprintsByGroup.size()][]; diff --git a/src/main/java/picard/fingerprint/FingerprintChecker.java b/src/main/java/picard/fingerprint/FingerprintChecker.java index c6efbaafd..d36e41a97 100644 --- a/src/main/java/picard/fingerprint/FingerprintChecker.java +++ b/src/main/java/picard/fingerprint/FingerprintChecker.java @@ -66,7 +66,7 @@ public ValidationStringency getValidationStringency() { return validationStringency; } - public void setValidationStringency(ValidationStringency validationStringency) { + public void setValidationStringency(final ValidationStringency validationStringency) { this.validationStringency = validationStringency; } @@ -116,7 +116,7 @@ public SAMFileHeader getHeader() { * Sets whether duplicate reads should be allowed when calling genotypes from SAM files. This is * useful when comparing read groups within a SAM file and individual read groups show artifactually * high duplication (e.g. a single-ended read group mixed in with paired-end read groups). - * @param allowDuplicateReads + * @param allowDuplicateReads should fingerprinting use duplicate reads? */ public void setAllowDuplicateReads(final boolean allowDuplicateReads) { this.allowDuplicateReads = allowDuplicateReads; @@ -185,7 +185,7 @@ public void setpLossofHet(final double pLossofHet) { } try { getFingerprintFromVc(fingerprints, ctx); - } catch (IllegalArgumentException e) { + } catch (final IllegalArgumentException e) { log.warn("There was a genotyping error in File: " + fingerprintFile + "\n" + e.getMessage()); } } @@ -208,7 +208,7 @@ public void setpLossofHet(final double pLossofHet) { SequenceUtil.assertSequenceDictionariesEqual(this.haplotypes.getHeader().getSequenceDictionary(), VCFFileReader.getSequenceDictionary(fingerprintFile)); - SortedSet snps = new TreeSet<>(haplotypes.getAllSnps()); + final SortedSet snps = new TreeSet<>(haplotypes.getAllSnps()); final Map fingerprints = new HashMap<>(); Set samples = null; @@ -233,7 +233,7 @@ public void setpLossofHet(final double pLossofHet) { } try { getFingerprintFromVc(fingerprints, ctx); - } catch (IllegalArgumentException e) { + } catch (final IllegalArgumentException e) { log.warn("There was a genotyping error in File: " + fingerprintFile + "\n" + e.getMessage()); } } @@ -247,7 +247,7 @@ public void setpLossofHet(final double pLossofHet) { * @param fingerprints a map from Sample to fingerprint * @param ctx the VC from which to extract (part of ) a fingerprint */ - private void getFingerprintFromVc(Map fingerprints, VariantContext ctx) throws IllegalArgumentException { + private void getFingerprintFromVc(final Map fingerprints, final VariantContext ctx) throws IllegalArgumentException { if (!isUsableSnp(ctx)) { return; } @@ -360,12 +360,12 @@ public IntervalList getLociToGenotype(final Collection fingerprints } public Map fingerprintVcf(final File vcfFile) { - Map fpIdMap = new HashMap<>(); + final Map fpIdMap = new HashMap<>(); final Map< String, Fingerprint> sampleFpMap = loadFingerprints(vcfFile, null); sampleFpMap.forEach((key, value) -> { - FingerprintIdDetails fpId = new FingerprintIdDetails(); + final FingerprintIdDetails fpId = new FingerprintIdDetails(); fpId.sample = key; fpId.file = vcfFile.getAbsolutePath(); @@ -380,11 +380,11 @@ public IntervalList getLociToGenotype(final Collection fingerprints */ public Map fingerprintSamFile(final File samFile, final IntervalList loci) { final SamReader in = SamReaderFactory.makeDefault() - .enable(SamReaderFactory.Option.CACHE_FILE_BASED_INDEXES) - .open(samFile); + .enable(SamReaderFactory.Option.CACHE_FILE_BASED_INDEXES) + .open(samFile); SequenceUtil.assertSequenceDictionariesEqual(this.haplotypes.getHeader().getSequenceDictionary(), - in.getFileHeader().getSequenceDictionary()); + in.getFileHeader().getSequenceDictionary()); final SamLocusIterator iterator = new SamLocusIterator(in, loci, in.hasIndex()); iterator.setEmitUncoveredLoci(true); @@ -400,18 +400,19 @@ public IntervalList getLociToGenotype(final Collection fingerprints iterator.setSamFilters(filters); } + final Map fingerprintIdDetailsMap = new HashMap<>(); final Map fingerprintsByReadGroup = new HashMap<>(); - final Collection rgs = in.getFileHeader().getReadGroups().stream().map(rg-> - {FingerprintIdDetails id = new FingerprintIdDetails(rg.getPlatformUnit(), samFile.getAbsolutePath()); - id.library = rg.getLibrary(); - id.sample = rg.getSample(); - return id;}).collect(Collectors.toSet()); - - for (final FingerprintIdDetails rg : rgs) { - final Fingerprint fingerprint = new Fingerprint(rg.sample, - samFile, - rg.platformUnit); - fingerprintsByReadGroup.put(rg, fingerprint); + + for (final SAMReadGroupRecord rg : in.getFileHeader().getReadGroups()) { + final FingerprintIdDetails id = new FingerprintIdDetails(rg.getPlatformUnit(), samFile.getAbsolutePath()); + id.library = rg.getLibrary(); + id.sample = rg.getSample(); + fingerprintIdDetailsMap.put(rg, id); + + final Fingerprint fingerprint = new Fingerprint(id.sample, + samFile, + id.platformUnit); + fingerprintsByReadGroup.put(id, fingerprint); for (final HaplotypeBlock h : this.haplotypes.getHaplotypes()) { fingerprint.add(new HaplotypeProbabilitiesFromSequence(h)); @@ -425,7 +426,7 @@ public IntervalList getLociToGenotype(final Collection fingerprints // Now go through the data at each locus and figure stuff out! for (final SamLocusIterator.LocusInfo info : iterator) { - boolean loggedMissingRGTag=false; + // TODO: Filter out the locus if the allele balance doesn't make sense for either a // TODO: 50/50 het or a hom with some errors; in HS data with deep coverage any base // TODO: with major strand bias could cause errors @@ -437,43 +438,21 @@ public IntervalList getLociToGenotype(final Collection fingerprints for (final SamLocusIterator.RecordAndOffset rec : info.getRecordAndOffsets()) { final SAMReadGroupRecord rg = rec.getRecord().getReadGroup(); final FingerprintIdDetails details; - if (rg == null) { - final PicardException e = new PicardException("Found read with no readgroup: " + rec.getRecord().getReadName() + " in file: " + samFile); - if (validationStringency != ValidationStringency.STRICT) { - final SAMReadGroupRecord readGroupRecord = new SAMReadGroupRecord(":::" + samFile.getAbsolutePath()); - readGroupRecord.setLibrary(""); - readGroupRecord.setSample(""); - readGroupRecord.setPlatformUnit(".0.ZZZ"); - - details = new FingerprintIdDetails(readGroupRecord, samFile.getAbsolutePath()); - if (!fingerprintsByReadGroup.containsKey(details)) { - Fingerprint fp = new Fingerprint(readGroupRecord.getSample(), samFile, readGroupRecord.getPlatformUnit()); - fingerprintsByReadGroup.put(details, fp); - - for (final HaplotypeBlock h : this.haplotypes.getHaplotypes()) { - fp.add(new HaplotypeProbabilitiesFromSequence(h)); - } - } - } else { - log.error(e); - throw e; - } + if (rg == null && !fingerprintIdDetailsMap.containsKey(rg)) { + final FingerprintIdDetails unknownFPDetails = createUnknownFP(samFile, rec.getRecord()); + fingerprintIdDetailsMap.put(null, unknownFPDetails); - if (validationStringency == ValidationStringency.LENIENT && !loggedMissingRGTag) { - log.warn(e); - log.warn("further messages from this file will be suppressed"); - loggedMissingRGTag = true; - } + final Fingerprint fp = new Fingerprint(unknownFPDetails.sample, new File(unknownFPDetails.file), unknownFPDetails.platformUnit); + fingerprintsByReadGroup.put(unknownFPDetails, fp); - } else { // rg is not null - details = new FingerprintIdDetails(rg, samFile.getAbsolutePath()); + for (final HaplotypeBlock h : this.haplotypes.getHaplotypes()) { + fp.add(new HaplotypeProbabilitiesFromSequence(h)); + } } - if (!fingerprintsByReadGroup.containsKey(details)) { - final PicardException e = new PicardException("Unknown read group: " + rg + " in file: " + samFile); - log.error(e); - throw e; - } else { + if (fingerprintIdDetailsMap.containsKey(rg)) { + details = fingerprintIdDetailsMap.get(rg); + final String readName = rec.getRecord().getReadName(); if (!usedReadNames.contains(readName)) { final HaplotypeProbabilitiesFromSequence probs = (HaplotypeProbabilitiesFromSequence) fingerprintsByReadGroup.get(details).get(haplotypeBlock); @@ -483,13 +462,38 @@ public IntervalList getLociToGenotype(final Collection fingerprints probs.addToProbs(snp, base, qual); usedReadNames.add(readName); } + } else { + final PicardException e = new PicardException("Unknown read group: " + rg + " in file: " + samFile); + log.error(e); + throw e; } + } } return fingerprintsByReadGroup; } + private FingerprintIdDetails createUnknownFP(final File samFile, final SAMRecord rec) { + final PicardException e = new PicardException("Found read with no readgroup: " + rec.getReadName() + " in file: " + samFile); + if (validationStringency != ValidationStringency.STRICT) { + final SAMReadGroupRecord readGroupRecord = new SAMReadGroupRecord(":::" + samFile.getAbsolutePath()); + readGroupRecord.setLibrary(""); + readGroupRecord.setSample(""); + readGroupRecord.setPlatformUnit(".0.ZZZ"); + + if (validationStringency == ValidationStringency.LENIENT) { + log.warn(e); + log.warn("further messages from this file will be suppressed"); + } + + return new FingerprintIdDetails(readGroupRecord, samFile.getAbsolutePath()); + } else { + log.error(e); + throw e; + } + } + /** * Generates a per-sample Fingerprint for the contaminant in the supplied SAM file. * Data is aggregated by sample, not read-group. @@ -620,7 +624,7 @@ public IntervalList getLociToGenotype(final Collection fingerprints if (filesRead.incrementAndGet() % 100 == 0) { log.info("Processed " + filesRead.get() + " out of " + files.size()); } - } catch(Exception e){ + } catch(final Exception e){ log.warn("Exception thrown in thread:" + e.getMessage()); throw e; } @@ -644,26 +648,6 @@ public IntervalList getLociToGenotype(final Collection fingerprints } /** - * Takes a collection of fingerprints and, assuming that they are independent, merged the fingerprints - * by samples and totals up the probabilities. - */ - static public SortedMap mergeFingerprintsBySample(final Collection inputs) { - final SortedMap sampleFps = new TreeMap<>(); - for (final Fingerprint fp : inputs) { - Fingerprint sampleFp = sampleFps.get(fp.getSample()); - if (sampleFp == null) { - sampleFp = new Fingerprint(fp.getSample(), null, fp.getSample()); - sampleFps.put(fp.getSample(), sampleFp); - } - - sampleFp.merge(fp); - } - - return sampleFps; - } - - - /** * Top level method to take a set of one or more SAM files and one or more Genotype files and compare * each read group in each SAM file to each set of fingerprint genotypes. * @@ -755,7 +739,7 @@ public IntervalList getLociToGenotype(final Collection fingerprints for (final String sample : observedFingerprintsBySample.keySet()) { final FingerprintResults results = new FingerprintResults(f, null, sample); - for (Fingerprint expectedFp : expectedFingerprints) { + for (final Fingerprint expectedFp : expectedFingerprints) { final MatchResults result = calculateMatchResults(observedFingerprintsBySample.get(sample), expectedFp, 0, pLossofHet); results.addResults(result); } @@ -849,11 +833,11 @@ public static MatchResults calculateMatchResults(final Fingerprint observedFp, f return calculateMatchResults(observedFp, expectedFp, 0, 0); } - private static boolean isQueryable(SAMSequenceDictionary dict, VCFFileReader reader){ + private static boolean isQueryable(final SAMSequenceDictionary dict, final VCFFileReader reader){ try { reader.query(dict.getSequence(0).getSequenceName(),0,0); - } catch (RuntimeException e) { + } catch (final RuntimeException e) { return false; } return true; diff --git a/src/main/java/picard/fingerprint/FingerprintIdDetails.java b/src/main/java/picard/fingerprint/FingerprintIdDetails.java index 1d105236a..72d6b5c5d 100644 --- a/src/main/java/picard/fingerprint/FingerprintIdDetails.java +++ b/src/main/java/picard/fingerprint/FingerprintIdDetails.java @@ -47,24 +47,24 @@ public FingerprintIdDetails() {} - public FingerprintIdDetails(String platformUnit, String file) { + public FingerprintIdDetails(final String platformUnit, final String file) { getPlatformUnitDetails(platformUnit); this.platformUnit = platformUnit; this.file = file; } - public FingerprintIdDetails(final SAMReadGroupRecord rg, String file) { + public FingerprintIdDetails(final SAMReadGroupRecord rg, final String file) { this(rg.getPlatformUnit(), file); this.sample = rg.getSample(); this.library = rg.getLibrary(); } @Override - public boolean equals(Object o) { + public boolean equals(final Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; - FingerprintIdDetails that = (FingerprintIdDetails) o; + final FingerprintIdDetails that = (FingerprintIdDetails) o; if (platformUnit != null ? !platformUnit.equals(that.platformUnit) : that.platformUnit != null) return false; if (runBarcode != null ? !runBarcode.equals(that.runBarcode) : that.runBarcode != null) return false; @@ -89,7 +89,7 @@ public int hashCode() { return result; } - public FingerprintIdDetails merge(FingerprintIdDetails other) { + public FingerprintIdDetails merge(final FingerprintIdDetails other) { platformUnit = equalValueOrElse(platformUnit, other.platformUnit, multipleValuesString); runBarcode = equalValueOrElse(runBarcode, other.runBarcode, multipleValuesString); @@ -102,7 +102,7 @@ public FingerprintIdDetails merge(FingerprintIdDetails other) { return this; } - static private T equalValueOrElse(T lhs, T rhs, T orElse) { + static private T equalValueOrElse(final T lhs, final T rhs, final T orElse) { if (rhs == null) return lhs; if (lhs == null) return rhs; @@ -128,7 +128,7 @@ private void getPlatformUnitDetails(final String puString) { } else { throw new IllegalArgumentException("Unexpected format " + puString + " for PU attribute"); } - } catch (NumberFormatException e) { + } catch (final NumberFormatException e) { throw new IllegalArgumentException("Unexpected format " + puString + " for PU attribute"); } } diff --git a/src/test/java/picard/fingerprint/FingerprintCheckerTest.java b/src/test/java/picard/fingerprint/FingerprintCheckerTest.java index f64dad9fe..c78cc15f0 100644 --- a/src/test/java/picard/fingerprint/FingerprintCheckerTest.java +++ b/src/test/java/picard/fingerprint/FingerprintCheckerTest.java @@ -1,6 +1,5 @@ package picard.fingerprint; -import htsjdk.samtools.ValidationStringency; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; @@ -85,7 +84,7 @@ public void testMatchResults(final double pLoH) { @DataProvider(name = "checkFingerprintsVcfDataProvider") public Object[][] testCheckFingerprintsVcfDataProvider() { - return new Object[][] { + return new Object[][]{ {new File(TEST_DATA_DIR, "NA12891.vcf"), new File(TEST_DATA_DIR, "NA12891.fp.vcf"), "NA12891", "NA12891", -0.02128, -1.026742, 1.005462}, {new File(TEST_DATA_DIR, "NA12892.vcf"), new File(TEST_DATA_DIR, "NA12892.fp.vcf"), "NA12892", "NA12892", -0.021945, -1.08308, 1.061135}, {new File(TEST_DATA_DIR, "NA12891.vcf"), new File(TEST_DATA_DIR, "NA12892.fp.vcf"), "NA12891", "NA12892", -5.941691, -1.026742, -4.914948}, @@ -94,8 +93,8 @@ public void testMatchResults(final double pLoH) { } @Test(dataProvider = "checkFingerprintsVcfDataProvider") - public void testCheckFingerprints(File vcfFile, File genotypesFile, String observedSampleAlias, String expectedSampleAlias, - double llExpectedSample, double llRandomSample, double lodExpectedSample) throws IOException { + public void testCheckFingerprints(final File vcfFile, final File genotypesFile, final String observedSampleAlias, final String expectedSampleAlias, + final double llExpectedSample, final double llRandomSample, final double lodExpectedSample) throws IOException { final File indexedInputVcf = VcfTestUtils.createTemporaryIndexedVcfFromInput(vcfFile, "fingerprintcheckertest.tmp."); final File indexedGenotypesVcf = VcfTestUtils.createTemporaryIndexedVcfFromInput(genotypesFile, "fingerprintcheckertest.tmp."); @@ -115,21 +114,19 @@ public void testCheckFingerprints(File vcfFile, File genotypesFile, String obs Assert.assertEquals(mr.getLOD(), lodExpectedSample, DELTA); } - @Test(dataProvider = "checkFingerprintsVcfDataProvider") - public void testFingerprintVcf(File vcfFile, File genotypesFile, String observedSampleAlias, String expectedSampleAlias, - double llExpectedSample, double llRandomSample, double lodExpectedSample) throws IOException { + public void testFingerprintVcf(final File vcfFile, final File genotypesFile, final String observedSampleAlias, final String expectedSampleAlias, + final double llExpectedSample, final double llRandomSample, final double lodExpectedSample) throws IOException { final FingerprintChecker fpChecker = new FingerprintChecker(SUBSETTED_HAPLOTYPE_DATABASE_FOR_TESTING); - Map fp1=fpChecker.fingerprintVcf(vcfFile); + final Map fp1 = fpChecker.fingerprintVcf(vcfFile); Assert.assertFalse(fp1.isEmpty()); } - @Test(expectedExceptions = RuntimeException.class) - public void testTerminateOnBadFile(){ + public void testTerminateOnBadFile() { final FingerprintChecker fpChecker = new FingerprintChecker(SUBSETTED_HAPLOTYPE_DATABASE_FOR_TESTING); - final File badSam= new File(TEST_DATA_DIR,"aligned_queryname_sorted.sam"); + final File badSam = new File(TEST_DATA_DIR, "aligned_queryname_sorted.sam"); fpChecker.fingerprintFiles(Collections.singletonList(badSam), 1, 1, TimeUnit.DAYS); } @@ -143,16 +140,22 @@ public void testTerminateOnBadFile(){ final File na12891_noRg = new File(TEST_DATA_DIR, "NA12891.over.fingerprints.noRgTag.sam"); return new Object[][]{ - {na12891_r1, na12891_r2, true}, - {na12892_r1, na12892_r2, true}, - {na12892_r1, na12891_r2, false}, - {na12892_r1, na12891_noRg, false}, - {na12891_r1, na12891_noRg, true}, + {na12891_r1, na12891_r2, true, true}, + {na12892_r1, na12892_r2, true, true}, + {na12892_r1, na12891_r2, false, true}, + {na12892_r1, na12891_noRg, false, true}, + {na12891_r1, na12891_noRg, true, true}, + + {na12891_r1, na12891_r2, true, false}, + {na12892_r1, na12892_r2, true, false}, + {na12892_r1, na12891_r2, false, false}, + {na12892_r1, na12891_noRg, false, false}, + {na12891_r1, na12891_noRg, true, false} }; } @Test(dataProvider = "checkFingerprintsSamDataProvider") - public void testCheckFingerprints(File samFile1, File samFile2, boolean expectedMatch) { + public void testCheckFingerprints(final File samFile1, final File samFile2, final boolean expectedMatch, final boolean silent) { final String[] args = { "EXPECT_ALL_GROUPS_TO_MATCH=true", @@ -160,10 +163,38 @@ public void testCheckFingerprints(File samFile1, File samFile2, boolean expected "H=" + SUBSETTED_HAPLOTYPE_DATABASE_FOR_TESTING.getAbsolutePath(), "I=" + samFile1.getAbsolutePath(), "I=" + samFile2.getAbsolutePath(), - "VALIDATION_STRINGENCY=LENIENT", + "VALIDATION_STRINGENCY=" + (silent ? "SILENT" : "LENIENT"), "CROSSCHECK_BY=FILE", }; Assert.assertEquals(new CrosscheckFingerprints().instanceMain(args), expectedMatch ? 0 : 1); } + + @DataProvider(name = "checkFingerprintsSamDataProviderFail") + public Object[][] testCheckFingerprintsSamDataProviderFail() { + final File na12891_r1 = new File(TEST_DATA_DIR, "NA12891.over.fingerprints.r1.sam"); + final File na12892_r1 = new File(TEST_DATA_DIR, "NA12892.over.fingerprints.r1.sam"); + final File na12891_noRg = new File(TEST_DATA_DIR, "NA12891.over.fingerprints.noRgTag.sam"); + + return new Object[][]{ + {na12892_r1, na12891_noRg, false}, + {na12891_r1, na12891_noRg, true}, + }; + } + + @Test(dataProvider = "checkFingerprintsSamDataProviderFail", expectedExceptions = RuntimeException.class) + public void testCheckFingerprintsFail(final File samFile1, final File samFile2, final boolean expectedMatch) { + + final String[] args = { + "EXPECT_ALL_GROUPS_TO_MATCH=true", + "LOD_THRESHOLD=-1", + "H=" + SUBSETTED_HAPLOTYPE_DATABASE_FOR_TESTING.getAbsolutePath(), + "I=" + samFile1.getAbsolutePath(), + "I=" + samFile2.getAbsolutePath(), + "VALIDATION_STRINGENCY=STRICT", + "CROSSCHECK_BY=FILE", + }; + + new CrosscheckFingerprints().instanceMain(args); + } } \ No newline at end of file