From a1e9a9d7758d62ec808fe359506e9af6438a2ffe Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Tue, 11 Jul 2017 16:39:47 -0400 Subject: [PATCH 1/2] - fixed bug in Liftover pertaining to indels that straddle two "links" in a chain --- src/main/java/picard/vcf/LiftoverVcf.java | 80 +++++++---- src/test/java/picard/vcf/LiftoverVcfTest.java | 156 ++++++++++++++++++++- .../picard/vcf/dummy.two.block.reference.fasta | 20 +++ testdata/picard/vcf/test.two.block.over.chain | 4 + 4 files changed, 232 insertions(+), 28 deletions(-) create mode 100644 testdata/picard/vcf/dummy.two.block.reference.fasta create mode 100644 testdata/picard/vcf/test.two.block.over.chain diff --git a/src/main/java/picard/vcf/LiftoverVcf.java b/src/main/java/picard/vcf/LiftoverVcf.java index 4c9bb03c6..80cd3edb3 100644 --- a/src/main/java/picard/vcf/LiftoverVcf.java +++ b/src/main/java/picard/vcf/LiftoverVcf.java @@ -101,17 +101,24 @@ public static final String FILTER_NO_TARGET = "NoTarget"; /** - * Filter name to use when a target is lifted over, but the reference allele doens't match the new reference. + * Filter name to use when a target is lifted over, but the reference allele doesn't match the new reference. */ public static final String FILTER_MISMATCHING_REF_ALLELE = "MismatchedRefAllele"; /** + * Filter name to use when an indel cannot be lifted over since it straddles two links which means + * that it is unclear what are the right alleles to be used. + */ + public static final String FILTER_INDEL_STRADDLES_TWO_LINKS = "IndelStraddlesMultipleLinks"; + + /** * Filters to be added to the REJECT file. */ private static final List FILTERS = CollectionUtil.makeList( new VCFFilterHeaderLine(FILTER_CANNOT_LIFTOVER_INDEL, "Indel falls into a reverse complemented region in the target genome."), new VCFFilterHeaderLine(FILTER_NO_TARGET, "Variant could not be lifted between genome builds."), - new VCFFilterHeaderLine(FILTER_MISMATCHING_REF_ALLELE, "Reference allele does not match reference genome sequence after liftover.") + new VCFFilterHeaderLine(FILTER_MISMATCHING_REF_ALLELE, "Reference allele does not match reference genome sequence after liftover."), + new VCFFilterHeaderLine(FILTER_INDEL_STRADDLES_TWO_LINKS, "Indel is straddling multiple links in the chain, and so the results are not well defined.") ); /** @@ -141,6 +148,8 @@ private final Log log = Log.getInstance(LiftoverVcf.class); private SortingCollection sorter; + private long failedLiftover = 0, failedAlleleCheck = 0; + @Override protected int doWork() { IOUtil.assertFileIsReadable(INPUT); @@ -193,7 +202,7 @@ protected int doWork() { // Read the input VCF, lift the records over and write to the sorting // collection. //////////////////////////////////////////////////////////////////////// - long failedLiftover = 0, failedAlleleCheck = 0, total = 0; + long total = 0; log.info("Lifting variants over and sorting."); sorter = SortingCollection.newInstance(VariantContext.class, @@ -210,16 +219,24 @@ protected int doWork() { ++total; final Interval source = new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd(), false, ctx.getContig() + ":" + ctx.getStart() + "-" + ctx.getEnd()); final Interval target = liftOver.liftOver(source, LIFTOVER_MIN_MATCH); + + if (target == null) { + rejectVariant(ctx, FILTER_NO_TARGET); + continue; + } + if (ctx.getReference().length() != target.length()){ + rejectVariant(ctx, FILTER_INDEL_STRADDLES_TWO_LINKS); + continue; + } + final ReferenceSequence refSeq; // if the target is null OR (the target is reverse complemented AND the variant is a non-biallelic indel or mixed), then we cannot lift it over - if (target == null || (target.isNegativeStrand() && (ctx.isMixed() || (ctx.isIndel() && !ctx.isBiallelic())))) { - final String reason = (target == null) ? FILTER_NO_TARGET : FILTER_CANNOT_LIFTOVER_INDEL; - rejects.add(new VariantContextBuilder(ctx).filter(reason).make()); - failedLiftover++; + if (target.isNegativeStrand() && (ctx.isMixed() || ctx.isIndel() && !ctx.isBiallelic())) { + rejectVariant(ctx, FILTER_CANNOT_LIFTOVER_INDEL); + } else if (!refSeqs.containsKey(target.getContig())) { - rejects.add(new VariantContextBuilder(ctx).filter(FILTER_NO_TARGET).make()); - failedLiftover++; + rejectVariant(ctx, FILTER_NO_TARGET); final String missingContigMessage = "Encountered a contig, " + target.getContig() + " that is not part of the target reference."; if (WARN_ON_MISSING_CONTIG) { @@ -236,17 +253,13 @@ protected int doWork() { if (flippedIndel == null) { throw new IllegalArgumentException("Unexpectedly found null VC. This should have not happened."); } else { - if (!tryToAddVariant(flippedIndel, refSeq, reverseComplementAlleleMap, ctx)){ - failedAlleleCheck++; - } + tryToAddVariant(flippedIndel, refSeq, reverseComplementAlleleMap, ctx); } } else { refSeq = refSeqs.get(target.getContig()); - final VariantContext liftedVariant = liftSnp(ctx, target, refSeq); + final VariantContext liftedVariant = liftSimpleVariant(ctx, target); - if (!tryToAddVariant(liftedVariant, refSeq, reverseComplementAlleleMap, ctx)) { - failedAlleleCheck++; - } + tryToAddVariant(liftedVariant, refSeq, reverseComplementAlleleMap, ctx); } progress.record(ctx.getContig(), ctx.getStart()); } @@ -278,6 +291,11 @@ protected int doWork() { return 0; } + private void rejectVariant(final VariantContext ctx, final String reason) { + rejects.add(new VariantContextBuilder(ctx).filter(reason).make()); + failedLiftover++; + } + /** * utility function to attempt to add a variant. Checks that the reference allele still matches the reference (which may have changed) * @@ -287,11 +305,10 @@ protected int doWork() { * @param source the original {@link VariantContext} to use for putting the original location information into vc * @return true if successful, false if failed due to mismatching reference allele. */ - private boolean tryToAddVariant(final VariantContext vc, final ReferenceSequence refSeq, final Map alleleMap, final VariantContext source) { + private void tryToAddVariant(final VariantContext vc, final ReferenceSequence refSeq, final Map alleleMap, final VariantContext source) { final VariantContextBuilder builder = new VariantContextBuilder(vc); - builder.genotypes(fixGenotypes(source.getGenotypes(), alleleMap)); builder.filters(source.getFilters()); builder.log10PError(source.getLog10PError()); @@ -308,7 +325,7 @@ private boolean tryToAddVariant(final VariantContext vc, final ReferenceSequence for (final Allele allele : builder.getAlleles()) { if (allele.isReference()) { final byte[] ref = refSeq.getBases(); - final String refString = StringUtil.bytesToString(ref, vc.getStart() - 1, vc.getEnd()-vc.getStart()+1); + final String refString = StringUtil.bytesToString(ref, vc.getStart() - 1, vc.getEnd() - vc.getStart() + 1); if (!refString.equalsIgnoreCase(allele.getBaseString())) { mismatchesReference = true; @@ -322,18 +339,26 @@ private boolean tryToAddVariant(final VariantContext vc, final ReferenceSequence .filter(FILTER_MISMATCHING_REF_ALLELE) .attribute(ATTEMPTED_LOCUS, String.format("%s:%d-%d",vc.getContig(),vc.getStart(),vc.getEnd())) .make()); - return false; + failedAlleleCheck++; } else { sorter.add(builder.make()); - return true; } } - - protected static VariantContext liftSnp(final VariantContext source, final Interval target , final ReferenceSequence referenceSequence){ + /** liftsOver snps on either positive or negative strand and biallelic indels on positive strand only + * + * @param source + * @param target + * @return + */ + protected static VariantContext liftSimpleVariant(final VariantContext source, final Interval target){ // Fix the alleles if we went from positive to negative strand - final Map reverseComplementAlleleMap = new HashMap<>(2); + if (target == null) return null; + + if (source.getReference().length() != target.length()) { + return null; + } final List alleles = new ArrayList<>(); for (final Allele oldAllele : source.getAlleles()) { @@ -342,7 +367,6 @@ protected static VariantContext liftSnp(final VariantContext source, final Inter } else { final Allele fixedAllele = Allele.create(SequenceUtil.reverseComplement(oldAllele.getBaseString()), oldAllele.isReference()); alleles.add(fixedAllele); - reverseComplementAlleleMap.put(oldAllele, fixedAllele); } } @@ -369,9 +393,13 @@ protected static VariantContext flipIndel(final VariantContext source, final Lif if (!source.isBiallelic()) return null; //only supporting biallelic indels, for now. final Interval originalLocus = new Interval(source.getContig(), source.getStart(), source.getEnd()); - final Locatable target = liftOver.liftOver(originalLocus); + final Interval target = liftOver.liftOver(originalLocus); if (target == null) return null; + if (!target.isNegativeStrand()) { + throw new IllegalArgumentException("Expecting a variant the is lifted over with an inversion. Got " + + source + " maps to " + target.toString()); + } // a boolean to protect against trying to access the -1 position in the reference array final boolean addToStart = target.getStart() > 1; diff --git a/src/test/java/picard/vcf/LiftoverVcfTest.java b/src/test/java/picard/vcf/LiftoverVcfTest.java index acabe826a..99feb0f67 100644 --- a/src/test/java/picard/vcf/LiftoverVcfTest.java +++ b/src/test/java/picard/vcf/LiftoverVcfTest.java @@ -1,9 +1,11 @@ package picard.vcf; import htsjdk.samtools.liftover.LiftOver; +import htsjdk.samtools.reference.FastaSequenceFile; import htsjdk.samtools.reference.ReferenceSequence; import htsjdk.samtools.util.CollectionUtil; import htsjdk.samtools.util.IOUtil; +import htsjdk.samtools.util.Interval; import htsjdk.variant.variantcontext.*; import htsjdk.variant.vcf.VCFFileReader; import org.testng.Assert; @@ -26,8 +28,12 @@ private static final File TEST_DATA_PATH = new File("testdata/picard/vcf/"); private static final File CHAIN_FILE = new File(TEST_DATA_PATH, "test.over.chain"); + private static final File TWO_LINK_CHAIN_FILE = new File(TEST_DATA_PATH, "test.two.block.over.chain"); + private static final File CHAIN_FILE_WITH_BAD_CONTIG = new File(TEST_DATA_PATH, "test.over.badContig.chain"); private static final File REFERENCE_FILE = new File(TEST_DATA_PATH, "dummy.reference.fasta"); + private static final File TWO_LINK_REFERENCE_FILE = new File(TEST_DATA_PATH, "dummy.two.block.reference.fasta"); + private static final File OUTPUT_DATA_PATH = IOUtil.createTempDir("LiftoverVcfsTest", null); public String getCommandLineProgramName() { @@ -376,11 +382,13 @@ public void testFlipIndel(final VariantContext source, final ReferenceSequence r // G*/A -> G/A start = 13; stop = start; + builder.source("test1_5"); builder.start(start).stop(stop).alleles(CollectionUtil.makeList(RefG, A)); result_builder.start(stop).stop(start).alleles(CollectionUtil.makeList(RefG, A)); tests.add(new Object[]{builder.make(), reference, result_builder.make()}); + builder.source("test2"); for (start = 1; start <= reference.getBases().length; start++) { builder.start(start).stop(start); builder.alleles(CollectionUtil.makeList( @@ -392,6 +400,7 @@ public void testFlipIndel(final VariantContext source, final ReferenceSequence r // AA/A in initial polyA repeat -> CA/C at the beginning result_builder.start(1).stop(2).alleles(CollectionUtil.makeList(RefCA, C)); + builder.source("test3"); for (start = 2; start <= 11; start++) { builder.start(start).stop(start + 1); builder.alleles(CollectionUtil.makeList( @@ -402,6 +411,7 @@ public void testFlipIndel(final VariantContext source, final ReferenceSequence r // A/AA in initial polyA repeat -> C/CA at the beginning result_builder.start(1).stop(1).alleles(CollectionUtil.makeList(RefC, CA)); + builder.source("test4"); for (start = 2; start <= 11; start++) { builder.start(start).stop(start); builder.alleles(CollectionUtil.makeList( @@ -410,10 +420,9 @@ public void testFlipIndel(final VariantContext source, final ReferenceSequence r tests.add(new Object[]{builder.make(), reference, result_builder.make()}); } - - //CT/CTCT -> A/ACT in CT repeat region result_builder.start(19).stop(19).alleles(CollectionUtil.makeList(RefA, ACT)); + builder.source("test5"); for (start = 20; start <= 27; start += 2) { builder.start(start).stop(start + 1); builder.alleles(CollectionUtil.makeList( @@ -422,6 +431,7 @@ public void testFlipIndel(final VariantContext source, final ReferenceSequence r tests.add(new Object[]{builder.make(), reference, result_builder.make()}); } + builder.source("test6"); //TC/TCTC -> A/ACT in CT repeat region for (start = 21; start <= 29; start += 2) { builder.start(start).stop(start + 1); @@ -432,6 +442,7 @@ public void testFlipIndel(final VariantContext source, final ReferenceSequence r } //CTCT/CT -> ACT/A in CT repeat region + builder.source("test7"); result_builder.start(19).stop(21).alleles(CollectionUtil.makeList(RefACT, A)); for (start = 20; start <= 27; start += 2) { builder.start(start).stop(start + 3); @@ -441,6 +452,7 @@ public void testFlipIndel(final VariantContext source, final ReferenceSequence r tests.add(new Object[]{builder.make(), reference, result_builder.make()}); } + builder.source("test8"); //TCTC/TC-> ACT/A in CT repeat region for (start = 21; start <= 29; start += 2) { builder.start(start).stop(start + 3); @@ -454,6 +466,7 @@ public void testFlipIndel(final VariantContext source, final ReferenceSequence r // "CAAAAAAAAAACGTACGTACTCTCTCTCTACGT" // 123456789 123456789 123456789 123 + builder.source("test9"); result_builder.alleles("AACGT", "A").start(10).stop(14); for (start = 10; start < 17; start++) { for (stop = start + 4; stop < 20; stop++) { @@ -467,6 +480,7 @@ public void testFlipIndel(final VariantContext source, final ReferenceSequence r // test vc with genotypes: + builder.source("test10"); result_builder.start(10).stop(14).alleles("AACGT", "A", "AACG", "AACGTACGT"); builder.start(9).stop(18).alleles("AAACGTACGT", "AAACGT", "AAACGACGT", "AAACGTACGTACGT"); final Collection genotypes = new ArrayList<>(); @@ -490,6 +504,7 @@ public void testFlipIndel(final VariantContext source, final ReferenceSequence r builder.genotypes(genotypes); result_builder.genotypes(results_genotypes); + builder.source("test12"); tests.add(new Object[]{builder.make(), reference, result_builder.make()}); return tests.iterator(); @@ -501,6 +516,143 @@ public void testLeftAlignVariants(final VariantContext source, final ReferenceSe assertVcAreEqual(leftAlignVariant(source, reference), result); } + @DataProvider(name = "indelNoFlipData") + public Iterator indelNoFlipData() { + + final VariantContextBuilder builder = new VariantContextBuilder().source("test1").chr("chr1"); + final VariantContextBuilder result_builder = new VariantContextBuilder().source("test1").chr("chr1"); + final List tests = new ArrayList<>(); + + // some more tests with a more complicated chain File. this one has 2 links in the relevant block + // the cigar string would be 540M5I500M5D40M (if chains were written using cigar strings....) + + final Allele AAAARef = Allele.create("AAAA", true); + final Allele AAA = Allele.create("AAA", false); + final Allele CAAARef = Allele.create("CAAA", true); + final Allele CAA = Allele.create("CAA", false); + final Allele ARef = Allele.create("A", true); + final Allele A = Allele.create("A", false); + final Allele CRef = Allele.create("C", true); + final Allele T = Allele.create("T", false); + + int start = 1; + int stop = 4; + int offset; + + final ReferenceSequence twoLinkReference = new FastaSequenceFile(TWO_LINK_REFERENCE_FILE, false).nextSequence(); + + final LiftOver liftOver = new LiftOver(TWO_LINK_CHAIN_FILE); + + // trivial snp + builder.source("test1"); + builder.start(1).stop(1).alleles(CollectionUtil.makeList(CRef, A)); + result_builder.start(1).stop(1).alleles(CollectionUtil.makeList(CRef, A)); + tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), result_builder.make()}); + + // trivial case indel + builder.source("test2"); + builder.start(start).stop(stop).alleles(CollectionUtil.makeList(CAAARef, CAA)); + result_builder.start(1).stop(4).alleles(CollectionUtil.makeList(CAAARef, CAA)); + tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), result_builder.make()}); + + // near end of link indel + builder.source("test3"); + builder.start(537).stop(540).alleles(CollectionUtil.makeList(AAAARef, AAA)); + result_builder.start(537).stop(540).alleles(CollectionUtil.makeList(AAAARef, AAA)); + tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), result_builder.make()}); + + // near end of link snp + builder.source("test4"); + builder.start(540).stop(540).alleles(CollectionUtil.makeList(ARef, T)); + result_builder.start(540).stop(540).alleles(CollectionUtil.makeList(ARef, T)); + tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), result_builder.make()}); + + // straddling chains indel + builder.source("test5"); + builder.start(538).stop(541).alleles(CollectionUtil.makeList(AAAARef, AAA)); + result_builder.start(537).stop(540).alleles(CollectionUtil.makeList(AAAARef, AAA)); + tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), null}); + + // near start of second link snp + builder.source("test6"); + start = 541; + offset = 5; + builder.start(start).stop(start).alleles(CollectionUtil.makeList(ARef, T)); + result_builder.start(start + offset).stop(start + offset).alleles(CollectionUtil.makeList(ARef, T)); + tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), result_builder.make()}); + + + // near start of second link indel + builder.source("test7"); + builder.start(start).stop(start).alleles(CollectionUtil.makeList(ARef, T)); + result_builder.start(start + offset).stop(start + offset).alleles(CollectionUtil.makeList(ARef, T)); + tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), result_builder.make()}); + + // near end of second link snp + builder.source("test8"); + start = 1040; + offset = 5; + builder.start(start).stop(start).alleles(CollectionUtil.makeList(ARef, T)); + result_builder.start(start + offset).stop(start + offset).alleles(CollectionUtil.makeList(ARef, T)); + tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), result_builder.make()}); + + // near end of second link indel + builder.source("test9"); + start = 1037; + stop = 1040; + builder.start(start).stop(stop).alleles(CollectionUtil.makeList(AAAARef, AAA)); + result_builder.start(start + offset).stop(stop + offset).alleles(CollectionUtil.makeList(AAAARef, AAA)); + tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), result_builder.make()}); + + + // straddling link indel + builder.source("test9"); + start = 1038; + stop = 1041; + builder.start(start).stop(stop).alleles(CollectionUtil.makeList(AAAARef, AAA)); + tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), null}); + + // straddling link indel + builder.source("test10"); + start = 1045; + stop = 1048; + builder.start(start).stop(stop).alleles(CollectionUtil.makeList(AAAARef, AAA)); + tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), null}); + + // vanishing snp + builder.source("test11"); + start = 1045; + stop = 1045; + builder.start(start).stop(stop).alleles(CollectionUtil.makeList(ARef, T)); + tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), null}); + + // after second link indel + builder.source("test12"); + start = 1046; + stop = 1049; + offset = 0; + builder.start(start).stop(stop).alleles(CollectionUtil.makeList(AAAARef, AAA)); + result_builder.start(start + offset).stop(stop + offset).alleles(CollectionUtil.makeList(AAAARef, AAA)); + tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), result_builder.make()}); + + // near start of second link snp + builder.source("test13"); + start = 1046; + builder.start(start).stop(start).alleles(CollectionUtil.makeList(ARef, T)); + result_builder.start(start + offset).stop(start + offset).alleles(CollectionUtil.makeList(ARef, T)); + tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), result_builder.make()}); + + return tests.iterator(); + } + + @Test(dataProvider = "indelNoFlipData") + public void testLiftOverSimpleIndels( final LiftOver liftOver, final ReferenceSequence reference, final VariantContext source, final VariantContext result) { + + final Interval target = liftOver.liftOver(new Interval(source.getContig(), source.getStart(), source.getEnd()), .95); + + assertVcAreEqual(LiftoverVcf.liftSimpleVariant(source, target), result); + } + private void assertVcAreEqual(final VariantContext actual, final VariantContext expected) { if (expected == null) { diff --git a/testdata/picard/vcf/dummy.two.block.reference.fasta b/testdata/picard/vcf/dummy.two.block.reference.fasta new file mode 100644 index 000000000..8f5749301 --- /dev/null +++ b/testdata/picard/vcf/dummy.two.block.reference.fasta @@ -0,0 +1,20 @@ +>chr1 dna:chromosome chromosome:GRCh37:1:1:1085:1 +CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +GGGGG +CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA +CAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA \ No newline at end of file diff --git a/testdata/picard/vcf/test.two.block.over.chain b/testdata/picard/vcf/test.two.block.over.chain new file mode 100644 index 000000000..13343d977 --- /dev/null +++ b/testdata/picard/vcf/test.two.block.over.chain @@ -0,0 +1,4 @@ +chain 1085 chr1 1085 + 0 1085 chr1 1085 + 0 1085 2 +540 0 5 +500 5 0 +40 From 6a684942e4034760060f4a0f3cba3e4adabf9a86 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Tue, 11 Jul 2017 21:25:03 -0400 Subject: [PATCH 2/2] - reivew comments: -- renamed "link" to "interval", -- added comments and -- a space --- src/main/java/picard/vcf/LiftoverVcf.java | 22 +++++++--- src/test/java/picard/vcf/LiftoverVcfTest.java | 58 +++++++++++++-------------- 2 files changed, 45 insertions(+), 35 deletions(-) diff --git a/src/main/java/picard/vcf/LiftoverVcf.java b/src/main/java/picard/vcf/LiftoverVcf.java index 80cd3edb3..5f7eb3587 100644 --- a/src/main/java/picard/vcf/LiftoverVcf.java +++ b/src/main/java/picard/vcf/LiftoverVcf.java @@ -106,10 +106,10 @@ public static final String FILTER_MISMATCHING_REF_ALLELE = "MismatchedRefAllele"; /** - * Filter name to use when an indel cannot be lifted over since it straddles two links which means + * Filter name to use when an indel cannot be lifted over since it straddles two intervals in a chain which means * that it is unclear what are the right alleles to be used. */ - public static final String FILTER_INDEL_STRADDLES_TWO_LINKS = "IndelStraddlesMultipleLinks"; + public static final String FILTER_INDEL_STRADDLES_TWO_INTERVALS = "IndelStraddlesMultipleIntevals"; /** * Filters to be added to the REJECT file. @@ -118,7 +118,7 @@ new VCFFilterHeaderLine(FILTER_CANNOT_LIFTOVER_INDEL, "Indel falls into a reverse complemented region in the target genome."), new VCFFilterHeaderLine(FILTER_NO_TARGET, "Variant could not be lifted between genome builds."), new VCFFilterHeaderLine(FILTER_MISMATCHING_REF_ALLELE, "Reference allele does not match reference genome sequence after liftover."), - new VCFFilterHeaderLine(FILTER_INDEL_STRADDLES_TWO_LINKS, "Indel is straddling multiple links in the chain, and so the results are not well defined.") + new VCFFilterHeaderLine(FILTER_INDEL_STRADDLES_TWO_INTERVALS, "Indel is straddling multiple intervalss in the chain, and so the results are not well defined.") ); /** @@ -220,12 +220,20 @@ protected int doWork() { final Interval source = new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd(), false, ctx.getContig() + ":" + ctx.getStart() + "-" + ctx.getEnd()); final Interval target = liftOver.liftOver(source, LIFTOVER_MIN_MATCH); + // target is null when there is no good liftover for the context. This happens either when it fall in a gap + // where there isn't a chain, or if a large enough proportion of it is diminished by the "deletion" at the + // end of each interval in a chain. if (target == null) { rejectVariant(ctx, FILTER_NO_TARGET); continue; } + + // the target is the lifted-over interval comprised of the start/stop of the variant context, + // if the sizes of target and ctx do not match, it means that the interval grew or shrank during + // liftover which must be due to straddling multiple intervals in the liftover chain. + // This would invalidate the indel as it isn't clear what the resulting alleles should be. if (ctx.getReference().length() != target.length()){ - rejectVariant(ctx, FILTER_INDEL_STRADDLES_TWO_LINKS); + rejectVariant(ctx, FILTER_INDEL_STRADDLES_TWO_INTERVALS); continue; } @@ -351,10 +359,12 @@ private void tryToAddVariant(final VariantContext vc, final ReferenceSequence re * @param target * @return */ - protected static VariantContext liftSimpleVariant(final VariantContext source, final Interval target){ + protected static VariantContext liftSimpleVariant(final VariantContext source, final Interval target) { // Fix the alleles if we went from positive to negative strand - if (target == null) return null; + if (target == null) { + return null; + } if (source.getReference().length() != target.length()) { return null; diff --git a/src/test/java/picard/vcf/LiftoverVcfTest.java b/src/test/java/picard/vcf/LiftoverVcfTest.java index 99feb0f67..ca29c686d 100644 --- a/src/test/java/picard/vcf/LiftoverVcfTest.java +++ b/src/test/java/picard/vcf/LiftoverVcfTest.java @@ -28,11 +28,11 @@ private static final File TEST_DATA_PATH = new File("testdata/picard/vcf/"); private static final File CHAIN_FILE = new File(TEST_DATA_PATH, "test.over.chain"); - private static final File TWO_LINK_CHAIN_FILE = new File(TEST_DATA_PATH, "test.two.block.over.chain"); + private static final File TWO_INTERVAL_CHAIN_FILE = new File(TEST_DATA_PATH, "test.two.block.over.chain"); private static final File CHAIN_FILE_WITH_BAD_CONTIG = new File(TEST_DATA_PATH, "test.over.badContig.chain"); private static final File REFERENCE_FILE = new File(TEST_DATA_PATH, "dummy.reference.fasta"); - private static final File TWO_LINK_REFERENCE_FILE = new File(TEST_DATA_PATH, "dummy.two.block.reference.fasta"); + private static final File TWO_INTERVALS_REFERENCE_FILE = new File(TEST_DATA_PATH, "dummy.two.block.reference.fasta"); private static final File OUTPUT_DATA_PATH = IOUtil.createTempDir("LiftoverVcfsTest", null); @@ -523,7 +523,7 @@ public void testLeftAlignVariants(final VariantContext source, final ReferenceSe final VariantContextBuilder result_builder = new VariantContextBuilder().source("test1").chr("chr1"); final List tests = new ArrayList<>(); - // some more tests with a more complicated chain File. this one has 2 links in the relevant block + // some more tests with a more complicated chain File. this one has 2 intervals in the relevant block // the cigar string would be 540M5I500M5D40M (if chains were written using cigar strings....) final Allele AAAARef = Allele.create("AAAA", true); @@ -539,108 +539,108 @@ public void testLeftAlignVariants(final VariantContext source, final ReferenceSe int stop = 4; int offset; - final ReferenceSequence twoLinkReference = new FastaSequenceFile(TWO_LINK_REFERENCE_FILE, false).nextSequence(); + final ReferenceSequence twoIntervalChainReference = new FastaSequenceFile(TWO_INTERVALS_REFERENCE_FILE, false).nextSequence(); - final LiftOver liftOver = new LiftOver(TWO_LINK_CHAIN_FILE); + final LiftOver liftOver = new LiftOver(TWO_INTERVAL_CHAIN_FILE); // trivial snp builder.source("test1"); builder.start(1).stop(1).alleles(CollectionUtil.makeList(CRef, A)); result_builder.start(1).stop(1).alleles(CollectionUtil.makeList(CRef, A)); - tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), result_builder.make()}); + tests.add(new Object[]{liftOver, twoIntervalChainReference, builder.make(), result_builder.make()}); // trivial case indel builder.source("test2"); builder.start(start).stop(stop).alleles(CollectionUtil.makeList(CAAARef, CAA)); result_builder.start(1).stop(4).alleles(CollectionUtil.makeList(CAAARef, CAA)); - tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), result_builder.make()}); + tests.add(new Object[]{liftOver, twoIntervalChainReference, builder.make(), result_builder.make()}); - // near end of link indel + // near end of interval indel builder.source("test3"); builder.start(537).stop(540).alleles(CollectionUtil.makeList(AAAARef, AAA)); result_builder.start(537).stop(540).alleles(CollectionUtil.makeList(AAAARef, AAA)); - tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), result_builder.make()}); + tests.add(new Object[]{liftOver, twoIntervalChainReference, builder.make(), result_builder.make()}); - // near end of link snp + // near end of interval snp builder.source("test4"); builder.start(540).stop(540).alleles(CollectionUtil.makeList(ARef, T)); result_builder.start(540).stop(540).alleles(CollectionUtil.makeList(ARef, T)); - tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), result_builder.make()}); + tests.add(new Object[]{liftOver, twoIntervalChainReference, builder.make(), result_builder.make()}); // straddling chains indel builder.source("test5"); builder.start(538).stop(541).alleles(CollectionUtil.makeList(AAAARef, AAA)); result_builder.start(537).stop(540).alleles(CollectionUtil.makeList(AAAARef, AAA)); - tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), null}); + tests.add(new Object[]{liftOver, twoIntervalChainReference, builder.make(), null}); - // near start of second link snp + // near start of second interval snp builder.source("test6"); start = 541; offset = 5; builder.start(start).stop(start).alleles(CollectionUtil.makeList(ARef, T)); result_builder.start(start + offset).stop(start + offset).alleles(CollectionUtil.makeList(ARef, T)); - tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), result_builder.make()}); + tests.add(new Object[]{liftOver, twoIntervalChainReference, builder.make(), result_builder.make()}); - // near start of second link indel + // near start of second interval indel builder.source("test7"); builder.start(start).stop(start).alleles(CollectionUtil.makeList(ARef, T)); result_builder.start(start + offset).stop(start + offset).alleles(CollectionUtil.makeList(ARef, T)); - tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), result_builder.make()}); + tests.add(new Object[]{liftOver, twoIntervalChainReference, builder.make(), result_builder.make()}); - // near end of second link snp + // near end of second interval snp builder.source("test8"); start = 1040; offset = 5; builder.start(start).stop(start).alleles(CollectionUtil.makeList(ARef, T)); result_builder.start(start + offset).stop(start + offset).alleles(CollectionUtil.makeList(ARef, T)); - tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), result_builder.make()}); + tests.add(new Object[]{liftOver, twoIntervalChainReference, builder.make(), result_builder.make()}); - // near end of second link indel + // near end of second interval indel builder.source("test9"); start = 1037; stop = 1040; builder.start(start).stop(stop).alleles(CollectionUtil.makeList(AAAARef, AAA)); result_builder.start(start + offset).stop(stop + offset).alleles(CollectionUtil.makeList(AAAARef, AAA)); - tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), result_builder.make()}); + tests.add(new Object[]{liftOver, twoIntervalChainReference, builder.make(), result_builder.make()}); - // straddling link indel + // straddling interval indel builder.source("test9"); start = 1038; stop = 1041; builder.start(start).stop(stop).alleles(CollectionUtil.makeList(AAAARef, AAA)); - tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), null}); + tests.add(new Object[]{liftOver, twoIntervalChainReference, builder.make(), null}); - // straddling link indel + // straddling interval indel builder.source("test10"); start = 1045; stop = 1048; builder.start(start).stop(stop).alleles(CollectionUtil.makeList(AAAARef, AAA)); - tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), null}); + tests.add(new Object[]{liftOver, twoIntervalChainReference, builder.make(), null}); // vanishing snp builder.source("test11"); start = 1045; stop = 1045; builder.start(start).stop(stop).alleles(CollectionUtil.makeList(ARef, T)); - tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), null}); + tests.add(new Object[]{liftOver, twoIntervalChainReference, builder.make(), null}); - // after second link indel + // after second interval indel builder.source("test12"); start = 1046; stop = 1049; offset = 0; builder.start(start).stop(stop).alleles(CollectionUtil.makeList(AAAARef, AAA)); result_builder.start(start + offset).stop(stop + offset).alleles(CollectionUtil.makeList(AAAARef, AAA)); - tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), result_builder.make()}); + tests.add(new Object[]{liftOver, twoIntervalChainReference, builder.make(), result_builder.make()}); - // near start of second link snp + // near start of second interval snp builder.source("test13"); start = 1046; builder.start(start).stop(start).alleles(CollectionUtil.makeList(ARef, T)); result_builder.start(start + offset).stop(start + offset).alleles(CollectionUtil.makeList(ARef, T)); - tests.add(new Object[]{liftOver, twoLinkReference, builder.make(), result_builder.make()}); + tests.add(new Object[]{liftOver, twoIntervalChainReference, builder.make(), result_builder.make()}); return tests.iterator(); }