From 3df1f8dc3701685b7c55039ffb9fc62552761494 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Fri, 2 Jun 2017 19:42:22 -0400 Subject: [PATCH 1/7] - added biallelic indel liftover for negative strand - added many tests - connected to LiftOverVcf now - does left-aligning of variants --- src/main/java/picard/vcf/LiftoverVcf.java | 258 ++++++++++--- src/test/java/picard/vcf/LiftoverVcfTest.java | 402 ++++++++++++++++++--- ...iftover.vcf => testLiftoverBiallelicIndels.vcf} | 1 + .../picard/vcf/testLiftoverMultiallelicIndels.vcf | 4 + 4 files changed, 563 insertions(+), 102 deletions(-) rename testdata/picard/vcf/{testLiftover.vcf => testLiftoverBiallelicIndels.vcf} (83%) create mode 100644 testdata/picard/vcf/testLiftoverMultiallelicIndels.vcf diff --git a/src/main/java/picard/vcf/LiftoverVcf.java b/src/main/java/picard/vcf/LiftoverVcf.java index 4be6ab754..8a2161485 100644 --- a/src/main/java/picard/vcf/LiftoverVcf.java +++ b/src/main/java/picard/vcf/LiftoverVcf.java @@ -4,16 +4,9 @@ import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.ValidationStringency; import htsjdk.samtools.liftover.LiftOver; +import htsjdk.samtools.reference.ReferenceSequence; import htsjdk.samtools.reference.ReferenceSequenceFileWalker; -import htsjdk.samtools.util.CloserUtil; -import htsjdk.samtools.util.CollectionUtil; -import htsjdk.samtools.util.IOUtil; -import htsjdk.samtools.util.Interval; -import htsjdk.samtools.util.Log; -import htsjdk.samtools.util.ProgressLogger; -import htsjdk.samtools.util.SequenceUtil; -import htsjdk.samtools.util.SortingCollection; -import htsjdk.samtools.util.StringUtil; +import htsjdk.samtools.util.*; import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.writer.Options; import htsjdk.variant.variantcontext.writer.VariantContextWriter; @@ -33,10 +26,10 @@ import java.io.File; import java.text.DecimalFormat; import java.text.NumberFormat; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; +import java.util.function.Function; +import java.util.stream.Collector; +import java.util.stream.Collectors; /** * Tool for lifting over a VCF to another genome build and producing a properly header'd, @@ -69,20 +62,20 @@ "" + "For additional information, please see: http://genome.ucsc.edu/cgi-bin/hgLiftOver" + "
"; - @Option(shortName=StandardOptionDefinitions.INPUT_SHORT_NAME, doc="The input VCF/BCF file to be lifted over.") + @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "The input VCF/BCF file to be lifted over.") public File INPUT; - @Option(shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc="The output location to write the lifted over VCF/BCF to.") + @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The output location to write the lifted over VCF/BCF to.") public File OUTPUT; - @Option(shortName="C", doc="The liftover chain file. See https://genome.ucsc.edu/goldenPath/help/chain.html for a description" + + @Option(shortName = "C", doc = "The liftover chain file. See https://genome.ucsc.edu/goldenPath/help/chain.html for a description" + " of chain files. See http://hgdownload.soe.ucsc.edu/downloads.html#terms for where to download chain files.") public File CHAIN; - @Option(doc="File to which to write rejected records.") + @Option(doc = "File to which to write rejected records.") public File REJECT; - @Option(shortName = StandardOptionDefinitions.REFERENCE_SHORT_NAME, common=false, + @Option(shortName = StandardOptionDefinitions.REFERENCE_SHORT_NAME, common = false, doc = "The reference sequence (fasta) for the TARGET genome build. The fasta file must have an " + "accompanying sequence dictionary (.dict file).") public File REFERENCE_SEQUENCE = Defaults.REFERENCE_FASTA; @@ -104,29 +97,43 @@ // When a contig used in the chain is not in the reference, exit with this value instead of 0. protected static int EXIT_CODE_WHEN_CONTIG_NOT_IN_REFERENCE = 1; - /** Filter name to use when a target cannot be lifted over. */ + /** + * Filter name to use when a target cannot be lifted over. + */ public static final String FILTER_CANNOT_LIFTOVER_INDEL = "ReverseComplementedIndel"; - /** Filter name to use when a target cannot be lifted over. */ + /** + * Filter name to use when a target cannot be lifted over. + */ public static final String FILTER_NO_TARGET = "NoTarget"; - /** Filter name to use when a target is lifted over, but the reference allele doens't match the new reference. */ + /** + * Filter name to use when a target is lifted over, but the reference allele doens't match the new reference. + */ public static final String FILTER_MISMATCHING_REF_ALLELE = "MismatchedRefAllele"; - /** Filters to be added to the REJECT file. */ + /** + * Filters to be added to the REJECT file. + */ private static final List FILTERS = CollectionUtil.makeList( new VCFFilterHeaderLine(FILTER_CANNOT_LIFTOVER_INDEL, "Indel falls into a reverse complemented region in the target genome."), new VCFFilterHeaderLine(FILTER_NO_TARGET, "Variant could not be lifted between genome builds."), new VCFFilterHeaderLine(FILTER_MISMATCHING_REF_ALLELE, "Reference allele does not match reference genome sequence after liftover.") ); - /** Attribute used to store the name of the source contig/chromosome prior to liftover. */ + /** + * Attribute used to store the name of the source contig/chromosome prior to liftover. + */ public static final String ORIGINAL_CONTIG = "OriginalContig"; - /** Attribute used to store the position of the variant on the source contig prior to liftover. */ + /** + * Attribute used to store the position of the variant on the source contig prior to liftover. + */ public static final String ORIGINAL_START = "OriginalStart"; - /** Metadata to be added to the Passing file. */ + /** + * Metadata to be added to the Passing file. + */ private static final List ATTRS = CollectionUtil.makeList( new VCFInfoHeaderLine(ORIGINAL_CONTIG, 1, VCFHeaderLineType.String, "The name of the source contig/chromosome prior to liftover."), new VCFInfoHeaderLine(ORIGINAL_START, 1, VCFHeaderLineType.String, "The position of the variant on the source contig prior to liftover.") @@ -134,12 +141,8 @@ private final Log log = Log.getInstance(LiftoverVcf.class); - // Stock main method - public static void main(final String[] args) { - new LiftoverVcf().instanceMainWithExit(args); - } - - @Override protected int doWork() { + @Override + protected int doWork() { IOUtil.assertFileIsReadable(INPUT); IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE); IOUtil.assertFileIsReadable(CHAIN); @@ -154,13 +157,12 @@ public static void main(final String[] args) { log.info("Loading up the target reference genome."); final ReferenceSequenceFileWalker walker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE); - final Map refSeqs = new HashMap(); - for (final SAMSequenceRecord rec: walker.getSequenceDictionary().getSequences()) { - refSeqs.put(rec.getSequenceName(), walker.get(rec.getSequenceIndex()).getBases()); + final Map refSeqs = new HashMap<>(); + for (final SAMSequenceRecord rec : walker.getSequenceDictionary().getSequences()) { + refSeqs.put(rec.getSequenceName(), walker.get(rec.getSequenceIndex())); } CloserUtil.close(walker); - //////////////////////////////////////////////////////////////////////// // Setup the outputs //////////////////////////////////////////////////////////////////////// @@ -170,12 +172,14 @@ public static void main(final String[] args) { if (WRITE_ORIGINAL_POSITION) { for (final VCFInfoHeaderLine line : ATTRS) outHeader.addMetaDataLine(line); } - final VariantContextWriter out = new VariantContextWriterBuilder().setOption(Options.INDEX_ON_THE_FLY) + final VariantContextWriter out = new VariantContextWriterBuilder() + .setOption(Options.INDEX_ON_THE_FLY) .modifyOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER, ALLOW_MISSING_FIELDS_IN_HEADER) .setOutputFile(OUTPUT).setReferenceDictionary(walker.getSequenceDictionary()).build(); out.writeHeader(outHeader); - final VariantContextWriter rejects = new VariantContextWriterBuilder().setOutputFile(REJECT).unsetOption(Options.INDEX_ON_THE_FLY) + final VariantContextWriter rejects = new VariantContextWriterBuilder().setOutputFile(REJECT) + .unsetOption(Options.INDEX_ON_THE_FLY) .modifyOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER, ALLOW_MISSING_FIELDS_IN_HEADER) .build(); final VCFHeader rejectHeader = new VCFHeader(in.getFileHeader()); @@ -185,7 +189,6 @@ public static void main(final String[] args) { } rejects.writeHeader(rejectHeader); - //////////////////////////////////////////////////////////////////////// // Read the input VCF, lift the records over and write to the sorting // collection. @@ -201,15 +204,16 @@ public static void main(final String[] args) { ProgressLogger progress = new ProgressLogger(log, 1000000, "read"); // a mapping from original allele to reverse complemented allele - final Map reverseComplementAlleleMap = new HashMap(10); + final Map reverseComplementAlleleMap = new HashMap<>(10); for (final VariantContext ctx : in) { ++total; final Interval source = new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd(), false, ctx.getContig() + ":" + ctx.getStart() + "-" + ctx.getEnd()); final Interval target = liftOver.liftOver(source, LIFTOVER_MIN_MATCH); - // if the target is null OR (the target is reverse complemented AND the variant is an indel or mixed), then we cannot lift it over - if (target == null || (target.isNegativeStrand() && (ctx.isMixed() || ctx.isIndel()))) { + // if the target is null OR (the target is reverse complemented AND the variant is a non-biallelic indel or mixed), then we cannot lift it over + + if (target == null || (target.isNegativeStrand() && (ctx.isMixed() || ctx.isIndel() && !ctx.isBiallelic()))) { final String reason = (target == null) ? FILTER_NO_TARGET : FILTER_CANNOT_LIFTOVER_INDEL; rejects.add(new VariantContextBuilder(ctx).filter(reason).make()); failedLiftover++; @@ -218,22 +222,30 @@ public static void main(final String[] args) { failedLiftover++; String missingContigMessage = "Encountered a contig, " + target.getContig() + " that is not part of the target reference."; - if(WARN_ON_MISSING_CONTIG) { + if (WARN_ON_MISSING_CONTIG) { log.warn(missingContigMessage); } else { log.error(missingContigMessage); return EXIT_CODE_WHEN_CONTIG_NOT_IN_REFERENCE; } + } else if ( target.isNegativeStrand() && ctx.isIndel() && ctx.isBiallelic()) { + final VariantContextBuilder flippedIndel=new VariantContextBuilder(flipIndel(ctx,liftOver,refSeqs.get(target.getContig()))); + + if (WRITE_ORIGINAL_POSITION) { + flippedIndel.attribute(ORIGINAL_CONTIG, source.getContig()); + flippedIndel.attribute(ORIGINAL_START, source.getStart()); + } + sorter.add(flippedIndel.make()); + } else { // Fix the alleles if we went from positive to negative strand reverseComplementAlleleMap.clear(); - final List alleles = new ArrayList(); + final List alleles = new ArrayList<>(); for (final Allele oldAllele : ctx.getAlleles()) { if (target.isPositiveStrand() || oldAllele.isSymbolic()) { alleles.add(oldAllele); - } - else { + } else { final Allele fixedAllele = Allele.create(SequenceUtil.reverseComplement(oldAllele.getBaseString()), oldAllele.isReference()); alleles.add(fixedAllele); reverseComplementAlleleMap.put(oldAllele, fixedAllele); @@ -263,13 +275,12 @@ public static void main(final String[] args) { boolean mismatchesReference = false; for (final Allele allele : builder.getAlleles()) { if (allele.isReference()) { - final byte[] ref = refSeqs.get(target.getContig()); - final String refString = StringUtil.bytesToString(ref, target.getStart()-1, target.length()); + final byte[] ref = refSeqs.get(target.getContig()).getBases(); + final String refString = StringUtil.bytesToString(ref, target.getStart() - 1, target.length()); if (!refString.equalsIgnoreCase(allele.getBaseString())) { mismatchesReference = true; } - break; } } @@ -277,8 +288,7 @@ public static void main(final String[] args) { if (mismatchesReference) { rejects.add(new VariantContextBuilder(ctx).filter(FILTER_MISMATCHING_REF_ALLELE).make()); failedAlleleCheck++; - } - else { + } else { sorter.add(builder.make()); } } @@ -313,21 +323,155 @@ public static void main(final String[] args) { return 0; } - protected static GenotypesContext fixGenotypes(final GenotypesContext originals, final Map reverseComplementAlleleMap) { + protected static Locatable getFlippedLocation(final VariantContext source, final LiftOver liftOver) { + Interval originalLocus = new Interval(source.getContig(), source.getStart() + 1, source.getEnd() + 1); + return liftOver.liftOver(originalLocus); + } + + /** + * @param source original variant context + * @param liftOver the LiftOver object to use for flipping + * @param referenceSequence the reference sequence of the target + * @return a flipped variant-context. + */ + protected static VariantContext flipIndel(final VariantContext source, final LiftOver liftOver, final ReferenceSequence referenceSequence) { + if (!source.isBiallelic()) return null; //only supporting biallelic indels, for now. + + final Locatable target = getFlippedLocation(source, liftOver); + if (target == null) return null; + + final Map reverseComplementAlleleMap = new HashMap<>(10); + + reverseComplementAlleleMap.clear(); + final List alleles = new ArrayList<>(); + + for (final Allele oldAllele : source.getAlleles()) { + // target.getStart is 1-based, reference bases are 0-based + final String alleleWithoutHead = oldAllele.getBaseString().substring(1, oldAllele.length()); + final char refBase = (char) referenceSequence.getBases()[target.getStart() - 1]; + final Allele fixedAllele = Allele.create(refBase + SequenceUtil.reverseComplement(alleleWithoutHead), oldAllele.isReference()); + alleles.add(fixedAllele); + reverseComplementAlleleMap.put(oldAllele, fixedAllele); + } + + VariantContextBuilder builder = new VariantContextBuilder(source.getSource(), + target.getContig(), + target.getStart(), + target.getEnd(), + alleles); + + builder.id(source.getID()); + builder.attributes(source.getAttributes()); + + builder.genotypes(fixGenotypes(source.getGenotypes(), reverseComplementAlleleMap)); + builder.filters(source.getFilters()); + builder.log10PError(source.getLog10PError()); + + return leftAlignVariant(builder.make(), referenceSequence); + } + + protected static GenotypesContext fixGenotypes(final GenotypesContext originals, final Map alleleMap) { // optimization: if nothing needs to be fixed then don't bother - if ( reverseComplementAlleleMap.isEmpty() ) { + if (alleleMap.isEmpty()) { return originals; } final GenotypesContext fixedGenotypes = GenotypesContext.create(originals.size()); - for ( final Genotype genotype : originals ) { - final List fixedAlleles = new ArrayList(); - for ( final Allele allele : genotype.getAlleles() ) { - final Allele fixedAllele = reverseComplementAlleleMap.containsKey(allele) ? reverseComplementAlleleMap.get(allele) : allele; + for (final Genotype genotype : originals) { + final List fixedAlleles = new ArrayList<>(); + for (final Allele allele : genotype.getAlleles()) { + final Allele fixedAllele = alleleMap.getOrDefault(allele, allele); fixedAlleles.add(fixedAllele); } fixedGenotypes.add(new GenotypeBuilder(genotype).alleles(fixedAlleles).make()); } return fixedGenotypes; } -} + + protected static VariantContext leftAlignVariant(final VariantContext vc, final ReferenceSequence referenceSequence) { + // from Adrian Tan, Gonçalo R. Abecasis and Hyun Min Kang. (2015) Unified Representation of Genetic Variants. Bioinformatics. + + boolean changesInAlleles = true; + + int start = vc.getStart(); + int end = vc.getEnd(); + + Map alleleBasesMap = new HashMap<>(); + vc.getAlleles().forEach(a -> alleleBasesMap.put(a, a.getBases())); + + // 1. while changes in alleles do + while (changesInAlleles) { + + changesInAlleles = false; + // 2. if alleles end with the same nucleotide then + if (alleleBasesMap.values().stream() + .collect(Collectors.groupingBy(a -> a[a.length - 1], Collectors.toSet())) + .size() == 1) { + // 3. truncate rightmost nucleotide of each allele + for (Allele allele : alleleBasesMap.keySet()) { + alleleBasesMap.put(allele, truncateBase(alleleBasesMap.get(allele), true)); + } + changesInAlleles = true; + end--; + // 4. end if + } + + // 5. if there exists an empty allele then + if (alleleBasesMap.values().stream() + .map(a -> a.length) + .anyMatch(l -> l == 0)) { + // 6. extend alleles 1 nucleotide to the left + for (Allele allele : alleleBasesMap.keySet()) { + // the first -1 for zero-base (getBases) versus 1-based (variant position) + // another -1 to get the base prior to the location of the start of the allele + alleleBasesMap.put(allele, extendOneBaseLeft(alleleBasesMap.get(allele), referenceSequence.getBases()[start - 2])); + } + changesInAlleles = true; + start--; + + // 7. end if + } + } + + // 8. while leftmost nucleotide of each allele are the same and all alleles have length 2 or more do + while (alleleBasesMap.values().stream() + .allMatch(a -> a.length >= 2) && + + alleleBasesMap.values().stream() + .collect(Collectors.groupingBy(a -> a[0], Collectors.toSet())) + .size() == 1 + ) { + + //9. truncate the leftmost base of the alleles + for (Allele allele : alleleBasesMap.keySet()) { + alleleBasesMap.put(allele, truncateBase(alleleBasesMap.get(allele), false)); + } + start++; + } + + VariantContextBuilder builder = new VariantContextBuilder(vc); + builder.start(start); + builder.stop(end); + + Map fixedAlleleMap = alleleBasesMap.entrySet().stream() + .collect(Collectors.toMap(Map.Entry::getKey, me -> Allele.create(me.getValue(), me.getKey().isReference()))); + builder.alleles(fixedAlleleMap.values()); + builder.genotypes(fixGenotypes(vc.getGenotypes(), fixedAlleleMap)); + + return builder.make(); + } + + private static byte[] truncateBase(byte[] allele, boolean truncateRightmost) { + return Arrays.copyOfRange(allele, truncateRightmost ? 0 : 1, truncateRightmost ? allele.length - 1 : allele.length); + } + + private static byte[] extendOneBaseLeft(final byte[] bases, final byte base) { + + byte[] newBases = new byte[bases.length + 1]; + + System.arraycopy(bases, 0, newBases, 1, bases.length); + newBases[0] = base; + + return newBases; + } +} \ No newline at end of file diff --git a/src/test/java/picard/vcf/LiftoverVcfTest.java b/src/test/java/picard/vcf/LiftoverVcfTest.java index 758967e4d..01cb53b9a 100644 --- a/src/test/java/picard/vcf/LiftoverVcfTest.java +++ b/src/test/java/picard/vcf/LiftoverVcfTest.java @@ -1,5 +1,8 @@ package picard.vcf; +import htsjdk.samtools.liftover.LiftOver; +import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.util.CollectionUtil; import htsjdk.samtools.util.IOUtil; import htsjdk.variant.variantcontext.*; import htsjdk.variant.vcf.VCFFileReader; @@ -12,9 +15,11 @@ import java.io.File; import java.util.*; +import static picard.vcf.LiftoverVcf.leftAlignVariant; + /** * Test class for LiftoverVcf. - * + *

* Created by ebanks on 8/11/15. */ public class LiftoverVcfTest extends CommandLineProgramTest { @@ -34,11 +39,20 @@ public void teardown() { IOUtil.deleteDirectoryTree(OUTPUT_DATA_PATH); } - @Test - public void testDoNotFixReverseComplementedIndels() { + @DataProvider(name="liftoverReverseStrand") + public Object[][] liftoverReverseStrand(){ + return new Object[][]{ + {"testLiftoverBiallelicIndels.vcf",3,0}, + {"testLiftoverMultiallelicIndels.vcf",0,2}, + }; + } + + + @Test(dataProvider = "liftoverReverseStrand" ) + public void testReverseComplementedIndels(String filename, int expectedPassing, int expectedFailing) { final File liftOutputFile = new File(OUTPUT_DATA_PATH, "lift-delete-me.vcf"); final File rejectOutputFile = new File(OUTPUT_DATA_PATH, "reject-delete-me.vcf"); - final File input = new File(TEST_DATA_PATH, "testLiftover.vcf"); + final File input = new File(TEST_DATA_PATH, filename); liftOutputFile.deleteOnExit(); rejectOutputFile.deleteOnExit(); @@ -54,45 +68,10 @@ public void testDoNotFixReverseComplementedIndels() { Assert.assertEquals(runPicardCommandLine(args), 0); final VCFFileReader liftReader = new VCFFileReader(liftOutputFile, false); - for (final VariantContext inputContext : liftReader) { - Assert.fail("there should be no passing indels in the liftover"); - } - final VCFFileReader rejectReader = new VCFFileReader(rejectOutputFile, false); - int counter = 0; - for (final VariantContext inputContext : rejectReader) { - counter++; - } - Assert.assertEquals(counter, 2, "the wrong number of rejected indels failed the liftover"); - } + Assert.assertEquals(liftReader.iterator().stream().count(), expectedPassing, "The wrong number of variants was lifted over"); - @Test - public void testFixReverseComplementedGenotypes() { - - final Allele refA = Allele.create("A", true); - final Allele altC = Allele.create("C", false); - final GenotypesContext originalGenotypes = GenotypesContext.create(3); - originalGenotypes.add(new GenotypeBuilder("homref").alleles(Arrays.asList(refA, refA)).make()); - originalGenotypes.add(new GenotypeBuilder("het").alleles(Arrays.asList(refA, altC)).make()); - originalGenotypes.add(new GenotypeBuilder("homvar").alleles(Arrays.asList(altC, altC)).make()); - - final Allele refT = Allele.create("T", true); - final Allele altG = Allele.create("G", false); - final GenotypesContext expectedGenotypes = GenotypesContext.create(3); - expectedGenotypes.add(new GenotypeBuilder("homref").alleles(Arrays.asList(refT, refT)).make()); - expectedGenotypes.add(new GenotypeBuilder("het").alleles(Arrays.asList(refT, altG)).make()); - expectedGenotypes.add(new GenotypeBuilder("homvar").alleles(Arrays.asList(altG, altG)).make()); - - final Map reverseComplementAlleleMap = new HashMap(2); - reverseComplementAlleleMap.put(refA, refT); - reverseComplementAlleleMap.put(altC, altG); - final GenotypesContext actualGenotypes = LiftoverVcf.fixGenotypes(originalGenotypes, reverseComplementAlleleMap); - - for ( final String sample : Arrays.asList("homref", "het", "homvar") ) { - final List expected = expectedGenotypes.get(sample).getAlleles(); - final List actual = actualGenotypes.get(sample).getAlleles(); - Assert.assertEquals(expected.get(0), actual.get(0)); - Assert.assertEquals(expected.get(1), actual.get(1)); - } + final VCFFileReader rejectReader = new VCFFileReader(rejectOutputFile, false); + Assert.assertEquals(rejectReader.iterator().stream().count(), expectedFailing, "The wrong number of variants was lifted over"); } @DataProvider(name = "dataTestMissingContigInReference") @@ -137,7 +116,7 @@ public void testMissingContigInReference(boolean warnOnMissingContext, int expec public void testWriteOriginalPosition(boolean shouldWriteOriginalPosition) { final File liftOutputFile = new File(OUTPUT_DATA_PATH, "lift-delete-me.vcf"); final File rejectOutputFile = new File(OUTPUT_DATA_PATH, "reject-delete-me.vcf"); - final File input = new File(TEST_DATA_PATH, "testLiftover.vcf"); + final File input = new File(TEST_DATA_PATH, "testLiftoverBiallelicIndels.vcf"); liftOutputFile.deleteOnExit(); rejectOutputFile.deleteOnExit(); @@ -159,12 +138,345 @@ public void testWriteOriginalPosition(boolean shouldWriteOriginalPosition) { if (shouldWriteOriginalPosition) { Assert.assertNotNull(vc.getAttribute(LiftoverVcf.ORIGINAL_CONTIG)); Assert.assertNotNull(vc.getAttribute(LiftoverVcf.ORIGINAL_START)); - } - else { + } else { Assert.assertFalse(vc.hasAttribute(LiftoverVcf.ORIGINAL_CONTIG)); Assert.assertFalse(vc.hasAttribute(LiftoverVcf.ORIGINAL_START)); } } } } + + @DataProvider(name = "indelFlipData") + public Iterator indelFlipData() { + + ReferenceSequence reference = new ReferenceSequence("chr1", 0, + "CAAAAAAAAAACGTACGTACTCTCTCTCTACGT".getBytes()); + // 123456789 123456789 123456789 123 + + Allele RefAAA = Allele.create("AAA", true); + Allele RefCAA = Allele.create("CAA", true); + Allele RefGTT = Allele.create("GTT", true); + Allele RefACGT = Allele.create("ACGT", true); + Allele RefAACG = Allele.create("AACG", true); + Allele RefTTT = Allele.create("TTT", true); + Allele RefCG = Allele.create("CG", true); + Allele RefT = Allele.create("T", true); + Allele RefA = Allele.create("A", true); + Allele RefC = Allele.create("C", true); + + Allele A = Allele.create("A", false); + Allele T = Allele.create("T", false); + Allele C = Allele.create("C", false); + Allele CG = Allele.create("CG", false); + Allele CAA = Allele.create("CAA", false); + Allele ACT = Allele.create("ACT", false); + Allele TTT = Allele.create("TTT", false); + Allele ATT = Allele.create("ATT", false); + Allele TAG = Allele.create("TAG", false); + Allele ACGT = Allele.create("ACGT", false); + Allele AACG = Allele.create("AACG", false); + + final List tests = new ArrayList<>(); + + final int CHAIN_SIZE = 540; // the length of the single chain in CHAIN_FILE + + VariantContextBuilder builder = new VariantContextBuilder().source("test1").chr("chr1"); + VariantContextBuilder result_builder = new VariantContextBuilder().source("test1").chr("chr1"); + + // simple deletion + // TTT*/T -> AAA*/A turns into left-aligned CAA*/C at position 1 + int start = 537; + int stop = start + 2; + builder.start(start).stop(stop).alleles(CollectionUtil.makeList(RefTTT, T)); + result_builder.start(1).stop(3).alleles(CollectionUtil.makeList(RefCAA, C)); + tests.add(new Object[]{builder.make(), reference, result_builder.make()}); + + //simple insertion + // T*/TTT -> A*/AAA -> turns into left-aligned C*/CAA at position 1 + stop = start; + builder.source("test2").alleles(CollectionUtil.makeList(RefT, TTT)).stop(stop); + result_builder.alleles(CollectionUtil.makeList(RefC, CAA)).start(1).stop(1); + tests.add(new Object[]{builder.make(), reference, result_builder.make()}); + + // non-simple deletion + // ACGT(T)*/A(T) -> AACG(T)*/A(T) + // position of 13 ^ in result + start = CHAIN_SIZE - 13; + stop = start + 3; + + builder.source("test3").start(start).stop(stop).alleles(CollectionUtil.makeList(RefACGT, A)); + result_builder.start(CHAIN_SIZE - stop).stop(CHAIN_SIZE - start).alleles(CollectionUtil.makeList(RefAACG, A)); + tests.add(new Object[]{builder.make(), reference, result_builder.make()}); + + // "CAAAAAAAAAACG---CGTACTCTCTCTCTACGT".getBytes()); + // "CAAAAAAAAAACGacgCGTACTCTCTCTCTACGT".getBytes()); + + // equivalent to + + // "CAAAAAAAAA---ACGCGTACTCTCTCTCTACGT".getBytes()); + // "CAAAAAAAAAACGacgCGTACTCTCTCTCTACGT".getBytes()); + + //non-simple insertion + // A(C)*/ACGT(C) -> G(T)*/GACG(T) -> gets moves 3 bases back due to left-aligning... + // position of 13 ^ in result + // ...and becomes A->AACG at position 10 + + start = CHAIN_SIZE - 13; + stop = start; + builder.source("test4").stop(stop).start(start).alleles(CollectionUtil.makeList(RefA, ACGT)); + result_builder.start(10).stop(10).alleles(CollectionUtil.makeList(RefA, AACG)); + tests.add(new Object[]{builder.make(), reference, result_builder.make()}); + +// // outside of chain (due to shifting of anchor point) + start = stop = CHAIN_SIZE; + builder.source("test5").stop(stop).start(start).alleles(CollectionUtil.makeList(RefA, ACGT)); + tests.add(new Object[]{builder.make(), reference, null}); + +// // outside of chain + start = stop = CHAIN_SIZE + 1; + builder.source("test6").stop(stop).start(start).alleles(CollectionUtil.makeList(RefA, ACGT)); + tests.add(new Object[]{builder.make(), reference, null}); + +// // MNP + // GTT*(T)/ACGT(T) -> AAA(C)*/AACG(T) -> which is then normalized to A*/CG at position 11 + // pos 11 ^ in the result + start = CHAIN_SIZE - 11; + stop = start + 2; + builder.source("test7").stop(stop).start(start).alleles(CollectionUtil.makeList(RefGTT, ACGT)); + result_builder.start(11).stop(11).alleles(CollectionUtil.makeList(RefA, CG)); + tests.add(new Object[]{builder.make(), reference, result_builder.make()}); + +// // MNP + // ACGT*(T)/ATT*(T) -> AACG(T)*/AAA(T) -> by normalization CG(T)*/A(T) + // pos 13 ^ in the result + start = CHAIN_SIZE - 13; + stop = start + 3; + builder.source("test8").stop(stop).start(start).alleles(CollectionUtil.makeList(RefACGT, ATT)); + result_builder.start(CHAIN_SIZE - stop + 2).stop(CHAIN_SIZE - stop + 3).alleles(CollectionUtil.makeList(RefCG, A)); + tests.add(new Object[]{builder.make(), reference, result_builder.make()}); + + // needs left-aligning + // T->TAG --> T(A)/TCT(A) -> by normalization A/ACT @ 19 + // position 29 ^ + start = CHAIN_SIZE - 29; + stop = start; + builder.source("test9").stop(stop).start(start).alleles(CollectionUtil.makeList(RefT, TAG)); + result_builder.start(19).stop(19).alleles(CollectionUtil.makeList(RefA, ACT)); + tests.add(new Object[]{builder.make(), reference, result_builder.make()}); + + return tests.iterator(); + } + + @Test(dataProvider = "indelFlipData") + public void testFlipIndel(final VariantContext source, final ReferenceSequence reference, final VariantContext result) { + + final LiftOver liftOver = new LiftOver(CHAIN_FILE); + + final VariantContext flipped = LiftoverVcf.flipIndel(source, liftOver, reference); + + assertVcAreEqual(flipped, result); + } + + @DataProvider(name = "leftAlignAllelesData") + public Iterator leftAlignAllelesData() { + + ReferenceSequence reference = new ReferenceSequence("chr1", 0, + "CAAAAAAAAAACGTACGTACTCTCTCTCTACGT".getBytes()); + // 123456789 123456789 123456789 123 + + Allele RefG = Allele.create("G", true); + Allele A = Allele.create("A", false); + + Allele RefA = Allele.create("A", true); + Allele RefC = Allele.create("C", true); + Allele RefAA = Allele.create("AA", true); + Allele RefCA = Allele.create("CA", true); + Allele RefCT = Allele.create("CT", true); + Allele RefTC = Allele.create("TC", true); + Allele RefACT = Allele.create("ACT", true); + Allele RefCTCT = Allele.create("CTCT", true); + Allele RefTCTC = Allele.create("TCTC", true); + + Allele AA = Allele.create("AA", false); + Allele C = Allele.create("C", false); + Allele CA = Allele.create("CA", false); + Allele ACT = Allele.create("ACT", false); + Allele CTCT = Allele.create("CTCT", false); + Allele TCTC = Allele.create("TCTC", false); + Allele CT = Allele.create("CT", false); + Allele TC = Allele.create("TC", false); + + Allele T = Allele.create("T", false); + + final List tests = new ArrayList<>(); + + VariantContextBuilder builder = new VariantContextBuilder().source("test1").chr("chr1"); + VariantContextBuilder result_builder = new VariantContextBuilder().source("test1").chr("chr1"); + + // simple SNP + // G*/A -> G/A + int start = 13; + int stop = start; + builder.start(start).stop(stop).alleles(CollectionUtil.makeList(RefG, A)); + result_builder.start(stop).stop(start).alleles(CollectionUtil.makeList(RefG, A)); + tests.add(new Object[]{builder.make(), reference, result_builder.make()}); + + for (start = 1; start <= reference.getBases().length; start++) { + builder.start(start).stop(start); + builder.alleles(CollectionUtil.makeList( + Allele.create(reference.getBaseString().substring(start - 1, start), true), + reference.getBaseString().charAt(start - 1) == 'A' ? T : A)); + + tests.add(new Object[]{builder.make(), reference, builder.make()}); + } + + // AA/A in initial polyA repeat -> CA/C at the beginning + result_builder.start(1).stop(2).alleles(CollectionUtil.makeList(RefCA, C)); + for (start = 2; start <= 11; start++) { + builder.start(start).stop(start + 1); + builder.alleles(CollectionUtil.makeList( + RefAA, A)); + + tests.add(new Object[]{builder.make(), reference, result_builder.make()}); + } + + // A/AA in initial polyA repeat -> C/CA at the beginning + result_builder.start(1).stop(1).alleles(CollectionUtil.makeList(RefC, CA)); + for (start = 2; start <= 11; start++) { + builder.start(start).stop(start); + builder.alleles(CollectionUtil.makeList( + RefA, AA)); + + tests.add(new Object[]{builder.make(), reference, result_builder.make()}); + } + + //CT/CTCT -> A/ACT in CT repeat region + result_builder.start(19).stop(19).alleles(CollectionUtil.makeList(RefA, ACT)); + for (start = 20; start <= 27; start += 2) { + builder.start(start).stop(start + 1); + builder.alleles(CollectionUtil.makeList( + RefCT, CTCT)); + + tests.add(new Object[]{builder.make(), reference, result_builder.make()}); + } + + //TC/TCTC -> A/ACT in CT repeat region + for (start = 21; start <= 29; start += 2) { + builder.start(start).stop(start + 1); + builder.alleles(CollectionUtil.makeList( + RefTC, TCTC)); + + tests.add(new Object[]{builder.make(), reference, result_builder.make()}); + } + + //CTCT/CT -> ACT/A in CT repeat region + result_builder.start(19).stop(21).alleles(CollectionUtil.makeList(RefACT, A)); + for (start = 20; start <= 27; start += 2) { + builder.start(start).stop(start + 3); + builder.alleles(CollectionUtil.makeList( + RefCTCT, CT)); + + tests.add(new Object[]{builder.make(), reference, result_builder.make()}); + } + + //TCTC/TC-> ACT/A in CT repeat region + for (start = 21; start <= 29; start += 2) { + builder.start(start).stop(start + 3); + builder.alleles(CollectionUtil.makeList( + RefTCTC, TC)); + + tests.add(new Object[]{builder.make(), reference, result_builder.make()}); + } + + // for ease of reading, here's the reference sequence + // "CAAAAAAAAAACGTACGTACTCTCTCTCTACGT" + // 123456789 123456789 123456789 123 + + result_builder.alleles("AACGT", "A").start(10).stop(14); + for (start = 10; start < 17; start++) { + for (stop = start + 4; stop < 20; stop++) { + builder.alleles( + // -1 here due to reference string being 0-based. + reference.getBaseString().substring(start - 1, stop + 1 - 1), + reference.getBaseString().substring(start - 1, stop - 3 - 1)).start(start).stop(stop); + tests.add(new Object[]{builder.make(), reference, result_builder.make()}); + } + } + + // test vc with genotypes: + + result_builder.start(10).stop(14).alleles("AACGT", "A", "AACG", "AACGTACGT"); + builder.start(9).stop(18).alleles("AAACGTACGT", "AAACGT", "AAACGACGT", "AAACGTACGTACGT"); + Collection genotypes = new ArrayList<>(); + Collection results_genotypes = new ArrayList<>(); + GenotypeBuilder gBuilder = new GenotypeBuilder(); + + for (int sample = 1; sample < 4; sample++) { + gBuilder.name("sample" + sample) + .alleles(CollectionUtil.makeList(builder.getAlleles().get(0), builder.getAlleles().get(sample))); + genotypes.add(gBuilder.make()); + gBuilder.alleles(CollectionUtil.makeList(result_builder.getAlleles().get(0), result_builder.getAlleles().get(sample))); + results_genotypes.add(gBuilder.make()); + } + gBuilder.name("sample_non_ref_het") + .alleles(CollectionUtil.makeList(builder.getAlleles().get(1), builder.getAlleles().get(2))); + genotypes.add(gBuilder.make()); + gBuilder.alleles(CollectionUtil.makeList(result_builder.getAlleles().get(1), result_builder.getAlleles().get(2))); + results_genotypes.add(gBuilder.make()); + + builder.genotypes(genotypes); + result_builder.genotypes(results_genotypes); + + tests.add(new Object[]{builder.make(), reference, result_builder.make()}); + + return tests.iterator(); + } + + @Test(dataProvider = "leftAlignAllelesData") + public void testLeftAlignVariants(final VariantContext source, final ReferenceSequence reference, final VariantContext result) { + + assertVcAreEqual(leftAlignVariant(source, reference), result); + } + + private void assertVcAreEqual(final VariantContext actual, final VariantContext expected) { + + if (expected == null) { + Assert.assertNull(actual); + return; + } + + Assert.assertNotNull(actual); + Assert.assertEquals(actual.getContig(), expected.getContig()); + Assert.assertEquals(actual.getStart(), expected.getStart()); + Assert.assertEquals(actual.getEnd(), expected.getEnd()); + + expected.getAlleles().sort(new AlleleComparator()); + actual.getAlleles().sort(new AlleleComparator()); + + Assert.assertEquals(expected.getAlleles(), actual.getAlleles()); + assertGenotypeContextsAreEquals(actual.getGenotypes(), expected.getGenotypes()); + } + + static class AlleleComparator implements Comparator { + + @Override + public int compare(Allele o1, Allele o2) { + int ret = (o1.isReference() ? 1 : 0) - (o2.isReference() ? 1 : 0); + if (ret != 0) return ret; + + return o1.getBaseString().compareTo(o2.getBaseString()); + } + } + + private void assertGenotypeContextsAreEquals(final GenotypesContext actual, final GenotypesContext expected) { + if (expected == null) { + Assert.assertNull(actual); + return; + } + Assert.assertEquals(expected.getSampleNamesOrderedByName(), expected.getSampleNamesOrderedByName()); + + for (String name : expected.getSampleNamesOrderedByName()) { + Assert.assertEquals(expected.get(name).getAlleles(), actual.get(name).getAlleles()); + } + } } diff --git a/testdata/picard/vcf/testLiftover.vcf b/testdata/picard/vcf/testLiftoverBiallelicIndels.vcf similarity index 83% rename from testdata/picard/vcf/testLiftover.vcf rename to testdata/picard/vcf/testLiftoverBiallelicIndels.vcf index e6161ed82..f738efd08 100644 --- a/testdata/picard/vcf/testLiftover.vcf +++ b/testdata/picard/vcf/testLiftoverBiallelicIndels.vcf @@ -2,3 +2,4 @@ #CHROM POS ID REF ALT QUAL FILTER INFO chr1 1 . C CCCCT 15676.17 PASS . chr1 61 . CA C 724.43 PASS . +chr1 72 . T A 100 PASS . \ No newline at end of file diff --git a/testdata/picard/vcf/testLiftoverMultiallelicIndels.vcf b/testdata/picard/vcf/testLiftoverMultiallelicIndels.vcf new file mode 100644 index 000000000..b37615299 --- /dev/null +++ b/testdata/picard/vcf/testLiftoverMultiallelicIndels.vcf @@ -0,0 +1,4 @@ +##fileformat=VCFv4.1 +#CHROM POS ID REF ALT QUAL FILTER INFO +chr1 1 . C CCCCT,CCCt 15676.17 PASS . +chr1 61 . CA C,CAA 724.43 PASS . From 217fe26bd28c6f8703d2009c1f0f767695c1c8f1 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Fri, 23 Jun 2017 21:07:08 -0400 Subject: [PATCH 2/7] - responding to reviewer's comments --- src/main/java/picard/vcf/LiftoverVcf.java | 259 +++++++++++++-------- src/test/java/picard/vcf/LiftoverVcfTest.java | 187 +++++++-------- .../picard/vcf/testLiftoverBiallelicIndels.vcf | 2 +- 3 files changed, 250 insertions(+), 198 deletions(-) diff --git a/src/main/java/picard/vcf/LiftoverVcf.java b/src/main/java/picard/vcf/LiftoverVcf.java index 8a2161485..0b6b70b60 100644 --- a/src/main/java/picard/vcf/LiftoverVcf.java +++ b/src/main/java/picard/vcf/LiftoverVcf.java @@ -11,12 +11,7 @@ import htsjdk.variant.variantcontext.writer.Options; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder; -import htsjdk.variant.vcf.VCFFileReader; -import htsjdk.variant.vcf.VCFFilterHeaderLine; -import htsjdk.variant.vcf.VCFHeader; -import htsjdk.variant.vcf.VCFHeaderLineType; -import htsjdk.variant.vcf.VCFInfoHeaderLine; -import htsjdk.variant.vcf.VCFRecordCodec; +import htsjdk.variant.vcf.*; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; @@ -27,8 +22,6 @@ import java.text.DecimalFormat; import java.text.NumberFormat; import java.util.*; -import java.util.function.Function; -import java.util.stream.Collector; import java.util.stream.Collectors; /** @@ -132,6 +125,11 @@ public static final String ORIGINAL_START = "OriginalStart"; /** + * Attribute used to store the position of the failed variant on the target contig prior to finding out that alleles do not match. + */ + public static final String ATTEMPTED_LOCUS = "AttemptedLocus"; + + /** * Metadata to be added to the Passing file. */ private static final List ATTRS = CollectionUtil.makeList( @@ -139,7 +137,9 @@ new VCFInfoHeaderLine(ORIGINAL_START, 1, VCFHeaderLineType.String, "The position of the variant on the source contig prior to liftover.") ); + private VariantContextWriter rejects; private final Log log = Log.getInstance(LiftoverVcf.class); + private SortingCollection sorter; @Override protected int doWork() { @@ -178,15 +178,15 @@ protected int doWork() { .setOutputFile(OUTPUT).setReferenceDictionary(walker.getSequenceDictionary()).build(); out.writeHeader(outHeader); - final VariantContextWriter rejects = new VariantContextWriterBuilder().setOutputFile(REJECT) + rejects = new VariantContextWriterBuilder().setOutputFile(REJECT) .unsetOption(Options.INDEX_ON_THE_FLY) .modifyOption(Options.ALLOW_MISSING_FIELDS_IN_HEADER, ALLOW_MISSING_FIELDS_IN_HEADER) .build(); final VCFHeader rejectHeader = new VCFHeader(in.getFileHeader()); for (final VCFFilterHeaderLine line : FILTERS) rejectHeader.addMetaDataLine(line); - if (WRITE_ORIGINAL_POSITION) { - for (final VCFInfoHeaderLine line : ATTRS) rejectHeader.addMetaDataLine(line); - } + + rejectHeader.addMetaDataLine(new VCFInfoHeaderLine(ATTEMPTED_LOCUS,1, VCFHeaderLineType.String, "The locus of the variant in the TARGET prior to failing due to mismatching alleles.")); + rejects.writeHeader(rejectHeader); //////////////////////////////////////////////////////////////////////// @@ -196,7 +196,7 @@ protected int doWork() { long failedLiftover = 0, failedAlleleCheck = 0, total = 0; log.info("Lifting variants over and sorting."); - final SortingCollection sorter = SortingCollection.newInstance(VariantContext.class, + sorter = SortingCollection.newInstance(VariantContext.class, new VCFRecordCodec(outHeader, ALLOW_MISSING_FIELDS_IN_HEADER || VALIDATION_STRINGENCY != ValidationStringency.STRICT), outHeader.getVCFRecordComparator(), MAX_RECORDS_IN_RAM, @@ -210,10 +210,10 @@ protected int doWork() { ++total; final Interval source = new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd(), false, ctx.getContig() + ":" + ctx.getStart() + "-" + ctx.getEnd()); final Interval target = liftOver.liftOver(source, LIFTOVER_MIN_MATCH); + final ReferenceSequence refSeq; // if the target is null OR (the target is reverse complemented AND the variant is a non-biallelic indel or mixed), then we cannot lift it over - - if (target == null || (target.isNegativeStrand() && (ctx.isMixed() || ctx.isIndel() && !ctx.isBiallelic()))) { + if (target == null || (target.isNegativeStrand() && (ctx.isMixed() || (ctx.isIndel() && !ctx.isBiallelic())))) { final String reason = (target == null) ? FILTER_NO_TARGET : FILTER_CANNOT_LIFTOVER_INDEL; rejects.add(new VariantContextBuilder(ctx).filter(reason).make()); failedLiftover++; @@ -221,78 +221,33 @@ protected int doWork() { rejects.add(new VariantContextBuilder(ctx).filter(FILTER_NO_TARGET).make()); failedLiftover++; - String missingContigMessage = "Encountered a contig, " + target.getContig() + " that is not part of the target reference."; + final String missingContigMessage = "Encountered a contig, " + target.getContig() + " that is not part of the target reference."; if (WARN_ON_MISSING_CONTIG) { log.warn(missingContigMessage); } else { log.error(missingContigMessage); return EXIT_CODE_WHEN_CONTIG_NOT_IN_REFERENCE; } - } else if ( target.isNegativeStrand() && ctx.isIndel() && ctx.isBiallelic()) { - final VariantContextBuilder flippedIndel=new VariantContextBuilder(flipIndel(ctx,liftOver,refSeqs.get(target.getContig()))); - - if (WRITE_ORIGINAL_POSITION) { - flippedIndel.attribute(ORIGINAL_CONTIG, source.getContig()); - flippedIndel.attribute(ORIGINAL_START, source.getStart()); - } - sorter.add(flippedIndel.make()); - - } else { - // Fix the alleles if we went from positive to negative strand - reverseComplementAlleleMap.clear(); - final List alleles = new ArrayList<>(); - - for (final Allele oldAllele : ctx.getAlleles()) { - if (target.isPositiveStrand() || oldAllele.isSymbolic()) { - alleles.add(oldAllele); - } else { - final Allele fixedAllele = Allele.create(SequenceUtil.reverseComplement(oldAllele.getBaseString()), oldAllele.isReference()); - alleles.add(fixedAllele); - reverseComplementAlleleMap.put(oldAllele, fixedAllele); - } - } + } else if (target.isNegativeStrand() && ctx.isIndel() && ctx.isBiallelic()) { + refSeq = refSeqs.get(target.getContig()); + //flipping indels: - // Build the new variant context - final VariantContextBuilder builder = new VariantContextBuilder( - ctx.getSource(), - target.getContig(), - target.getStart(), - target.getEnd(), - alleles); - - builder.id(ctx.getID()); - builder.attributes(ctx.getAttributes()); - - if (WRITE_ORIGINAL_POSITION) { - builder.attribute(ORIGINAL_CONTIG, source.getContig()); - builder.attribute(ORIGINAL_START, source.getStart()); - } - builder.genotypes(fixGenotypes(ctx.getGenotypes(), reverseComplementAlleleMap)); - builder.filters(ctx.getFilters()); - builder.log10PError(ctx.getLog10PError()); - - // Check that the reference allele still agrees with the reference sequence - boolean mismatchesReference = false; - for (final Allele allele : builder.getAlleles()) { - if (allele.isReference()) { - final byte[] ref = refSeqs.get(target.getContig()).getBases(); - final String refString = StringUtil.bytesToString(ref, target.getStart() - 1, target.length()); - - if (!refString.equalsIgnoreCase(allele.getBaseString())) { - mismatchesReference = true; - } - break; + final VariantContext flippedIndel = flipIndel(ctx, liftOver, refSeq); + if (flippedIndel == null) { + rejects.add(new VariantContextBuilder(ctx).filter(FILTER_CANNOT_LIFTOVER_INDEL).make()); + } else { + if (tryToAddVariant(flippedIndel,refSeq, reverseComplementAlleleMap,ctx)){ + failedAlleleCheck++; } } + } else { + refSeq = refSeqs.get(target.getContig()); + final VariantContext liftedVariant=liftSnp(ctx,target,refSeq); - if (mismatchesReference) { - rejects.add(new VariantContextBuilder(ctx).filter(FILTER_MISMATCHING_REF_ALLELE).make()); + if (!tryToAddVariant(liftedVariant, refSeq, reverseComplementAlleleMap, ctx)) { failedAlleleCheck++; - } else { - sorter.add(builder.make()); } } - progress.record(ctx.getContig(), ctx.getStart()); } @@ -323,9 +278,84 @@ protected int doWork() { return 0; } - protected static Locatable getFlippedLocation(final VariantContext source, final LiftOver liftOver) { - Interval originalLocus = new Interval(source.getContig(), source.getStart() + 1, source.getEnd() + 1); - return liftOver.liftOver(originalLocus); + /** + * utility function to attempt to add a variant. Checks that the reference allele still matches the reference (which may have changed) + * + * @param vc new {@link VariantContext} + * @param refSeq {@link ReferenceSequence} of new reference + * @param alleleMap a {@link Map} mapping the old alleles to the new alleles (for fixing the genotypes) + * @param source the original {@link VariantContext} to use for putting the original location information into vc + * @return true if successful, false if failed due to mismatching reference allele. + */ + private boolean tryToAddVariant(final VariantContext vc, final ReferenceSequence refSeq, final Map alleleMap, final VariantContext source) { + + final VariantContextBuilder builder = new VariantContextBuilder(vc); + + if (WRITE_ORIGINAL_POSITION) { + builder.attribute(ORIGINAL_CONTIG, source.getContig()); + builder.attribute(ORIGINAL_START, source.getStart()); + } + builder.genotypes(fixGenotypes(source.getGenotypes(), alleleMap)); + builder.filters(source.getFilters()); + builder.log10PError(source.getLog10PError()); + builder.attributes(source.getAttributes()); + builder.id(source.getID()); + + + // Check that the reference allele still agrees with the reference sequence + boolean mismatchesReference = false; + for (final Allele allele : builder.getAlleles()) { + if (allele.isReference()) { + final byte[] ref = refSeq.getBases(); + final String refString = StringUtil.bytesToString(ref, vc.getStart() - 1, vc.getEnd()-vc.getStart()+1); + + if (!refString.equalsIgnoreCase(allele.getBaseString())) { + mismatchesReference = true; + } + break; + } + } + + if (mismatchesReference) { + rejects.add(new VariantContextBuilder(source) + .filter(FILTER_MISMATCHING_REF_ALLELE) + .attribute(ATTEMPTED_LOCUS, String.format("%s:%d-%d",vc.getContig(),vc.getStart(),vc.getEnd())) + .make()); + return false; + } else { + sorter.add(builder.make()); + return true; + } + } + + + protected static VariantContext liftSnp(final VariantContext source, final Interval target , final ReferenceSequence referenceSequence){ + // Fix the alleles if we went from positive to negative strand + final Map reverseComplementAlleleMap = new HashMap<>(2); + + final List alleles = new ArrayList<>(); + + for (final Allele oldAllele : source.getAlleles()) { + if (target.isPositiveStrand() || oldAllele.isSymbolic()) { + alleles.add(oldAllele); + } else { + final Allele fixedAllele = Allele.create(SequenceUtil.reverseComplement(oldAllele.getBaseString()), oldAllele.isReference()); + alleles.add(fixedAllele); + reverseComplementAlleleMap.put(oldAllele, fixedAllele); + } + } + + // Build the new variant context + final VariantContextBuilder builder = new VariantContextBuilder( + source.getSource(), + target.getContig(), + target.getStart(), + target.getEnd(), + alleles); + + builder.id(source.getID()); + + return builder.make(); } /** @@ -337,27 +367,36 @@ protected static Locatable getFlippedLocation(final VariantContext source, final protected static VariantContext flipIndel(final VariantContext source, final LiftOver liftOver, final ReferenceSequence referenceSequence) { if (!source.isBiallelic()) return null; //only supporting biallelic indels, for now. - final Locatable target = getFlippedLocation(source, liftOver); + final Interval originalLocus = new Interval(source.getContig(), source.getStart(), source.getEnd()); + final Locatable target = liftOver.liftOver(originalLocus); + if (target == null) return null; - final Map reverseComplementAlleleMap = new HashMap<>(10); + // a boolean to protect against trying to access the -1 position in the reference array + final boolean addToStart = target.getStart() > 1; + + final Map reverseComplementAlleleMap = new HashMap<>(2); reverseComplementAlleleMap.clear(); final List alleles = new ArrayList<>(); for (final Allele oldAllele : source.getAlleles()) { // target.getStart is 1-based, reference bases are 0-based - final String alleleWithoutHead = oldAllele.getBaseString().substring(1, oldAllele.length()); - final char refBase = (char) referenceSequence.getBases()[target.getStart() - 1]; - final Allele fixedAllele = Allele.create(refBase + SequenceUtil.reverseComplement(alleleWithoutHead), oldAllele.isReference()); + final StringBuilder alleleBuilder = new StringBuilder(target.getEnd() - target.getStart() + 1); + + if (addToStart) alleleBuilder.append((char) referenceSequence.getBases()[target.getStart() - 2]); + alleleBuilder.append(SequenceUtil.reverseComplement(oldAllele.getBaseString().substring(1, oldAllele.length()))); + if (!addToStart) alleleBuilder.append((char) referenceSequence.getBases()[target.getEnd() - 1]); + + final Allele fixedAllele = Allele.create(alleleBuilder.toString(), oldAllele.isReference()); alleles.add(fixedAllele); reverseComplementAlleleMap.put(oldAllele, fixedAllele); } - VariantContextBuilder builder = new VariantContextBuilder(source.getSource(), + final VariantContextBuilder builder = new VariantContextBuilder(source.getSource(), target.getContig(), - target.getStart(), - target.getEnd(), + target.getStart() - (addToStart ? 1 : 0), + target.getEnd() - (addToStart ? 1 : 0), alleles); builder.id(source.getID()); @@ -388,15 +427,28 @@ protected static GenotypesContext fixGenotypes(final GenotypesContext originals, return fixedGenotypes; } + /** + * Normalizes and left alignes a {@link VariantContext}. + * + * Based on Adrian Tan, Gonçalo R. Abecasis and Hyun Min Kang. (2015) + * Unified Representation of Genetic Variants. Bioinformatics. + * @param vc the {@link VariantContext} to be normalized + * @param referenceSequence the {@link ReferenceSequence} of the same contig as vc + * @return a new {@link VariantContext} which represents the same variation as vc but has + * been normalized and left-aligned + */ protected static VariantContext leftAlignVariant(final VariantContext vc, final ReferenceSequence referenceSequence) { - // from Adrian Tan, Gonçalo R. Abecasis and Hyun Min Kang. (2015) Unified Representation of Genetic Variants. Bioinformatics. boolean changesInAlleles = true; int start = vc.getStart(); int end = vc.getEnd(); - Map alleleBasesMap = new HashMap<>(); + if (!vc.getContig().equals(referenceSequence.getName())) { + throw new IllegalArgumentException("vc contig doesn't match that of supplied reference: " + vc.getContig() + " != " + referenceSequence.getName()); + } + + final Map alleleBasesMap = new HashMap<>(); vc.getAlleles().forEach(a -> alleleBasesMap.put(a, a.getBases())); // 1. while changes in alleles do @@ -406,9 +458,9 @@ protected static VariantContext leftAlignVariant(final VariantContext vc, final // 2. if alleles end with the same nucleotide then if (alleleBasesMap.values().stream() .collect(Collectors.groupingBy(a -> a[a.length - 1], Collectors.toSet())) - .size() == 1) { + .size() == 1 && end > 1) { // 3. truncate rightmost nucleotide of each allele - for (Allele allele : alleleBasesMap.keySet()) { + for (final Allele allele : alleleBasesMap.keySet()) { alleleBasesMap.put(allele, truncateBase(alleleBasesMap.get(allele), true)); } changesInAlleles = true; @@ -421,10 +473,14 @@ protected static VariantContext leftAlignVariant(final VariantContext vc, final .map(a -> a.length) .anyMatch(l -> l == 0)) { // 6. extend alleles 1 nucleotide to the left - for (Allele allele : alleleBasesMap.keySet()) { + for (final Allele allele : alleleBasesMap.keySet()) { // the first -1 for zero-base (getBases) versus 1-based (variant position) // another -1 to get the base prior to the location of the start of the allele - alleleBasesMap.put(allele, extendOneBaseLeft(alleleBasesMap.get(allele), referenceSequence.getBases()[start - 2])); + final byte extraBase = start > 1 ? + referenceSequence.getBases()[start - 2] : + referenceSequence.getBases()[end]; + + alleleBasesMap.put(allele, extendOneBase(alleleBasesMap.get(allele), extraBase, start > 1)); } changesInAlleles = true; start--; @@ -443,17 +499,17 @@ protected static VariantContext leftAlignVariant(final VariantContext vc, final ) { //9. truncate the leftmost base of the alleles - for (Allele allele : alleleBasesMap.keySet()) { + for (final Allele allele : alleleBasesMap.keySet()) { alleleBasesMap.put(allele, truncateBase(alleleBasesMap.get(allele), false)); } start++; } - VariantContextBuilder builder = new VariantContextBuilder(vc); + final VariantContextBuilder builder = new VariantContextBuilder(vc); builder.start(start); builder.stop(end); - Map fixedAlleleMap = alleleBasesMap.entrySet().stream() + final Map fixedAlleleMap = alleleBasesMap.entrySet().stream() .collect(Collectors.toMap(Map.Entry::getKey, me -> Allele.create(me.getValue(), me.getKey().isReference()))); builder.alleles(fixedAlleleMap.values()); builder.genotypes(fixGenotypes(vc.getGenotypes(), fixedAlleleMap)); @@ -461,16 +517,21 @@ protected static VariantContext leftAlignVariant(final VariantContext vc, final return builder.make(); } - private static byte[] truncateBase(byte[] allele, boolean truncateRightmost) { + private static byte[] truncateBase(final byte[] allele, final boolean truncateRightmost) { return Arrays.copyOfRange(allele, truncateRightmost ? 0 : 1, truncateRightmost ? allele.length - 1 : allele.length); } - private static byte[] extendOneBaseLeft(final byte[] bases, final byte base) { + private static byte[] extendOneBase(final byte[] bases, final byte base, boolean toTheLeft) { - byte[] newBases = new byte[bases.length + 1]; + final byte[] newBases = new byte[bases.length + 1]; - System.arraycopy(bases, 0, newBases, 1, bases.length); - newBases[0] = base; + if (toTheLeft) { + System.arraycopy(bases, 0, newBases, 1, bases.length); + newBases[0] = base; + } else { + System.arraycopy(bases, 0, newBases, 0, bases.length); + newBases[bases.length] = base; + } return newBases; } diff --git a/src/test/java/picard/vcf/LiftoverVcfTest.java b/src/test/java/picard/vcf/LiftoverVcfTest.java index 01cb53b9a..999aad48e 100644 --- a/src/test/java/picard/vcf/LiftoverVcfTest.java +++ b/src/test/java/picard/vcf/LiftoverVcfTest.java @@ -39,17 +39,17 @@ public void teardown() { IOUtil.deleteDirectoryTree(OUTPUT_DATA_PATH); } - @DataProvider(name="liftoverReverseStrand") - public Object[][] liftoverReverseStrand(){ + @DataProvider(name = "liftoverReverseStrand") + public Object[][] liftoverReverseStrand() { return new Object[][]{ - {"testLiftoverBiallelicIndels.vcf",3,0}, - {"testLiftoverMultiallelicIndels.vcf",0,2}, + {"testLiftoverBiallelicIndels.vcf", 3, 0}, + {"testLiftoverMultiallelicIndels.vcf", 0, 2}, }; } @Test(dataProvider = "liftoverReverseStrand" ) - public void testReverseComplementedIndels(String filename, int expectedPassing, int expectedFailing) { + public void testReverseComplementedIndels(final String filename, final int expectedPassing, final int expectedFailing) { final File liftOutputFile = new File(OUTPUT_DATA_PATH, "lift-delete-me.vcf"); final File rejectOutputFile = new File(OUTPUT_DATA_PATH, "reject-delete-me.vcf"); final File input = new File(TEST_DATA_PATH, filename); @@ -83,7 +83,7 @@ public void testReverseComplementedIndels(String filename, int expectedPassing, } @Test(dataProvider = "dataTestMissingContigInReference") - public void testMissingContigInReference(boolean warnOnMissingContext, int expectedReturnCode) { + public void testMissingContigInReference(final boolean warnOnMissingContext, final int expectedReturnCode) { final File liftOutputFile = new File(OUTPUT_DATA_PATH, "lift-delete-me.vcf"); final File rejectOutputFile = new File(OUTPUT_DATA_PATH, "reject-delete-me.vcf"); final File input = new File(TEST_DATA_PATH, "testLiftoverUsingMissingContig.vcf"); @@ -113,7 +113,7 @@ public void testMissingContigInReference(boolean warnOnMissingContext, int expec } @Test(dataProvider = "dataTestWriteOriginalPosition") - public void testWriteOriginalPosition(boolean shouldWriteOriginalPosition) { + public void testWriteOriginalPosition(final boolean shouldWriteOriginalPosition) { final File liftOutputFile = new File(OUTPUT_DATA_PATH, "lift-delete-me.vcf"); final File rejectOutputFile = new File(OUTPUT_DATA_PATH, "reject-delete-me.vcf"); final File input = new File(TEST_DATA_PATH, "testLiftoverBiallelicIndels.vcf"); @@ -133,8 +133,8 @@ public void testWriteOriginalPosition(boolean shouldWriteOriginalPosition) { runPicardCommandLine(args); - try (VCFFileReader liftReader = new VCFFileReader(liftOutputFile, false)) { - for (VariantContext vc : liftReader) { + try (final VCFFileReader liftReader = new VCFFileReader(liftOutputFile, false)) { + for (final VariantContext vc : liftReader) { if (shouldWriteOriginalPosition) { Assert.assertNotNull(vc.getAttribute(LiftoverVcf.ORIGINAL_CONTIG)); Assert.assertNotNull(vc.getAttribute(LiftoverVcf.ORIGINAL_START)); @@ -149,43 +149,45 @@ public void testWriteOriginalPosition(boolean shouldWriteOriginalPosition) { @DataProvider(name = "indelFlipData") public Iterator indelFlipData() { - ReferenceSequence reference = new ReferenceSequence("chr1", 0, + final ReferenceSequence reference = new ReferenceSequence("chr1", 0, "CAAAAAAAAAACGTACGTACTCTCTCTCTACGT".getBytes()); // 123456789 123456789 123456789 123 - Allele RefAAA = Allele.create("AAA", true); - Allele RefCAA = Allele.create("CAA", true); - Allele RefGTT = Allele.create("GTT", true); - Allele RefACGT = Allele.create("ACGT", true); - Allele RefAACG = Allele.create("AACG", true); - Allele RefTTT = Allele.create("TTT", true); - Allele RefCG = Allele.create("CG", true); - Allele RefT = Allele.create("T", true); - Allele RefA = Allele.create("A", true); - Allele RefC = Allele.create("C", true); - - Allele A = Allele.create("A", false); - Allele T = Allele.create("T", false); - Allele C = Allele.create("C", false); - Allele CG = Allele.create("CG", false); - Allele CAA = Allele.create("CAA", false); - Allele ACT = Allele.create("ACT", false); - Allele TTT = Allele.create("TTT", false); - Allele ATT = Allele.create("ATT", false); - Allele TAG = Allele.create("TAG", false); - Allele ACGT = Allele.create("ACGT", false); - Allele AACG = Allele.create("AACG", false); + final Allele RefCAA = Allele.create("CAA", true); + final Allele RefGTT = Allele.create("GTT", true); + final Allele RefACGT = Allele.create("ACGT", true); + final Allele RefAACG = Allele.create("AACG", true); + final Allele RefTTT = Allele.create("TTT", true); + final Allele RefCG = Allele.create("CG", true); + final Allele RefT = Allele.create("T", true); + final Allele RefA = Allele.create("A", true); + final Allele RefC = Allele.create("C", true); + final Allele RefG = Allele.create("G", true); + + final Allele A = Allele.create("A", false); + final Allele T = Allele.create("T", false); + final Allele C = Allele.create("C", false); + final Allele CG = Allele.create("CG", false); + final Allele CAA = Allele.create("CAA", false); + final Allele ACT = Allele.create("ACT", false); + final Allele TTT = Allele.create("TTT", false); + final Allele ATT = Allele.create("ATT", false); + final Allele GTT = Allele.create("GTT", false); + final Allele AAC= Allele.create("AAC", false); + final Allele TAG = Allele.create("TAG", false); + final Allele ACGT = Allele.create("ACGT", false); + final Allele AACG = Allele.create("AACG", false); final List tests = new ArrayList<>(); final int CHAIN_SIZE = 540; // the length of the single chain in CHAIN_FILE - VariantContextBuilder builder = new VariantContextBuilder().source("test1").chr("chr1"); - VariantContextBuilder result_builder = new VariantContextBuilder().source("test1").chr("chr1"); + final VariantContextBuilder builder = new VariantContextBuilder().source("test1").chr("chr1"); + final VariantContextBuilder result_builder = new VariantContextBuilder().source("test1").chr("chr1"); // simple deletion // TTT*/T -> AAA*/A turns into left-aligned CAA*/C at position 1 - int start = 537; + int start = CHAIN_SIZE - 3 ; int stop = start + 2; builder.start(start).stop(stop).alleles(CollectionUtil.makeList(RefTTT, T)); result_builder.start(1).stop(3).alleles(CollectionUtil.makeList(RefCAA, C)); @@ -208,15 +210,15 @@ public void testWriteOriginalPosition(boolean shouldWriteOriginalPosition) { result_builder.start(CHAIN_SIZE - stop).stop(CHAIN_SIZE - start).alleles(CollectionUtil.makeList(RefAACG, A)); tests.add(new Object[]{builder.make(), reference, result_builder.make()}); - // "CAAAAAAAAAACG---CGTACTCTCTCTCTACGT".getBytes()); - // "CAAAAAAAAAACGacgCGTACTCTCTCTCTACGT".getBytes()); + // "CAAAAAAAAAACG---CGTACTCTCTCTCTACGT" -- Allele A + // "CAAAAAAAAAACGacgCGTACTCTCTCTCTACGT" -- Allele B - // equivalent to + // is equivalent to - // "CAAAAAAAAA---ACGCGTACTCTCTCTCTACGT".getBytes()); - // "CAAAAAAAAAACGacgCGTACTCTCTCTCTACGT".getBytes()); + // "CAAAAAAAAA---ACGCGTACTCTCTCTCTACGT" -- Allele A + // "CAAAAAAAAAACGacgCGTACTCTCTCTCTACGT" -- Allele B - //non-simple insertion + // non-simple insertion // A(C)*/ACGT(C) -> G(T)*/GACG(T) -> gets moves 3 bases back due to left-aligning... // position of 13 ^ in result // ...and becomes A->AACG at position 10 @@ -227,17 +229,19 @@ public void testWriteOriginalPosition(boolean shouldWriteOriginalPosition) { result_builder.start(10).stop(10).alleles(CollectionUtil.makeList(RefA, AACG)); tests.add(new Object[]{builder.make(), reference, result_builder.make()}); -// // outside of chain (due to shifting of anchor point) + // just outside of chain & contig, testing that we do not read into negative indices + // or reference start = stop = CHAIN_SIZE; - builder.source("test5").stop(stop).start(start).alleles(CollectionUtil.makeList(RefA, ACGT)); - tests.add(new Object[]{builder.make(), reference, null}); + builder.source("test5").stop(stop).start(start).alleles(CollectionUtil.makeList(RefG, GTT)); + result_builder.start(1).stop(1).alleles(CollectionUtil.makeList(RefC, AAC)); + tests.add(new Object[]{builder.make(), reference, result_builder.make()}); -// // outside of chain + // outside of chain start = stop = CHAIN_SIZE + 1; builder.source("test6").stop(stop).start(start).alleles(CollectionUtil.makeList(RefA, ACGT)); tests.add(new Object[]{builder.make(), reference, null}); -// // MNP + // MNP // GTT*(T)/ACGT(T) -> AAA(C)*/AACG(T) -> which is then normalized to A*/CG at position 11 // pos 11 ^ in the result start = CHAIN_SIZE - 11; @@ -246,7 +250,7 @@ public void testWriteOriginalPosition(boolean shouldWriteOriginalPosition) { result_builder.start(11).stop(11).alleles(CollectionUtil.makeList(RefA, CG)); tests.add(new Object[]{builder.make(), reference, result_builder.make()}); -// // MNP + // MNP // ACGT*(T)/ATT*(T) -> AACG(T)*/AAA(T) -> by normalization CG(T)*/A(T) // pos 13 ^ in the result start = CHAIN_SIZE - 13; @@ -280,38 +284,38 @@ public void testFlipIndel(final VariantContext source, final ReferenceSequence r @DataProvider(name = "leftAlignAllelesData") public Iterator leftAlignAllelesData() { - ReferenceSequence reference = new ReferenceSequence("chr1", 0, + final ReferenceSequence reference = new ReferenceSequence("chr1", 0, "CAAAAAAAAAACGTACGTACTCTCTCTCTACGT".getBytes()); // 123456789 123456789 123456789 123 - Allele RefG = Allele.create("G", true); - Allele A = Allele.create("A", false); - - Allele RefA = Allele.create("A", true); - Allele RefC = Allele.create("C", true); - Allele RefAA = Allele.create("AA", true); - Allele RefCA = Allele.create("CA", true); - Allele RefCT = Allele.create("CT", true); - Allele RefTC = Allele.create("TC", true); - Allele RefACT = Allele.create("ACT", true); - Allele RefCTCT = Allele.create("CTCT", true); - Allele RefTCTC = Allele.create("TCTC", true); - - Allele AA = Allele.create("AA", false); - Allele C = Allele.create("C", false); - Allele CA = Allele.create("CA", false); - Allele ACT = Allele.create("ACT", false); - Allele CTCT = Allele.create("CTCT", false); - Allele TCTC = Allele.create("TCTC", false); - Allele CT = Allele.create("CT", false); - Allele TC = Allele.create("TC", false); - - Allele T = Allele.create("T", false); + final Allele RefG = Allele.create("G", true); + final Allele A = Allele.create("A", false); + + final Allele RefA = Allele.create("A", true); + final Allele RefC = Allele.create("C", true); + final Allele RefAA = Allele.create("AA", true); + final Allele RefCA = Allele.create("CA", true); + final Allele RefCT = Allele.create("CT", true); + final Allele RefTC = Allele.create("TC", true); + final Allele RefACT = Allele.create("ACT", true); + final Allele RefCTCT = Allele.create("CTCT", true); + final Allele RefTCTC = Allele.create("TCTC", true); + + final Allele AA = Allele.create("AA", false); + final Allele C = Allele.create("C", false); + final Allele CA = Allele.create("CA", false); + final Allele ACT = Allele.create("ACT", false); + final Allele CTCT = Allele.create("CTCT", false); + final Allele TCTC = Allele.create("TCTC", false); + final Allele CT = Allele.create("CT", false); + final Allele TC = Allele.create("TC", false); + + final Allele T = Allele.create("T", false); final List tests = new ArrayList<>(); - VariantContextBuilder builder = new VariantContextBuilder().source("test1").chr("chr1"); - VariantContextBuilder result_builder = new VariantContextBuilder().source("test1").chr("chr1"); + final VariantContextBuilder builder = new VariantContextBuilder().source("test1").chr("chr1"); + final VariantContextBuilder result_builder = new VariantContextBuilder().source("test1").chr("chr1"); // simple SNP // G*/A -> G/A @@ -407,9 +411,9 @@ public void testFlipIndel(final VariantContext source, final ReferenceSequence r result_builder.start(10).stop(14).alleles("AACGT", "A", "AACG", "AACGTACGT"); builder.start(9).stop(18).alleles("AAACGTACGT", "AAACGT", "AAACGACGT", "AAACGTACGTACGT"); - Collection genotypes = new ArrayList<>(); - Collection results_genotypes = new ArrayList<>(); - GenotypeBuilder gBuilder = new GenotypeBuilder(); + final Collection genotypes = new ArrayList<>(); + final Collection results_genotypes = new ArrayList<>(); + final GenotypeBuilder gBuilder = new GenotypeBuilder(); for (int sample = 1; sample < 4; sample++) { gBuilder.name("sample" + sample) @@ -418,6 +422,7 @@ public void testFlipIndel(final VariantContext source, final ReferenceSequence r gBuilder.alleles(CollectionUtil.makeList(result_builder.getAlleles().get(0), result_builder.getAlleles().get(sample))); results_genotypes.add(gBuilder.make()); } + gBuilder.name("sample_non_ref_het") .alleles(CollectionUtil.makeList(builder.getAlleles().get(1), builder.getAlleles().get(2))); genotypes.add(gBuilder.make()); @@ -445,38 +450,24 @@ private void assertVcAreEqual(final VariantContext actual, final VariantContext return; } - Assert.assertNotNull(actual); - Assert.assertEquals(actual.getContig(), expected.getContig()); - Assert.assertEquals(actual.getStart(), expected.getStart()); - Assert.assertEquals(actual.getEnd(), expected.getEnd()); + Assert.assertNotNull(actual, "null status"); + Assert.assertEquals(actual.getContig(), expected.getContig(),"Different contigs: "); + Assert.assertEquals(actual.getStart(), expected.getStart(), "Different starts: "); + Assert.assertEquals(actual.getEnd(), expected.getEnd(), "Different ends: "); - expected.getAlleles().sort(new AlleleComparator()); - actual.getAlleles().sort(new AlleleComparator()); - - Assert.assertEquals(expected.getAlleles(), actual.getAlleles()); + Assert.assertTrue(actual.hasSameAllelesAs(expected), "Alleles differ between " + actual + " and " + expected + ": "); assertGenotypeContextsAreEquals(actual.getGenotypes(), expected.getGenotypes()); } - static class AlleleComparator implements Comparator { - - @Override - public int compare(Allele o1, Allele o2) { - int ret = (o1.isReference() ? 1 : 0) - (o2.isReference() ? 1 : 0); - if (ret != 0) return ret; - - return o1.getBaseString().compareTo(o2.getBaseString()); - } - } - private void assertGenotypeContextsAreEquals(final GenotypesContext actual, final GenotypesContext expected) { if (expected == null) { Assert.assertNull(actual); return; } - Assert.assertEquals(expected.getSampleNamesOrderedByName(), expected.getSampleNamesOrderedByName()); + Assert.assertEquals(actual.getSampleNamesOrderedByName(), expected.getSampleNamesOrderedByName(), "Sample names differ"); - for (String name : expected.getSampleNamesOrderedByName()) { - Assert.assertEquals(expected.get(name).getAlleles(), actual.get(name).getAlleles()); + for (final String name : expected.getSampleNamesOrderedByName()) { + Assert.assertEquals(actual.get(name).getAlleles(), expected.get(name).getAlleles(),"Alleles differ for sample "+ name); } } } diff --git a/testdata/picard/vcf/testLiftoverBiallelicIndels.vcf b/testdata/picard/vcf/testLiftoverBiallelicIndels.vcf index f738efd08..cf058794b 100644 --- a/testdata/picard/vcf/testLiftoverBiallelicIndels.vcf +++ b/testdata/picard/vcf/testLiftoverBiallelicIndels.vcf @@ -1,5 +1,5 @@ ##fileformat=VCFv4.1 #CHROM POS ID REF ALT QUAL FILTER INFO chr1 1 . C CCCCT 15676.17 PASS . -chr1 61 . CA C 724.43 PASS . +chr1 61 . GT G 724.43 PASS . chr1 72 . T A 100 PASS . \ No newline at end of file From cc147027321ef1b0119b0bc486f0bbbad788052c Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Sat, 24 Jun 2017 09:58:15 -0400 Subject: [PATCH 3/7] - typo --- src/main/java/picard/vcf/LiftoverVcf.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/picard/vcf/LiftoverVcf.java b/src/main/java/picard/vcf/LiftoverVcf.java index 0b6b70b60..5edfd4a26 100644 --- a/src/main/java/picard/vcf/LiftoverVcf.java +++ b/src/main/java/picard/vcf/LiftoverVcf.java @@ -428,10 +428,11 @@ protected static GenotypesContext fixGenotypes(final GenotypesContext originals, } /** - * Normalizes and left alignes a {@link VariantContext}. + * Normalizes and left aligns a {@link VariantContext}. * * Based on Adrian Tan, Gonçalo R. Abecasis and Hyun Min Kang. (2015) * Unified Representation of Genetic Variants. Bioinformatics. + * * @param vc the {@link VariantContext} to be normalized * @param referenceSequence the {@link ReferenceSequence} of the same contig as vc * @return a new {@link VariantContext} which represents the same variation as vc but has From 81849828bcb73602f8cda732c476ffd44c459258 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Sat, 24 Jun 2017 16:58:19 -0400 Subject: [PATCH 4/7] - fixed failing test --- src/main/java/picard/vcf/LiftoverVcf.java | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/main/java/picard/vcf/LiftoverVcf.java b/src/main/java/picard/vcf/LiftoverVcf.java index 5edfd4a26..582de1464 100644 --- a/src/main/java/picard/vcf/LiftoverVcf.java +++ b/src/main/java/picard/vcf/LiftoverVcf.java @@ -236,7 +236,7 @@ protected int doWork() { if (flippedIndel == null) { rejects.add(new VariantContextBuilder(ctx).filter(FILTER_CANNOT_LIFTOVER_INDEL).make()); } else { - if (tryToAddVariant(flippedIndel,refSeq, reverseComplementAlleleMap,ctx)){ + if (!tryToAddVariant(flippedIndel,refSeq, reverseComplementAlleleMap,ctx)){ failedAlleleCheck++; } } @@ -283,7 +283,7 @@ protected int doWork() { * * @param vc new {@link VariantContext} * @param refSeq {@link ReferenceSequence} of new reference - * @param alleleMap a {@link Map} mapping the old alleles to the new alleles (for fixing the genotypes) + * @param alleleMap a {@link Map} mapping the old alleles to the new alleles (for fixing the genotypes) * @param source the original {@link VariantContext} to use for putting the original location information into vc * @return true if successful, false if failed due to mismatching reference allele. */ @@ -291,16 +291,17 @@ private boolean tryToAddVariant(final VariantContext vc, final ReferenceSequence final VariantContextBuilder builder = new VariantContextBuilder(vc); - if (WRITE_ORIGINAL_POSITION) { - builder.attribute(ORIGINAL_CONTIG, source.getContig()); - builder.attribute(ORIGINAL_START, source.getStart()); - } + builder.genotypes(fixGenotypes(source.getGenotypes(), alleleMap)); builder.filters(source.getFilters()); builder.log10PError(source.getLog10PError()); builder.attributes(source.getAttributes()); builder.id(source.getID()); + if (WRITE_ORIGINAL_POSITION) { + builder.attribute(ORIGINAL_CONTIG, source.getContig()); + builder.attribute(ORIGINAL_START, source.getStart()); + } // Check that the reference allele still agrees with the reference sequence boolean mismatchesReference = false; From 768f174ba11bec85f394f03c194d792127d9aade Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Mon, 26 Jun 2017 17:22:47 -0400 Subject: [PATCH 5/7] - adding tests of variants that do not pass for various reasons. --- src/main/java/picard/vcf/LiftoverVcf.java | 6 +++--- src/test/java/picard/vcf/LiftoverVcfTest.java | 1 + testdata/picard/vcf/testLiftoverFailingVariants.vcf | 5 +++++ 3 files changed, 9 insertions(+), 3 deletions(-) create mode 100644 testdata/picard/vcf/testLiftoverFailingVariants.vcf diff --git a/src/main/java/picard/vcf/LiftoverVcf.java b/src/main/java/picard/vcf/LiftoverVcf.java index 582de1464..cc0de5d01 100644 --- a/src/main/java/picard/vcf/LiftoverVcf.java +++ b/src/main/java/picard/vcf/LiftoverVcf.java @@ -234,15 +234,15 @@ protected int doWork() { final VariantContext flippedIndel = flipIndel(ctx, liftOver, refSeq); if (flippedIndel == null) { - rejects.add(new VariantContextBuilder(ctx).filter(FILTER_CANNOT_LIFTOVER_INDEL).make()); + throw new IllegalArgumentException("Unexpectedly found null VC. This should have not happened."); } else { - if (!tryToAddVariant(flippedIndel,refSeq, reverseComplementAlleleMap,ctx)){ + if (!tryToAddVariant(flippedIndel, refSeq, reverseComplementAlleleMap, ctx)){ failedAlleleCheck++; } } } else { refSeq = refSeqs.get(target.getContig()); - final VariantContext liftedVariant=liftSnp(ctx,target,refSeq); + final VariantContext liftedVariant = liftSnp(ctx, target, refSeq); if (!tryToAddVariant(liftedVariant, refSeq, reverseComplementAlleleMap, ctx)) { failedAlleleCheck++; diff --git a/src/test/java/picard/vcf/LiftoverVcfTest.java b/src/test/java/picard/vcf/LiftoverVcfTest.java index 999aad48e..526f1f82d 100644 --- a/src/test/java/picard/vcf/LiftoverVcfTest.java +++ b/src/test/java/picard/vcf/LiftoverVcfTest.java @@ -44,6 +44,7 @@ public void teardown() { return new Object[][]{ {"testLiftoverBiallelicIndels.vcf", 3, 0}, {"testLiftoverMultiallelicIndels.vcf", 0, 2}, + {"testLiftoverFailingVariants.vcf", 0, 3}, }; } diff --git a/testdata/picard/vcf/testLiftoverFailingVariants.vcf b/testdata/picard/vcf/testLiftoverFailingVariants.vcf new file mode 100644 index 000000000..d128062a2 --- /dev/null +++ b/testdata/picard/vcf/testLiftoverFailingVariants.vcf @@ -0,0 +1,5 @@ +##fileformat=VCFv4.1 +#CHROM POS ID REF ALT QUAL FILTER INFO +chr1 1 . C T 100 PASS . +chr1 61 . A T 100 PASS . +chr1 71 . CA C 100 PASS . From 5703a0ada35f45aa87c157f6f0c225d5a57559c0 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Wed, 28 Jun 2017 19:33:50 -0400 Subject: [PATCH 6/7] - need another test (see TODO) --- src/main/java/picard/vcf/LiftoverVcf.java | 8 +++++--- src/test/java/picard/vcf/LiftoverVcfTest.java | 4 ++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/main/java/picard/vcf/LiftoverVcf.java b/src/main/java/picard/vcf/LiftoverVcf.java index cc0de5d01..60d8c73ce 100644 --- a/src/main/java/picard/vcf/LiftoverVcf.java +++ b/src/main/java/picard/vcf/LiftoverVcf.java @@ -478,7 +478,7 @@ protected static VariantContext leftAlignVariant(final VariantContext vc, final for (final Allele allele : alleleBasesMap.keySet()) { // the first -1 for zero-base (getBases) versus 1-based (variant position) // another -1 to get the base prior to the location of the start of the allele - final byte extraBase = start > 1 ? + final byte extraBase = (start > 1) ? referenceSequence.getBases()[start - 2] : referenceSequence.getBases()[end]; @@ -520,10 +520,12 @@ protected static VariantContext leftAlignVariant(final VariantContext vc, final } private static byte[] truncateBase(final byte[] allele, final boolean truncateRightmost) { - return Arrays.copyOfRange(allele, truncateRightmost ? 0 : 1, truncateRightmost ? allele.length - 1 : allele.length); + return Arrays.copyOfRange(allele, truncateRightmost ? 0 : 1, truncateRightmost ? + allele.length - 1 : + allele.length); } - private static byte[] extendOneBase(final byte[] bases, final byte base, boolean toTheLeft) { + private static byte[] extendOneBase(final byte[] bases, final byte base, final boolean toTheLeft) { final byte[] newBases = new byte[bases.length + 1]; diff --git a/src/test/java/picard/vcf/LiftoverVcfTest.java b/src/test/java/picard/vcf/LiftoverVcfTest.java index 526f1f82d..829f9d2d3 100644 --- a/src/test/java/picard/vcf/LiftoverVcfTest.java +++ b/src/test/java/picard/vcf/LiftoverVcfTest.java @@ -355,6 +355,10 @@ public void testFlipIndel(final VariantContext source, final ReferenceSequence r tests.add(new Object[]{builder.make(), reference, result_builder.make()}); } + // TODO: add a test that converts the initial C to a AC which will require + // TODO: code in LiftOver::extendOneBase(., false) which is currently not covered. + + //CT/CTCT -> A/ACT in CT repeat region result_builder.start(19).stop(19).alleles(CollectionUtil.makeList(RefA, ACT)); for (start = 20; start <= 27; start += 2) { From 33caee9ae61619d112ad9ce23943faee40d9ec8a Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Fri, 30 Jun 2017 21:24:02 -0400 Subject: [PATCH 7/7] -- moar tests to be sure we covered everything --- src/main/java/picard/vcf/LiftoverVcf.java | 14 +++--- src/test/java/picard/vcf/LiftoverVcfTest.java | 61 +++++++++++++++++++++++++-- 2 files changed, 62 insertions(+), 13 deletions(-) diff --git a/src/main/java/picard/vcf/LiftoverVcf.java b/src/main/java/picard/vcf/LiftoverVcf.java index 60d8c73ce..4c9bb03c6 100644 --- a/src/main/java/picard/vcf/LiftoverVcf.java +++ b/src/main/java/picard/vcf/LiftoverVcf.java @@ -482,7 +482,7 @@ protected static VariantContext leftAlignVariant(final VariantContext vc, final referenceSequence.getBases()[start - 2] : referenceSequence.getBases()[end]; - alleleBasesMap.put(allele, extendOneBase(alleleBasesMap.get(allele), extraBase, start > 1)); + alleleBasesMap.put(allele, extendOneBase(alleleBasesMap.get(allele), extraBase)); } changesInAlleles = true; start--; @@ -525,17 +525,13 @@ protected static VariantContext leftAlignVariant(final VariantContext vc, final allele.length); } - private static byte[] extendOneBase(final byte[] bases, final byte base, final boolean toTheLeft) { + //creates a new byte array with the base added at the begining + private static byte[] extendOneBase(final byte[] bases, final byte base) { final byte[] newBases = new byte[bases.length + 1]; - if (toTheLeft) { - System.arraycopy(bases, 0, newBases, 1, bases.length); - newBases[0] = base; - } else { - System.arraycopy(bases, 0, newBases, 0, bases.length); - newBases[bases.length] = base; - } + System.arraycopy(bases, 0, newBases, 1, bases.length); + newBases[0] = base; return newBases; } diff --git a/src/test/java/picard/vcf/LiftoverVcfTest.java b/src/test/java/picard/vcf/LiftoverVcfTest.java index 829f9d2d3..acabe826a 100644 --- a/src/test/java/picard/vcf/LiftoverVcfTest.java +++ b/src/test/java/picard/vcf/LiftoverVcfTest.java @@ -169,6 +169,8 @@ public void testWriteOriginalPosition(final boolean shouldWriteOriginalPosition) final Allele T = Allele.create("T", false); final Allele C = Allele.create("C", false); final Allele CG = Allele.create("CG", false); + final Allele GA = Allele.create("GA", false); + final Allele TC = Allele.create("TC", false); final Allele CAA = Allele.create("CAA", false); final Allele ACT = Allele.create("ACT", false); final Allele TTT = Allele.create("TTT", false); @@ -269,6 +271,46 @@ public void testWriteOriginalPosition(final boolean shouldWriteOriginalPosition) result_builder.start(19).stop(19).alleles(CollectionUtil.makeList(RefA, ACT)); tests.add(new Object[]{builder.make(), reference, result_builder.make()}); + // insertion at end of section + // a test that converts the initial C to a AC which requires + // code in LiftOver::extendOneBase(., false) + // + // G*(.)/GA(.) -> .(C)/.T(C) but that's not legal, so + // -> C/TC + start = CHAIN_SIZE; + stop = CHAIN_SIZE; + + builder.start(start).stop(stop).alleles(CollectionUtil.makeList(RefG, GA)); + result_builder.start(1).stop(1).alleles(CollectionUtil.makeList(RefC, TC)); + tests.add(new Object[]{builder.make(), reference, result_builder.make()}); + + // insertion at end of section + // a test that converts the initial C to a AC which requires + // code in LiftOver::extendOneBase(., false) + // + // G*(.)/GG(.) -> .(C)/.C(C) but that's not legal, so + // -> C/CC + start = CHAIN_SIZE; + stop = CHAIN_SIZE; + + builder.start(start).stop(stop).alleles(CollectionUtil.makeList(RefG, Allele.create("GG"))); + result_builder.start(1).stop(1).alleles(CollectionUtil.makeList(RefC, Allele.create("CC"))); + tests.add(new Object[]{builder.make(), reference, result_builder.make()}); + + // insertion at end of section + // a test that converts the initial C to a AC which requires + // code in LiftOver::extendOneBase(., false) + // + // improperly aligned + // TTTT(G)->TTTTG(G) -> C/CC + start = CHAIN_SIZE - 4; + stop = CHAIN_SIZE - 1; + + builder.start(start).stop(stop).alleles(CollectionUtil.makeList(Allele.create("TTTT",true), Allele.create("TTTTG"))); + result_builder.start(1).stop(1).alleles(CollectionUtil.makeList(RefC, Allele.create("CC"))); + tests.add(new Object[]{builder.make(), reference, result_builder.make()}); + + return tests.iterator(); } @@ -318,14 +360,27 @@ public void testFlipIndel(final VariantContext source, final ReferenceSequence r final VariantContextBuilder builder = new VariantContextBuilder().source("test1").chr("chr1"); final VariantContextBuilder result_builder = new VariantContextBuilder().source("test1").chr("chr1"); + // left aligning at the edge of the reference + // simple SNP + // CAAA*/CCAAA -> C->CC + int start = 1; + int stop = 4; + builder.start(start).stop(stop).alleles(CollectionUtil.makeList(Allele.create("CAAA",true), Allele.create("CCAAA") )); + + result_builder.start(1).stop(1).alleles(CollectionUtil.makeList(RefC, Allele.create("CC"))); + + tests.add(new Object[]{builder.make(), reference, result_builder.make()}); + + // simple SNP // G*/A -> G/A - int start = 13; - int stop = start; + start = 13; + stop = start; builder.start(start).stop(stop).alleles(CollectionUtil.makeList(RefG, A)); result_builder.start(stop).stop(start).alleles(CollectionUtil.makeList(RefG, A)); tests.add(new Object[]{builder.make(), reference, result_builder.make()}); + for (start = 1; start <= reference.getBases().length; start++) { builder.start(start).stop(start); builder.alleles(CollectionUtil.makeList( @@ -355,8 +410,6 @@ public void testFlipIndel(final VariantContext source, final ReferenceSequence r tests.add(new Object[]{builder.make(), reference, result_builder.make()}); } - // TODO: add a test that converts the initial C to a AC which will require - // TODO: code in LiftOver::extendOneBase(., false) which is currently not covered. //CT/CTCT -> A/ACT in CT repeat region