From d3d69e0dafda9a2699cdba1e78acc49c2ae6cf75 Mon Sep 17 00:00:00 2001 From: Nils Homer Date: Mon, 10 Apr 2017 14:26:09 -0700 Subject: [PATCH] Adding custom adapter pairs to IlluminaBasecallsToSam --- .../picard/illumina/ClusterDataToSamConverter.java | 2 +- .../java/picard/illumina/CustomAdapterPair.java | 63 ++++++++++++++++++++++ .../picard/illumina/IlluminaBasecallsToFastq.java | 13 ++--- .../picard/illumina/IlluminaBasecallsToSam.java | 20 ++++++- .../java/picard/illumina/MarkIlluminaAdapters.java | 36 +------------ .../IlluminaBasecallsToSamAdapterClippingTest.java | 53 ++++++++++++++---- 6 files changed, 134 insertions(+), 53 deletions(-) create mode 100644 src/main/java/picard/illumina/CustomAdapterPair.java diff --git a/src/main/java/picard/illumina/ClusterDataToSamConverter.java b/src/main/java/picard/illumina/ClusterDataToSamConverter.java index fc3517860..7e7910738 100644 --- a/src/main/java/picard/illumina/ClusterDataToSamConverter.java +++ b/src/main/java/picard/illumina/ClusterDataToSamConverter.java @@ -89,7 +89,7 @@ public ClusterDataToSamConverter(final String runBarcode, final String readGroupId, final ReadStructure readStructure, - final List adapters) { + final List adapters) { this.readGroupId = readGroupId; this.readNameEncoder = new IlluminaReadNameEncoder(runBarcode); diff --git a/src/main/java/picard/illumina/CustomAdapterPair.java b/src/main/java/picard/illumina/CustomAdapterPair.java new file mode 100644 index 000000000..ac3e6cb78 --- /dev/null +++ b/src/main/java/picard/illumina/CustomAdapterPair.java @@ -0,0 +1,63 @@ +/* + * The MIT License + * + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package picard.illumina; + +import htsjdk.samtools.util.SequenceUtil; +import htsjdk.samtools.util.StringUtil; +import picard.util.AdapterPair; + +class CustomAdapterPair implements AdapterPair { + + private final String fivePrime, threePrime, fivePrimeReadOrder; + private final byte[] fivePrimeBytes, threePrimeBytes, fivePrimeReadOrderBytes; + + CustomAdapterPair(final String fivePrime, final String threePrime) { + this.threePrime = threePrime; + this.threePrimeBytes = StringUtil.stringToBytes(threePrime); + + this.fivePrime = fivePrime; + this.fivePrimeReadOrder = SequenceUtil.reverseComplement(fivePrime); + this.fivePrimeBytes = StringUtil.stringToBytes(fivePrime); + this.fivePrimeReadOrderBytes = StringUtil.stringToBytes(fivePrimeReadOrder); + } + + public String get3PrimeAdapter() { return threePrime; } + + public String get5PrimeAdapter() { return fivePrime; } + + public String get3PrimeAdapterInReadOrder() { return threePrime; } + + public String get5PrimeAdapterInReadOrder() { return fivePrimeReadOrder; } + + public byte[] get3PrimeAdapterBytes() { return threePrimeBytes; } + + public byte[] get5PrimeAdapterBytes() { return fivePrimeBytes; } + + public byte[] get3PrimeAdapterBytesInReadOrder() { return threePrimeBytes; } + + public byte[] get5PrimeAdapterBytesInReadOrder() { return fivePrimeReadOrderBytes; } + + public String getName() { return "Custom adapter pair"; } +} \ No newline at end of file diff --git a/src/main/java/picard/illumina/IlluminaBasecallsToFastq.java b/src/main/java/picard/illumina/IlluminaBasecallsToFastq.java index 87a00795c..43ade8161 100644 --- a/src/main/java/picard/illumina/IlluminaBasecallsToFastq.java +++ b/src/main/java/picard/illumina/IlluminaBasecallsToFastq.java @@ -150,12 +150,9 @@ mutex = {"OUTPUT_PREFIX"}) public File MULTIPLEX_PARAMS; - @Option(doc = "Which adapters to look for in the read.") - public List ADAPTERS_TO_CHECK = new ArrayList<>( - Arrays.asList(IlluminaUtil.IlluminaAdapterPair.INDEXED, - IlluminaUtil.IlluminaAdapterPair.DUAL_INDEXED, - IlluminaUtil.IlluminaAdapterPair.NEXTERA_V2, - IlluminaUtil.IlluminaAdapterPair.FLUIDIGM)); + @Deprecated + @Option(doc = "Deprecated (No longer used). Which adapters to look for in the read.") + public List ADAPTERS_TO_CHECK = null; @Option(doc = "The number of threads to run in parallel. If NUM_PROCESSORS = 0, number of cores is automatically set to " + "the number of cores available on the machine. If NUM_PROCESSORS < 0, then the number of cores used will" + @@ -233,6 +230,10 @@ protected int doWork() { if (READ_NAME_FORMAT == ReadNameFormat.CASAVA_1_8 && FLOWCELL_BARCODE == null) { errors.add("FLOWCELL_BARCODE is required when using Casava1.8-style read name headers."); } + + if (ADAPTERS_TO_CHECK != null) { + log.warn("ADAPTERS_TO_CHECK is not used"); + } if (errors.isEmpty()) { return null; diff --git a/src/main/java/picard/illumina/IlluminaBasecallsToSam.java b/src/main/java/picard/illumina/IlluminaBasecallsToSam.java index 54aa196a8..f31c5ecac 100644 --- a/src/main/java/picard/illumina/IlluminaBasecallsToSam.java +++ b/src/main/java/picard/illumina/IlluminaBasecallsToSam.java @@ -45,6 +45,7 @@ import picard.cmdline.StandardOptionDefinitions; import picard.illumina.parser.ReadStructure; import picard.illumina.parser.readers.BclQualityEvaluationStrategy; +import picard.util.AdapterPair; import picard.util.IlluminaUtil; import picard.util.IlluminaUtil.IlluminaAdapterPair; import picard.util.TabbedTextFileWithHeaderParser; @@ -201,6 +202,12 @@ IlluminaAdapterPair.NEXTERA_V2, IlluminaAdapterPair.FLUIDIGM)); + @Option(doc = "For specifying adapters other than standard Illumina", optional = true) + public String FIVE_PRIME_ADAPTER; + + @Option(doc = "For specifying adapters other than standard Illumina", optional = true) + public String THREE_PRIME_ADAPTER; + @Option(doc = "The number of threads to run in parallel. If NUM_PROCESSORS = 0, number of cores is automatically set to " + "the number of cores available on the machine. If NUM_PROCESSORS < 0, then the number of cores used will" + " be the number available on the machine less NUM_PROCESSORS.") @@ -289,12 +296,19 @@ private void initialize() { log.info("DONE_READING STRUCTURE IS " + readStructure.toString()); + // Combine any adapters and custom adapter pairs from the command line into an array for use in clipping + final List adapters = new ArrayList<>(); + adapters.addAll(ADAPTERS_TO_CHECK); + if (FIVE_PRIME_ADAPTER != null && THREE_PRIME_ADAPTER != null) { + adapters.add(new CustomAdapterPair(FIVE_PRIME_ADAPTER, THREE_PRIME_ADAPTER)); + } + /** * Be sure to pass the outputReadStructure to ClusterDataToSamConverter, which reflects the structure of the output cluster * data which may be different from the input read structure (specifically if there are skips). */ final ClusterDataToSamConverter converter = new ClusterDataToSamConverter(RUN_BARCODE, READ_GROUP_ID, - basecallsConverter.getFactory().getOutputReadStructure(), ADAPTERS_TO_CHECK) + basecallsConverter.getFactory().getOutputReadStructure(), adapters) .withMolecularIndexTag(MOLECULAR_INDEX_TAG) .withMolecularIndexQualityTag(MOLECULAR_INDEX_BASE_QUALITY_TAG) .withTagPerMolecularIndex(TAG_PER_MOLECULAR_INDEX); @@ -492,6 +506,10 @@ public static void main(final String[] args) { messages.add("The number of tags given in TAG_PER_MOLECULAR_INDEX does not match the number of molecular indexes in READ_STRUCTURE"); } + if ((FIVE_PRIME_ADAPTER == null) != (THREE_PRIME_ADAPTER == null)) { + messages.add("THREE_PRIME_ADAPTER and FIVE_PRIME_ADAPTER must either both be null or both be set."); + } + if (messages.isEmpty()) { return null; } diff --git a/src/main/java/picard/illumina/MarkIlluminaAdapters.java b/src/main/java/picard/illumina/MarkIlluminaAdapters.java index 993c5feb8..269a34d5c 100644 --- a/src/main/java/picard/illumina/MarkIlluminaAdapters.java +++ b/src/main/java/picard/illumina/MarkIlluminaAdapters.java @@ -146,7 +146,7 @@ public static void main(final String[] args) { @Override protected String[] customCommandLineValidation() { if ((FIVE_PRIME_ADAPTER != null && THREE_PRIME_ADAPTER == null) || (THREE_PRIME_ADAPTER != null && FIVE_PRIME_ADAPTER == null)) { - return new String[]{"Either both or neither of THREE_PRIME_ADAPTER and FIVE_PRIME_ADAPTER must be set."}; + return new String[]{"THREE_PRIME_ADAPTER and FIVE_PRIME_ADAPTER must either both be null or both be set."}; } else { return null; } @@ -251,38 +251,4 @@ protected int doWork() { CloserUtil.close(in); return 0; } - - private final class CustomAdapterPair implements AdapterPair { - - final String fivePrime, threePrime, fivePrimeReadOrder; - final byte[] fivePrimeBytes, threePrimeBytes, fivePrimeReadOrderBytes; - - private CustomAdapterPair(final String fivePrime, final String threePrime) { - this.threePrime = threePrime; - this.threePrimeBytes = StringUtil.stringToBytes(threePrime); - - this.fivePrime = fivePrime; - this.fivePrimeReadOrder = SequenceUtil.reverseComplement(fivePrime); - this.fivePrimeBytes = StringUtil.stringToBytes(fivePrime); - this.fivePrimeReadOrderBytes = StringUtil.stringToBytes(fivePrimeReadOrder); - } - - public String get3PrimeAdapter() { return threePrime; } - - public String get5PrimeAdapter() { return fivePrime; } - - public String get3PrimeAdapterInReadOrder() { return threePrime; } - - public String get5PrimeAdapterInReadOrder() { return fivePrimeReadOrder; } - - public byte[] get3PrimeAdapterBytes() { return threePrimeBytes; } - - public byte[] get5PrimeAdapterBytes() { return fivePrimeBytes; } - - public byte[] get3PrimeAdapterBytesInReadOrder() { return threePrimeBytes; } - - public byte[] get5PrimeAdapterBytesInReadOrder() { return fivePrimeReadOrderBytes; } - - public String getName() { return "Custom adapter pair"; } - } } diff --git a/src/test/java/picard/illumina/IlluminaBasecallsToSamAdapterClippingTest.java b/src/test/java/picard/illumina/IlluminaBasecallsToSamAdapterClippingTest.java index 0e79419d9..57da4a2b6 100644 --- a/src/test/java/picard/illumina/IlluminaBasecallsToSamAdapterClippingTest.java +++ b/src/test/java/picard/illumina/IlluminaBasecallsToSamAdapterClippingTest.java @@ -27,12 +27,16 @@ import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.util.CollectionUtil; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import picard.cmdline.CommandLineProgramTest; import java.io.File; +import java.util.List; + +import static picard.util.IlluminaUtil.IlluminaAdapterPair.*; /** @@ -52,22 +56,28 @@ public String getCommandLineProgramName() { * Run IlluminaBasecallsToSam on a few test cases, and verify that results agree with hand-checked expectation. */ @Test(dataProvider="data") - public void testBasic(final String LANE, final String readStructure) throws Exception { + public void testBasic(final String LANE, final String readStructure, + final String fivePrimerAdapter, final String threePrimerAdapter, + final int expectedNumClippedRecords) throws Exception { // Create the SAM file from Gerald output final File samFile = File.createTempFile("." + LANE + ".illuminaBasecallsToSam", ".sam"); samFile.deleteOnExit(); - final String[] illuminaArgv = { - "BASECALLS_DIR=" + TEST_DATA_DIR, + final List illuminaArgv = CollectionUtil.makeList("BASECALLS_DIR=" + TEST_DATA_DIR, "LANE=" + LANE, "RUN_BARCODE=" + RUN_BARCODE, "READ_STRUCTURE=" + readStructure, "OUTPUT=" + samFile, "ALIAS=" + ALIAS - }; + ); + if (fivePrimerAdapter != null) { + illuminaArgv.addAll(CollectionUtil.makeList( + "ADAPTERS_TO_CHECK=null", + "FIVE_PRIME_ADAPTER=" + fivePrimerAdapter, + "THREE_PRIME_ADAPTER=" + threePrimerAdapter + )); + } Assert.assertEquals(runPicardCommandLine(illuminaArgv), 0); - System.out.println ("Ouput Sam file is in " + samFile.getAbsolutePath()); - // Read the file and confirm it contains what is expected final SamReader samReader = SamReaderFactory.makeDefault().open(samFile); @@ -83,6 +93,10 @@ public void testBasic(final String LANE, final String readStructure) throws Exce } } } + + // Check the total number of clipped records + Assert.assertEquals(count, expectedNumClippedRecords); + samReader.close(); samFile.delete(); } @@ -90,10 +104,29 @@ public void testBasic(final String LANE, final String readStructure) throws Exce @DataProvider(name="data") private Object[][] getIlluminaBasecallsToSamTestData(){ return new Object[][] { - {"1", "125T125T"}, - {"2", "125T125T"}, - }; - } + // Use the default set of adapters + {"1", "125T125T", null, null, 32}, + {"2", "125T125T", null, null, 108}, + + // Use adapters that don't match + {"1", "125T125T", "ACGTACGTACGTACGT", "ACGTACGTACGTACGT", 0}, + {"2", "125T125T", "ACGTACGTACGTACGT", "ACGTACGTACGTACGT", 0}, + + // Add just the "nextera v2" adapters, which should not match + {"1", "125T125T", NEXTERA_V2.get5PrimeAdapter(), NEXTERA_V2.get3PrimeAdapter(), 0}, + {"2", "125T125T", NEXTERA_V2.get5PrimeAdapter(), NEXTERA_V2.get3PrimeAdapter(), 0}, + // Add just the "fludigm" adapters, which should not match + {"1", "125T125T", FLUIDIGM.get5PrimeAdapter(), FLUIDIGM.get3PrimeAdapter(), 0}, + {"2", "125T125T", FLUIDIGM.get5PrimeAdapter(), FLUIDIGM.get3PrimeAdapter(), 0}, + // Add just the "dual indexed" adapters, which should match + {"1", "125T125T", DUAL_INDEXED.get5PrimeAdapter(), DUAL_INDEXED.get3PrimeAdapter(), 32}, + {"2", "125T125T", DUAL_INDEXED.get5PrimeAdapter(), DUAL_INDEXED.get3PrimeAdapter(), 108}, + + // Add just the "indexed" adapters, which should match + {"1", "125T125T", INDEXED.get5PrimeAdapter(), INDEXED.get3PrimeAdapter(), 32}, + {"2", "125T125T", INDEXED.get5PrimeAdapter(), INDEXED.get3PrimeAdapter(), 108} + }; + } } \ No newline at end of file