diff --git a/src/main/java/picard/illumina/IlluminaBasecallsToSam.java b/src/main/java/picard/illumina/IlluminaBasecallsToSam.java index 2d2dcf16c..5ad0d5a89 100644 --- a/src/main/java/picard/illumina/IlluminaBasecallsToSam.java +++ b/src/main/java/picard/illumina/IlluminaBasecallsToSam.java @@ -1,7 +1,7 @@ /* * The MIT License * - * Copyright (c) 2011 The Broad Institute + * Copyright (c) 2011-2016 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -44,7 +44,6 @@ import picard.cmdline.programgroups.Illumina; import picard.cmdline.StandardOptionDefinitions; import picard.illumina.parser.ReadStructure; -import picard.illumina.parser.ReadType; import picard.illumina.parser.readers.BclQualityEvaluationStrategy; import picard.util.IlluminaUtil; import picard.util.IlluminaUtil.IlluminaAdapterPair; @@ -55,7 +54,6 @@ import java.io.OutputStream; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.Comparator; import java.util.Date; import java.util.HashMap; @@ -67,11 +65,11 @@ /** * IlluminaBasecallsToSam transforms a lane of Illumina data file formats (bcl, locs, clocs, qseqs, etc.) into - * SAM or BAM file format. + * SAM, BAM or CRAM file format. *

* In this application, barcode data is read from Illumina data file groups, each of which is associated with a tile. * Each tile may contain data for any number of barcodes, and a single barcode's data may span multiple tiles. Once the - * barcode data is collected from files, each barcode's data is written to its own SAM/BAM. The barcode data must be + * barcode data is collected from files, each barcode's data is written to its own SAM/BAM/CRAM. The barcode data must be * written in order; this means that barcode data from each tile is sorted before it is written to file, and that if a * barcode's data does span multiple tiles, data collected from each tile must be written in the order of the tiles * themselves. @@ -100,9 +98,9 @@ programGroup = Illumina.class ) public class IlluminaBasecallsToSam extends CommandLineProgram { - static final String USAGE_SUMMARY = "Transforms raw Illumina sequencing data into an unmapped SAM or BAM file." ; + static final String USAGE_SUMMARY = "Transforms raw Illumina sequencing data into an unmapped SAM, BAM or CRAM file." ; static final String USAGE_DETAILS = "

The IlluminaBaseCallsToSam program collects, demultiplexes, and sorts reads across all " + - "of the tiles of a lane via barcode to produce an unmapped SAM/BAM file. An unmapped BAM file is often referred to as a uBAM. " + + "of the tiles of a lane via barcode to produce an unmapped SAM, BAM or CRAM file. An unmapped BAM file is often referred to as a uBAM. " + "All barcode, sample, and library data is provided in the LIBRARY_PARAMS file. Note, this LIBRARY_PARAMS file " + "should be formatted according to the specifications indicated below. The following is an example of a properly" + " formmated LIBRARY_PARAMS file:

" + @@ -133,7 +131,7 @@ // The following attributes define the command-line arguments - public static final String USAGE = "Generate a SAM or BAM file from data in an Illumina basecalls output directory"; + public static final String USAGE = "Generate a SAM, BAM or CRAM file from data in an Illumina basecalls output directory"; @Option(doc = "The basecalls directory. ", shortName = "B") public File BASECALLS_DIR; @@ -144,7 +142,7 @@ @Option(doc = "Lane number. ", shortName = StandardOptionDefinitions.LANE_SHORT_NAME) public Integer LANE; - @Option(doc = "Deprecated (use LIBRARY_PARAMS). The output SAM or BAM file. Format is determined by extension.", + @Option(doc = "Deprecated (use LIBRARY_PARAMS). The output SAM, BAM or CRAM file. Format is determined by extension.", shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, mutex = {"BARCODE_PARAMS", "LIBRARY_PARAMS"}) public File OUTPUT; @@ -181,18 +179,18 @@ @Option(doc = ReadStructure.PARAMETER_DOC, shortName = "RS") public String READ_STRUCTURE; - @Option(doc = "Deprecated (use LIBRARY_PARAMS). Tab-separated file for creating all output BAMs for barcoded run " + + @Option(doc = "Deprecated (use LIBRARY_PARAMS). Tab-separated file for creating all output SAM, BAM or CRAM files for barcoded run " + "with single IlluminaBasecallsToSam invocation. Columns are BARCODE, OUTPUT, SAMPLE_ALIAS, and " + "LIBRARY_NAME. Row with BARCODE=N is used to specify a file for no barcode match", mutex = {"OUTPUT", "SAMPLE_ALIAS", "LIBRARY_NAME", "LIBRARY_PARAMS"}) public File BARCODE_PARAMS; - @Option(doc = "Tab-separated file for creating all output BAMs for a lane with single IlluminaBasecallsToSam " + + @Option(doc = "Tab-separated file for creating all output SAM, BAM or CRAM files for a lane with single IlluminaBasecallsToSam " + "invocation. The columns are OUTPUT, SAMPLE_ALIAS, and LIBRARY_NAME, BARCODE_1, BARCODE_2 ... BARCODE_X " + "where X = number of barcodes per cluster (optional). Row with BARCODE_1 set to 'N' is used to specify a file " + "for no barcode match. You may also provide any 2 letter RG header attributes (excluding PU, CN, PL, and" + " DT) as columns in this file and the values for those columns will be inserted into the RG tag for the" + - " BAM file created for a given row.", + " SAM, BAM or CRAM file created for a given row.", mutex = {"OUTPUT", "SAMPLE_ALIAS", "LIBRARY_NAME", "BARCODE_PARAMS"}) public File LIBRARY_PARAMS; @@ -236,7 +234,7 @@ public boolean INCLUDE_NON_PF_READS = true; @Option(doc="Whether to ignore reads whose barcodes are not found in LIBRARY_PARAMS. Useful when outputting " + - "BAMs for only a subset of the barcodes in a lane.", shortName="INGORE_UNEXPECTED") + "SAM, BAM or CRAM files for only a subset of the barcodes in a lane.", shortName="INGORE_UNEXPECTED") public boolean IGNORE_UNEXPECTED_BARCODES = false; @Option(doc="The tag to use to store any molecular indexes. If more than one molecular index is found, they will be concatenated and stored here.", optional=true) @@ -457,7 +455,7 @@ private SAMFileWriterWrapper buildSamFileWriter(final File output, final String header.setSortOrder(SAMFileHeader.SortOrder.queryname); header.addReadGroup(rg); - return new SAMFileWriterWrapper(new SAMFileWriterFactory().makeSAMOrBAMWriter(header, true, output)); + return new SAMFileWriterWrapper(new SAMFileWriterFactory().makeWriter(header, true, output, REFERENCE_SEQUENCE)); } public static void main(final String[] args) { diff --git a/src/main/java/picard/illumina/MarkIlluminaAdapters.java b/src/main/java/picard/illumina/MarkIlluminaAdapters.java index 993c5feb8..08eb94ac1 100644 --- a/src/main/java/picard/illumina/MarkIlluminaAdapters.java +++ b/src/main/java/picard/illumina/MarkIlluminaAdapters.java @@ -1,7 +1,7 @@ /* * The MIT License * - * Copyright (c) 2009 The Broad Institute + * Copyright (c) 2009-2016 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -71,9 +71,9 @@ ) public class MarkIlluminaAdapters extends CommandLineProgram { - static final String USAGE_SUMMARY = "Reads a SAM or BAM file and rewrites it with new adapter-trimming tags. "; + static final String USAGE_SUMMARY = "Reads a SAM, BAM or CRAM file and rewrites it with new adapter-trimming tags. "; static final String USAGE_DETAILS = "

This tool clears any existing adapter-trimming tags (XT:i:) in the optional tag region of " + - "a SAM file. The SAM/BAM file must be sorted by query name.

"+ + "a SAM file. The SAM/BAM/CRAM file must be sorted by query name.

"+ "

Outputs a metrics file histogram showing counts of bases_clipped per read." + "" + "

Usage example:

" + @@ -162,7 +162,7 @@ protected int doWork() { SAMFileWriter out = null; if (OUTPUT != null) { IOUtil.assertFileIsWritable(OUTPUT); - out = new SAMFileWriterFactory().makeSAMOrBAMWriter(in.getFileHeader(), true, OUTPUT); + out = new SAMFileWriterFactory().makeWriter(in.getFileHeader(), true, OUTPUT, REFERENCE_SEQUENCE); } final Histogram histo = new Histogram("clipped_bases", "read_count"); @@ -199,7 +199,7 @@ protected int doWork() { if (rec.getReadPairedFlag()) { // Assert that the input file is in query name order only if we see some PE reads if (order != SAMFileHeader.SortOrder.queryname) { - throw new PicardException("Input BAM file must be sorted by queryname"); + throw new PicardException("Input file must be sorted by queryname"); } if (rec2 == null) throw new PicardException("Missing mate pair for paired read: " + rec.getReadName()); diff --git a/src/main/java/picard/sam/AbstractAlignmentMerger.java b/src/main/java/picard/sam/AbstractAlignmentMerger.java index ba690b74c..a4a0cd50d 100644 --- a/src/main/java/picard/sam/AbstractAlignmentMerger.java +++ b/src/main/java/picard/sam/AbstractAlignmentMerger.java @@ -1,7 +1,7 @@ /* * The MIT License * - * Copyright (c) 2009 The Broad Institute + * Copyright (c) 2009-2016 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -67,7 +67,7 @@ *

* The order of processing is as follows: *

- * 1. Get records from the unmapped bam and the alignment data + * 1. Get records from the unmapped SAM/BAM/CRAM and the alignment data * 2. Merge the alignment information and public tags ONLY from the aligned SAMRecords * 3. Do additional modifications -- handle clipping, trimming, etc. * 4. Fix up mate information on paired reads @@ -316,7 +316,7 @@ public void mergeAlignment(final File referenceFasta) { else { // catches queryname and unsorted final SAMFileHeader header = this.header.clone(); header.setSortOrder(this.sortOrder); - final SAMFileWriter writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, true, this.targetBamFile); + final SAMFileWriter writer = new SAMFileWriterFactory().makeWriter(header, true, this.targetBamFile, referenceFasta); writer.setProgressLogger(new ProgressLogger(log, (int) 1e7, "Wrote", "records to output in queryname order")); sink = new Sink(writer); } @@ -469,7 +469,7 @@ public void mergeAlignment(final File referenceFasta) { // Write the records to the output file in specified sorted order, if (this.sortOrder == SortOrder.coordinate) { header.setSortOrder(this.sortOrder); - final SAMFileWriter writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, true, this.targetBamFile); + final SAMFileWriter writer = new SAMFileWriterFactory().makeWriter(header, true, this.targetBamFile, referenceFasta); writer.setProgressLogger(new ProgressLogger(log, (int) 1e7, "Wrote", "records from a sorting collection")); final ProgressLogger finalProgress = new ProgressLogger(log, 10000000, "Written in coordinate order to output", "records"); diff --git a/src/main/java/picard/sam/AddOrReplaceReadGroups.java b/src/main/java/picard/sam/AddOrReplaceReadGroups.java index 539e89ea5..ad8c4215a 100644 --- a/src/main/java/picard/sam/AddOrReplaceReadGroups.java +++ b/src/main/java/picard/sam/AddOrReplaceReadGroups.java @@ -25,7 +25,7 @@ import java.util.Arrays; /** - * Replaces read groups in a BAM file + * Replaces read groups in a SAM, BAM or CRAM file * * @author mdepristo */ @@ -35,12 +35,12 @@ programGroup = SamOrBam.class ) public class AddOrReplaceReadGroups extends CommandLineProgram { - static final String USAGE_SUMMARY = "Replace read groups in a BAM file."; + static final String USAGE_SUMMARY = "Replace read groups in a SAM, BAM or CRAM file."; static final String USAGE_DETAILS = "This tool enables the user to replace all read groups in the INPUT file with a single new read " + - "group and assign all reads to this read group in the OUTPUT BAM file.

" + + "group and assign all reads to this read group in the OUTPUT SAM, BAM or CRAM file.

" + "For more information about read groups, see the " + "GATK Dictionary entry.

" + - "This tool accepts INPUT BAM and SAM files or URLs from the Global Alliance for Genomics and Health (GA4GH) (see http://ga4gh.org/#/documentation)." + + "This tool accepts INPUT SAM, BAM or CRAM files or URLs from the Global Alliance for Genomics and Health (GA4GH) (see http://ga4gh.org/#/documentation)." + "

Usage example:

" + "
" +
             "java -jar picard.jar AddOrReplaceReadGroups \\
" + @@ -53,10 +53,10 @@ " RGSM=20" + "
" + "
" ; - @Option(shortName= StandardOptionDefinitions.INPUT_SHORT_NAME, doc="Input file (BAM or SAM or a GA4GH url).") + @Option(shortName= StandardOptionDefinitions.INPUT_SHORT_NAME, doc="Input file (SAM, BAM or CRAM or a GA4GH url).") public String INPUT = null; - @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Output file (BAM or SAM).") + @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Output file (SAM, BAM or CRAM).") public File OUTPUT = null; @Option(shortName = StandardOptionDefinitions.SORT_ORDER_SHORT_NAME, optional = true, @@ -132,9 +132,10 @@ protected int doWork() { outHeader.setReadGroups(Arrays.asList(rg)); if (SORT_ORDER != null) outHeader.setSortOrder(SORT_ORDER); - final SAMFileWriter outWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(outHeader, + final SAMFileWriter outWriter = new SAMFileWriterFactory().makeWriter(outHeader, outHeader.getSortOrder() == inHeader.getSortOrder(), - OUTPUT); + OUTPUT, + REFERENCE_SEQUENCE); final ProgressLogger progress = new ProgressLogger(log); for (final SAMRecord read : in) { diff --git a/src/main/java/picard/sam/CleanSam.java b/src/main/java/picard/sam/CleanSam.java index 9cd845119..fcd736d2f 100644 --- a/src/main/java/picard/sam/CleanSam.java +++ b/src/main/java/picard/sam/CleanSam.java @@ -1,7 +1,7 @@ /* * The MIT License * - * Copyright (c) 2010 The Broad Institute + * Copyright (c) 2010 - 2016 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -52,7 +52,7 @@ ) public class CleanSam extends CommandLineProgram { - static final String USAGE = "Cleans the provided SAM/BAM, soft-clipping beyond-end-of-reference alignments and setting MAPQ to 0 for unmapped reads"; + static final String USAGE = "Cleans the provided SAM/BAM/CRAM, soft-clipping beyond-end-of-reference alignments and setting MAPQ to 0 for unmapped reads"; @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input SAM to be cleaned.") public File INPUT; @@ -79,7 +79,7 @@ protected int doWork() { factory.validationStringency(ValidationStringency.LENIENT); } final SamReader reader = factory.open(INPUT); - final SAMFileWriter writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(reader.getFileHeader(), true, OUTPUT); + final SAMFileWriter writer = new SAMFileWriterFactory().makeWriter(reader.getFileHeader(), true, OUTPUT, REFERENCE_SEQUENCE); final CloseableIterator it = reader.iterator(); final ProgressLogger progress = new ProgressLogger(Log.getInstance(CleanSam.class)); diff --git a/src/main/java/picard/sam/DownsampleSam.java b/src/main/java/picard/sam/DownsampleSam.java index 7fcda82e7..6b5f14508 100644 --- a/src/main/java/picard/sam/DownsampleSam.java +++ b/src/main/java/picard/sam/DownsampleSam.java @@ -1,7 +1,7 @@ /* * The MIT License * - * Copyright (c) 2015 The Broad Institute + * Copyright (c) 2015-2016 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -44,10 +44,9 @@ import java.io.File; import java.text.DecimalFormat; import java.text.NumberFormat; -import java.util.Random; /** - * Class to randomly downsample a BAM file while respecting that we should either retain or discard + * Class to randomly downsample a SAM, BAM or CRAM file while respecting that we should either retain or discard * all of the reads for a template - i.e. all reads with the same name, whether first or second of * pair, secondary or supplementary, all travel together. * @@ -59,8 +58,8 @@ programGroup = SamOrBam.class ) public class DownsampleSam extends CommandLineProgram { - static final String USAGE_SUMMARY = "Downsample a SAM or BAM file. "; - static final String USAGE_DETAILS = "This tool applies a random downsampling algorithm to a SAM or BAM file to retain " + + static final String USAGE_SUMMARY = "Downsample a SAM, BAM or CRAM file. "; + static final String USAGE_DETAILS = "This tool applies a random downsampling algorithm to a SAM, BAM or CRAM file to retain " + "only a random subset of the reads. Reads in a mate-pair are either both kept or both discarded. Reads marked as not primary " + "alignments are all discarded. Each read is given a probability P of being retained so that runs performed with the exact " + "same input in the same order and with the same value for RANDOM_SEED will produce the same results." + @@ -79,10 +78,10 @@ " O=downsampled.bam" + "" + "
"; - @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "The input SAM or BAM file to downsample.") + @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "The input SAM, BAM or CRAM file to downsample.") public File INPUT; - @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The output, downsampled, SAM or BAM file to write.") + @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The output, downsampled, SAM, BAM or CRAM file to write.") public File OUTPUT; @Option(shortName="S", doc="The downsampling strategy to use. See usage for discussion.") @@ -116,11 +115,10 @@ protected int doWork() { log.warn("Running DownsampleSam with PROBABILITY=1! This will likely just recreate the input file."); } - final Random r = RANDOM_SEED == null ? new Random() : new Random(RANDOM_SEED); final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT); - final SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(in.getFileHeader(), true, OUTPUT); + final SAMFileWriter out = new SAMFileWriterFactory().makeWriter(in.getFileHeader(), true, OUTPUT, REFERENCE_SEQUENCE); final ProgressLogger progress = new ProgressLogger(log, (int) 1e7, "Wrote"); - final DownsamplingIterator iterator = DownsamplingIteratorFactory.make(INPUT, STRATEGY, PROBABILITY, ACCURACY, RANDOM_SEED); + final DownsamplingIterator iterator = DownsamplingIteratorFactory.make(in, STRATEGY, PROBABILITY, ACCURACY, RANDOM_SEED); while (iterator.hasNext()) { final SAMRecord rec = iterator.next(); diff --git a/src/main/java/picard/sam/FastqToSam.java b/src/main/java/picard/sam/FastqToSam.java index 4d89d8527..85eb30e96 100644 --- a/src/main/java/picard/sam/FastqToSam.java +++ b/src/main/java/picard/sam/FastqToSam.java @@ -1,7 +1,7 @@ /* * The MIT License * - * Copyright (c) 2009 The Broad Institute + * Copyright (c) 2009-2016 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -32,7 +32,6 @@ import htsjdk.samtools.SAMReadGroupRecord; import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SAMUtils; -import htsjdk.samtools.fastq.FastqConstants; import htsjdk.samtools.fastq.FastqConstants.FastqExtensions; import htsjdk.samtools.fastq.FastqReader; import htsjdk.samtools.fastq.FastqRecord; @@ -57,7 +56,7 @@ import java.util.List; /** - * Converts a fastq file to an unaligned BAM/SAM format. + * Converts a fastq file to an unaligned BAM/SAM/CRAM format. * See MAQ FastQ specification for details. * Three fastq versions are supported: FastqSanger, FastqSolexa and FastqIllumina. * Input files can be in GZip format (end in .gz). @@ -68,7 +67,7 @@ programGroup = SamOrBam.class ) public class FastqToSam extends CommandLineProgram { - static final String USAGE_SUMMARY = "Converts a FASTQ file to an unaligned BAM or SAM file. "; + static final String USAGE_SUMMARY = "Converts a FASTQ file to an unaligned BAM/SAM/CRAM file. "; static final String USAGE_DETAILS = "This tool extracts read sequences and base qualities from the input FASTQ file and writes them" + " out to a new file in unaligned BAM (uBAM) format. Read group information can be provided on the command line.

" + "Three versions of FASTQ quality scales are supported: FastqSanger, FastqSolexa and FastqIllumina " + @@ -98,7 +97,7 @@ "If this value is not specified, the quality format will be detected automatically.", optional = true) public FastqQualityFormat QUALITY_FORMAT; - @Option(doc="Output SAM/BAM file. ", shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME) + @Option(doc="Output BAM/SAM/CRAM file. ", shortName=StandardOptionDefinitions.OUTPUT_SHORT_NAME) public File OUTPUT ; @Option(shortName="RG", doc="Read group name") @@ -137,7 +136,7 @@ @Option(shortName = "DT", doc = "Date the run was produced, to insert into the read group header", optional = true) public Iso8601Date RUN_DATE; - @Option(shortName="SO", doc="The sort order for the output sam/bam file.") + @Option(shortName="SO", doc="The sort order for the output BAM/SAM/CRAM file.") public SortOrder SORT_ORDER = SortOrder.queryname; @Option(doc="Minimum quality allowed in the input fastq. An exception will be thrown if a quality is less than this value.") @@ -245,7 +244,7 @@ protected int doWork() { IOUtil.assertFileIsWritable(OUTPUT); final SAMFileHeader header = createSamFileHeader(); - final SAMFileWriter writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, OUTPUT); + final SAMFileWriter writer = new SAMFileWriterFactory().makeWriter(header, false, OUTPUT, REFERENCE_SEQUENCE); // Set the quality format QUALITY_FORMAT = FastqToSam.determineQualityFormat(fileToFastqReader(FASTQ), diff --git a/src/main/java/picard/sam/FilterSamReads.java b/src/main/java/picard/sam/FilterSamReads.java index ecfe31b62..7b7f9778a 100644 --- a/src/main/java/picard/sam/FilterSamReads.java +++ b/src/main/java/picard/sam/FilterSamReads.java @@ -1,7 +1,7 @@ /* * The MIT License * - * Copyright (c) 2011 The Broad Institute + * Copyright (c) 2011-2016 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -54,7 +54,7 @@ import java.util.List; /** - * From a SAM or BAM file, produce a new SAM or BAM by filtering aligned reads or a list of read + * From a SAM/BAM/CRAM file, produce a new SAM/BAM/CRAM by filtering aligned reads or a list of read * names provided in a file (one readname per line) *

* $Id$ @@ -65,8 +65,8 @@ programGroup = SamOrBam.class ) public class FilterSamReads extends CommandLineProgram { - static final String USAGE_SUMMARY = "Subset read data from a SAM or BAM file"; - static final String USAGE_DETAILS = "This tool takes a SAM or BAM file and subsets it to a new file that either excludes or " + + static final String USAGE_SUMMARY = "Subset read data from a SAM/BAM/CRAM file"; + static final String USAGE_DETAILS = "This tool takes a SAM/BAM/CRAM file and subsets it to a new file that either excludes or " + "only includes either aligned or unaligned reads (set using FILTER), or specific reads based on a list of reads names " + "supplied in the READ_LIST_FILE. " + "" + @@ -83,12 +83,12 @@ private static final Log log = Log.getInstance(FilterSamReads.class); protected /* <- used in test */ enum Filter { - includeAligned("OUTPUT SAM/BAM will contain aligned reads only. INPUT SAM/BAM must be in queryname SortOrder. (Note that *both* first and second of paired reads must be aligned to be included in the OUTPUT SAM or BAM)"), - excludeAligned("OUTPUT SAM/BAM will contain un-mapped reads only. INPUT SAM/BAM must be in queryname SortOrder. (Note that *both* first and second of pair must be aligned to be excluded from the OUTPUT SAM or BAM)"), - includeReadList("OUTPUT SAM/BAM will contain reads that are supplied in the READ_LIST_FILE file"), - excludeReadList("OUTPUT bam will contain reads that are *not* supplied in the READ_LIST_FILE file"), - includeJavascript("OUTPUT bam will contain reads that hava been accepted by the JAVASCRIPT_FILE script."), - includePairedIntervals("OUTPUT SAM/BAM will contain any reads (and their mate) that overlap with an interval. INPUT SAM/BAM and INTERVAL_LIST must be in coordinate SortOrder. Only aligned reads will be output."); + includeAligned("OUTPUT SAM/BAM/CRAM will contain aligned reads only. INPUT SAM/BAM/CRAM must be in queryname SortOrder. (Note that *both* first and second of paired reads must be aligned to be included in the OUTPUT SAM/BAM/CRAM)"), + excludeAligned("OUTPUT SAM/BAM/CRAM will contain un-mapped reads only. INPUT SAM/BAM/CRAM must be in queryname SortOrder. (Note that *both* first and second of pair must be aligned to be excluded from the OUTPUT SAM/BAM/CRAM)"), + includeReadList("OUTPUT SAM/BAM/CRAM will contain reads that are supplied in the READ_LIST_FILE file"), + excludeReadList("OUTPUT SAM/BAM/CRAM will contain reads that are *not* supplied in the READ_LIST_FILE file"), + includeJavascript("OUTPUT SAM/BAM/CRAM will contain reads that have been accepted by the JAVASCRIPT_FILE script."), + includePairedIntervals("OUTPUT SAM/BAM/CRAM will contain any reads (and their mate) that overlap with an interval. INPUT SAM/BAM/CRAM and INTERVAL_LIST must be in coordinate SortOrder. Only aligned reads will be output."); private final String description; Filter(final String description) { @@ -101,7 +101,7 @@ public String toString() { } } - @Option(doc = "The SAM or BAM file that will be filtered.", + @Option(doc = "The SAM/BAM/CRAM file that will be filtered.", optional = false, shortName = StandardOptionDefinitions.INPUT_SHORT_NAME) public File INPUT; @@ -109,18 +109,18 @@ public String toString() { @Option(doc = "Filter.", optional = false) public Filter FILTER = null; - @Option(doc = "Read List File containing reads that will be included or excluded from the OUTPUT SAM or BAM file.", + @Option(doc = "Read List File containing reads that will be included or excluded from the OUTPUT SAM/BAM/CRAM file.", optional = true, shortName = "RLF") public File READ_LIST_FILE; - @Option(doc = "Interval List File containing intervals that will be included or excluded from the OUTPUT SAM or BAM file.", + @Option(doc = "Interval List File containing intervals that will be included or excluded from the OUTPUT SAM/BAM/CRAM file.", optional = true, shortName = "IL") public File INTERVAL_LIST; @Option( - doc = "SortOrder of the OUTPUT SAM or BAM file, otherwise use the SortOrder of the INPUT file.", + doc = "SortOrder of the OUTPUT SAM/BAM/CRAM file, otherwise use the SortOrder of the INPUT file.", optional = true, shortName = "SO") public SAMFileHeader.SortOrder SORT_ORDER; @@ -129,12 +129,12 @@ public String toString() { optional = true) public boolean WRITE_READS_FILES = true; - @Option(doc = "SAM or BAM file to write read excluded results to", + @Option(doc = "SAM/BAM/CRAM file to write read excluded results to", optional = false, shortName = "O") public File OUTPUT; @Option(shortName = "JS", - doc = "Filters a SAM or BAM file with a javascript expression using the java javascript-engine. " + doc = "Filters a SAM/BAM/CRAM file with a javascript expression using the java javascript-engine. " + " The script puts the following variables in the script context: " + " 'record' a SamRecord ( https://samtools.github.io/htsjdk/javadoc/htsjdk/htsjdk/samtools/SAMRecord.html ) and " + " 'header' a SAMFileHeader ( https://samtools.github.io/htsjdk/javadoc/htsjdk/htsjdk/samtools/SAMFileHeader.html )." @@ -161,7 +161,7 @@ private void filterReads(final FilteringSamIterator filteringIterator) { OUTPUT.getName() + " [sortorder=" + fileHeader.getSortOrder().name() + "]"); // create OUTPUT file - final SAMFileWriter outputWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(fileHeader, presorted, OUTPUT); + final SAMFileWriter outputWriter = new SAMFileWriterFactory().makeWriter(fileHeader, presorted, OUTPUT, REFERENCE_SEQUENCE); final ProgressLogger progress = new ProgressLogger(log, (int) 1e6, "Written"); @@ -179,7 +179,7 @@ private void filterReads(final FilteringSamIterator filteringIterator) { /** * Write out a file of read names for debugging purposes. * - * @param samOrBamFile The SAM or BAM file for which we are going to write out a file of its + * @param samOrBamFile The SAM/BAM/CRAM file for which we are going to write out a file of its * containing read names */ private void writeReadsFile(final File samOrBamFile) throws IOException { diff --git a/src/main/java/picard/sam/FixMateInformation.java b/src/main/java/picard/sam/FixMateInformation.java index 058150f8a..4fadeba6c 100644 --- a/src/main/java/picard/sam/FixMateInformation.java +++ b/src/main/java/picard/sam/FixMateInformation.java @@ -1,7 +1,7 @@ /* * The MIT License * - * Copyright (c) 2009 The Broad Institute + * Copyright (c) 2009 - 2016 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -25,7 +25,6 @@ package picard.sam; import htsjdk.samtools.BAMRecordCodec; -import htsjdk.samtools.BamFileIoUtils; import htsjdk.samtools.MergingSamRecordIterator; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMFileHeader.SortOrder; @@ -134,10 +133,11 @@ protected int doWork() { } else { final File soleInput = INPUT.get(0).getAbsoluteFile(); final File dir = soleInput.getParentFile().getAbsoluteFile(); + final String extension = parseExtension(soleInput); try { IOUtil.assertFileIsWritable(soleInput); IOUtil.assertDirectoryIsWritable(dir); - OUTPUT = File.createTempFile(soleInput.getName() + ".being_fixed.", BamFileIoUtils.BAM_FILE_EXTENSION, dir); + OUTPUT = File.createTempFile(soleInput.getName() + ".being_fixed.", extension, dir); } catch (final IOException ioe) { throw new RuntimeIOException("Could not create tmp file in " + dir.getAbsolutePath()); } @@ -263,9 +263,14 @@ public void close() { return 0; } + private String parseExtension(final File input) { + String[] splitted = input.getName().split("\\."); + return splitted[splitted.length - 1]; + } + protected void createSamFileWriter(final SAMFileHeader header) { - out = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, - header.getSortOrder() == SortOrder.queryname, OUTPUT); + out = new SAMFileWriterFactory().makeWriter(header, + header.getSortOrder() == SortOrder.queryname, OUTPUT, REFERENCE_SEQUENCE); } diff --git a/src/main/java/picard/sam/GatherBamFiles.java b/src/main/java/picard/sam/GatherBamFiles.java index b2efb4670..e62024256 100644 --- a/src/main/java/picard/sam/GatherBamFiles.java +++ b/src/main/java/picard/sam/GatherBamFiles.java @@ -7,6 +7,7 @@ import htsjdk.samtools.SAMRecord; import htsjdk.samtools.SamReader; import htsjdk.samtools.SamReaderFactory; +import htsjdk.samtools.cram.build.CramIO; import htsjdk.samtools.util.CloserUtil; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.Log; @@ -20,9 +21,9 @@ import java.util.List; /** - * Program to perform a rapid "gather" operation on BAM files after a scatter operations where - * the same process has been performed on different regions of a BAM file creating many smaller - * BAM files that now need to be concatenated back together. + * Program to perform a rapid "gather" operation on BAM/CRAM files after a scatter operations where + * the same process has been performed on different regions of a BAM/CRAM file creating many smaller + * BAM/CRAM files that now need to be concatenated back together. * * @author Tim Fennell */ @@ -32,15 +33,15 @@ programGroup = SamOrBam.class ) public class GatherBamFiles extends CommandLineProgram { - static final String USAGE_SUMMARY = "Concatenate one or more BAM files as efficiently as possible"; - static final String USAGE_DETAILS = "This tool performs a rapid \"gather\" operation on BAM files after scatter" + - " operations where the same process has been performed on different regions of a BAM file creating many " + - "smaller BAM files that now need to be concatenated (reassembled) back together." + + static final String USAGE_SUMMARY = "Concatenate one or more BAM/CRAM files as efficiently as possible"; + static final String USAGE_DETAILS = "This tool performs a rapid \"gather\" operation on BAM/CRAM files after scatter" + + " operations where the same process has been performed on different regions of a BAM/CRAM file creating many " + + "smaller BAM/CRAM files that now need to be concatenated (reassembled) back together." + "

" + - "Assumes that the list of BAM files provided as INPUT are in the order that they should be concatenated and" + - " simply concatenates the bodies of the BAM files while retaining the header from the first file. " + + "Assumes that the list of BAM/CRAM files provided as INPUT are in the order that they should be concatenated and" + + " simply concatenates the bodies of the BAM/CRAM files while retaining the header from the first file. " + "Operates via copying of the gzip blocks directly for speed but also supports generation of an MD5 on the" + - " output and indexing of the output BAM file. Only supports BAM files, does not support SAM files." + + " output and indexing of the output BAM/CRAM file. Only supports BAM/CRAM files, does not support SAM files." + "

Usage example:

" + "
" +
             "java -jar picard.jar GatherBamFiles \\
" + @@ -50,10 +51,10 @@ "
" + "
"; @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, - doc = "Two or more BAM files or text files containing lists of BAM files (one per line).") + doc = "Two or more BAM/CRAM files or text files containing lists of BAM/CRAM files (one per line).") public List INPUT; - @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The output BAM file to write.") + @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The output BAM/CRAM file to write.") public File OUTPUT; private static final Log log = Log.getInstance(GatherBamFiles.class); @@ -67,7 +68,11 @@ public static void main(final String[] args) { @Override protected int doWork() { - final List inputs = IOUtil.unrollFiles(INPUT, BamFileIoUtils.BAM_FILE_EXTENSION, ".sam"); + final List inputs = IOUtil.unrollFiles( + INPUT, + BamFileIoUtils.BAM_FILE_EXTENSION, + IOUtil.SAM_FILE_EXTENSION, + CramIO.CRAM_FILE_EXTENSION); for (final File f : inputs) IOUtil.assertFileIsReadable(f); IOUtil.assertFileIsWritable(OUTPUT); @@ -92,7 +97,7 @@ private boolean determineBlockCopyingStatus(final List inputs) { /** * Simple implementation of a gather operations that uses SAMFileReaders and Writers in order to concatenate - * multiple BAM files. + * multiple BAM/CRAM files. */ private static void gatherNormally(final List inputs, final File output, final boolean createIndex, final boolean createMd5, final File referenceFasta) { @@ -101,7 +106,10 @@ private static void gatherNormally(final List inputs, final File output, f header = SamReaderFactory.makeDefault().referenceSequence(referenceFasta).getFileHeader(inputs.get(0)); } - final SAMFileWriter out = new SAMFileWriterFactory().setCreateIndex(createIndex).setCreateMd5File(createMd5).makeSAMOrBAMWriter(header, true, output); + final SAMFileWriter out = new SAMFileWriterFactory() + .setCreateIndex(createIndex) + .setCreateMd5File(createMd5) + .makeWriter(header, true, output, referenceFasta); for (final File f : inputs) { log.info("Gathering " + f.getAbsolutePath()); diff --git a/src/main/java/picard/sam/MergeSamFiles.java b/src/main/java/picard/sam/MergeSamFiles.java index 54db0ab47..c0a3fd2c6 100644 --- a/src/main/java/picard/sam/MergeSamFiles.java +++ b/src/main/java/picard/sam/MergeSamFiles.java @@ -1,7 +1,7 @@ /* * The MIT License * - * Copyright (c) 2009 The Broad Institute + * Copyright (c) 2009-2016 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -54,7 +54,7 @@ import java.util.Map; /** - * Reads a SAM or BAM file and combines the output to one file + * Reads a SAM, BAM or CRAM file and combines the output to one file * * @author Tim Fennell */ @@ -66,8 +66,8 @@ public class MergeSamFiles extends CommandLineProgram { private static final Log log = Log.getInstance(MergeSamFiles.class); - static final String USAGE_SUMMARY = "Merges multiple SAM and/or BAM files into a single file. "; - static final String USAGE_DETAILS = "This tool is used for combining SAM and/or BAM files from different runs or read groups, similarly " + + static final String USAGE_SUMMARY = "Merges multiple SAM/BAM/CRAM (and/or) files into a single file. "; + static final String USAGE_DETAILS = "This tool is used for combining SAM/BAM/CRAM (and/or) files from different runs or read groups, similarly " + "to the \"merge\" function of Samtools (http://www.htslib.org/doc/samtools.html). " + "

Note that to prevent errors in downstream processing, it is critical to identify/label read groups appropriately. " + "If different samples contain identical read group IDs, this tool will avoid collisions by modifying the read group IDs to be " + @@ -83,10 +83,10 @@ "" + "
" ; - @Option(shortName = "I", doc = "SAM or BAM input file", minElements = 1) + @Option(shortName = "I", doc = "SAM/BAM/CRAM input file", minElements = 1) public List INPUT = new ArrayList(); - @Option(shortName = "O", doc = "SAM or BAM file to write merged result to") + @Option(shortName = "O", doc = "SAM/BAM/CRAM file to write merged result to") public File OUTPUT; @Option(shortName = StandardOptionDefinitions.SORT_ORDER_SHORT_NAME, doc = "Sort order of output file", optional = true) @@ -101,14 +101,14 @@ @Option(doc = "Option to create a background thread to encode, " + "compress and write to disk the output file. The threaded version uses about 20% more CPU and decreases " + - "runtime by ~20% when writing out a compressed BAM file.") + "runtime by ~20% when writing out a compressed BAM/CRAM file.") public boolean USE_THREADING = false; @Option(doc = "Comment(s) to include in the merged output file's header.", optional = true, shortName = "CO") public List COMMENT = new ArrayList(); @Option(shortName = "RGN", doc = "An interval list file that contains the locations of the positions to merge. "+ - "Assume bam are sorted and indexed. "+ + "Assume sam are sorted and indexed. "+ "The resulting file will contain alignments that may overlap with genomic regions outside the requested region. "+ "Unmapped reads are discarded.", optional = true) @@ -121,7 +121,7 @@ public static void main(final String[] argv) { System.exit(new MergeSamFiles().instanceMain(argv)); } - /** Combines multiple SAM/BAM files into one. */ + /** Combines multiple SAM/BAM/CRAM files into one. */ @Override protected int doWork() { boolean matchedSortOrders = true; @@ -141,7 +141,7 @@ protected int doWork() { IOUtil.assertFileIsReadable(inFile); final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(inFile); if ( INTERVALS != null ) { - if( ! in.hasIndex() ) throw new PicardException("Merging with interval but Bam file is not indexed "+ inFile); + if( ! in.hasIndex() ) throw new PicardException("Merging with interval but file is not indexed "+ inFile); final CloseableIterator samIterator = new SamRecordIntervalIteratorFactory().makeSamRecordIntervalIterator(in, intervalList, true); samReaderToIterator.put(in, samIterator); } @@ -188,7 +188,7 @@ protected int doWork() { } else { // show warning related to https://github.com/broadinstitute/picard/pull/314/files - log.info("Warning: merged bams from different interval lists may contain the same read in both files"); + log.info("Warning: merged files from different interval lists may contain the same read in both files"); iterator = new MergingSamRecordIterator(headerMerger, samReaderToIterator, true); } final SAMFileHeader header = headerMerger.getMergedHeader(); @@ -200,7 +200,7 @@ protected int doWork() { if (USE_THREADING) { samFileWriterFactory.setUseAsyncIo(true); } - final SAMFileWriter out = samFileWriterFactory.makeSAMOrBAMWriter(header, presorted, OUTPUT); + final SAMFileWriter out = samFileWriterFactory.makeWriter(header, presorted, OUTPUT, REFERENCE_SEQUENCE); // Lastly loop through and write out the records final ProgressLogger progress = new ProgressLogger(log, PROGRESS_INTERVAL); diff --git a/src/main/java/picard/sam/PositionBasedDownsampleSam.java b/src/main/java/picard/sam/PositionBasedDownsampleSam.java index b95dd6cd4..4ed0f70e4 100644 --- a/src/main/java/picard/sam/PositionBasedDownsampleSam.java +++ b/src/main/java/picard/sam/PositionBasedDownsampleSam.java @@ -1,7 +1,7 @@ /* * The MIT License * - * Copyright (c) 2015 The Broad Institute + * Copyright (c) 2015-2016 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -54,12 +54,12 @@ /** - * Class to downsample a BAM file while respecting that we should either get rid + * Class to downsample a SAM/BAM/CRAM file while respecting that we should either get rid * of both ends of a pair or neither end of the pair. In addition, this program uses the read-name * and extracts the position within the tile whence the read came from. The downsampling is based on this position. *

* Note 1: This is technology and read-name dependent. If your read-names do not have coordinate information, or if your - * BAM contains reads from multiple technologies (flowcell versions, sequencing machines) this will not work properly. + * SAM/BAM/CRAM contains reads from multiple technologies (flowcell versions, sequencing machines) this will not work properly. * This has been designed with Illumina MiSeq/HiSeq in mind. *

* Note 2: The downsampling is _not_ random. It is deterministically dependent on the position of the read within its tile. Specifically, @@ -74,13 +74,13 @@ * @author Yossi Farjoun */ @CommandLineProgramProperties( - usage = "Class to downsample a BAM file while respecting that we should either get rid of both ends of a pair or neither \n" + + usage = "Class to downsample a SAM/BAM/CRAM file while respecting that we should either get rid of both ends of a pair or neither \n" + "end of the pair. In addition, this program uses the read-name and extracts the position within the tile whence \n" + "the read came from. The downsampling is based on this position. Results with the exact same input will produce the \n" + "same results.\n" + "\n" + "Note 1: This is technology and read-name dependent. If your read-names do not have coordinate information, or if your\n" + - "BAM contains reads from multiple technologies (flowcell versions, sequencing machines) this will not work properly. \n" + + "SAM/BAM/CRAM contains reads from multiple technologies (flowcell versions, sequencing machines) this will not work properly. \n" + "This has been designed with Illumina MiSeq/HiSeq in mind.\n" + "Note 2: The downsampling is not random. It is deterministically dependent on the position of the read within its tile.\n" + "Note 3: Downsampling twice with this program is not supported.\n" + @@ -89,15 +89,15 @@ "Finally, the code has been designed to simulate sequencing less as accurately as possible, not for getting an exact downsample \n" + "fraction. In particular, since the reads may be distributed non-evenly within the lanes/tiles, the resulting downsampling \n" + "percentage will not be accurately determined by the input argument FRACTION.", - usageShort = "Downsample a SAM or BAM file to retain a subset of the reads based on the reads location in each tile in the flowcell.", + usageShort = "Downsample a SAM/BAM/CRAM file to retain a subset of the reads based on the reads location in each tile in the flowcell.", programGroup = SamOrBam.class ) public class PositionBasedDownsampleSam extends CommandLineProgram { - @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "The input SAM or BAM file to downsample.") + @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "The input SAM/BAM/CRAM file to downsample.") public File INPUT; - @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The output, downsampled, SAM or BAM file to write.") + @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The output, downsampled, SAM/BAM/CRAM file to write.") public File OUTPUT; @Option(shortName = "F", doc = "The (approximate) fraction of reads to be kept, between 0 and 1.", optional = false) @@ -198,7 +198,7 @@ private void outputSamRecords() { programRecord.setProgramVersion(getVersion()); header.addProgramRecord(programRecord); - final SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, true, OUTPUT); + final SAMFileWriter out = new SAMFileWriterFactory().makeWriter(header, true, OUTPUT, REFERENCE_SEQUENCE); final CircleSelector selector = new CircleSelector(FRACTION); @@ -240,7 +240,7 @@ private void checkProgramRecords() { for (final SAMProgramRecord pg : in.getFileHeader().getProgramRecords()) { if (pg.getProgramName() != null && pg.getProgramName().equals(PG_PROGRAM_NAME)) { - final String outText = "Found previous Program Record that indicates that this BAM has been downsampled already with this program. Operation not supported! Previous PG: " + pg.toString(); + final String outText = "Found previous Program Record that indicates that this file has been downsampled already with this program. Operation not supported! Previous PG: " + pg.toString(); if (ALLOW_MULTIPLE_DOWNSAMPLING_DESPITE_WARNINGS) { log.warn(outText); diff --git a/src/main/java/picard/sam/ReorderSam.java b/src/main/java/picard/sam/ReorderSam.java index a0714e01f..aadbc40bc 100644 --- a/src/main/java/picard/sam/ReorderSam.java +++ b/src/main/java/picard/sam/ReorderSam.java @@ -1,7 +1,7 @@ /* * The MIT License * - * Copyright (c) 2011 The Broad Institute + * Copyright (c) 2011-2016 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -50,31 +50,27 @@ import java.util.Map; /** - * Reorders a SAM/BAM input file according to the order of contigs in a second reference sequence + * Reorders a SAM/BAM/CRAM input file according to the order of contigs in a second reference sequence * * @author mdepristo */ @CommandLineProgramProperties( - usage = "Not to be confused with SortSam which sorts a SAM or BAM file with a valid sequence dictionary, " + - "ReorderSam reorders reads in a SAM/BAM file to match the contig ordering in a provided reference file, " + + usage = "Not to be confused with SortSam which sorts a SAM/BAM/CRAM file with a valid sequence dictionary, " + + "ReorderSam reorders reads in a SAM/BAM/CRAM file to match the contig ordering in a provided reference file, " + "as determined by exact name matching of contigs. Reads mapped to contigs absent in the new " + - "reference are dropped. Runs substantially faster if the input is an indexed BAM file.", - usageShort = "Reorders reads in a SAM or BAM file to match ordering in reference", + "reference are dropped. Runs substantially faster if the input is an indexed BAM/CRAM file.", + usageShort = "Reorders reads in a SAM/BAM/CRAM file to match ordering in reference", programGroup = SamOrBam.class ) public class ReorderSam extends CommandLineProgram { - @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input file (bam or sam) to extract reads from.") + @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "Input file (SAM/BAM/CRAM) to extract reads from.") public File INPUT; - @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Output file (bam or sam) to write extracted reads to.") + @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "Output file (SAM/BAM/CRAM) to write extracted reads to.") public File OUTPUT; - @Option(shortName = StandardOptionDefinitions.REFERENCE_SHORT_NAME, doc = "Reference sequence to reorder reads to match. " + - "A sequence dictionary corresponding to the reference fasta is required. Create one with CreateSequenceDictionary.jar.") - public File REFERENCE; - - @Option(shortName = "S", doc = "If true, then allows only a partial overlap of the BAM contigs with the new reference " + + @Option(shortName = "S", doc = "If true, then allows only a partial overlap of the SAM/BAM/CRAM contigs with the new reference " + "sequence contigs. By default, this tool requires a corresponding contig in the new " + "reference for each read contig") public boolean ALLOW_INCOMPLETE_DICT_CONCORDANCE = false; @@ -91,14 +87,22 @@ public static void main(final String[] argv) { new ReorderSam().instanceMainWithExit(argv); } + @Override + protected String[] customCommandLineValidation() { + if (REFERENCE_SEQUENCE == null) { + return new String[]{"Must have a non-null REFERENCE_SEQUENCE"}; + } + return super.customCommandLineValidation(); + } + protected int doWork() { IOUtil.assertFileIsReadable(INPUT); - IOUtil.assertFileIsReadable(REFERENCE); + IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE); IOUtil.assertFileIsWritable(OUTPUT); final SamReader in = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT); - ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE); + ReferenceSequenceFile reference = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE_SEQUENCE); SAMSequenceDictionary refDict = reference.getSequenceDictionary(); if (refDict == null) { @@ -107,7 +111,7 @@ protected int doWork() { return 1; } - printDictionary("SAM/BAM file", in.getFileHeader().getSequenceDictionary()); + printDictionary("SAM/BAM/CRAM file", in.getFileHeader().getSequenceDictionary()); printDictionary("Reference", refDict); Map newOrder = buildSequenceDictionaryMap(refDict, in.getFileHeader().getSequenceDictionary()); @@ -117,7 +121,7 @@ protected int doWork() { log.info("Writing reads..."); if (in.hasIndex()) { - final SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(outHeader, true, OUTPUT); + final SAMFileWriter out = new SAMFileWriterFactory().makeWriter(outHeader, true, OUTPUT, REFERENCE_SEQUENCE); // write the reads in contig order for (final SAMSequenceRecord contig : refDict.getSequences()) { @@ -128,7 +132,7 @@ protected int doWork() { writeReads(out, in.queryUnmapped(), newOrder, "unmapped"); out.close(); } else { - SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(outHeader, false, OUTPUT); + SAMFileWriter out = new SAMFileWriterFactory().makeWriter(outHeader, false, OUTPUT, REFERENCE_SEQUENCE); writeReads(out, in.iterator(), newOrder, "All reads"); out.close(); } @@ -198,7 +202,7 @@ private void writeReads(final SAMFileWriter out, final SAMSequenceDictionary readsDict) { Map newOrder = new HashMap(); - log.info("Reordering SAM/BAM file:"); + log.info("Reordering SAM/BAM/CRAM file:"); for (final SAMSequenceRecord refRec : refDict.getSequences()) { final SAMSequenceRecord readsRec = readsDict.getSequence(refRec.getSequenceName()); @@ -239,7 +243,7 @@ private void writeReads(final SAMFileWriter out, private void printDictionary(String name, SAMSequenceDictionary dict) { log.info(name); for (final SAMSequenceRecord contig : dict.getSequences()) { - log.info(" SN=%s LN=%d%n", contig.getSequenceName(), contig.getSequenceLength()); + log.info(String.format(" SN=%s LN=%d%n", contig.getSequenceName(), contig.getSequenceLength())); } } } diff --git a/src/main/java/picard/sam/ReplaceSamHeader.java b/src/main/java/picard/sam/ReplaceSamHeader.java index 3f76d4c5a..e33309edf 100644 --- a/src/main/java/picard/sam/ReplaceSamHeader.java +++ b/src/main/java/picard/sam/ReplaceSamHeader.java @@ -1,7 +1,7 @@ /* * The MIT License * - * Copyright (c) 2009 The Broad Institute + * Copyright (c) 2009-2016 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -53,8 +53,8 @@ programGroup = SamOrBam.class ) public class ReplaceSamHeader extends CommandLineProgram { - static final String USAGE_SUMMARY = "Replaces the SAMFileHeader in a SAM or BAM file. "; - static final String USAGE_DETAILS = "This tool makes it possible to replace the header of a SAM or BAM file with the header of another" + + static final String USAGE_SUMMARY = "Replaces the SAMFileHeader in a SAM, BAM or CRAM file. "; + static final String USAGE_DETAILS = "This tool makes it possible to replace the header of a SAM, BAM or CRAM file with the header of another" + "file, or a header block that has been edited manually (in a stub SAM file). The sort order (@SO) of the two input files must " + "be the same.

" + "Note that validation is minimal, so it is up to the user to ensure that all the elements referred to in the SAMRecords " + @@ -68,10 +68,10 @@ " O=bam_with_new_head.bam" + "" + "


"; - @Option(doc = "SAM file from which SAMRecords will be read.", shortName = StandardOptionDefinitions.INPUT_SHORT_NAME) + @Option(doc = "SAM, BAM or CRAM file from which SAMRecords will be read.", shortName = StandardOptionDefinitions.INPUT_SHORT_NAME) public File INPUT; - @Option(doc = "SAM file from which SAMFileHeader will be read.") + @Option(doc = "SAM, BAM or CRAM file from which SAMFileHeader will be read.") public File HEADER; @Option(doc = "SAMFileHeader from HEADER file will be written to this file, followed by SAMRecords from INPUT file", @@ -111,7 +111,7 @@ private void standardReheader(final SAMFileHeader replacementHeader) { throw new PicardException("Sort orders of INPUT (" + recordReader.getFileHeader().getSortOrder().name() + ") and HEADER (" + replacementHeader.getSortOrder().name() + ") do not agree."); } - final SAMFileWriter writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(replacementHeader, true, OUTPUT); + final SAMFileWriter writer = new SAMFileWriterFactory().makeWriter(replacementHeader, true, OUTPUT, REFERENCE_SEQUENCE); final ProgressLogger progress = new ProgressLogger(Log.getInstance(ReplaceSamHeader.class)); for (final SAMRecord rec : recordReader) { diff --git a/src/main/java/picard/sam/RevertOriginalBaseQualitiesAndAddMateCigar.java b/src/main/java/picard/sam/RevertOriginalBaseQualitiesAndAddMateCigar.java index 3d52c2245..a78cf1e1b 100644 --- a/src/main/java/picard/sam/RevertOriginalBaseQualitiesAndAddMateCigar.java +++ b/src/main/java/picard/sam/RevertOriginalBaseQualitiesAndAddMateCigar.java @@ -25,23 +25,23 @@ import java.util.Iterator; /** - * This tool reverts the original base qualities (if specified) and adds the mate cigar tag to mapped BAMs. + * This tool reverts the original base qualities (if specified) and adds the mate cigar tag to mapped SAM, BAM or CRAM files. * If the file does not have OQs and already has mate cigar tags, nothing is done. * New BAM/BAI/MD5 files are created. * * @author Nils Homer */ @CommandLineProgramProperties( - usage = "Reverts the original base qualities and adds the mate cigar tag to read-group BAMs.", - usageShort = "Reverts the original base qualities and adds the mate cigar tag to read-group BAMs", + usage = "Reverts the original base qualities and adds the mate cigar tag to read-group SAM, BAM or CRAM files.", + usageShort = "Reverts the original base qualities and adds the mate cigar tag to read-group SAM, BAM or CRAM files", programGroup = SamOrBam.class ) public class RevertOriginalBaseQualitiesAndAddMateCigar extends CommandLineProgram { - @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "The input SAM/BAM file to revert the state of.") + @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, doc = "The input SAM/BAM/CRAM file to revert the state of.") public File INPUT; - @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The output SAM/BAM file to create.") + @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The output SAM/BAM/CRAM file to create.") public File OUTPUT; @Option(shortName = "SO", doc = "The sort order to create the reverted output file with." @@ -90,7 +90,7 @@ public int doWork() { outHeader.setSortOrder(SORT_ORDER); SAMFileWriterFactory.setDefaultCreateIndexWhileWriting(CREATE_INDEX); SAMFileWriterFactory.setDefaultCreateMd5File(CREATE_MD5_FILE); - final SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(outHeader, false, OUTPUT); + final SAMFileWriter out = new SAMFileWriterFactory().makeWriter(outHeader, false, OUTPUT, REFERENCE_SEQUENCE); // Iterate over the records, revert original base qualities, and push them into a SortingCollection by queryname final SortingCollection sorter = SortingCollection.newInstance(SAMRecord.class, new BAMRecordCodec(outHeader), @@ -138,9 +138,9 @@ public int doWork() { * Used as a return for the canSkipSAMFile function. */ public enum CanSkipSamFile { - CAN_SKIP("Can skip the BAM file", true), - CANNOT_SKIP_FOUND_OQ("Cannot skip the BAM as we found a record with an OQ", false), - CANNOT_SKIP_FOUND_NO_MC("Cannot skip the BAM as we found a mate with no mate cigar tag", false), + CAN_SKIP("Can skip the SAM/BAM/CRAM file", true), + CANNOT_SKIP_FOUND_OQ("Cannot skip the SAM/BAM/CRAM as we found a record with an OQ", false), + CANNOT_SKIP_FOUND_NO_MC("Cannot skip the SAM/BAM/CRAM as we found a mate with no mate cigar tag", false), FOUND_NO_EVIDENCE("Found no evidence of OQ or mate with no mate cigar in the first %d records. Will continue...", false); final private String format; final private boolean skip; @@ -156,9 +156,9 @@ private CanSkipSamFile(final String format, final boolean skip) { } /** - * Checks if we can skip the SAM/BAM file when reverting origin base qualities and adding mate cigars. + * Checks if we can skip the SAM/BAM/CRAM file when reverting origin base qualities and adding mate cigars. * - * @param inputFile the SAM/BAM input file + * @param inputFile the SAM/BAM/CRAM input file * @param maxRecordsToExamine the maximum number of records to examine before quitting * @param revertOriginalBaseQualities true if we are to revert original base qualities, false otherwise * @return whether we can skip or not, and the explanation why. diff --git a/src/main/java/picard/sam/SetNmMdAndUqTags.java b/src/main/java/picard/sam/SetNmMdAndUqTags.java index fa52e14cd..1cdd7929e 100644 --- a/src/main/java/picard/sam/SetNmMdAndUqTags.java +++ b/src/main/java/picard/sam/SetNmMdAndUqTags.java @@ -52,8 +52,8 @@ programGroup = SamOrBam.class ) public class SetNmMdAndUqTags extends CommandLineProgram { - static final String USAGE_SUMMARY = "Fixes the NM, MD, and UQ tags in a SAM file. "; - static final String USAGE_DETAILS = "This tool takes in a SAM or BAM file (sorted by coordinate) and calculates the NM, MD, and UQ tags by comparing with the reference."+ + static final String USAGE_SUMMARY = "Fixes the NM, MD, and UQ tags in a SAM, BAM or CRAM file. "; + static final String USAGE_DETAILS = "This tool takes in a SAM, BAM or CRAM file (sorted by coordinate) and calculates the NM, MD, and UQ tags by comparing with the reference."+ "
" + "This may be needed when MergeBamAlignment was run with SORT_ORDER different from 'coordinate' and thus could not fix\n"+ "these tags then.
"+ @@ -64,10 +64,10 @@ " O=fixed.bam \\
"+ "" + "
"; - @Option(doc = "The BAM or SAM file to fix.", shortName = StandardOptionDefinitions.INPUT_SHORT_NAME) + @Option(doc = "The SAM, BAM or CRAM file to fix.", shortName = StandardOptionDefinitions.INPUT_SHORT_NAME) public File INPUT; - @Option(doc = "The fixed BAM or SAM output file. ", shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME) + @Option(doc = "The fixed SAM, BAM or CRAM output file. ", shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME) public File OUTPUT; @Option(doc = "Whether the file contains bisulfite sequence (used when calculating the NM tag).") @@ -96,7 +96,7 @@ protected int doWork() { throw new SAMException("Input must be coordinate-sorted for this program to run. Found: " + reader.getFileHeader().getSortOrder()); } - final SAMFileWriter writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(reader.getFileHeader(), true, OUTPUT); + final SAMFileWriter writer = new SAMFileWriterFactory().makeWriter(reader.getFileHeader(), true, OUTPUT, REFERENCE_SEQUENCE); writer.setProgressLogger( new ProgressLogger(log, (int) 1e7, "Wrote", "records")); diff --git a/src/main/java/picard/sam/SortSam.java b/src/main/java/picard/sam/SortSam.java index 28058aa23..63ed3af7b 100644 --- a/src/main/java/picard/sam/SortSam.java +++ b/src/main/java/picard/sam/SortSam.java @@ -1,7 +1,7 @@ /* * The MIT License * - * Copyright (c) 2009 The Broad Institute + * Copyright (c) 2009-2016 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -50,11 +50,11 @@ programGroup = SamOrBam.class ) public class SortSam extends CommandLineProgram { - static final String USAGE_SUMMARY = "Sorts a SAM or BAM file. "; + static final String USAGE_SUMMARY = "Sorts a SAM, BAM or CRAM file. "; static final String USAGE_DETAILS = "This tool sorts the input SAM or BAM file by coordinate, queryname (QNAME), or some other property " + - "of the SAM record. The SortOrder of a SAM/BAM file is found in the SAM file header tag @HD in the field labeled SO. " + + "of the SAM record. The SortOrder of a SAM/BAM/CRAM file is found in the SAM file header tag @HD in the field labeled SO. " + "" + - "

For a coordinate sorted SAM/BAM file, read alignments are sorted first by the reference sequence name (RNAME) field using the " + + "

For a coordinate sorted SAM/BAM/CRAM file, read alignments are sorted first by the reference sequence name (RNAME) field using the " + "reference sequence dictionary (@SQ tag). Alignments within these subgroups are secondarily sorted using the left-most mapping " + "position of the read (POS). Subsequent to this sorting scheme, alignments are listed arbitrarily.

" + "" + @@ -70,10 +70,10 @@ "" + "
"; - @Option(doc = "The BAM or SAM file to sort.", shortName = StandardOptionDefinitions.INPUT_SHORT_NAME) + @Option(doc = "The SAM, BAM or CRAM file to sort.", shortName = StandardOptionDefinitions.INPUT_SHORT_NAME) public File INPUT; - @Option(doc = "The sorted BAM or SAM output file. ", shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME) + @Option(doc = "The sorted SAM, BAM or CRAM output file. ", shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME) public File OUTPUT; @Option(shortName = StandardOptionDefinitions.SORT_ORDER_SHORT_NAME, doc = "Sort order of output file") @@ -91,7 +91,7 @@ protected int doWork() { final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT); ; reader.getFileHeader().setSortOrder(SORT_ORDER); - final SAMFileWriter writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(reader.getFileHeader(), false, OUTPUT); + final SAMFileWriter writer = new SAMFileWriterFactory().makeWriter(reader.getFileHeader(), false, OUTPUT, REFERENCE_SEQUENCE); writer.setProgressLogger( new ProgressLogger(log, (int) 1e7, "Wrote", "records from a sorting collection")); diff --git a/src/main/java/picard/sam/SplitSamByLibrary.java b/src/main/java/picard/sam/SplitSamByLibrary.java index 75282165e..a3b266d6f 100755 --- a/src/main/java/picard/sam/SplitSamByLibrary.java +++ b/src/main/java/picard/sam/SplitSamByLibrary.java @@ -1,7 +1,7 @@ /* * The MIT License * - * Copyright (c) 2009 The Broad Institute + * Copyright (c) 2009-2016 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -47,27 +47,27 @@ import java.util.Map; /** - * Command-line program to split a SAM or BAM file into separate files based on + * Command-line program to split a SAM/BAM/CRAM file into separate files based on * library name. * * @author ktibbett@broadinstitute.org */ @CommandLineProgramProperties( - usage = "Takes a SAM or BAM file and separates all the reads " + - "into one SAM or BAM file per library name. Reads that do not have " + + usage = "Takes a SAM/BAM/CRAM file and separates all the reads " + + "into one SAM/BAM/CRAM file per library name. Reads that do not have " + "a read group specified or whose read group does not have a library name " + - "are written to a file called 'unknown.' The format (SAM or BAM) of the " + + "are written to a file called 'unknown.' The format (SAM/BAM/CRAM) of the " + "output files matches that of the input file. ", - usageShort = "Splits a SAM or BAM file into individual files by library", + usageShort = "Splits a SAM/BAM/CRAM file into individual files by library", programGroup = SamOrBam.class ) public class SplitSamByLibrary extends CommandLineProgram { @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, - doc = "The SAM or BAM file to be split. ") + doc = "The SAM, BAM of CRAM file to be split. ") public File INPUT; @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, - doc = "The directory where the library SAM or BAM files should be written " + + doc = "The directory where the library SAM/BAM/CRAM files should be written " + "(defaults to the current directory). ", optional = true) public File OUTPUT = new File(".").getAbsoluteFile(); @@ -84,11 +84,11 @@ protected int doWork() { IOUtil.assertFileIsReadable(INPUT); IOUtil.assertDirectoryIsWritable(OUTPUT); - SamReader reader = SamReaderFactory.makeDefault().open(INPUT); + SamReader reader = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).open(INPUT); Map libraryToWriter = new HashMap(); Map> libraryToRg = new HashMap>(); SAMFileWriterFactory factory = new SAMFileWriterFactory(); - String extension = reader.type().equals(SamReader.Type.BAM_TYPE) ? ".bam" : ".sam"; + String extension = "." + reader.type().fileExtension(); SAMFileHeader unknownHeader = reader.getFileHeader().clone(); unknownHeader.setReadGroups(new ArrayList()); @@ -116,8 +116,9 @@ protected int doWork() { String lib = entry.getKey(); SAMFileHeader header = reader.getFileHeader().clone(); header.setReadGroups(entry.getValue()); - libraryToWriter.put(lib, factory.makeSAMOrBAMWriter(header, true, - new File(OUTPUT, IOUtil.makeFileNameSafe(lib) + extension))); + libraryToWriter.put(lib, factory.makeWriter(header, true, + new File(OUTPUT, IOUtil.makeFileNameSafe(lib) + extension), + REFERENCE_SEQUENCE)); } for (Iterator it = reader.iterator(); it.hasNext(); ) { @@ -127,8 +128,9 @@ protected int doWork() { libraryToWriter.get(rg.getLibrary()).addAlignment(sam); } else { if (unknown == null) { - unknown = factory.makeSAMOrBAMWriter(unknownHeader, true, - new File(OUTPUT, "unknown" + extension)); + unknown = factory.makeWriter(unknownHeader, true, + new File(OUTPUT, "unknown" + extension), + REFERENCE_SEQUENCE); } unknown.addAlignment(sam); } diff --git a/src/main/java/picard/sam/markduplicates/MarkDuplicates.java b/src/main/java/picard/sam/markduplicates/MarkDuplicates.java index 0141c4bdc..cf267c2e8 100644 --- a/src/main/java/picard/sam/markduplicates/MarkDuplicates.java +++ b/src/main/java/picard/sam/markduplicates/MarkDuplicates.java @@ -1,7 +1,7 @@ /* * The MIT License * - * Copyright (c) 2009 The Broad Institute + * Copyright (c) 2009-2016 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -64,7 +64,7 @@ ) public class MarkDuplicates extends AbstractMarkDuplicatesCommandLineProgram { static final String USAGE_SUMMARY = "Identifies duplicate reads. "; - static final String USAGE_DETAILS = "

This tool locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are " + + static final String USAGE_DETAILS = "

This tool locates and tags duplicate reads in a SAM, BAM or CRAM file, where duplicate reads are " + "defined as originating from a single fragment of DNA. Duplicates can arise during sample preparation e.g. library " + "construction using PCR. See also " + "EstimateLibraryComplexity" + @@ -77,16 +77,16 @@ " collected, the tool differentiates the primary and duplicate reads using an algorithm that ranks reads by the sums " + "of their base-quality scores (default method).

" + - "

The tool's main output is a new SAM or BAM file, in which duplicates have been identified in the SAM flags field for each" + + "

The tool's main output is a new SAM, BAM or CRAM file, in which duplicates have been identified in the SAM flags field for each" + " read. Duplicates are marked with the hexadecimal value of 0x0400, which corresponds to a decimal value of 1024. " + "If you are not familiar with this type of annotation, please see the following " + "blog post for additional information.

" + "" + "

Although the bitwise flag annotation indicates whether a read was marked as a duplicate, it does not identify the type of " + "duplicate. To do this, a new tag called the duplicate type (DT) tag was recently added as an optional output in " + - "the 'optional field' section of a SAM/BAM file. Invoking the TAGGING_POLICY option," + + "the 'optional field' section of a SAM/BAM/CRAM file. Invoking the TAGGING_POLICY option," + " you can instruct the program to mark all the duplicates (All), only the optical duplicates (OpticalOnly), or no " + - "duplicates (DontTag). The records within the output of a SAM/BAM file will have values for the 'DT' tag (depending on the invoked " + + "duplicates (DontTag). The records within the output of a SAM/BAM/CRAM file will have values for the 'DT' tag (depending on the invoked " + "TAGGING_POLICY), as either library/PCR-generated duplicates (LB), or sequencing-platform artifact duplicates (SQ). " + "This tool uses the READ_NAME_REGEX and the OPTICAL_DUPLICATE_PIXEL_DISTANCE options as the primary methods to identify " + "and differentiate duplicate types. Set READ_NAME_REGEX to null to skip optical duplicate detection, e.g. for RNA-seq " + @@ -119,7 +119,7 @@ /** Enum used to control how duplicates are flagged in the DT optional tag on each read. */ public enum DuplicateTaggingPolicy { DontTag, OpticalOnly, All } - /** The optional attribute in SAM/BAM files used to store the duplicate type. */ + /** The optional attribute in SAM/BAM/CRAM files used to store the duplicate type. */ public static final String DUPLICATE_TYPE_TAG = "DT"; /** The duplicate type tag value for duplicate type: library. */ public static final String DUPLICATE_TYPE_LIBRARY = "LB"; @@ -205,7 +205,7 @@ public static void main(final String[] args) { } /** - * Main work method. Reads the BAM file once and collects sorted information about + * Main work method. Reads the SAM file once and collects sorted information about * the 5' ends of both ends of each read (or just one end in the case of pairs). * Then makes a pass through those determining duplicates before re-reading the * input file and writing it out with duplication flags set correctly. @@ -250,9 +250,10 @@ protected int doWork() { // Key: previous PG ID on a SAM Record (or null). Value: New PG ID to replace it. final Map chainedPgIds = getChainedPgIds(outputHeader); - final SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(outputHeader, + final SAMFileWriter out = new SAMFileWriterFactory().makeWriter(outputHeader, true, - OUTPUT); + OUTPUT, + REFERENCE_SEQUENCE); // Now copy over the file while marking all the necessary indexes as duplicates long recordInFileIndex = 0; diff --git a/src/main/java/picard/sam/markduplicates/MarkDuplicatesWithMateCigar.java b/src/main/java/picard/sam/markduplicates/MarkDuplicatesWithMateCigar.java index fb9aff26a..564cb18a3 100644 --- a/src/main/java/picard/sam/markduplicates/MarkDuplicatesWithMateCigar.java +++ b/src/main/java/picard/sam/markduplicates/MarkDuplicatesWithMateCigar.java @@ -1,7 +1,7 @@ /* * The MIT License * - * Copyright (c) 2014 The Broad Institute + * Copyright (c) 2014-2016 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -64,7 +64,7 @@ ) public class MarkDuplicatesWithMateCigar extends AbstractMarkDuplicatesCommandLineProgram { static final String USAGE_SUMMARY = "Identifies duplicate reads, accounting for mate CIGAR. "; - static final String USAGE_DETAILS = "This tool locates and tags duplicate reads (both PCR and optical) in a BAM or SAM file, where " + + static final String USAGE_DETAILS = "This tool locates and tags duplicate reads (both PCR and optical) in a BAM, SAM or CRAM file, where " + "duplicate reads are defined as originating from the same original fragment of DNA, taking into account the CIGAR string of " + "read mates.

" + "" + @@ -136,9 +136,10 @@ protected int doWork() { final Map chainedPgIds = getChainedPgIds(outputHeader); // Open the output - final SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(outputHeader, + final SAMFileWriter out = new SAMFileWriterFactory().makeWriter(outputHeader, true, - OUTPUT); + OUTPUT, + REFERENCE_SEQUENCE); // Create the mark duplicate iterator. The duplicate marking is handled by the iterator, conveniently. final MarkDuplicatesWithMateCigarIterator iterator = new MarkDuplicatesWithMateCigarIterator(headerAndIterator.header, diff --git a/src/main/java/picard/sam/markduplicates/SimpleMarkDuplicatesWithMateCigar.java b/src/main/java/picard/sam/markduplicates/SimpleMarkDuplicatesWithMateCigar.java index e8fe4d064..baac38a30 100644 --- a/src/main/java/picard/sam/markduplicates/SimpleMarkDuplicatesWithMateCigar.java +++ b/src/main/java/picard/sam/markduplicates/SimpleMarkDuplicatesWithMateCigar.java @@ -1,7 +1,7 @@ /* * The MIT License * - * Copyright (c) 2015 The Broad Institute + * Copyright (c) 2015-2016 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -62,9 +62,9 @@ * @author nhomer */ @CommandLineProgramProperties( - usage = "Examines aligned records in the supplied SAM or BAM file to locate duplicate molecules. " + + usage = "Examines aligned records in the supplied SAM/BAM/CRAM file to locate duplicate molecules. " + "All records are then written to the output file with the duplicate records flagged.", - usageShort = "Examines aligned records in the supplied SAM or BAM file to locate duplicate molecules.", + usageShort = "Examines aligned records in the supplied SAM/BAM/CRAM file to locate duplicate molecules.", programGroup = Testing.class ) public class SimpleMarkDuplicatesWithMateCigar extends MarkDuplicates { @@ -110,9 +110,10 @@ protected int doWork() { final Map chainedPgIds = getChainedPgIds(outputHeader); // Open the output - final SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(outputHeader, + final SAMFileWriter out = new SAMFileWriterFactory().makeWriter(outputHeader, false, - OUTPUT); + OUTPUT, + REFERENCE_SEQUENCE); final SAMRecordDuplicateComparator comparator = new SAMRecordDuplicateComparator(Collections.singletonList(headerAndIterator.header)); comparator.setScoringStrategy(this.DUPLICATE_SCORING_STRATEGY); diff --git a/src/main/java/picard/sam/markduplicates/util/AbstractMarkDuplicatesCommandLineProgram.java b/src/main/java/picard/sam/markduplicates/util/AbstractMarkDuplicatesCommandLineProgram.java index 5bf6972f4..3af3267dd 100644 --- a/src/main/java/picard/sam/markduplicates/util/AbstractMarkDuplicatesCommandLineProgram.java +++ b/src/main/java/picard/sam/markduplicates/util/AbstractMarkDuplicatesCommandLineProgram.java @@ -36,7 +36,6 @@ import htsjdk.samtools.metrics.MetricsFile; import htsjdk.samtools.util.CloseableIterator; import htsjdk.samtools.util.Histogram; -import htsjdk.samtools.util.Log; import picard.PicardException; import picard.cmdline.Option; import picard.cmdline.StandardOptionDefinitions; @@ -52,14 +51,14 @@ /** * Abstract class that holds parameters and methods common to classes that perform duplicate - * detection and/or marking within SAM/BAM files. + * detection and/or marking within SAM/BAM/CRAM files. * * @author Nils Homer */ public abstract class AbstractMarkDuplicatesCommandLineProgram extends AbstractOpticalDuplicateFinderCommandLineProgram { @Option(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME, - doc = "One or more input SAM or BAM files to analyze. Must be coordinate sorted.") + doc = "One or more input SAM, BAM or CRAM files to analyze. Must be coordinate sorted.") public List INPUT; @Option(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, @@ -209,6 +208,7 @@ protected SamHeaderAndIterator openInputs() { for (final String input : INPUT) { SamReader reader = SamReaderFactory.makeDefault() .enable(SamReaderFactory.Option.EAGERLY_DECODE) + .referenceSequence(REFERENCE_SEQUENCE) .open(SamInputResource.of(input)); final SAMFileHeader header = reader.getFileHeader(); diff --git a/src/test/java/picard/sam/CramCompatibilityTest.java b/src/test/java/picard/sam/CramCompatibilityTest.java new file mode 100644 index 000000000..930b84a12 --- /dev/null +++ b/src/test/java/picard/sam/CramCompatibilityTest.java @@ -0,0 +1,275 @@ +package picard.sam; + +import htsjdk.samtools.SamStreams; +import htsjdk.samtools.cram.CRAMException; +import htsjdk.samtools.util.IOUtil; +import htsjdk.samtools.util.TestUtil; +import org.testng.Assert; +import org.testng.annotations.AfterTest; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import picard.cmdline.CommandLineProgram; + +import java.io.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; + +public class CramCompatibilityTest { + + public static final String CRAM_FILE = "testdata/picard/sam/test_cram_file_coordinate_sorted.cram"; + public static final String CRAM_FILE_2 = "testdata/picard/sam/test_cram_file_header_only.cram"; + public static final String CRAM_FILE_ONE_PAIR_MC = "testdata/picard/sam/MarkDuplicates/one_pair_mc.cram"; + + public static final String CRAM_FILE_QUERY_SORTED_UNMAPPED = "testdata/picard/sam/unmapped_queryname_sorted.cram"; + public static final String CRAM_FILE_QUERY_SORTED = "testdata/picard/sam/test_cram_file_query_sorted.cram"; + + public static final String REFERENCE_FILE = "testdata/picard/sam/test_cram_file.ref.fa"; + public static final String FASTQ_FILE = "testdata/picard/sam/fastq2bam/fastq-sanger/5k-v1-Rhodobacter_LW1.sam.fastq"; + + public static final String CRAM_UNMAPPED = "testdata/picard/sam/SamFileConverterTest/unmapped.cram"; + public static final String CRAM_UNMAPPED_WITH_OQ_TAG = "testdata/picard/sam/unmapped_with_oq_tag.cram"; + + public static final String CRAM_UNMAPPED_PART_1 = "testdata/picard/sam/unmapped_part_1.cram"; + public static final String CRAM_UNMAPPED_PART_2 = "testdata/picard/sam/unmapped_part_2.cram"; + + public static final String CRAM_SPLIT_UNMAPPED = "testdata/picard/sam/split_test_unmapped.cram"; + + public static final String MBA_ALIGNED_CRAM = "testdata/picard/sam/MergeBamAlignment/cliptest.aligned.cram"; + public static final String MBA_UNMAPPED_CRAM = "testdata/picard/sam/MergeBamAlignment/cliptest.unmapped.cram"; + public static final String MBA_REFERENCE = "testdata/picard/sam/MergeBamAlignment/cliptest.fasta"; + + private static final File outputDir = IOUtil.createTempDir("testdata/picard/sam/CramCompatibilityTest", ".tmp"); + + @AfterTest + public void tearDown() { + TestUtil.recursiveDelete(outputDir); + } + + @DataProvider(name = "programArgsForCRAMWithReference") + public Object[][] getArgsForCRAMWithReference() { + return new Object[][] { + {"picard.sam.AddOrReplaceReadGroups", + "RGID=4 RGLB=lib1 RGPL=illumina RGPU=unit1 RGSM=20", + CRAM_FILE, + REFERENCE_FILE + }, + {"picard.sam.CleanSam", null, CRAM_FILE, REFERENCE_FILE}, + {"picard.sam.DownsampleSam", null, CRAM_FILE, REFERENCE_FILE}, + {"picard.sam.FixMateInformation", null, CRAM_FILE, REFERENCE_FILE}, + {"picard.sam.markduplicates.MarkDuplicates", + "M=" + createTempFile("MarkDuplicates", ".dir").getAbsolutePath(), + CRAM_FILE, + REFERENCE_FILE + }, + {"picard.sam.MergeSamFiles", null, CRAM_FILE, REFERENCE_FILE}, + {"picard.sam.PositionBasedDownsampleSam", "FRACTION=0.5", CRAM_FILE, REFERENCE_FILE}, + {"picard.sam.SortSam", "SORT_ORDER=queryname", CRAM_FILE, REFERENCE_FILE}, + {"picard.sam.ReplaceSamHeader", "HEADER=" + CRAM_FILE_2, CRAM_FILE, REFERENCE_FILE}, + {"picard.sam.RevertOriginalBaseQualitiesAndAddMateCigar", null, CRAM_FILE_QUERY_SORTED, REFERENCE_FILE}, + {"picard.sam.GatherBamFiles", + "I=" + new File(CRAM_UNMAPPED).getAbsolutePath(), + CRAM_FILE_QUERY_SORTED, + REFERENCE_FILE + }, + {"picard.sam.markduplicates.MarkDuplicatesWithMateCigar", + "M=" + createTempFile("MarkDuplicatesWithMateCigar", ".txt").getAbsolutePath(), + CRAM_FILE, + REFERENCE_FILE + }, + {"picard.sam.markduplicates.SimpleMarkDuplicatesWithMateCigar", + "M=" + createTempFile("SimpleMarkDuplicatesWithMateCigar", ".txt").getAbsolutePath(), + CRAM_FILE_ONE_PAIR_MC, + REFERENCE_FILE + }, + {"picard.sam.ReorderSam", + null, + CRAM_FILE, + REFERENCE_FILE + }, + {"picard.sam.SetNmMdAndUqTags", null, CRAM_FILE, REFERENCE_FILE}, + {"picard.sam.MergeBamAlignment", + "UNMAPPED=" + new File(MBA_UNMAPPED_CRAM).getAbsolutePath() + + " ALIGNED=" + new File(MBA_ALIGNED_CRAM).getAbsolutePath(), + null, + MBA_REFERENCE + }, + {"picard.illumina.MarkIlluminaAdapters", + "METRICS=" + createTempFile("picard.illumina.MarkIlluminaAdapters", ".txt").getAbsolutePath(), + CRAM_FILE_QUERY_SORTED, + REFERENCE_FILE + }, + {"picard.sam.SplitSamByLibrary", null, CRAM_FILE, REFERENCE_FILE} + }; + } + + @Test(dataProvider = "programArgsForCRAMWithReference") + public void testShouldWriteCRAMWhenCRAMWithReference(String program, + String parameters, + String cramFile, + String reference) throws IOException, IllegalAccessException, InstantiationException, ClassNotFoundException { + if (!program.equals("picard.sam.SplitSamByLibrary")) { + final File outputFile = createTempCram(program); + launchProgram(program, cramFile, outputFile.getAbsolutePath(), parameters, reference); + assertCRAM(outputFile); + } else { + final File tmpDir = IOUtil.createTempDir(outputDir.getAbsolutePath(), program); + launchProgram(program, cramFile, tmpDir.getAbsolutePath(), parameters, reference); + assertCRAMs(tmpDir); + } + } + + @DataProvider(name = "programArgsForCRAMWithoutReferenceToFail") + public Object[][] getArgsForCRAMWithoutReferenceToFail() { + return new Object[][] { + {"picard.sam.AddOrReplaceReadGroups", + "RGID=4 RGLB=lib1 RGPL=illumina RGPU=unit1 RGSM=20", + CRAM_FILE + }, + {"picard.sam.CleanSam", null, CRAM_FILE}, + {"picard.sam.DownsampleSam", null, CRAM_FILE}, + {"picard.sam.FixMateInformation", null, CRAM_FILE}, + {"picard.sam.markduplicates.MarkDuplicates", + "M=" + createTempFile("MarkDuplicates", ".dir").getAbsolutePath(), + CRAM_FILE + }, + {"picard.sam.MergeSamFiles", null, CRAM_FILE}, + {"picard.sam.PositionBasedDownsampleSam", "FRACTION=0.5", CRAM_FILE}, + {"picard.sam.SortSam", "SORT_ORDER=queryname", CRAM_FILE}, + {"picard.sam.ReplaceSamHeader", "HEADER=" + CRAM_FILE_2, CRAM_FILE}, + {"picard.sam.RevertOriginalBaseQualitiesAndAddMateCigar", null, CRAM_FILE_QUERY_SORTED}, + {"picard.sam.GatherBamFiles", + "I=" + new File(CRAM_UNMAPPED).getAbsolutePath(), + CRAM_FILE_QUERY_SORTED + }, + {"picard.sam.markduplicates.MarkDuplicatesWithMateCigar", + "M=" + createTempFile("MarkDuplicatesWithMateCigar", ".txt").getAbsolutePath(), + CRAM_FILE}, + {"picard.sam.markduplicates.SimpleMarkDuplicatesWithMateCigar", + "M=" + createTempFile("SimpleMarkDuplicatesWithMateCigar", ".txt").getAbsolutePath(), + CRAM_FILE_ONE_PAIR_MC}, + {"picard.illumina.MarkIlluminaAdapters", + "METRICS=" + createTempFile("picard.illumina.MarkIlluminaAdapters", ".txt").getAbsolutePath(), + CRAM_FILE_QUERY_SORTED, + }, + {"picard.sam.SplitSamByLibrary", null, CRAM_FILE} + }; + } + + @Test(dataProvider = "programArgsForCRAMWithoutReferenceToFail", expectedExceptions = CRAMException.class) + public void testShouldFailWhenCRAMWithoutReference(String program, + String parameters, + String cramFile) throws IOException, IllegalAccessException, InstantiationException, ClassNotFoundException { + if (!program.equals("picard.sam.SplitSamByLibrary")) { + final File outputFile = createTempCram(program); + launchProgram(program, cramFile, outputFile.getAbsolutePath(), parameters, null); + assertCRAM(outputFile); + } else { + final File tmpDir = IOUtil.createTempDir(outputDir.getAbsolutePath(), program); + launchProgram(program, cramFile, tmpDir.getAbsolutePath(), parameters, null); + assertCRAMs(tmpDir); + } + } + + // test with CRAMs that don't need reference (unmapped CRAMs for input or output) + @DataProvider(name = "programArgsWithUnmappedCRAM") + public Object[][] getArgsWithUnmappedCRAM() { + return new Object[][] { + {"picard.sam.AddOrReplaceReadGroups", "RGID=4 RGLB=lib1 RGPL=illumina RGPU=unit1 RGSM=20", CRAM_UNMAPPED}, + {"picard.sam.CleanSam", null, CRAM_UNMAPPED}, + {"picard.sam.DownsampleSam", null, CRAM_UNMAPPED}, + {"picard.sam.FixMateInformation", null, CRAM_UNMAPPED}, + {"picard.sam.markduplicates.MarkDuplicates", + "M=" + createTempFile("MarkDuplicates", ".dir").getAbsolutePath(), + CRAM_UNMAPPED + }, + {"picard.sam.MergeSamFiles", null, CRAM_UNMAPPED}, + {"picard.sam.PositionBasedDownsampleSam", "FRACTION=0.5", CRAM_UNMAPPED}, + {"picard.sam.SortSam", "SORT_ORDER=unsorted", CRAM_UNMAPPED}, + {"picard.sam.ReplaceSamHeader", "HEADER=" + MBA_UNMAPPED_CRAM, CRAM_UNMAPPED}, + {"picard.sam.RevertOriginalBaseQualitiesAndAddMateCigar", null, CRAM_UNMAPPED_WITH_OQ_TAG}, + {"picard.sam.GatherBamFiles", + "I=" + new File(CRAM_UNMAPPED_PART_2).getAbsolutePath(), + CRAM_UNMAPPED_PART_1 + }, + {"picard.sam.FastqToSam", "F1=" + FASTQ_FILE + " SAMPLE_NAME=s1", null}, + {"picard.illumina.IlluminaBasecallsToSam", + "BASECALLS_DIR=" + new File("testdata/picard/illumina/25T8B25T/Data/Intensities/BaseCalls") + + " LANE=1 READ_STRUCTURE=25S8S25T RUN_BARCODE=HiMom SAMPLE_ALIAS=HiDad LIBRARY_NAME=HelloWorld", + null + }, + {"picard.illumina.MarkIlluminaAdapters", + "METRICS=" + createTempFile("picard.illumina.MarkIlluminaAdapters", ".txt").getAbsolutePath(), + CRAM_FILE_QUERY_SORTED_UNMAPPED + }, + {"picard.sam.SplitSamByLibrary", null, CRAM_SPLIT_UNMAPPED} + }; + } + + @Test(dataProvider = "programArgsWithUnmappedCRAM") + public void testShouldWriteCRAMWhenUnmappedCRAMWithoutReference(String program, + String parameters, + String cramFile) throws IOException, IllegalAccessException, InstantiationException, ClassNotFoundException { + if (!program.equals("picard.sam.SplitSamByLibrary")) { + final File outputFile = createTempCram(program); + launchProgram(program, cramFile, outputFile.getAbsolutePath(), parameters, null); + assertCRAM(outputFile); + } else { + final File tmpDir = IOUtil.createTempDir(outputDir.getAbsolutePath(), program); + launchProgram(program, cramFile, tmpDir.getAbsolutePath(), parameters, null); + assertCRAMs(tmpDir); + } + } + + private File createTempCram(String name) throws IOException { + return createTempFile(name, ".cram"); + } + + private static File createTempFile(String name, String extension) { + File file = null; + try { + file = File.createTempFile(name, extension, outputDir); + file.deleteOnExit(); + } catch (IOException e) { + e.printStackTrace(); + } + + return file; + } + + private void launchProgram(String programClassname, + String input, + String output, + String exParams, + String reference) throws ClassNotFoundException, IllegalAccessException, InstantiationException { + final Collection args = new ArrayList<>(); + + if (input != null) { + args.add("INPUT=" + new File(input).getAbsolutePath()); + } + args.add("OUTPUT=" + output); + + if (exParams != null) { + args.addAll(Arrays.asList(exParams.split(" "))); + } + + if (reference != null) { + args.add("REFERENCE_SEQUENCE=" + new File(reference).getAbsolutePath()); + } + + CommandLineProgram program = (CommandLineProgram) Class.forName(programClassname).newInstance(); + program.instanceMain(args.toArray(new String[args.size()])); + } + + static void assertCRAM(File outputFile) { + try (InputStream in = new FileInputStream(outputFile)) { + Assert.assertTrue(SamStreams.isCRAMFile(new BufferedInputStream(in)), "File is not a CRAM."); + } catch (IOException e) { + e.printStackTrace(); + } + } + + private static void assertCRAMs(File dir) { + Arrays.stream(dir.listFiles()).filter(file -> file.getName().endsWith("cram")).forEach(CramCompatibilityTest::assertCRAM); + } +} diff --git a/src/test/java/picard/sam/FilterSamReadsTest.java b/src/test/java/picard/sam/FilterSamReadsTest.java index 71469c5ad..54dc3b53f 100644 --- a/src/test/java/picard/sam/FilterSamReadsTest.java +++ b/src/test/java/picard/sam/FilterSamReadsTest.java @@ -24,7 +24,10 @@ package picard.sam; import htsjdk.samtools.*; +import htsjdk.samtools.util.IOUtil; +import htsjdk.samtools.util.TestUtil; import org.testng.Assert; +import org.testng.annotations.AfterTest; import org.testng.annotations.BeforeTest; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -35,15 +38,10 @@ import java.util.stream.StreamSupport; public class FilterSamReadsTest extends CommandLineProgramTest { - @Override - public String getCommandLineProgramName() { - return FilterSamReads.class.getSimpleName(); - } - private static final int READ_LENGTH = 151; - private final SAMRecordSetBuilder builder = new SAMRecordSetBuilder(); - private final static File TEST_DIR = new File("testdata/picard/sam/FilterSamReads/"); + private final SAMRecordSetBuilder builder = new SAMRecordSetBuilder(); + private final static File TEST_DIR = IOUtil.createTempDir("testdata/picard/sam/FilterSamReads/output", ".tmp"); @BeforeTest public void setUp() { @@ -54,6 +52,16 @@ public void setUp() { builder.addPair("one_of_pair", 0, 1, 1000); //first read should pass, second should not, but both will be kept in first test } + @AfterTest + public void tearDown() { + TestUtil.recursiveDelete(TEST_DIR); + } + + @Override + public String getCommandLineProgramName() { + return FilterSamReads.class.getSimpleName(); + } + @DataProvider(name = "dataTestJsFilter") public Object[][] dataTestJsFilter() { return new Object[][]{ @@ -69,22 +77,38 @@ public void setUp() { {"testdata/picard/sam/FilterSamReads/filter2.interval_list", 0} }; } - + /** * filters a SAM using a javascript filter */ @Test(dataProvider = "dataTestJsFilter") public void testJavaScriptFilters(final String samFilename, final String javascriptFilename,final int expectNumber) throws Exception { - // input as SAM file + launchJavaScriptFilter(samFilename, javascriptFilename, expectNumber); + } + + @Test + public void testJavaScriptFiltersWithCRAM() throws Exception { + final FilterSamReads program = setupProgram( + new File("testdata/picard/sam/FilterSamReads/filterOddStarts.js"), + new File(CramCompatibilityTest.CRAM_FILE), + FilterSamReads.Filter.includeJavascript, + CramCompatibilityTest.REFERENCE_FILE); + Assert.assertEquals(program.doWork(), 0); + CramCompatibilityTest.assertCRAM(program.OUTPUT); + } + + private FilterSamReads launchJavaScriptFilter(String samFilename, String javascriptFilename, int expectNumber) throws Exception { + // input as SAM file final File inputSam = new File(samFilename); final File javascriptFile = new File(javascriptFilename); - FilterSamReads filterTest = setupProgram(javascriptFile, inputSam, FilterSamReads.Filter.includeJavascript); - Assert.assertEquals(filterTest.doWork(),0); - + FilterSamReads filterTest = setupProgram(javascriptFile, inputSam, FilterSamReads.Filter.includeJavascript, null); + Assert.assertEquals(filterTest.doWork(), 0); long count = getReadCount(filterTest); Assert.assertEquals(count, expectNumber); + + return filterTest; } /** @@ -106,7 +130,7 @@ public void testPairedIntervalFilter(final String intervalFilename, final int ex final File intervalFile = new File(intervalFilename); - FilterSamReads filterTest = setupProgram(intervalFile, inputSam, FilterSamReads.Filter.includePairedIntervals); + FilterSamReads filterTest = setupProgram(intervalFile, inputSam, FilterSamReads.Filter.includePairedIntervals, null); Assert.assertEquals(filterTest.doWork(),0); long count = getReadCount(filterTest); @@ -114,20 +138,23 @@ public void testPairedIntervalFilter(final String intervalFilename, final int ex Assert.assertEquals(count, expectNumber); } - private FilterSamReads setupProgram(final File inputFile, final File inputSam, final FilterSamReads.Filter filter) throws Exception { + private FilterSamReads setupProgram(final File inputFile, final File inputSam, final FilterSamReads.Filter filter, final String reference) throws Exception { final FilterSamReads program = new FilterSamReads(); program.INPUT = inputSam; - program.OUTPUT = File.createTempFile("FilterSamReads.output.", ".sam"); + program.OUTPUT = File.createTempFile("FilterSamReads.output.", getFilenameExtension(inputSam.getAbsolutePath())); program.OUTPUT.deleteOnExit(); program.FILTER = filter; - if(filter == FilterSamReads.Filter.includePairedIntervals) { + if (filter == FilterSamReads.Filter.includePairedIntervals) { program.INTERVAL_LIST = inputFile; - } - else { + } else { program.JAVASCRIPT_FILE = inputFile; } + if (reference != null) { + program.REFERENCE_SEQUENCE = new File(reference); + } + return program; } @@ -140,4 +167,9 @@ private long getReadCount(FilterSamReads filterTest) throws Exception { samReader.close(); return count; } + + private static String getFilenameExtension(String samFilename) { + final String[] split = samFilename.split("\\."); + return "." + split[split.length - 1]; + } } diff --git a/testdata/picard/sam/MarkDuplicates/one_pair_mc.cram b/testdata/picard/sam/MarkDuplicates/one_pair_mc.cram new file mode 100644 index 000000000..ea35315b1 Binary files /dev/null and b/testdata/picard/sam/MarkDuplicates/one_pair_mc.cram differ diff --git a/testdata/picard/sam/MergeBamAlignment/cliptest.aligned.cram b/testdata/picard/sam/MergeBamAlignment/cliptest.aligned.cram new file mode 100644 index 000000000..c66d98819 Binary files /dev/null and b/testdata/picard/sam/MergeBamAlignment/cliptest.aligned.cram differ diff --git a/testdata/picard/sam/MergeBamAlignment/cliptest.fasta.fai b/testdata/picard/sam/MergeBamAlignment/cliptest.fasta.fai new file mode 100644 index 000000000..1e6a48d89 --- /dev/null +++ b/testdata/picard/sam/MergeBamAlignment/cliptest.fasta.fai @@ -0,0 +1 @@ +chr1 1000 6 50 51 diff --git a/testdata/picard/sam/MergeBamAlignment/cliptest.unmapped.cram b/testdata/picard/sam/MergeBamAlignment/cliptest.unmapped.cram new file mode 100644 index 000000000..691fcf68d Binary files /dev/null and b/testdata/picard/sam/MergeBamAlignment/cliptest.unmapped.cram differ diff --git a/testdata/picard/sam/split_test_unmapped.cram b/testdata/picard/sam/split_test_unmapped.cram new file mode 100644 index 000000000..58b807967 Binary files /dev/null and b/testdata/picard/sam/split_test_unmapped.cram differ diff --git a/testdata/picard/sam/test_cram_file.ref.dict b/testdata/picard/sam/test_cram_file.ref.dict new file mode 100644 index 000000000..433246b66 --- /dev/null +++ b/testdata/picard/sam/test_cram_file.ref.dict @@ -0,0 +1,2 @@ +@HD VN:1.5 SO:unsorted +@SQ SN:17 LN:4200 M5:f8c08a4411f07717451464d546b3706d UR:file:\testdata\picard\sam\test_cram_file.fa diff --git a/testdata/picard/sam/test_cram_file.ref.fa b/testdata/picard/sam/test_cram_file.ref.fa new file mode 100644 index 000000000..7c2ec2a88 --- /dev/null +++ b/testdata/picard/sam/test_cram_file.ref.fa @@ -0,0 +1,71 @@ +>17 17:1-4200 +AAGCTTCTCACCCTGTTCCTGCATAGATAATTGCATGACAATTGCCTTGTCCCTGCTGAA +TGTGCTCTGGGGTCTCTGGGGTCTCACCCACGACCAACTCCCTGGGCCTGGCACCAGGGA +GCTTAACAAACATCTGTCCAGCGAATACCTGCATCCCTAGAAGTGAAGCCACCGCCCAAA +GACACGCCCATGTCCAGCTTAACCTGCATCCCTAGAAGTGAAGGCACCGCCCAAAGACAC +GCCCATGTCCAGCTTATTCTGCCCAGTTCCTCTCCAGAAAGGCTGCATGGTTGACACACA +GTGCCTGCGACAAAGCTGAATGCTATCATTTAAAAACTCCTTGCTGGTTTGAGAGGCAGA +AAATGATATCTCATAGTTGCTTTACTTTGCATATTTTAAAATTGTGACTTTCATGGCATA +AATAATACTGGTTTATTACAGAAGCACTAGAAAATGCATGTGGACAAAAGTTGGGATTAG +GAGAGAGAAATGAAGACATATGTCCACACAAAAACCTGTTCATTGCAGCTTTCTACCATC +ACCAAAAATTGCAAACAACCACACGCCCTTCAACTGGGGAACTCATCAACAACAAACTTG +TGGTTTACCCACACAATGGAAGACCACTTAGCAACAAAAAGGACCAAACTCCTGGTACAT +GCAACTGACAGATGAATCTCAAACGCATTCCTCCGTGTGAAAGAAGCCGGACTCACAGGG +CAACACACTATCTGACTGTTTCATGGGAAAGTCTGGAAACGGCAACACCATTGAGACAGA +AAACAGGTGAGTGGTTGCCTGGGGCCAGGGAACTTTCTGGGGTCATATTCTCTGTGTTGA +TTCTGGTGGTGGAAACAAGACTGTCCCAGCCTGGGTGATACAGCGAGACCCCATCTCTAC +CAAAAAATTAAAAATTAGCTGGGCATGGTGGTGCATGCCTGTAGTCCCAGCTATTCACAG +TGCTGAGGTGGGAAGATGCTTGAGCCCAGGAGTTCAAGGCTGCAATGAGCTATGATTGCG +CCACTGCACTTTGGCCTGGACAACAGAGCAAAACCCTGTCTCTAAAAAAAGAAAAGAAAA +GAAAAACTCACTGGATATGAATGATACAGGTTGAGGATCCATTATCTGAAATGCTTGGAC +CAGATGTTTTGAATTTTGGATTTTTTCATATTTTGTAATCTTTGCAGTATATTTACCAGT +TCAGCATCCCTAACTCAAAAATTCAAAAATCTGAAATCCCAAACGCGCCAATAAGCATTC +CCTTTGAGCGTCATGTCGGTGCTTGGAATGTTTGGGGTTTTGGATTTACAGCTTTGGGAC +GCTCAACCTGTACCTCAATAAACCTGATTTTAAAAAAGTTTGGGGGGATTCCCCTAAGCC +CGCCACCCGGAGACAGCGGATTTCCTTAGTTACTTACTATGCTCCTTGGCCATTTCTCTA +GGTATTGGTATATTGTGTCTGCTGTGAACTGTCCTTGGCCTGTTTGGTGACGGGTGAGGA +GCAGGGACAGAAGGGTCCTGCGTGCCCTGCCTTCACAAGCCCCTGGAAGGAAAGTTGTTT +TGGGATCTCTGCACCCTCAGCCTGGACAACTTGTGCCCATCTGGTGACCCCTCACTCAGC +CACCAGACTTCCACGACAGGCTCCAGCCTCGGCACCTTCAGCCATGGACAGTTCCGCCAG +CGTTGCCCTCTGTTCTGCTGTTTTCTCTACCAGAAGTGCCCTTCCCTCCTCACCTGACCA +CTCTGGGGAAATCCCTCAGCACCCTCCCTGAGCATACCCTACTCTGGCACAAGCCCACCC +TGCAAAGCCCCTGAGGCCCGCCCTGTGGCGTCTCTCCCTCCCTTGCTGTCAGGACAGTGG +TCCTGGCCACCGGGGCTCACGGAGCCGCCCTGTGCCGTGTACCTCTGAGCCCTCTGCACA +GTGCCTTCTGCTTGCCTGTGGCTTTGAGAAGAAACCCCTTCTGGTTATACATAAGACAGC +CAGAGAAGGGAGTTGCCCAGGGTGGCACAGCACGTTGCTGCCAGTTACTGCCATTTTCAC +GGGCATGAAATGGAGATAACAACAGGAGCGACCGCACAGGCTGCTGAGCGCGTCACACGC +AGCCATCGCGCAGCTCAGGGATATTACGTGTAACTCGACATGTCAGCGATTGTCACAGGC +ACTGCTACTCCTGGGGTTTTCCATCAAACCCTCAAGAGCTGGGCCTGGGGTCAACTTCCG +GCCTGGGGAAACTGGGGCAAGTATCACCAGAGATGAGCTTTATAAAAATAATGGTGCTAG +CTGGGCATGGTGGCTTGCACCTGTAATCCCAGCACTTTGGGAGGCCGAGCTAGGAGGATC +GTTTGAGTCCAGCAGTTTGAGACCAGCCTGGCCAATACGGCAAAACCCAGTCTCTACAAA +AAATACAAAAAACAACTAGCCAGGCGTGGTGGTGCACACCTGTAGTCCCAGCTACTCAGG +AGGCTGAGGGGGAAGGACTGCTTGAGCCCAGGAGTTTGAGGCTGCTGTGAGCTGTGATCG +CATCACTGCATTCCAGCCCGGTGACAGAGTGAGTCACTGTCTCAAAAAAGAAAGGAAGAA +ATAAAGAAAACAAATAAAAATAATAGTGCAGACAAAAGGCCTTGACCCATCTAGCTTTGG +CCCTCAGCATCAACCGCTAGATACGTCCCTCCCTTTCTTCTGGGGCACAGGTCACACTCT +CTTCCAGGTCTAGGATGCAGCTGAGGGGTGCCCCTCTTACCATCTAATCTGTGCCCTTAT +TTCCTCTGCTTTAGTGAGGAAGAGGCCCCTGGTCCATGAAGGGGCCTTTCAGAGACGGGG +ACCCCTGAGGAGCCCCGAGCAGCAGCCGTCGTGTCTCACCCAGGGTGTCTGAAACAGATG +TGGAGGTCTCGGGTGAGGCGTGGCTCAGATACAGGGAGTGGCCCACAGCTCGGCCTGTCT +TTGAAAGGCCACGTGACCTGGCCCACGGCTGGCAGGTGGGACCCAGCTGCAGGGGTCCAG +CAGCACCCACAGCAGCCACCTGTGGCAGGGAGGAGCTTGTGGTACAGTGGACAGGCCCTG +CCCAGATGGCCCCCCGCCTGCCTGTGGAAGTTGACCAGACCATCTGTCACAGCAGGTAAG +ACTCTGCTTTCTGGGCAACCCAGCAGGTGACCCTGGAATTCCTGTCCATCTGGCAGGTGG +GCATTGAAACTGGTTTAAAAATGTCACACCATAGGCCGGGCACAGTGGCTCACGCCTGTA +ATCCCAGCCCTTTGGGAGGCCAGGGTGGGTGGATCACTTGAGGTCAGGAGTTCAAGACCA +GCCTGGCCAACATGGTGAAACCCCGTCTACTAAAAATACAAAAATTAGCCTGGCGTGGTG +GCGCATGCCTGTAATCCCAGCTACTTGGGAAGCTGAGGGATGAGAACTGCTTGAACCTGG +GAGGCAGACGTTGCAGTGAGCTGAGATCACGCCACTGCACTCCAGCCTGGGCAACAGAGT +AAGACTCTGTCTCAAAAAAAAAAAAATCACACCATTTTGGCTTCAGATTGCATATCCTCC +TGCAAGGATATATACGCGTGAAATTCAAGTCAATGACAAATCAGAAGAAAAAACATATAT +ATACGCAAACCAGTATCCTACTGTGTGTGTCGTTTGTTGTGTTTTCGACAGCTGTCCGTG +TTATAATAATTCCTCTAGTTCAAATTTATTCATTTTTAACTTCATAGTACCACATTCTAC +ACACTGCCCATGTCCCCTCAAGCTTCCCCTGGCTCCTGCAACCACAAATCTACTCTCTGC +CTCTGTGGGTTGACCTATTCTGGACACGTCATAGAAATAGAGTCCTGCAACACGTGGCCG +TCTGTGTCTGGCTTCTCTCGCTTAGCATCTTGTTTCCAAGGTCCTCCCACAGTGTAGCAT +GCACCTGCTACACTCCTTCTTAGGGCTGATATTCCACGCACCTGCTACACTCCTTCTTAT +GGCTGATATTCCACGCACCTGCTACACTCCTTCTTAGGGCTGATATTCCACACACCCGCT +ACACTCCTTCTTAGGGCTGATATTCCACGCACCCGCTACACTCCTTCTTAGGGCTGATAT +TCCACGCACCTGCTACACTCCTTCTTAGGGCTGATATTCCACGCACCTGCTACACTCCTT +CTTAGGGCTGATATTCCACGCACCTGCTACACTCCTTCTTAGGGCTGATATTCCACGCAC diff --git a/testdata/picard/sam/test_cram_file.ref.fa.fai b/testdata/picard/sam/test_cram_file.ref.fa.fai new file mode 100644 index 000000000..c2112667e --- /dev/null +++ b/testdata/picard/sam/test_cram_file.ref.fa.fai @@ -0,0 +1 @@ +17 4200 14 60 61 diff --git a/testdata/picard/sam/test_cram_file_coordinate_sorted.cram b/testdata/picard/sam/test_cram_file_coordinate_sorted.cram new file mode 100644 index 000000000..4d312952c Binary files /dev/null and b/testdata/picard/sam/test_cram_file_coordinate_sorted.cram differ diff --git a/testdata/picard/sam/test_cram_file_coordinate_sorted.cram.bai b/testdata/picard/sam/test_cram_file_coordinate_sorted.cram.bai new file mode 100644 index 000000000..62bacfd63 Binary files /dev/null and b/testdata/picard/sam/test_cram_file_coordinate_sorted.cram.bai differ diff --git a/testdata/picard/sam/test_cram_file_header_only.cram b/testdata/picard/sam/test_cram_file_header_only.cram new file mode 100644 index 000000000..606986d5f Binary files /dev/null and b/testdata/picard/sam/test_cram_file_header_only.cram differ diff --git a/testdata/picard/sam/test_cram_file_query_sorted.cram b/testdata/picard/sam/test_cram_file_query_sorted.cram new file mode 100644 index 000000000..1f7655a4c Binary files /dev/null and b/testdata/picard/sam/test_cram_file_query_sorted.cram differ diff --git a/testdata/picard/sam/test_cram_file_query_sorted.cram.crai b/testdata/picard/sam/test_cram_file_query_sorted.cram.crai new file mode 100644 index 000000000..bc15eb2ed Binary files /dev/null and b/testdata/picard/sam/test_cram_file_query_sorted.cram.crai differ diff --git a/testdata/picard/sam/unmapped_part_1.cram b/testdata/picard/sam/unmapped_part_1.cram new file mode 100644 index 000000000..5b14a69b4 Binary files /dev/null and b/testdata/picard/sam/unmapped_part_1.cram differ diff --git a/testdata/picard/sam/unmapped_part_2.cram b/testdata/picard/sam/unmapped_part_2.cram new file mode 100644 index 000000000..ff842a4f6 Binary files /dev/null and b/testdata/picard/sam/unmapped_part_2.cram differ diff --git a/testdata/picard/sam/unmapped_queryname_sorted.cram b/testdata/picard/sam/unmapped_queryname_sorted.cram new file mode 100644 index 000000000..893e530ba Binary files /dev/null and b/testdata/picard/sam/unmapped_queryname_sorted.cram differ diff --git a/testdata/picard/sam/unmapped_with_oq_tag.cram b/testdata/picard/sam/unmapped_with_oq_tag.cram new file mode 100644 index 000000000..8b9ac1402 Binary files /dev/null and b/testdata/picard/sam/unmapped_with_oq_tag.cram differ