diff --git a/build.gradle b/build.gradle index 300a83fbb..7e2e499c1 100644 --- a/build.gradle +++ b/build.gradle @@ -47,7 +47,7 @@ jacoco { toolVersion = "0.7.5.201505241946" } -final htsjdkVersion = System.getProperty('htsjdk.version', '2.7.0') +final htsjdkVersion = System.getProperty('htsjdk.version', '2.8.0') dependencies { compile 'com.google.guava:guava:15.0' diff --git a/src/main/java/picard/sam/CreateSequenceDictionary.java b/src/main/java/picard/sam/CreateSequenceDictionary.java index 62a09c6cc..6aa587a79 100644 --- a/src/main/java/picard/sam/CreateSequenceDictionary.java +++ b/src/main/java/picard/sam/CreateSequenceDictionary.java @@ -23,14 +23,18 @@ */ package picard.sam; -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMFileWriter; -import htsjdk.samtools.SAMFileWriterFactory; import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceDictionaryCodec; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.reference.ReferenceSequence; import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.samtools.reference.ReferenceSequenceFileFactory; +import htsjdk.samtools.util.AsciiWriter; +import htsjdk.samtools.util.CloseableIterator; +import htsjdk.samtools.util.IOUtil; +import htsjdk.samtools.util.Md5CalculatingOutputStream; +import htsjdk.samtools.util.RuntimeIOException; +import htsjdk.samtools.util.SortingCollection; import htsjdk.samtools.util.StringUtil; import picard.PicardException; import picard.cmdline.CommandLineProgram; @@ -39,7 +43,7 @@ import picard.cmdline.programgroups.Fasta; import picard.cmdline.StandardOptionDefinitions; -import java.io.File; +import java.io.*; import java.math.BigInteger; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; @@ -49,7 +53,7 @@ import java.util.Set; /** - * Create a SAM/BAM file from a fasta containing reference sequence. The output SAM file contains a header but no + * Create a SAM/BAM file from a fasta containing reference sequence. The output SAM file contains a header but no * SAMRecords, and the header contains only sequence records. */ @CommandLineProgramProperties( @@ -114,6 +118,28 @@ public static void main(final String[] argv) { } /** + * Read all the sequences from the given reference file, and convert into SAMSequenceRecords + * @param referenceFile fasta or fasta.gz + * @return SAMSequenceRecords containing info from the fasta, plus from cmd-line arguments. + */ + @Deprecated + public SAMSequenceDictionary makeSequenceDictionary(final File referenceFile) { + final ReferenceSequenceFile refSeqFile = + ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile, TRUNCATE_NAMES_AT_WHITESPACE); + ReferenceSequence refSeq; + final List ret = new ArrayList<>(); + final Set sequenceNames = new HashSet<>(); + for (int numSequences = 0; numSequences < NUM_SEQUENCES && (refSeq = refSeqFile.nextSequence()) != null; ++numSequences) { + if (sequenceNames.contains(refSeq.getName())) { + throw new PicardException("Sequence name appears more than once in reference: " + refSeq.getName()); + } + sequenceNames.add(refSeq.getName()); + ret.add(makeSequenceRecord(refSeq)); + } + return new SAMSequenceDictionary(ret); + } + + /** * Use reference filename to create URI to go into header if URI was not passed on cmd line. */ protected String[] customCommandLineValidation() { @@ -134,33 +160,55 @@ protected int doWork() { throw new PicardException(OUTPUT.getAbsolutePath() + " already exists. Delete this file and try again, or specify a different output file."); } - final SAMSequenceDictionary sequences = makeSequenceDictionary(REFERENCE); - final SAMFileHeader samHeader = new SAMFileHeader(); - samHeader.setSequenceDictionary(sequences); - final SAMFileWriter samWriter = new SAMFileWriterFactory().makeSAMWriter(samHeader, false, OUTPUT); - samWriter.close(); - return 0; - } - /** - * Read all the sequences from the given reference file, and convert into SAMSequenceRecords - * @param referenceFile fasta or fasta.gz - * @return SAMSequenceRecords containing info from the fasta, plus from cmd-line arguments. - */ - public SAMSequenceDictionary makeSequenceDictionary(final File referenceFile) { - final ReferenceSequenceFile refSeqFile = - ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile, TRUNCATE_NAMES_AT_WHITESPACE); - ReferenceSequence refSeq; - final List ret = new ArrayList(); - final Set sequenceNames = new HashSet(); - for (int numSequences = 0; numSequences < NUM_SEQUENCES && (refSeq = refSeqFile.nextSequence()) != null; ++numSequences) { - if (sequenceNames.contains(refSeq.getName())) { - throw new PicardException("Sequence name appears more than once in reference: " + refSeq.getName()); + // SortingCollection is used to check uniqueness of sequence names + final SortingCollection sequenceNames = makeSortingCollection(); + try (BufferedWriter writer = makeWriter()) { + final ReferenceSequenceFile refSeqFile = ReferenceSequenceFileFactory. + getReferenceSequenceFile(REFERENCE, TRUNCATE_NAMES_AT_WHITESPACE); + SAMSequenceDictionaryCodec samDictCodec = new SAMSequenceDictionaryCodec(writer); + + samDictCodec.encodeHeaderLine(false); + // read reference sequence one by one and write its metadata + for (ReferenceSequence refSeq = refSeqFile.nextSequence(); refSeq != null; refSeq = refSeqFile.nextSequence()) { + final SAMSequenceRecord samSequenceRecord = makeSequenceRecord(refSeq); + samDictCodec.encodeSequenceRecord(samSequenceRecord); + sequenceNames.add(refSeq.getName()); } - sequenceNames.add(refSeq.getName()); - ret.add(makeSequenceRecord(refSeq)); + } catch (FileNotFoundException e) { + throw new PicardException("File " + OUTPUT.getAbsolutePath() + " not found"); + } catch (IOException e) { + throw new PicardException("Can't write to or close output file " + OUTPUT.getAbsolutePath()); } - return new SAMSequenceDictionary(ret); + + // check uniqueness of sequences names + final CloseableIterator iterator = sequenceNames.iterator(); + + if(!iterator.hasNext()) return 0; + + String current = iterator.next(); + while (iterator.hasNext()) { + final String next = iterator.next(); + if (current.equals(next)) { + OUTPUT.delete(); + throw new PicardException("Sequence name " + current + + " appears more than once in reference file"); + } + current = next; + } + return 0; + } + + private BufferedWriter makeWriter() throws FileNotFoundException { + return new BufferedWriter( + new AsciiWriter(this.CREATE_MD5_FILE ? + new Md5CalculatingOutputStream( + new FileOutputStream(OUTPUT, false), + new File(OUTPUT.getAbsolutePath() + ".md5") + ) + : new FileOutputStream(OUTPUT) + ) + ); } /** @@ -196,4 +244,54 @@ private String md5Hash(final byte[] bytes) { } return s; } + + private SortingCollection makeSortingCollection() { + final String name = getClass().getSimpleName(); + final File tmpDir = IOUtil.createTempDir(name, null); + tmpDir.deleteOnExit(); + // 256 byte for one name, and 1/10 part of all memory for this, rough estimate + long maxNamesInRam = Runtime.getRuntime().maxMemory() / 256 / 10; + return SortingCollection.newInstance( + String.class, + new StringCodec(), + String::compareTo, + (int) Math.min(maxNamesInRam, Integer.MAX_VALUE), + tmpDir + ); + } + + private static class StringCodec implements SortingCollection.Codec { + private DataInputStream dis; + private DataOutputStream dos; + + public StringCodec clone() { + return new StringCodec(); + } + + public void setOutputStream(final OutputStream os) { + dos = new DataOutputStream(os); + } + + public void setInputStream(final InputStream is) { + dis = new DataInputStream(is); + } + + public void encode(final String str) { + try { + dos.writeUTF(str); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + public String decode() { + try { + return dis.readUTF(); + } catch (EOFException e) { + return null; + } catch (IOException e) { + throw new PicardException("Exception reading sequence name from temporary file.", e); + } + } + } } diff --git a/src/test/java/picard/sam/CreateSequenceDictionaryTest.java b/src/test/java/picard/sam/CreateSequenceDictionaryTest.java index 9b041ccef..bb0821482 100644 --- a/src/test/java/picard/sam/CreateSequenceDictionaryTest.java +++ b/src/test/java/picard/sam/CreateSequenceDictionaryTest.java @@ -28,15 +28,20 @@ import picard.cmdline.CommandLineProgramTest; import picard.PicardException; +import java.io.BufferedReader; import java.io.File; +import java.io.FileReader; +import java.util.List; +import java.util.stream.Collectors; /** * @author alecw@broadinstitute.org */ public class CreateSequenceDictionaryTest extends CommandLineProgramTest { - public static File TEST_DATA_DIR = new File("testdata/picard/sam"); - public static File BASIC_FASTA = new File(TEST_DATA_DIR, "basic.fasta"); - public static File DUPLICATE_FASTA = new File(TEST_DATA_DIR, "duplicate_sequence_names.fasta"); + public static File TEST_DATA_DIR = new File("testdata/picard"); + public static File BASIC_FASTA = new File(TEST_DATA_DIR + "/sam", "basic.fasta"); + public static File EQUIVALENCE_TEST_FASTA = new File(TEST_DATA_DIR + "/reference", "test.fasta"); + public static File DUPLICATE_FASTA = new File(TEST_DATA_DIR + "/sam", "duplicate_sequence_names.fasta"); public String getCommandLineProgramName() { return CreateSequenceDictionary.class.getSimpleName(); @@ -55,6 +60,32 @@ public void testBasic() throws Exception { Assert.assertEquals(runPicardCommandLine(argv), 0); } + @Test + public void testForEquivalence() throws Exception { + final File outputDict = File.createTempFile("CreateSequenceDictionaryTest.", ".dict"); + outputDict.delete(); + final String[] argv = { + "REFERENCE=" + EQUIVALENCE_TEST_FASTA, + "OUTPUT=" + outputDict, + "TRUNCATE_NAMES_AT_WHITESPACE=false" + }; + Assert.assertEquals(runPicardCommandLine(argv), 0); + + List currentDict = new BufferedReader(new FileReader(outputDict)) + .lines() + //remove info about location fasta file + .map(s -> s.replaceAll("UR:.*", "")) + .collect(Collectors.toList()); + + List expectedDict = new BufferedReader(new FileReader(TEST_DATA_DIR + "/reference/csd_dict.dict")) + .lines() + //remove info about location fasta file + .map(s -> s.replaceAll("UR:.*", "")) + .collect(Collectors.toList()); + + Assert.assertEquals(currentDict, expectedDict); + } + /** * Should throw an exception because with TRUNCATE_NAMES_AT_WHITESPACE, sequence names are not unique. */ diff --git a/testdata/picard/reference/csd_dict.dict b/testdata/picard/reference/csd_dict.dict new file mode 100644 index 000000000..0ca0bde86 --- /dev/null +++ b/testdata/picard/reference/csd_dict.dict @@ -0,0 +1,9 @@ +@HD VN:1.5 +@SQ SN:chr1 LN:101 M5:bd01f7e11515bb6beda8f7257902aa67 UR:file:/path/to/testdata/picard/reference/test.fasta +@SQ SN:chr2 LN:101 M5:31c33e2155b3de5e2554b693c475b310 UR:file:/path/to/testdata/picard/reference/test.fasta +@SQ SN:chr3 LN:101 M5:631593c6dd2048ae88dcce2bd505d295 UR:file:/path/to/testdata/picard/reference/test.fasta +@SQ SN:chr4 LN:101 M5:c60cb92f1ee5b78053c92bdbfa19abf1 UR:file:/path/to/testdata/picard/reference/test.fasta +@SQ SN:chr5 LN:101 M5:07ebc213c7611db0eacbb1590c3e9bda UR:file:/path/to/testdata/picard/reference/test.fasta +@SQ SN:chr6 LN:101 M5:7be2f5e7ee39e60a6c3b5b6a41178c6d UR:file:/path/to/testdata/picard/reference/test.fasta +@SQ SN:chr7 LN:202 M5:93763aaf6a455871c7d7a7718bff9ccf UR:file:/path/to/testdata/picard/reference/test.fasta +@SQ SN:chr8 LN:202 M5:d339678efce576d5546e88b49a487b63 UR:file:/path/to/testdata/picard/reference/test.fasta \ No newline at end of file