From fa84e9642c8e1ce7b76e8dcb222ec7185292437b Mon Sep 17 00:00:00 2001 From: Pavel Silin Date: Thu, 17 Nov 2016 18:00:06 +0300 Subject: [PATCH 1/8] fix issue 432 --- .../java/picard/sam/CreateSequenceDictionary.java | 143 +++++++++++++++------ 1 file changed, 107 insertions(+), 36 deletions(-) diff --git a/src/main/java/picard/sam/CreateSequenceDictionary.java b/src/main/java/picard/sam/CreateSequenceDictionary.java index 62a09c6cc..7cf6e84a6 100644 --- a/src/main/java/picard/sam/CreateSequenceDictionary.java +++ b/src/main/java/picard/sam/CreateSequenceDictionary.java @@ -23,34 +23,32 @@ */ package picard.sam; -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMFileWriter; -import htsjdk.samtools.SAMFileWriterFactory; -import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.samtools.SAMSequenceDictionaryCodec; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.reference.ReferenceSequence; import htsjdk.samtools.reference.ReferenceSequenceFile; import htsjdk.samtools.reference.ReferenceSequenceFileFactory; +import htsjdk.samtools.util.AsciiWriter; +import htsjdk.samtools.util.CloseableIterator; +import htsjdk.samtools.util.IOUtil; +import htsjdk.samtools.util.Md5CalculatingOutputStream; +import htsjdk.samtools.util.RuntimeIOException; +import htsjdk.samtools.util.SortingCollection; import htsjdk.samtools.util.StringUtil; import picard.PicardException; import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; -import picard.cmdline.programgroups.Fasta; import picard.cmdline.StandardOptionDefinitions; +import picard.cmdline.programgroups.Fasta; -import java.io.File; +import java.io.*; import java.math.BigInteger; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Set; /** - * Create a SAM/BAM file from a fasta containing reference sequence. The output SAM file contains a header but no - * SAMRecords, and the header contains only sequence records. + * Create a .dict file from a fasta containing reference sequence. dict file contains only sequence records. */ @CommandLineProgramProperties( usage = CreateSequenceDictionary.USAGE_SUMMARY + CreateSequenceDictionary.USAGE_DETAILS, @@ -73,6 +71,7 @@ "" + "" + "
"; + // The following attributes define the command-line arguments @Option(doc = "Input reference fasta or fasta.gz", shortName = StandardOptionDefinitions.REFERENCE_SHORT_NAME) @@ -134,33 +133,50 @@ protected int doWork() { throw new PicardException(OUTPUT.getAbsolutePath() + " already exists. Delete this file and try again, or specify a different output file."); } - final SAMSequenceDictionary sequences = makeSequenceDictionary(REFERENCE); - final SAMFileHeader samHeader = new SAMFileHeader(); - samHeader.setSequenceDictionary(sequences); - final SAMFileWriter samWriter = new SAMFileWriterFactory().makeSAMWriter(samHeader, false, OUTPUT); - samWriter.close(); - return 0; - } - /** - * Read all the sequences from the given reference file, and convert into SAMSequenceRecords - * @param referenceFile fasta or fasta.gz - * @return SAMSequenceRecords containing info from the fasta, plus from cmd-line arguments. - */ - public SAMSequenceDictionary makeSequenceDictionary(final File referenceFile) { - final ReferenceSequenceFile refSeqFile = - ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile, TRUNCATE_NAMES_AT_WHITESPACE); - ReferenceSequence refSeq; - final List ret = new ArrayList(); - final Set sequenceNames = new HashSet(); - for (int numSequences = 0; numSequences < NUM_SEQUENCES && (refSeq = refSeqFile.nextSequence()) != null; ++numSequences) { - if (sequenceNames.contains(refSeq.getName())) { - throw new PicardException("Sequence name appears more than once in reference: " + refSeq.getName()); + // SortingCollection is used to check uniqueness of sequence names + final SortingCollection sequenceNames = makeSortingCollection(); + try (Writer writer = makeWriter()) { + final ReferenceSequenceFile refSeqFile = ReferenceSequenceFileFactory. + getReferenceSequenceFile(REFERENCE, TRUNCATE_NAMES_AT_WHITESPACE); + SAMSequenceDictionaryCodec samDictCodec = new SAMSequenceDictionaryCodec(writer); + + // read reference sequence one by one and write its metadata + ReferenceSequence refSeq; + while ((refSeq = refSeqFile.nextSequence()) != null) { + final SAMSequenceRecord samSequenceRecord = makeSequenceRecord(refSeq); + samDictCodec.encodeSQLine(samSequenceRecord); + sequenceNames.add(refSeq.getName()); } - sequenceNames.add(refSeq.getName()); - ret.add(makeSequenceRecord(refSeq)); + } catch (FileNotFoundException e) { + throw new PicardException("File " + OUTPUT.getAbsolutePath() + " not found"); + } catch (IOException e) { + throw new PicardException("Can't write to or close output file " + OUTPUT.getAbsolutePath()); } - return new SAMSequenceDictionary(ret); + + // check uniqueness of sequences names + final CloseableIterator iterator = sequenceNames.iterator(); + String first = iterator.hasNext() ? iterator.next() : ""; + while (iterator.hasNext()) { + final String next = iterator.next(); + if (first.equals(next)) { + OUTPUT.delete(); + throw new PicardException("Sequence name " + first + + " appears more than once in reference file"); + } + first = next; + } + return 0; + } + + private Writer makeWriter() throws FileNotFoundException { + return new AsciiWriter(this.CREATE_MD5_FILE ? + new Md5CalculatingOutputStream( + new FileOutputStream(OUTPUT, false), + new File(OUTPUT.getAbsolutePath() + ".md5") + ) + : new FileOutputStream(OUTPUT) + ); } /** @@ -196,4 +212,59 @@ private String md5Hash(final byte[] bytes) { } return s; } + + private SortingCollection makeSortingCollection() { + final String parent = System.getProperty("java.io.tmpdir") + "/" + System.getProperty("user.name"); + final String child = getClass().getSimpleName(); + final File tmpDir = new File(parent, child); + IOUtil.deleteDirectoryTree(tmpDir); + if (!tmpDir.mkdirs()) { + throw new IllegalStateException("Could not create tmpdir: " + tmpDir.getAbsolutePath()); + } + tmpDir.deleteOnExit(); + // 256 byte for one name, and 1/10 part of all memory for this, rough estimate + long maxNamesInRam = Runtime.getRuntime().maxMemory() / 256 / 10; + return SortingCollection.newInstance( + String.class, + new StringCodec(), + String::compareTo, + maxNamesInRam > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) maxNamesInRam, + tmpDir + ); + } + + private static class StringCodec implements SortingCollection.Codec { + private DataInputStream dis; + private DataOutputStream dos; + + public StringCodec clone() { + return new StringCodec(); + } + + public void setOutputStream(final OutputStream os) { + dos = new DataOutputStream(os); + } + + public void setInputStream(final InputStream is) { + dis = new DataInputStream(is); + } + + public void encode(final String str) { + try { + dos.writeUTF(str); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + } + + public String decode() { + try { + return dis.readUTF(); + } catch (EOFException e) { + return null; + } catch (IOException e) { + throw new PicardException("Exception reading sequence name from temporary file.", e); + } + } + } } From 4eac3b65866bf86a772c82026183c40d69ceb4c9 Mon Sep 17 00:00:00 2001 From: Pavel Silin Date: Thu, 17 Nov 2016 19:36:40 +0300 Subject: [PATCH 2/8] fix imports --- src/main/java/picard/sam/CreateSequenceDictionary.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main/java/picard/sam/CreateSequenceDictionary.java b/src/main/java/picard/sam/CreateSequenceDictionary.java index 7cf6e84a6..415e0e16a 100644 --- a/src/main/java/picard/sam/CreateSequenceDictionary.java +++ b/src/main/java/picard/sam/CreateSequenceDictionary.java @@ -39,10 +39,10 @@ import picard.cmdline.CommandLineProgram; import picard.cmdline.CommandLineProgramProperties; import picard.cmdline.Option; -import picard.cmdline.StandardOptionDefinitions; import picard.cmdline.programgroups.Fasta; +import picard.cmdline.StandardOptionDefinitions; -import java.io.*; +import java.io.File; import java.math.BigInteger; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; @@ -71,7 +71,6 @@ "" + "" + "
"; - // The following attributes define the command-line arguments @Option(doc = "Input reference fasta or fasta.gz", shortName = StandardOptionDefinitions.REFERENCE_SHORT_NAME) From da847d0528dad926cdeb337ca4bf81272220f50d Mon Sep 17 00:00:00 2001 From: Pavel Silin Date: Fri, 18 Nov 2016 13:15:51 +0300 Subject: [PATCH 3/8] fix imports, writer -> BufferedWriter --- src/main/java/picard/sam/CreateSequenceDictionary.java | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/main/java/picard/sam/CreateSequenceDictionary.java b/src/main/java/picard/sam/CreateSequenceDictionary.java index 415e0e16a..b05c44930 100644 --- a/src/main/java/picard/sam/CreateSequenceDictionary.java +++ b/src/main/java/picard/sam/CreateSequenceDictionary.java @@ -42,7 +42,7 @@ import picard.cmdline.programgroups.Fasta; import picard.cmdline.StandardOptionDefinitions; -import java.io.File; +import java.io.*; import java.math.BigInteger; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; @@ -135,7 +135,7 @@ protected int doWork() { // SortingCollection is used to check uniqueness of sequence names final SortingCollection sequenceNames = makeSortingCollection(); - try (Writer writer = makeWriter()) { + try (BufferedWriter writer = makeWriter()) { final ReferenceSequenceFile refSeqFile = ReferenceSequenceFileFactory. getReferenceSequenceFile(REFERENCE, TRUNCATE_NAMES_AT_WHITESPACE); SAMSequenceDictionaryCodec samDictCodec = new SAMSequenceDictionaryCodec(writer); @@ -168,13 +168,15 @@ protected int doWork() { return 0; } - private Writer makeWriter() throws FileNotFoundException { - return new AsciiWriter(this.CREATE_MD5_FILE ? + private BufferedWriter makeWriter() throws FileNotFoundException { + return new BufferedWriter( + new AsciiWriter(this.CREATE_MD5_FILE ? new Md5CalculatingOutputStream( new FileOutputStream(OUTPUT, false), new File(OUTPUT.getAbsolutePath() + ".md5") ) : new FileOutputStream(OUTPUT) + ) ); } From 80b36a6f6f5d371c9c5b4b974dca8ee42280786b Mon Sep 17 00:00:00 2001 From: Pavel Silin Date: Sun, 20 Nov 2016 14:52:19 +0300 Subject: [PATCH 4/8] correction --- src/main/java/picard/sam/CreateSequenceDictionary.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/picard/sam/CreateSequenceDictionary.java b/src/main/java/picard/sam/CreateSequenceDictionary.java index b05c44930..1ce6bc471 100644 --- a/src/main/java/picard/sam/CreateSequenceDictionary.java +++ b/src/main/java/picard/sam/CreateSequenceDictionary.java @@ -140,11 +140,12 @@ protected int doWork() { getReferenceSequenceFile(REFERENCE, TRUNCATE_NAMES_AT_WHITESPACE); SAMSequenceDictionaryCodec samDictCodec = new SAMSequenceDictionaryCodec(writer); + samDictCodec.encodeHeaderLine(false); // read reference sequence one by one and write its metadata ReferenceSequence refSeq; while ((refSeq = refSeqFile.nextSequence()) != null) { final SAMSequenceRecord samSequenceRecord = makeSequenceRecord(refSeq); - samDictCodec.encodeSQLine(samSequenceRecord); + samDictCodec.encodeSequenceRecord(samSequenceRecord); sequenceNames.add(refSeq.getName()); } } catch (FileNotFoundException e) { From 43ee74392b95fd39a5ac7af1948ce3b92dc95fab Mon Sep 17 00:00:00 2001 From: Pavel Silin Date: Thu, 8 Dec 2016 00:25:19 +0300 Subject: [PATCH 5/8] htsjdk version -> 2.8.0 --- build.gradle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.gradle b/build.gradle index 300a83fbb..7e2e499c1 100644 --- a/build.gradle +++ b/build.gradle @@ -47,7 +47,7 @@ jacoco { toolVersion = "0.7.5.201505241946" } -final htsjdkVersion = System.getProperty('htsjdk.version', '2.7.0') +final htsjdkVersion = System.getProperty('htsjdk.version', '2.8.0') dependencies { compile 'com.google.guava:guava:15.0' From 131681ef754e56930d32a465b7d644cfa21c761b Mon Sep 17 00:00:00 2001 From: Pavel Silin Date: Fri, 9 Dec 2016 01:11:47 +0300 Subject: [PATCH 6/8] fix to comments, work in progress --- .../java/picard/sam/CreateSequenceDictionary.java | 47 +++++++++++++++++----- .../picard/sam/CreateSequenceDictionaryTest.java | 33 +++++++++++++-- testdata/picard/reference/csd_dict.dict | 9 +++++ 3 files changed, 75 insertions(+), 14 deletions(-) create mode 100644 testdata/picard/reference/csd_dict.dict diff --git a/src/main/java/picard/sam/CreateSequenceDictionary.java b/src/main/java/picard/sam/CreateSequenceDictionary.java index 1ce6bc471..8a6f0a895 100644 --- a/src/main/java/picard/sam/CreateSequenceDictionary.java +++ b/src/main/java/picard/sam/CreateSequenceDictionary.java @@ -23,6 +23,7 @@ */ package picard.sam; +import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceDictionaryCodec; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.reference.ReferenceSequence; @@ -46,9 +47,14 @@ import java.math.BigInteger; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; /** - * Create a .dict file from a fasta containing reference sequence. dict file contains only sequence records. + * Create a SAM/BAM file from a fasta containing reference sequence. The output SAM file contains a header but no + * SAMRecords, and the header contains only sequence records. */ @CommandLineProgramProperties( usage = CreateSequenceDictionary.USAGE_SUMMARY + CreateSequenceDictionary.USAGE_DETAILS, @@ -112,6 +118,28 @@ public static void main(final String[] argv) { } /** + * Read all the sequences from the given reference file, and convert into SAMSequenceRecords + * @param referenceFile fasta or fasta.gz + * @return SAMSequenceRecords containing info from the fasta, plus from cmd-line arguments. + */ + @Deprecated + public SAMSequenceDictionary makeSequenceDictionary(final File referenceFile) { + final ReferenceSequenceFile refSeqFile = + ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile, TRUNCATE_NAMES_AT_WHITESPACE); + ReferenceSequence refSeq; + final List ret = new ArrayList<>(); + final Set sequenceNames = new HashSet<>(); + for (int numSequences = 0; numSequences < NUM_SEQUENCES && (refSeq = refSeqFile.nextSequence()) != null; ++numSequences) { + if (sequenceNames.contains(refSeq.getName())) { + throw new PicardException("Sequence name appears more than once in reference: " + refSeq.getName()); + } + sequenceNames.add(refSeq.getName()); + ret.add(makeSequenceRecord(refSeq)); + } + return new SAMSequenceDictionary(ret); + } + + /** * Use reference filename to create URI to go into header if URI was not passed on cmd line. */ protected String[] customCommandLineValidation() { @@ -156,7 +184,10 @@ protected int doWork() { // check uniqueness of sequences names final CloseableIterator iterator = sequenceNames.iterator(); - String first = iterator.hasNext() ? iterator.next() : ""; + + if(!iterator.hasNext()) return 0; + + String first = iterator.next(); while (iterator.hasNext()) { final String next = iterator.next(); if (first.equals(next)) { @@ -216,21 +247,15 @@ private String md5Hash(final byte[] bytes) { } private SortingCollection makeSortingCollection() { - final String parent = System.getProperty("java.io.tmpdir") + "/" + System.getProperty("user.name"); - final String child = getClass().getSimpleName(); - final File tmpDir = new File(parent, child); - IOUtil.deleteDirectoryTree(tmpDir); - if (!tmpDir.mkdirs()) { - throw new IllegalStateException("Could not create tmpdir: " + tmpDir.getAbsolutePath()); - } - tmpDir.deleteOnExit(); + final String name = getClass().getSimpleName(); + final File tmpDir = IOUtil.createTempDir(name, null); // 256 byte for one name, and 1/10 part of all memory for this, rough estimate long maxNamesInRam = Runtime.getRuntime().maxMemory() / 256 / 10; return SortingCollection.newInstance( String.class, new StringCodec(), String::compareTo, - maxNamesInRam > Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) maxNamesInRam, + (int) Math.min(maxNamesInRam, Integer.MAX_VALUE), tmpDir ); } diff --git a/src/test/java/picard/sam/CreateSequenceDictionaryTest.java b/src/test/java/picard/sam/CreateSequenceDictionaryTest.java index 9b041ccef..b029b4ff6 100644 --- a/src/test/java/picard/sam/CreateSequenceDictionaryTest.java +++ b/src/test/java/picard/sam/CreateSequenceDictionaryTest.java @@ -28,15 +28,20 @@ import picard.cmdline.CommandLineProgramTest; import picard.PicardException; +import java.io.BufferedReader; import java.io.File; +import java.io.FileReader; +import java.util.List; +import java.util.stream.Collectors; /** * @author alecw@broadinstitute.org */ public class CreateSequenceDictionaryTest extends CommandLineProgramTest { - public static File TEST_DATA_DIR = new File("testdata/picard/sam"); - public static File BASIC_FASTA = new File(TEST_DATA_DIR, "basic.fasta"); - public static File DUPLICATE_FASTA = new File(TEST_DATA_DIR, "duplicate_sequence_names.fasta"); + public static File TEST_DATA_DIR = new File("testdata/picard"); + public static File BASIC_FASTA = new File(TEST_DATA_DIR + "/sam", "basic.fasta"); + public static File EQUIVALENCE_TEST_FASTA = new File(TEST_DATA_DIR + "/reference", "test.fasta"); + public static File DUPLICATE_FASTA = new File(TEST_DATA_DIR + "/sam", "duplicate_sequence_names.fasta"); public String getCommandLineProgramName() { return CreateSequenceDictionary.class.getSimpleName(); @@ -55,6 +60,28 @@ public void testBasic() throws Exception { Assert.assertEquals(runPicardCommandLine(argv), 0); } + @Test + public void testForEquivalence() throws Exception { + final File outputDict = File.createTempFile("CreateSequenceDictionaryTest.", ".dict"); + outputDict.delete(); + final String[] argv = { + "REFERENCE=" + EQUIVALENCE_TEST_FASTA, + "OUTPUT=" + outputDict, + "TRUNCATE_NAMES_AT_WHITESPACE=false" + }; + Assert.assertEquals(runPicardCommandLine(argv), 0); + + List currentDict = new BufferedReader(new FileReader(outputDict)) + .lines() + .collect(Collectors.toList()); + + List expectedDict = new BufferedReader(new FileReader("testdata/picard/reference/csd_dict.dict")) + .lines() + .collect(Collectors.toList()); + + Assert.assertEquals(currentDict, expectedDict); + } + /** * Should throw an exception because with TRUNCATE_NAMES_AT_WHITESPACE, sequence names are not unique. */ diff --git a/testdata/picard/reference/csd_dict.dict b/testdata/picard/reference/csd_dict.dict new file mode 100644 index 000000000..cea62901e --- /dev/null +++ b/testdata/picard/reference/csd_dict.dict @@ -0,0 +1,9 @@ +@HD VN:1.5 +@SQ SN:chr1 LN:101 M5:bd01f7e11515bb6beda8f7257902aa67 UR:file:/home/pavel/git/epm-cmbi/picard-pho/testdata/picard/reference/test.fasta +@SQ SN:chr2 LN:101 M5:31c33e2155b3de5e2554b693c475b310 UR:file:/home/pavel/git/epm-cmbi/picard-pho/testdata/picard/reference/test.fasta +@SQ SN:chr3 LN:101 M5:631593c6dd2048ae88dcce2bd505d295 UR:file:/home/pavel/git/epm-cmbi/picard-pho/testdata/picard/reference/test.fasta +@SQ SN:chr4 LN:101 M5:c60cb92f1ee5b78053c92bdbfa19abf1 UR:file:/home/pavel/git/epm-cmbi/picard-pho/testdata/picard/reference/test.fasta +@SQ SN:chr5 LN:101 M5:07ebc213c7611db0eacbb1590c3e9bda UR:file:/home/pavel/git/epm-cmbi/picard-pho/testdata/picard/reference/test.fasta +@SQ SN:chr6 LN:101 M5:7be2f5e7ee39e60a6c3b5b6a41178c6d UR:file:/home/pavel/git/epm-cmbi/picard-pho/testdata/picard/reference/test.fasta +@SQ SN:chr7 LN:202 M5:93763aaf6a455871c7d7a7718bff9ccf UR:file:/home/pavel/git/epm-cmbi/picard-pho/testdata/picard/reference/test.fasta +@SQ SN:chr8 LN:202 M5:d339678efce576d5546e88b49a487b63 UR:file:/home/pavel/git/epm-cmbi/picard-pho/testdata/picard/reference/test.fasta \ No newline at end of file From ae7aacbf805ca3e3c0ec4f209987550b0c4393a4 Mon Sep 17 00:00:00 2001 From: Pavel Silin Date: Fri, 9 Dec 2016 01:35:57 +0300 Subject: [PATCH 7/8] fix for test, remove location info about fasta file --- src/test/java/picard/sam/CreateSequenceDictionaryTest.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/test/java/picard/sam/CreateSequenceDictionaryTest.java b/src/test/java/picard/sam/CreateSequenceDictionaryTest.java index b029b4ff6..bb0821482 100644 --- a/src/test/java/picard/sam/CreateSequenceDictionaryTest.java +++ b/src/test/java/picard/sam/CreateSequenceDictionaryTest.java @@ -70,13 +70,17 @@ public void testForEquivalence() throws Exception { "TRUNCATE_NAMES_AT_WHITESPACE=false" }; Assert.assertEquals(runPicardCommandLine(argv), 0); - + List currentDict = new BufferedReader(new FileReader(outputDict)) .lines() + //remove info about location fasta file + .map(s -> s.replaceAll("UR:.*", "")) .collect(Collectors.toList()); - List expectedDict = new BufferedReader(new FileReader("testdata/picard/reference/csd_dict.dict")) + List expectedDict = new BufferedReader(new FileReader(TEST_DATA_DIR + "/reference/csd_dict.dict")) .lines() + //remove info about location fasta file + .map(s -> s.replaceAll("UR:.*", "")) .collect(Collectors.toList()); Assert.assertEquals(currentDict, expectedDict); From b74bab0feac3446d265144399716bf25ee4db942 Mon Sep 17 00:00:00 2001 From: Pavel Silin Date: Fri, 9 Dec 2016 15:22:19 +0300 Subject: [PATCH 8/8] correction to comments --- src/main/java/picard/sam/CreateSequenceDictionary.java | 12 ++++++------ testdata/picard/reference/csd_dict.dict | 16 ++++++++-------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/main/java/picard/sam/CreateSequenceDictionary.java b/src/main/java/picard/sam/CreateSequenceDictionary.java index 8a6f0a895..6aa587a79 100644 --- a/src/main/java/picard/sam/CreateSequenceDictionary.java +++ b/src/main/java/picard/sam/CreateSequenceDictionary.java @@ -170,8 +170,7 @@ protected int doWork() { samDictCodec.encodeHeaderLine(false); // read reference sequence one by one and write its metadata - ReferenceSequence refSeq; - while ((refSeq = refSeqFile.nextSequence()) != null) { + for (ReferenceSequence refSeq = refSeqFile.nextSequence(); refSeq != null; refSeq = refSeqFile.nextSequence()) { final SAMSequenceRecord samSequenceRecord = makeSequenceRecord(refSeq); samDictCodec.encodeSequenceRecord(samSequenceRecord); sequenceNames.add(refSeq.getName()); @@ -187,15 +186,15 @@ protected int doWork() { if(!iterator.hasNext()) return 0; - String first = iterator.next(); + String current = iterator.next(); while (iterator.hasNext()) { final String next = iterator.next(); - if (first.equals(next)) { + if (current.equals(next)) { OUTPUT.delete(); - throw new PicardException("Sequence name " + first + + throw new PicardException("Sequence name " + current + " appears more than once in reference file"); } - first = next; + current = next; } return 0; } @@ -249,6 +248,7 @@ private String md5Hash(final byte[] bytes) { private SortingCollection makeSortingCollection() { final String name = getClass().getSimpleName(); final File tmpDir = IOUtil.createTempDir(name, null); + tmpDir.deleteOnExit(); // 256 byte for one name, and 1/10 part of all memory for this, rough estimate long maxNamesInRam = Runtime.getRuntime().maxMemory() / 256 / 10; return SortingCollection.newInstance( diff --git a/testdata/picard/reference/csd_dict.dict b/testdata/picard/reference/csd_dict.dict index cea62901e..0ca0bde86 100644 --- a/testdata/picard/reference/csd_dict.dict +++ b/testdata/picard/reference/csd_dict.dict @@ -1,9 +1,9 @@ @HD VN:1.5 -@SQ SN:chr1 LN:101 M5:bd01f7e11515bb6beda8f7257902aa67 UR:file:/home/pavel/git/epm-cmbi/picard-pho/testdata/picard/reference/test.fasta -@SQ SN:chr2 LN:101 M5:31c33e2155b3de5e2554b693c475b310 UR:file:/home/pavel/git/epm-cmbi/picard-pho/testdata/picard/reference/test.fasta -@SQ SN:chr3 LN:101 M5:631593c6dd2048ae88dcce2bd505d295 UR:file:/home/pavel/git/epm-cmbi/picard-pho/testdata/picard/reference/test.fasta -@SQ SN:chr4 LN:101 M5:c60cb92f1ee5b78053c92bdbfa19abf1 UR:file:/home/pavel/git/epm-cmbi/picard-pho/testdata/picard/reference/test.fasta -@SQ SN:chr5 LN:101 M5:07ebc213c7611db0eacbb1590c3e9bda UR:file:/home/pavel/git/epm-cmbi/picard-pho/testdata/picard/reference/test.fasta -@SQ SN:chr6 LN:101 M5:7be2f5e7ee39e60a6c3b5b6a41178c6d UR:file:/home/pavel/git/epm-cmbi/picard-pho/testdata/picard/reference/test.fasta -@SQ SN:chr7 LN:202 M5:93763aaf6a455871c7d7a7718bff9ccf UR:file:/home/pavel/git/epm-cmbi/picard-pho/testdata/picard/reference/test.fasta -@SQ SN:chr8 LN:202 M5:d339678efce576d5546e88b49a487b63 UR:file:/home/pavel/git/epm-cmbi/picard-pho/testdata/picard/reference/test.fasta \ No newline at end of file +@SQ SN:chr1 LN:101 M5:bd01f7e11515bb6beda8f7257902aa67 UR:file:/path/to/testdata/picard/reference/test.fasta +@SQ SN:chr2 LN:101 M5:31c33e2155b3de5e2554b693c475b310 UR:file:/path/to/testdata/picard/reference/test.fasta +@SQ SN:chr3 LN:101 M5:631593c6dd2048ae88dcce2bd505d295 UR:file:/path/to/testdata/picard/reference/test.fasta +@SQ SN:chr4 LN:101 M5:c60cb92f1ee5b78053c92bdbfa19abf1 UR:file:/path/to/testdata/picard/reference/test.fasta +@SQ SN:chr5 LN:101 M5:07ebc213c7611db0eacbb1590c3e9bda UR:file:/path/to/testdata/picard/reference/test.fasta +@SQ SN:chr6 LN:101 M5:7be2f5e7ee39e60a6c3b5b6a41178c6d UR:file:/path/to/testdata/picard/reference/test.fasta +@SQ SN:chr7 LN:202 M5:93763aaf6a455871c7d7a7718bff9ccf UR:file:/path/to/testdata/picard/reference/test.fasta +@SQ SN:chr8 LN:202 M5:d339678efce576d5546e88b49a487b63 UR:file:/path/to/testdata/picard/reference/test.fasta \ No newline at end of file