diff --git a/src/main/java/htsjdk/samtools/SAMSequenceDictionaryCodec.java b/src/main/java/htsjdk/samtools/SAMSequenceDictionaryCodec.java new file mode 100644 index 000000000..e6e3ba592 --- /dev/null +++ b/src/main/java/htsjdk/samtools/SAMSequenceDictionaryCodec.java @@ -0,0 +1,114 @@ +/* + * The MIT License + * + * Copyright (c) 2016 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package htsjdk.samtools; + +import htsjdk.samtools.util.LineReader; +import java.io.BufferedWriter; + +/** + * "On the fly" codec SAMSequenceDictionaryCodec. + * Encodes each sequence and directly writes it to the Dictionary file. + * + * To use this class you should provide BufferedWriter to it, and so you should close it as you stop using this class. + * You can work with this class as shown below. + * + * Example of using this class: + * + * List dict = ...; + * + * //open BufferedReader and close in try-with-resources + * try(BufferedWriter writer = new BufferedWriter(new FileWriter("path/to/file"))) { + * SAMSequenceDictionaryCodec codec = new SAMSequenceDictionaryCodec(writer); + * + * //we have list of sequences, so encode header line and after that encode each sequence + * codec.encodeHeaderLine(false); + * dict.forEach(codec::encodeSequenceRecord); + *} + * + * or + * + * SAMSequenceDictionary dict = ...; + * + * //open BufferedReader and close in try-with-resources + * try(BufferedWriter writer = new BufferedWriter(new FileWriter("path/to/file"))) { + * SAMSequenceDictionaryCodec codec = new SAMSequenceDictionaryCodec(writer); + * + * //we have complete {@link SAMSequenceDictionary}, so just encode it. + * codec.encode(dict); + *} + * + * @author Pavel_Silin@epam.com, EPAM Systems, Inc. + */ +public class SAMSequenceDictionaryCodec { + + private static final SAMFileHeader EMPTY_HEADER = new SAMFileHeader(); + + private final SAMTextHeaderCodec codec; + + public SAMSequenceDictionaryCodec(final BufferedWriter writer) { + codec = new SAMTextHeaderCodec(); + codec.setmFileHeader(EMPTY_HEADER); + codec.setWriter(writer); + } + + /** + * Write {@link SAMSequenceRecord}. + * @param sequenceRecord object to be converted to text. + */ + public void encodeSequenceRecord(final SAMSequenceRecord sequenceRecord) { + codec.encodeSequenceRecord(sequenceRecord); + } + + /** + * Write Header line. + * @param keepExistingVersionNumber boolean flag to keep existing version number. + */ + public void encodeHeaderLine(final boolean keepExistingVersionNumber) { + codec.encodeHeaderLine(keepExistingVersionNumber); + } + + /** + * Reads text SAM header and converts to a SAMSequenceDictionary object. + * @param reader Where to get header text from. + * @param source Name of the input file, for error messages. May be null. + * @return complete SAMSequenceDictionary object. + */ + public SAMSequenceDictionary decode(final LineReader reader, final String source) { + return codec.decode(reader, source).getSequenceDictionary(); + } + + /** + * Convert {@link SAMSequenceDictionary} from in-memory representation to text representation. + * @param dictionary object to be converted to text. + */ + public void encode(final SAMSequenceDictionary dictionary) { + codec.encodeHeaderLine(false); + dictionary.getSequences().forEach(this::encodeSequenceRecord); + } + + public void setValidationStringency(final ValidationStringency validationStringency) { + codec.setValidationStringency(validationStringency); + } +} diff --git a/src/main/java/htsjdk/samtools/SAMTextHeaderCodec.java b/src/main/java/htsjdk/samtools/SAMTextHeaderCodec.java index 491bf9b4b..fb4b02ac3 100644 --- a/src/main/java/htsjdk/samtools/SAMTextHeaderCodec.java +++ b/src/main/java/htsjdk/samtools/SAMTextHeaderCodec.java @@ -70,6 +70,14 @@ public static final String COMMENT_PREFIX = HEADER_LINE_START + HeaderRecordType.CO.name() + FIELD_SEPARATOR; + void setWriter(final BufferedWriter writer) { + this.writer = writer; + } + + void setmFileHeader(final SAMFileHeader header) { + this.mFileHeader = header; + } + /** * Reads text SAM header and converts to a SAMFileHeader object. * @param reader Where to get header text from. @@ -80,8 +88,8 @@ public SAMFileHeader decode(final LineReader reader, final String source) { mFileHeader = new SAMFileHeader(); mReader = reader; mSource = source; - sequences = new ArrayList(); - readGroups = new ArrayList(); + sequences = new ArrayList<>(); + readGroups = new ArrayList<>(); while (advanceLine() != null) { final ParsedHeaderLine parsedHeaderLine = new ParsedHeaderLine(mCurrentLine); @@ -387,6 +395,30 @@ public void encode(final Writer writer, final SAMFileHeader header, final boolea } } + /** + * Encode {@link SAMSequenceRecord}. + * Designed for using in {@link SAMSequenceDictionaryCodec}, allows to implement recording on the fly. + * @throws IllegalStateException, if writer is null. + */ + void encodeSequenceRecord(final SAMSequenceRecord sequenceRecord) { + if (writer == null) { + throw new IllegalStateException("writer couldn't be null"); + } + writeSQLine(sequenceRecord); + } + + /** + * Encode HD line. + * Designed for using in {@link SAMSequenceDictionaryCodec}, allows to implement recording on the fly. + * @throws IllegalStateException, if writer is null. + */ + void encodeHeaderLine(final boolean keepExistingVersionNumber) { + if (writer == null) { + throw new IllegalStateException("writer couldn't be null"); + } + writeHDLine(keepExistingVersionNumber); + } + private void println(final String s) { try { writer.append(s); @@ -438,7 +470,7 @@ private void writeHDLine(final boolean keepExistingVersionNumber) { } private void writeSQLine(final SAMSequenceRecord sequenceRecord) { - final int numAttributes =sequenceRecord.getAttributes() != null ? sequenceRecord.getAttributes().size() : 0; + final int numAttributes = sequenceRecord.getAttributes() != null ? sequenceRecord.getAttributes().size() : 0; final String[] fields = new String[3 + numAttributes]; fields[0] = HEADER_LINE_START + HeaderRecordType.SQ; fields[1] = SAMSequenceRecord.SEQUENCE_NAME_TAG + TAG_KEY_VALUE_SEPARATOR + sequenceRecord.getSequenceName(); diff --git a/src/test/java/htsjdk/samtools/SAMSequenceDictionaryCodecTest.java b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryCodecTest.java new file mode 100644 index 000000000..32de1cd82 --- /dev/null +++ b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryCodecTest.java @@ -0,0 +1,122 @@ +/* + * The MIT License + * + * Copyright (c) 20016 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package htsjdk.samtools; + +import htsjdk.samtools.util.LineReader; +import htsjdk.samtools.util.StringLineReader; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import javax.sound.sampled.Line; +import java.io.BufferedWriter; +import java.io.StringWriter; +import java.util.List; +import java.util.Random; + +import static org.testng.Assert.*; + +/** + * @author Pavel_Silin@epam.com, EPAM Systems, Inc. + */ +public class SAMSequenceDictionaryCodecTest { + + private static final Random random = new Random(); + private SAMSequenceDictionary dictionary; + private StringWriter writer; + private SAMSequenceDictionaryCodec codec; + private BufferedWriter bufferedWriter; + + @BeforeMethod + public void setUp() throws Exception { + String[] seqs = new String[]{"chr1", "chr2", "chr12", "chr16", "chrX"}; + dictionary = new SAMSequenceDictionary(); + for (String seq : seqs) { + dictionary.addSequence(new SAMSequenceRecord(seq, random.nextInt(10_000_000))); + } + writer = new StringWriter(); + bufferedWriter = new BufferedWriter(writer); + codec = new SAMSequenceDictionaryCodec(bufferedWriter); + } + + @Test + public void testEncodeDecodeDictionary() throws Exception { + LineReader readerOne = null; + LineReader readerTwo = null; + try { + codec.encode(dictionary); + bufferedWriter.close(); + readerOne = new StringLineReader(writer.toString()); + SAMSequenceDictionary actual = codec.decode(readerOne, null); + assertEquals(actual, dictionary); + + readerTwo = new StringLineReader(writer.toString()); + + String line = readerTwo.readLine(); + assertTrue(line.startsWith("@HD")); + + line = readerTwo.readLine(); + while (line != null) { + assertTrue(line.startsWith("@SQ")); + line = readerTwo.readLine(); + } + } finally { + assert readerOne != null; + assert readerTwo != null; + readerOne.close(); + readerTwo.close(); + } + } + + @Test + public void testEncodeDecodeListOfSeqs() throws Exception { + LineReader readerOne = null; + LineReader readerTwo = null; + + try { + List sequences = dictionary.getSequences(); + codec.encodeHeaderLine(false); + sequences.forEach(codec::encodeSequenceRecord); + bufferedWriter.close(); + readerOne = new StringLineReader(writer.toString()); + SAMSequenceDictionary actual = codec.decode(readerOne, null); + assertEquals(actual, dictionary); + readerTwo = new StringLineReader(writer.toString()); + + String line = readerTwo.readLine(); + assertTrue(line.startsWith("@HD")); + + line = readerTwo.readLine(); + while (line != null) { + assertTrue(line.startsWith("@SQ")); + line = readerTwo.readLine(); + } + } finally { + assert readerOne != null; + assert readerTwo != null; + readerOne.close(); + readerTwo.close(); + } + } +}