diff --git a/src/main/java/htsjdk/samtools/fastq/BasicFastqWriter.java b/src/main/java/htsjdk/samtools/fastq/BasicFastqWriter.java index 8a5afd38a..0c9596a0b 100644 --- a/src/main/java/htsjdk/samtools/fastq/BasicFastqWriter.java +++ b/src/main/java/htsjdk/samtools/fastq/BasicFastqWriter.java @@ -58,12 +58,10 @@ public BasicFastqWriter(final PrintStream writer) { @Override public void write(final FastqRecord rec) { - writer.print(FastqConstants.SEQUENCE_HEADER); - writer.println(rec.getReadHeader()); - writer.println(rec.getReadString()); - writer.print(FastqConstants.QUALITY_HEADER); - writer.println(rec.getBaseQualityHeader() == null ? "" : rec.getBaseQualityHeader()); - writer.println(rec.getBaseQualityString()); + // encode without creating a String + FastqEncoder.write(writer, rec); + // and print a new line + writer.println(); if (writer.checkError()) { throw new SAMException("Error in writing fastq file " + path); } diff --git a/src/main/java/htsjdk/samtools/fastq/FastqConstants.java b/src/main/java/htsjdk/samtools/fastq/FastqConstants.java index f5d4150ea..4e9b95e5b 100644 --- a/src/main/java/htsjdk/samtools/fastq/FastqConstants.java +++ b/src/main/java/htsjdk/samtools/fastq/FastqConstants.java @@ -29,7 +29,9 @@ public class FastqConstants { public static final String SEQUENCE_HEADER = "@" ; public static final String QUALITY_HEADER = "+" ; - + public static final String FIRST_OF_PAIR = "/1"; + public static final String SECOND_OF_PAIR = "/2"; + public enum FastqExtensions { FASTQ(".fastq"), FASTQ_GZ(".fastq.gz"), diff --git a/src/main/java/htsjdk/samtools/fastq/FastqEncoder.java b/src/main/java/htsjdk/samtools/fastq/FastqEncoder.java new file mode 100644 index 000000000..fdbd02dcc --- /dev/null +++ b/src/main/java/htsjdk/samtools/fastq/FastqEncoder.java @@ -0,0 +1,113 @@ +/* + * The MIT License + * + * Copyright (c) 2016 Daniel Gomez-Sanchez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +package htsjdk.samtools.fastq; + +import htsjdk.samtools.SAMException; +import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.util.SequenceUtil; + +import java.io.IOException; + +/** + * Codec for encoding records into FASTQ format. + * + * @author Daniel Gomez-Sanchez (magicDGS) + */ +public final class FastqEncoder { + + // cannot be instantiated because it is an utility class + private FastqEncoder() {} + + /** + * Encodes a FastqRecord in the String FASTQ format. + */ + public static String encode(final FastqRecord record) { + // reserve some memory based on the read length + int capacity = record.getReadLength() * 2 + 5; + // reserve some memory based on the read name + if (record.getReadName() != null) { + capacity += record.getReadName().length(); + } + return write(new StringBuilder(capacity), record).toString(); + } + + /** + * Writes a FastqRecord into the Appendable output. + * @throws SAMException if any I/O error occurs. + */ + public static Appendable write(final Appendable out,final FastqRecord record) { + final String readName = record.getReadName(); + final String readString = record.getReadString(); + final String qualHeader = record.getBaseQualityHeader(); + final String qualityString = record.getBaseQualityString(); + try { + return out.append(FastqConstants.SEQUENCE_HEADER) + .append(readName == null ? "" : readName).append('\n') + .append(readString == null ? "" : readString).append('\n') + .append(FastqConstants.QUALITY_HEADER) + .append(qualHeader == null ? "" : qualHeader).append('\n') + .append(qualityString == null ? "" : qualityString); + } catch (IOException e) { + throw new SAMException(e); + } + } + + /** + * Encodes a SAMRecord in the String FASTQ format. + * @see #encode(FastqRecord) + * @see #asSAMRecord(FastqRecord, SAMFileHeader) + */ + public static String encode(final SAMRecord record) { + return encode(asFastqRecord(record)); + } + + /** + * Converts a {@link SAMRecord} into a {@link FastqRecord}. + */ + public static FastqRecord asFastqRecord(final SAMRecord record) { + String readName = record.getReadName(); + if(record.getReadPairedFlag() && (record.getFirstOfPairFlag() || record.getSecondOfPairFlag())) { + readName += (record.getFirstOfPairFlag()) ? FastqConstants.FIRST_OF_PAIR : FastqConstants.SECOND_OF_PAIR; + } + return new FastqRecord(readName, record.getReadString(), null, record.getBaseQualityString()); + } + + /** + * Converts a {@link FastqRecord} into a simple unmapped {@link SAMRecord}. + */ + public static SAMRecord asSAMRecord(final FastqRecord record, final SAMFileHeader header) { + // construct the SAMRecord and set the unmapped flag + final SAMRecord samRecord = new SAMRecord(header); + samRecord.setReadUnmappedFlag(true); + // get the read name from the FastqRecord correctly formatted + final String readName = SequenceUtil.getSamReadNameFromFastqHeader(record.getReadName()); + // set the basic information from the FastqRecord + samRecord.setReadName(readName); + samRecord.setReadBases(record.getReadBases()); + samRecord.setBaseQualities(record.getBaseQualities()); + return samRecord; + } + +} diff --git a/src/main/java/htsjdk/samtools/fastq/FastqRecord.java b/src/main/java/htsjdk/samtools/fastq/FastqRecord.java index b1d3f7507..9fbcd3912 100755 --- a/src/main/java/htsjdk/samtools/fastq/FastqRecord.java +++ b/src/main/java/htsjdk/samtools/fastq/FastqRecord.java @@ -23,62 +23,169 @@ */ package htsjdk.samtools.fastq; +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMUtils; +import htsjdk.samtools.util.StringUtil; + import java.io.Serializable; /** - * Represents a fastq record, fairly literally, i.e. without any conversion. + * Simple representation of a FASTQ record, without any conversion */ public class FastqRecord implements Serializable { private static final long serialVersionUID = 1L; - private final String seqHeaderPrefix; - private final String seqLine; - private final String qualHeaderPrefix; - private final String qualLine; - - public FastqRecord(final String seqHeaderPrefix, final String seqLine, final String qualHeaderPrefix, final String qualLine) { - if (seqHeaderPrefix != null && !seqHeaderPrefix.isEmpty()) this.seqHeaderPrefix = seqHeaderPrefix; - else this.seqHeaderPrefix = null; - if (qualHeaderPrefix != null && !qualHeaderPrefix.isEmpty()) this.qualHeaderPrefix = qualHeaderPrefix; - else this.qualHeaderPrefix = null; - this.seqLine = seqLine ; - this.qualLine = qualLine ; - } - - /** copy constructor */ + private final String readName; + private final String readString; + private final String qualityHeader; + private final String baseQualityString; + + /** + * Default constructor + * + * @param readName the read name (without {@link FastqConstants#SEQUENCE_HEADER}) + * @param readBases the read sequence bases + * @param qualityHeader the quality header (without {@link FastqConstants#SEQUENCE_HEADER}) + * @param baseQualities the base quality scores + */ + public FastqRecord(final String readName, final String readBases, final String qualityHeader, final String baseQualities) { + if (readName != null && !readName.isEmpty()) { + this.readName = readName; + } else { + this.readName = null; + } + if (qualityHeader != null && !qualityHeader.isEmpty()) { + this.qualityHeader = qualityHeader; + } else { + this.qualityHeader = null; + } + this.readString = readBases; + this.baseQualityString = baseQualities; + } + + /** + * Constructor for byte[] arrays + * + * @param readName the read name (without {@link FastqConstants#SEQUENCE_HEADER}) + * @param readBases the read sequence bases as ASCII bytes ACGTN=. + * @param qualityHeader the quality header (without {@link FastqConstants#SEQUENCE_HEADER}) + * @param baseQualities the base qualities as binary PHRED scores (not ASCII) + */ + public FastqRecord(final String readName, final byte[] readBases, final String qualityHeader, final byte[] baseQualities) { + this(readName, StringUtil.bytesToString(readBases), qualityHeader, SAMUtils.phredToFastq(baseQualities)); + } + + /** + * Copy constructor + * + * @param other record to copy + */ public FastqRecord(final FastqRecord other) { - if( other == null ) throw new IllegalArgumentException("new FastqRecord(null)"); - this.seqHeaderPrefix = other.seqHeaderPrefix; - this.seqLine = other.seqLine; - this.qualHeaderPrefix = other.qualHeaderPrefix; - this.qualLine = other.qualLine; - } - - /** @return the read name */ - public String getReadHeader() { return seqHeaderPrefix; } - /** @return the read DNA sequence */ - public String getReadString() { return seqLine; } - /** @return the quality header */ - public String getBaseQualityHeader() { return qualHeaderPrefix; } - /** @return the quality string */ - public String getBaseQualityString() { return qualLine; } - /** shortcut to getReadString().length() */ - public int length() { return this.seqLine==null?0:this.seqLine.length();} - + if (other == null) { + throw new IllegalArgumentException("new FastqRecord(null)"); + } + this.readName = other.readName; + this.readString = other.readString; + this.qualityHeader = other.qualityHeader; + this.baseQualityString = other.baseQualityString; + } + + /** + * @return the read name + * @deprecated since 02/2017. Use {@link #getReadName()} instead + */ + @Deprecated + public String getReadHeader() { + return getReadName(); + } + + /** + * Get the read name + * + * @return the read name + */ + public String getReadName() { + return readName; + } + + /** + * Get the DNA sequence + * + * @return read sequence as a string of ACGTN=. + */ + public String getReadString() { + return readString; + } + + /** + * Get the DNA sequence. + * + * @return read sequence as ASCII bytes ACGTN=; {@link SAMRecord#NULL_SEQUENCE} if no bases are present. + */ + public byte[] getReadBases() { + return (readString == null) ? SAMRecord.NULL_SEQUENCE : StringUtil.stringToBytes(readString); + } + + /** + * Get the base qualities encoded as a FASTQ string + * + * @return the quality string + */ + public String getBaseQualityString() { + return baseQualityString; + } + + /** + * Get the base qualities as binary PHRED scores (not ASCII) + * + * @return the base quality; {@link SAMRecord#NULL_QUALS} if no bases are present. + */ + public byte[] getBaseQualities() { + return (baseQualityString == null) ? SAMRecord.NULL_QUALS : SAMUtils.fastqToPhred(baseQualityString); + } + + /** + * Get the read length + * + * @return number of bases in the read + */ + public int getReadLength() { + return (readString == null) ? 0 : readString.length(); + } + + /** + * Get the base quality header + * + * @return the base quality header + */ + public String getBaseQualityHeader() { + return qualityHeader; + } + + /** + * shortcut to getReadString().length() + * + * @deprecated since 02/2017. Use {@link #getReadLength()} instead + */ + @Deprecated + public int length() { + return getReadLength(); + } + @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result - + ((qualHeaderPrefix == null) ? 0 : qualHeaderPrefix.hashCode()); + + ((qualityHeader == null) ? 0 : qualityHeader.hashCode()); result = prime * result - + ((qualLine == null) ? 0 : qualLine.hashCode()); + + ((baseQualityString == null) ? 0 : baseQualityString.hashCode()); result = prime * result - + ((seqHeaderPrefix == null) ? 0 : seqHeaderPrefix.hashCode()); - result = prime * result + ((seqLine == null) ? 0 : seqLine.hashCode()); + + ((readName == null) ? 0 : readName.hashCode()); + result = prime * result + ((readString == null) ? 0 : readString.hashCode()); return result; } - + @Override public boolean equals(Object obj) { if (this == obj) @@ -88,37 +195,45 @@ public boolean equals(Object obj) { if (getClass() != obj.getClass()) return false; FastqRecord other = (FastqRecord) obj; - if (seqLine == null) { - if (other.seqLine != null) + if (readString == null) { + if (other.readString != null) return false; - } else if (!seqLine.equals(other.seqLine)) + } else if (!readString.equals(other.readString)) return false; - if (qualHeaderPrefix == null) { - if (other.qualHeaderPrefix != null) + if (qualityHeader == null) { + if (other.qualityHeader != null) return false; - } else if (!qualHeaderPrefix.equals(other.qualHeaderPrefix)) + } else if (!qualityHeader.equals(other.qualityHeader)) return false; - if (qualLine == null) { - if (other.qualLine != null) + if (baseQualityString == null) { + if (other.baseQualityString != null) return false; - } else if (!qualLine.equals(other.qualLine)) + } else if (!baseQualityString.equals(other.baseQualityString)) return false; - if (seqHeaderPrefix == null) { - if (other.seqHeaderPrefix != null) + if (readName == null) { + if (other.readName != null) return false; - } else if (!seqHeaderPrefix.equals(other.seqHeaderPrefix)) + } else if (!readName.equals(other.readName)) return false; - + return true; } - + + /** + * Returns the record as the String FASTQ format. + * @see FastqEncoder#encode(FastqRecord) + */ + public String toFastQString() { + return FastqEncoder.encode(this); + } + + /** + * Returns {@link #toFastQString()} + */ @Override public String toString() { - return new StringBuilder(). - append(FastqConstants.SEQUENCE_HEADER).append(this.seqHeaderPrefix==null?"":this.seqHeaderPrefix).append('\n'). - append(this.seqLine==null?"":this.seqLine).append('\n'). - append(FastqConstants.QUALITY_HEADER).append(this.qualHeaderPrefix==null?"":this.qualHeaderPrefix).append('\n'). - append(this.qualLine==null?"":this.qualLine). - toString(); - } + // TODO: this should be change in the future for a simpler and more informative form such as + // TODO: return String.format("%s: %s bp", readName, getReadLength()); + return toFastQString(); + } } diff --git a/src/main/java/htsjdk/samtools/util/SequenceUtil.java b/src/main/java/htsjdk/samtools/util/SequenceUtil.java index 92c1a507d..7088217da 100644 --- a/src/main/java/htsjdk/samtools/util/SequenceUtil.java +++ b/src/main/java/htsjdk/samtools/util/SequenceUtil.java @@ -32,6 +32,7 @@ import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.SAMTag; +import htsjdk.samtools.fastq.FastqConstants; import java.io.File; import java.math.BigInteger; @@ -1005,7 +1006,7 @@ public static String getSamReadNameFromFastqHeader(final String fastqHeader) { // NOTE: the while loop isn't necessarily the most efficient way to handle this but we don't // expect this to ever happen more than once, just trapping pathological cases - while ((readName.endsWith("/1") || readName.endsWith("/2"))) { + while ((readName.endsWith(FastqConstants.FIRST_OF_PAIR) || readName.endsWith(FastqConstants.SECOND_OF_PAIR))) { // If this is an unpaired run we want to make sure that "/1" isn't tacked on the end of the read name, // as this can cause problems down the road (ex. in Picard's MergeBamAlignment). readName = readName.substring(0, readName.length() - 2); diff --git a/src/test/java/htsjdk/samtools/fastq/FastqEncoderTest.java b/src/test/java/htsjdk/samtools/fastq/FastqEncoderTest.java new file mode 100644 index 000000000..72e59cff7 --- /dev/null +++ b/src/test/java/htsjdk/samtools/fastq/FastqEncoderTest.java @@ -0,0 +1,75 @@ +/* + * The MIT License + * + * Copyright (c) 2016 Daniel Gomez-Sanchez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +package htsjdk.samtools.fastq; + +import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.SAMRecordSetBuilder; +import org.testng.Assert; +import org.testng.annotations.Test; + +/** + * @author Daniel Gomez-Sanchez (magicDGS) + */ +public class FastqEncoderTest { + + @Test + public void testAsFastqRecord() throws Exception { + final SAMRecord record = new SAMRecordSetBuilder().addFrag("test", 0, 1, false, false, "10M", null, 2); + record.setReadPairedFlag(true); + // test first of pair encoding + record.setFirstOfPairFlag(true); + testRecord(record.getReadName() + FastqConstants.FIRST_OF_PAIR, FastqEncoder.asFastqRecord(record), record); + record.setFirstOfPairFlag(false); + record.setSecondOfPairFlag(true); + testRecord(record.getReadName() + FastqConstants.SECOND_OF_PAIR, FastqEncoder.asFastqRecord(record), record); + record.setSecondOfPairFlag(false); + testRecord(record.getReadName(), FastqEncoder.asFastqRecord(record), record); + } + + private void testRecord(final String expectedReadName, final FastqRecord fastqRecord, final SAMRecord samRecord) { + Assert.assertEquals(fastqRecord.getReadName(), expectedReadName); + Assert.assertEquals(fastqRecord.getBaseQualities(), samRecord.getBaseQualities()); + Assert.assertEquals(fastqRecord.getReadBases(), samRecord.getReadBases()); + Assert.assertNull(fastqRecord.getBaseQualityHeader()); + } + + @Test + public void testAsSAMRecord() throws Exception { + // create a random record + final SAMRecord samRecord = new SAMRecordSetBuilder().addFrag("test", 0, 1, false, false, "10M", null, 2); + FastqRecord fastqRecord = new FastqRecord(samRecord.getReadName(), samRecord.getReadBases(), "", samRecord.getBaseQualities()); + testConvertedSAMRecord(FastqEncoder.asSAMRecord(fastqRecord, samRecord.getHeader()), samRecord); + fastqRecord = new FastqRecord(samRecord.getReadName() + FastqConstants.FIRST_OF_PAIR, samRecord.getReadBases(), "", samRecord.getBaseQualities()); + testConvertedSAMRecord(FastqEncoder.asSAMRecord(fastqRecord, samRecord.getHeader()), samRecord); + fastqRecord = new FastqRecord(samRecord.getReadName() + FastqConstants.SECOND_OF_PAIR, samRecord.getReadBases(), "", samRecord.getBaseQualities()); + testConvertedSAMRecord(FastqEncoder.asSAMRecord(fastqRecord, samRecord.getHeader()), samRecord); + } + + private void testConvertedSAMRecord(final SAMRecord converted, final SAMRecord original) { + Assert.assertEquals(converted.getReadName(), original.getReadName()); + Assert.assertEquals(converted.getBaseQualities(), original.getBaseQualities()); + Assert.assertEquals(converted.getReadBases(), original.getReadBases()); + Assert.assertTrue(converted.getReadUnmappedFlag()); + } +} \ No newline at end of file diff --git a/src/test/java/htsjdk/samtools/fastq/FastqRecordTest.java b/src/test/java/htsjdk/samtools/fastq/FastqRecordTest.java index f6f238eab..97a3d3c8d 100644 --- a/src/test/java/htsjdk/samtools/fastq/FastqRecordTest.java +++ b/src/test/java/htsjdk/samtools/fastq/FastqRecordTest.java @@ -15,7 +15,7 @@ public void testBasic() { Assert.assertNull(fastqRecord.getBaseQualityHeader()); - Assert.assertEquals(fastqRecord.getReadHeader(), seqHeaderPrefix); + Assert.assertEquals(fastqRecord.getReadName(), seqHeaderPrefix); Assert.assertEquals(fastqRecord.getBaseQualityString(), qualLine); Assert.assertEquals(fastqRecord.getReadString(), seqLine); Assert.assertNotNull(fastqRecord.toString());//just check not nullness @@ -25,9 +25,9 @@ public void testBasic() { Assert.assertEquals(fastqRecord, fastqRecord); Assert.assertNotEquals(fastqRecord, "fred"); Assert.assertNotEquals("fred", fastqRecord); - Assert.assertEquals(fastqRecord.length(), seqLine.length()); + Assert.assertEquals(fastqRecord.getReadLength(), seqLine.length()); Assert.assertEquals(fastqRecord.getBaseQualityString().length(), fastqRecord.getReadString().length()); - Assert.assertEquals(fastqRecord.getReadString().length(), fastqRecord.length()); + Assert.assertEquals(fastqRecord.getReadString().length(), fastqRecord.getReadLength()); } @Test @@ -37,7 +37,7 @@ public void testBasicEmptyHeaderPrefix() { final String qualHeaderPrefix = ""; final String qualLine = ";<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"; final FastqRecord fastqRecord = new FastqRecord(seqHeaderPrefix, seqLine, qualHeaderPrefix, qualLine); - Assert.assertNull(fastqRecord.getReadHeader()); + Assert.assertNull(fastqRecord.getReadName()); Assert.assertNull(fastqRecord.getBaseQualityHeader()); } @@ -57,6 +57,11 @@ public void testCopy() { Assert.assertSame(fastqRecord.getBaseQualityHeader(), fastqRecordCopy.getBaseQualityHeader()); } + @Test(expectedExceptions = IllegalArgumentException.class) + public void testNullCopy() { + new FastqRecord(null); + } + @Test public void testNullSeq() { final String seqHeaderPrefix = "header";