From 84f110b611c984c26eaf82aff08b565dd4c89f48 Mon Sep 17 00:00:00 2001 From: "Philip R. Kensche" Date: Thu, 9 Jun 2016 15:34:20 +0200 Subject: [PATCH 1/5] Optionally writing compressed FASTQs per read group. --- src/main/java/picard/sam/SamToFastq.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/main/java/picard/sam/SamToFastq.java b/src/main/java/picard/sam/SamToFastq.java index f577ed0b9..1ac52b7e7 100755 --- a/src/main/java/picard/sam/SamToFastq.java +++ b/src/main/java/picard/sam/SamToFastq.java @@ -133,6 +133,9 @@ "If the original read is shorter than CLIPPING_MIN_LENGTH then the original read length will be maintained.") public int CLIPPING_MIN_LENGTH = 0; + @Option(shortName = "GZIP", doc = "Compress output FASTQ files using gzip and append a .gz extension to the file names.", optional = false) + public Boolean COMPRESS_OUTPUTS = false; + @Option(shortName = "R1_TRIM", doc = "The number of bases to trim from the beginning of read 1.") public int READ1_TRIM = 0; @@ -286,6 +289,9 @@ private File makeReadGroupFile(final SAMReadGroupRecord readGroup, final String fileName = IOUtil.makeFileNameSafe(fileName); if (preExtSuffix != null) fileName += preExtSuffix; fileName += ".fastq"; + if (COMPRESS_OUTPUTS) { + fileName += ".gz"; + } final File result = (OUTPUT_DIR != null) ? new File(OUTPUT_DIR, fileName) From 1b0cdd9058a30ef2222037744349fe7f9ec09e93 Mon Sep 17 00:00:00 2001 From: "Philip R. Kensche" Date: Fri, 22 Jul 2016 15:54:45 +0200 Subject: [PATCH 2/5] Added mutexes to avoid explicit setting of output fastq files and request for read group fastq compression. Added _PER_RG to COMPRESS_OUTPUTS and simplified GZIP[_PER_GZ] to GZRG in analogy to OPRG. --- src/main/java/picard/sam/SamToFastq.java | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/main/java/picard/sam/SamToFastq.java b/src/main/java/picard/sam/SamToFastq.java index 1ac52b7e7..d82601d19 100755 --- a/src/main/java/picard/sam/SamToFastq.java +++ b/src/main/java/picard/sam/SamToFastq.java @@ -84,20 +84,25 @@ public File INPUT; @Option(shortName = "F", doc = "Output FASTQ file (single-end fastq or, if paired, first end of the pair FASTQ).", - mutex = {"OUTPUT_PER_RG"}) + mutex = {"OUTPUT_PER_RG", "COMPRESS_OUTPUTS_PER_RG"}) public File FASTQ; @Option(shortName = "F2", doc = "Output FASTQ file (if paired, second end of the pair FASTQ).", optional = true, - mutex = {"OUTPUT_PER_RG"}) + mutex = {"OUTPUT_PER_RG", "COMPRESS_OUTPUTS_PER_RG"}) public File SECOND_END_FASTQ; - @Option(shortName = "FU", doc = "Output FASTQ file for unpaired reads; may only be provided in paired-FASTQ mode", optional = true, mutex = {"OUTPUT_PER_RG"}) + @Option(shortName = "FU", doc = "Output FASTQ file for unpaired reads; may only be provided in paired-FASTQ mode", optional = true, + mutex = {"OUTPUT_PER_RG", "COMPRESS_OUTPUTS_PER_RG"}) public File UNPAIRED_FASTQ; @Option(shortName = "OPRG", doc = "Output a FASTQ file per read group (two FASTQ files per read group if the group is paired).", optional = true, mutex = {"FASTQ", "SECOND_END_FASTQ", "UNPAIRED_FASTQ"}) public boolean OUTPUT_PER_RG; + @Option(shortName = "GZRG", doc = "Compress output FASTQ files per read group using gzip and append a .gz extension to the file names.", + optional = false, mutex = {"FASTQ", "SECOND_END_FASTQ", "UNPAIRED_FASTQ"}) + public Boolean COMPRESS_OUTPUTS_PER_RG = false; + @Option(shortName="RGT", doc = "The read group tag (PU or ID) to be used to output a FASTQ file per read group.") public String RG_TAG = "PU"; @@ -133,9 +138,6 @@ "If the original read is shorter than CLIPPING_MIN_LENGTH then the original read length will be maintained.") public int CLIPPING_MIN_LENGTH = 0; - @Option(shortName = "GZIP", doc = "Compress output FASTQ files using gzip and append a .gz extension to the file names.", optional = false) - public Boolean COMPRESS_OUTPUTS = false; - @Option(shortName = "R1_TRIM", doc = "The number of bases to trim from the beginning of read 1.") public int READ1_TRIM = 0; @@ -289,7 +291,7 @@ private File makeReadGroupFile(final SAMReadGroupRecord readGroup, final String fileName = IOUtil.makeFileNameSafe(fileName); if (preExtSuffix != null) fileName += preExtSuffix; fileName += ".fastq"; - if (COMPRESS_OUTPUTS) { + if (COMPRESS_OUTPUTS_PER_RG) { fileName += ".gz"; } From 2f17b2c72e8bc963adfb148969903544cf80cbaf Mon Sep 17 00:00:00 2001 From: "Philip R. Kensche" Date: Fri, 22 Jul 2016 16:01:56 +0200 Subject: [PATCH 3/5] Simplified expression. --- src/main/java/picard/sam/SamToFastq.java | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/main/java/picard/sam/SamToFastq.java b/src/main/java/picard/sam/SamToFastq.java index d82601d19..5fe90e92e 100755 --- a/src/main/java/picard/sam/SamToFastq.java +++ b/src/main/java/picard/sam/SamToFastq.java @@ -290,10 +290,7 @@ private File makeReadGroupFile(final SAMReadGroupRecord readGroup, final String } fileName = IOUtil.makeFileNameSafe(fileName); if (preExtSuffix != null) fileName += preExtSuffix; - fileName += ".fastq"; - if (COMPRESS_OUTPUTS_PER_RG) { - fileName += ".gz"; - } + fileName += COMPRESS_OUTPUTS_PER_RG ? ".fastq.gz" : ".fastq"; final File result = (OUTPUT_DIR != null) ? new File(OUTPUT_DIR, fileName) From 81c4d5404fa67a0b336fe7a6a2cfe8b518e84c08 Mon Sep 17 00:00:00 2001 From: "Philip R. Kensche" Date: Fri, 9 Sep 2016 09:45:09 +0200 Subject: [PATCH 4/5] Adapted short argument name. --- src/main/java/picard/sam/SamToFastq.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/picard/sam/SamToFastq.java b/src/main/java/picard/sam/SamToFastq.java index 5fe90e92e..9fb0b1798 100755 --- a/src/main/java/picard/sam/SamToFastq.java +++ b/src/main/java/picard/sam/SamToFastq.java @@ -99,7 +99,7 @@ optional = true, mutex = {"FASTQ", "SECOND_END_FASTQ", "UNPAIRED_FASTQ"}) public boolean OUTPUT_PER_RG; - @Option(shortName = "GZRG", doc = "Compress output FASTQ files per read group using gzip and append a .gz extension to the file names.", + @Option(shortName = "GZOPRG", doc = "Compress output FASTQ files per read group using gzip and append a .gz extension to the file names.", optional = false, mutex = {"FASTQ", "SECOND_END_FASTQ", "UNPAIRED_FASTQ"}) public Boolean COMPRESS_OUTPUTS_PER_RG = false; From d29a267df65671ecf2d01c4884943baadde690fc Mon Sep 17 00:00:00 2001 From: "Philip R. Kensche" Date: Fri, 28 Apr 2017 10:23:39 +0200 Subject: [PATCH 5/5] Added test that output fastq suffixed by .gz will indeed be compressed (via magic number). --- src/test/java/picard/sam/SamToFastqTest.java | 39 ++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/src/test/java/picard/sam/SamToFastqTest.java b/src/test/java/picard/sam/SamToFastqTest.java index c0464e961..834f6a665 100644 --- a/src/test/java/picard/sam/SamToFastqTest.java +++ b/src/test/java/picard/sam/SamToFastqTest.java @@ -38,6 +38,7 @@ import picard.PicardException; import java.io.File; +import java.io.FileInputStream; import java.io.IOException; import java.util.HashMap; import java.util.HashSet; @@ -451,10 +452,44 @@ else if (record.getSecondOfPairFlag()) { } } - private File newTempFastqFile(final String filename) throws IOException { + private File newTempFastqFile(final String filename, final String suffix) throws IOException { if(filename == null) return null; - final File file = File.createTempFile(filename,".fastq"); + final File file = File.createTempFile(filename, suffix); file.deleteOnExit(); return file; } + + private File newTempFastqFile(final String filename) throws IOException { + return newTempFastqFile(filename, ".fastq"); + } + + @Test(dataProvider = "okFiles") + public void testFileCompression(final String samFilename) throws IOException { + final File samFile = new File(TEST_DATA_DIR,samFilename); + final File pair1File = newTempFastqFile("pair1", ".fastq.gz"); + final File pair2File = newTempFastqFile("pair2", ".fastq.gz"); + + convertFile(new String[]{ + "INPUT=" + samFile.getAbsolutePath(), + "FASTQ=" + pair1File.getAbsolutePath(), + "SECOND_END_FASTQ=" + pair2File.getAbsolutePath() + }); + + verifyFileIsGzCompressed(pair1File); + verifyFileIsGzCompressed(pair2File); + // Content verification. Picard automatically recognizes the compression from the .gz suffix + // (htsjdk.samtools.util.IOUtil#openFileForReading(java.nio.file.Path)). + verifyFastq(pair1File, pair2File, samFile); + } + + private void verifyFileIsGzCompressed(final File file) throws IOException { + FileInputStream fis = new FileInputStream(file); + final byte[] expectedMagicNumber = { (byte) 0x1f, (byte) 0x8b}; + byte[] observedMagicNumber = new byte[2]; + fis.read(observedMagicNumber, 0, 2); + fis.close(); + Assert.assertEquals(observedMagicNumber, expectedMagicNumber); + } + + }