bclsForOneTile, final int[] outputLengths,
final BclQualityEvaluationStrategy bclQualityEvaluationStrategy, final boolean seekable) {
+ super(outputLengths, bclQualityEvaluationStrategy);
try {
- this.bclQualityEvaluationStrategy = bclQualityEvaluationStrategy;
- this.outputLengths = outputLengths;
-
- int cycles = 0;
- for (final int outputLength : outputLengths) {
- cycles += outputLength;
- }
- this.streams = new InputStream[cycles];
- this.streamFiles = new File[cycles];
- this.numClustersPerCycle = new int[cycles];
final ByteBuffer byteBuffer = ByteBuffer.allocate(HEADER_SIZE);
byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
@@ -173,14 +153,8 @@ private static long getNumberOfClusters(final String filePath, final InputStream
public BclReader(final File bclFile, final BclQualityEvaluationStrategy bclQualityEvaluationStrategy, final boolean seekable) {
+ super(new int[]{1}, bclQualityEvaluationStrategy);
try {
-
- this.outputLengths = new int[]{1};
- this.streams = new InputStream[1];
- this.streamFiles = new File[1];
- this.numClustersPerCycle = new int[]{1};
- this.bclQualityEvaluationStrategy = bclQualityEvaluationStrategy;
-
final ByteBuffer byteBuffer = ByteBuffer.allocate(HEADER_SIZE);
final String filePath = bclFile.getName();
final boolean isGzip = filePath.endsWith(".gz");
@@ -195,7 +169,7 @@ public BclReader(final File bclFile, final BclQualityEvaluationStrategy bclQuali
byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
this.numClustersPerCycle[0] = byteBuffer.getInt();
if (!isBgzf && !isGzip) {
- assertProperFileStructure(bclFile, this.numClustersPerCycle[0], stream);
+ assertProperFileStructure(bclFile, this.getNumClusters(), stream);
}
this.streams[0] = stream;
this.streamFiles[0] = bclFile;
@@ -212,38 +186,6 @@ void assertProperFileStructure(final File file, final int numClusters, final Inp
}
}
- InputStream open(final File file, final boolean seekable, final boolean isGzip, final boolean isBgzf) throws IOException {
- final String filePath = file.getAbsolutePath();
-
- try {
- // Open up a buffered stream to read from the file and optionally wrap it in a gzip stream
- // if necessary
- if (isBgzf) {
- // Only BlockCompressedInputStreams can seek, and only if they are fed a SeekableStream.
- return new BlockCompressedInputStream(IOUtil.maybeBufferedSeekableStream(file));
- } else if (isGzip) {
- if (seekable) {
- throw new IllegalArgumentException(
- String.format("Cannot create a seekable reader for gzip bcl: %s.", filePath)
- );
- }
- return (IOUtil.maybeBufferInputStream(new GZIPInputStream(new FileInputStream(file), Defaults.BUFFER_SIZE / 2),
- Defaults.BUFFER_SIZE / 2));
- } else {
- if (seekable) {
- throw new IllegalArgumentException(
- String.format("Cannot create a seekable reader for provided bcl: %s.", filePath)
- );
- }
- return IOUtil.maybeBufferInputStream(new FileInputStream(file));
- }
- } catch (final FileNotFoundException fnfe) {
- throw new PicardException("File not found: (" + filePath + ")", fnfe);
- } catch (final IOException ioe) {
- throw new PicardException("Error reading file: (" + filePath + ")", ioe);
- }
- }
-
public void close() {
for (final InputStream stream : this.streams) {
CloserUtil.close(stream);
@@ -258,14 +200,10 @@ public boolean hasNext() {
return queue != null;
}
- private long getNumClusters() {
- return numClustersPerCycle[0];
- }
-
protected void assertProperFileStructure(final File file) {
final long elementsInFile = file.length() - HEADER_SIZE;
- if (numClustersPerCycle[0] != elementsInFile) {
- throw new PicardException("Expected " + numClustersPerCycle[0] + " in file " + file.getAbsolutePath() + " but found " + elementsInFile);
+ if (getNumClusters() != elementsInFile) {
+ throw new PicardException("Expected " + getNumClusters() + " in file " + file.getAbsolutePath() + " but found " + elementsInFile);
}
}
@@ -306,14 +244,7 @@ void advance() {
return;
}
- if (readByte == 0) {
- //NO CALL, don't confuse with an A call
- data.bases[read][cycle] = (byte) '.';
- data.qualities[read][cycle] = (byte) 2;
- } else {
- data.bases[read][cycle] = BASE_LOOKUP[readByte & BASE_MASK];
- data.qualities[read][cycle] = bclQualityEvaluationStrategy.reviseAndConditionallyLogQuality((byte) (readByte >>> 2));
- }
+ decodeBasecall(data, read, cycle, readByte);
totalCycleCount++;
} catch (final IOException ioe) {
throw new RuntimeIOException(ioe);
diff --git a/src/main/java/picard/illumina/parser/readers/CbclReader.java b/src/main/java/picard/illumina/parser/readers/CbclReader.java
new file mode 100644
index 000000000..8557a394a
--- /dev/null
+++ b/src/main/java/picard/illumina/parser/readers/CbclReader.java
@@ -0,0 +1,305 @@
+package picard.illumina.parser.readers;
+
+import htsjdk.samtools.util.CloseableIterator;
+import htsjdk.samtools.util.CloserUtil;
+import htsjdk.samtools.util.Log;
+import htsjdk.samtools.util.ProgressLogger;
+import htsjdk.samtools.util.RuntimeIOException;
+import picard.illumina.parser.BclData;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * ------------------------------------- CBCL Header -----------------------------------
+ * Bytes 0 - 1 Version number, current version is 1 unsigned 16 bits little endian integer
+ * Bytes 2 - 5 Header size unsigned 32 bits little endian integer
+ * Byte 6 Number of bits per basecall unsigned
+ * Byte 7 Number of bits per q-score unsigned
+ *
+ * q-val mapping info
+ * Bytes 0-3 Number of bins (B), zero indicates no mapping
+ * B pairs of 4 byte values (if B > 0) {from, to}, {from, to}, {from, to} …from: quality score bin to: quality score
+ *
+ * Number of tile records unsigned 32bits little endian integer
+ *
+ * gzip virtual file offsets, one record per tile
+ * Bytes 0-3: tile number
+ * Bytes 4-7 Number of clusters that were written into the current block (required due to bit-packed q-scores)
+ * unsigned 32 bit integer
+ *
+ * Bytes 8-11 Uncompressed block size of the tile data (useful for sanity check whenexcluding non-PF clusters)
+ * unsigned 32 bit integer
+ *
+ * Bytes 12-15 Compressed block size of the tile data unsigned 32 bit integer
+ *
+ * non-PF clusters excluded flag 1: non-PF clusters are excluded 0: non-PF clusters are included
+ *
+ * ------------------------------------- CBCL File Content -----------------------------------
+ *
+ * N blocks of gzip files, where N is the number of tiles.
+ *
+ * Each block consists of C number of basecall, quality score pairs where C is the number of clusters for the given tile.
+ *
+ * Each basecall, quality score pair has the following format (assuming 2 bits are used for the basecalls):
+ * Bits 0-1: Basecalls (respectively [A, C, G, T] for [00, 01, 10, 11])
+ * Bits 2 and up: Quality score (unsigned Q bit little endian integer where Q is the number of bits per q-score).
+ * For a two bit quality score, this is two clusters per byte where the bottom 4 bits are the first cluster and the
+ * higher 4 bits are the second cluster.
+ **/
+
+public class CbclReader extends BaseBclReader implements CloseableIterator {
+
+ private static Log log = Log.getInstance(CbclReader.class);
+
+ private ByteBuffer[] cachedTile;
+
+ private int[] currentTile;
+
+ private List queue = new ArrayList<>();
+
+ private final CycleData[] cycleData;
+ private Map filters;
+ private Map cachedFilter = new HashMap<>();
+
+ void addFilters(Map filters) {
+ this.filters = filters;
+ }
+
+ private static final int INITIAL_HEADER_SIZE = 6;
+
+ public CbclReader(final List cbcls, final int[] outputLengths) {
+ super(outputLengths);
+ cycleData = new CycleData[cycles];
+ cachedTile = new ByteBuffer[cycles];
+ currentTile = new int[cycles];
+
+ try {
+ final ByteBuffer byteBuffer = ByteBuffer.allocate(INITIAL_HEADER_SIZE);
+ byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
+ for (int i = 0; i < cycles; i++) {
+ currentTile[i] = 0;
+ final File bclFile = cbcls.get(i);
+
+ final InputStream stream = open(bclFile, false, false, false);
+ int read = stream.read(byteBuffer.array());
+
+ //we need to read the first 6 bytes to determine the header size
+ if (read != INITIAL_HEADER_SIZE) {
+ close();
+ throw new RuntimeIOException(String.format("BCL %s has invalid header structure.", bclFile.getAbsoluteFile()));
+ }
+
+ short version = byteBuffer.getShort();
+ int headerSize = byteBuffer.getInt();
+
+ final ByteBuffer headerBuffer = ByteBuffer.allocate(headerSize - INITIAL_HEADER_SIZE);
+ headerBuffer.order(ByteOrder.LITTLE_ENDIAN);
+
+ read = stream.read(headerBuffer.array());
+ if (read != headerSize - INITIAL_HEADER_SIZE) {
+ close();
+ throw new RuntimeIOException(String.format("BCL %s has invalid header structure.", bclFile.getAbsoluteFile()));
+ }
+
+ byte bitsPerBasecall = headerBuffer.get();
+ byte bitsPerQualityScore = headerBuffer.get();
+
+ if (bitsPerBasecall != 2 && bitsPerBasecall != bitsPerQualityScore) {
+ close();
+ throw new RuntimeIOException("CBCL data not encoded in nibbles. (not currently supported)");
+ }
+
+ int numberOfBins = headerBuffer.getInt();
+
+ byte[] qualityBins = new byte[numberOfBins];
+ //each bin has a pair of 4 byte mappings
+ for (int j = 0; j < numberOfBins; j++) {
+ headerBuffer.getInt(); // first int is "from" value, which we don't need
+ int to = headerBuffer.getInt();
+ qualityBins[j] = (byte) to;
+ }
+
+ int numTiles = headerBuffer.getInt();
+ TileData[] tileInfo = new TileData[numTiles];
+ for (int j = 0; j < numTiles; j++) {
+ int tileNum = headerBuffer.getInt();
+ int numClustersInTile = headerBuffer.getInt();
+ int uncompressedBlockSize = headerBuffer.getInt();
+ int compressedBlockSize = headerBuffer.getInt();
+ tileInfo[j] = new TileData(tileNum, numClustersInTile, uncompressedBlockSize, compressedBlockSize);
+ }
+
+ boolean pfExcluded = headerBuffer.get() == 1;
+ cycleData[i] = new CycleData(version, headerSize, bitsPerBasecall, bitsPerQualityScore, numberOfBins, qualityBins, numTiles, tileInfo, pfExcluded);
+ this.streams[i] = stream;
+ this.streamFiles[i] = bclFile;
+ byteBuffer.clear();
+ headerBuffer.clear();
+ }
+
+
+ } catch (final IOException ioe) {
+ throw new RuntimeIOException(ioe);
+ }
+ }
+
+ @Override
+ public boolean hasNext() {
+ if (queue.isEmpty()) {
+ advance();
+ }
+ return !queue.isEmpty();
+ }
+
+ public BclData next() {
+ if (queue.isEmpty()) {
+ advance();
+ }
+
+ final BclData data = queue.get(0);
+ queue.remove(0);
+ return data;
+ }
+
+ @Override
+ public void close() {
+ for (final InputStream stream : this.streams) {
+ CloserUtil.close(stream);
+ }
+ }
+
+ private void advance() {
+ int totalCycleCount = 0;
+ BclData data = new BclData(outputLengths);
+
+ for (int read = 0; read < outputLengths.length; read++) {
+ for (int cycle = 0; cycle < outputLengths[read]; cycle++) {
+ try {
+ CycleData currentCycleData = cycleData[totalCycleCount];
+ TileData currentTileData = currentCycleData.tileInfo[currentTile[totalCycleCount]];
+ try {
+ if (cachedTile[totalCycleCount] == null) {
+ if (!cachedFilter.containsKey(currentTileData.tileNum)) {
+ cacheFilter(currentTileData);
+ }
+ cacheTile(totalCycleCount, currentTileData, currentCycleData);
+ }
+ } catch (IOException e) {
+ // when logging the error, increment cycle by 1, since totalCycleCount is zero-indexed but Illumina directories are 1-indexed.
+ throw new IOException(String.format("Error while reading from BCL file for cycle %d. Offending file on disk is %s",
+ (totalCycleCount + 1), this.streamFiles[totalCycleCount].getAbsolutePath()), e);
+ }
+
+ if (!cachedTile[totalCycleCount].hasRemaining()) {
+ //on to the next tile
+ currentTile[totalCycleCount]++;
+ //if past the last tile then return
+ if (currentTile[totalCycleCount] > currentCycleData.tileInfo.length - 1) {
+ return;
+ }
+ currentTileData = currentCycleData.tileInfo[currentTile[totalCycleCount]];
+ if (!cachedFilter.containsKey(currentTileData.tileNum)) {
+ cacheFilter(currentTileData);
+ }
+ cacheTile(totalCycleCount, currentTileData, currentCycleData);
+ }
+
+ int singleByte = cachedTile[totalCycleCount].get();
+
+ decodeQualityBinnedBasecall(data, read, cycle, singleByte, currentCycleData);
+
+ totalCycleCount++;
+ } catch (final IOException ioe) {
+ throw new RuntimeIOException(ioe);
+ }
+
+ }
+ }
+ this.queue.add(data);
+ }
+
+ private void cacheFilter(TileData currentTileData) {
+ boolean[] filterValues = new boolean[currentTileData.numClustersInTile];
+ FilterFileReader reader = new FilterFileReader(filters.get(currentTileData.tileNum));
+ int count = 0;
+ while (reader.hasNext()) {
+ filterValues[count] = reader.next();
+ count++;
+ }
+ cachedFilter.put(currentTileData.tileNum, filterValues);
+ }
+
+ private void cacheTile(int totalCycleCount, TileData tileData, CycleData currentCycleData) throws IOException {
+ ByteBuffer tileByteBuffer = ByteBuffer.allocate(tileData.compressedBlockSize);
+ //we are going to explode the nibbles in to bytes to make PF filtering easier
+ ByteBuffer tempUncompressedByteBuffer = ByteBuffer.allocate(tileData.uncompressedBlockSize);
+ ByteBuffer uncompressedByteBuffer = ByteBuffer.allocate(tileData.uncompressedBlockSize);
+ ByteBuffer unNibbledByteBuffer = ByteBuffer.allocate(tileData.uncompressedBlockSize * 2);
+
+ tileByteBuffer.order(ByteOrder.LITTLE_ENDIAN);
+ uncompressedByteBuffer.order(ByteOrder.LITTLE_ENDIAN);
+
+ // Read the whole compressed block into a buffer, then sanity check the length
+ int readBytes = this.streams[totalCycleCount].read(tileByteBuffer.array());
+ if (readBytes != tileData.compressedBlockSize) {
+ throw new IOException(String.format("Error while reading from BCL file for cycle %d. Offending file on disk is %s",
+ (totalCycleCount + 1), this.streamFiles[totalCycleCount].getAbsolutePath()));
+ }
+
+ // Uncompress the data from the buffer we just wrote - use gzip input stream to write to uncompressed buffer
+ ByteArrayInputStream byteInputStream = new ByteArrayInputStream(tileByteBuffer.array());
+ if (cachedTile[totalCycleCount] != null) {
+ //clear the old cache
+ cachedTile[totalCycleCount] = null;
+ }
+ GZIPInputStream gzipInputStream = new GZIPInputStream(byteInputStream, uncompressedByteBuffer.capacity());
+ int read = 0;
+ int totalRead = 0;
+ while ((read = gzipInputStream.read(tempUncompressedByteBuffer.array(), 0, tempUncompressedByteBuffer.capacity())) != -1) {
+ uncompressedByteBuffer.put(tempUncompressedByteBuffer.array(), 0, read);
+ totalRead += read;
+ }
+ if (totalRead != tileData.uncompressedBlockSize) {
+ throw new IOException(String.format("Error while decompressing from BCL file for cycle %d. Offending file on disk is %s",
+ (totalCycleCount + 1), this.streamFiles[totalCycleCount].getAbsolutePath()));
+ }
+
+ // Read uncompressed data from the buffer and expand each nibble into a full byte for ease of use
+ uncompressedByteBuffer.flip();
+ while (uncompressedByteBuffer.hasRemaining()) {
+ byte singleByte = uncompressedByteBuffer.get();
+ unNibbledByteBuffer.put((byte) (singleByte & 0x0f));
+ unNibbledByteBuffer.put((byte) ((singleByte >> 4) & 0x0f));
+ }
+ gzipInputStream.close();
+
+ // Write buffer contents to cached tile array
+ unNibbledByteBuffer.flip();
+ //if nonPF reads are included we need to strip them out
+ if (!currentCycleData.pfExcluded) {
+ ByteBuffer uncompressedFilteredByteBuffer = ByteBuffer.allocate(tileData.uncompressedBlockSize * 2);
+ FilterFileReader reader = new FilterFileReader(filters.get(tileData.tileNum));
+ while (reader.hasNext()) {
+ byte readByte = unNibbledByteBuffer.get();
+ if (reader.next()) {
+ uncompressedFilteredByteBuffer.put(readByte);
+ }
+ }
+ uncompressedFilteredByteBuffer.flip();
+ cachedTile[totalCycleCount] = uncompressedFilteredByteBuffer.slice();
+ } else {
+ cachedTile[totalCycleCount] = unNibbledByteBuffer;
+ }
+ }
+
+}
diff --git a/src/test/java/picard/illumina/parser/readers/CbclReaderTest.java b/src/test/java/picard/illumina/parser/readers/CbclReaderTest.java
new file mode 100644
index 000000000..80f9956f8
--- /dev/null
+++ b/src/test/java/picard/illumina/parser/readers/CbclReaderTest.java
@@ -0,0 +1,52 @@
+package picard.illumina.parser.readers;
+
+import org.testng.Assert;
+import org.testng.annotations.Test;
+import picard.illumina.parser.BclData;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+
+public class CbclReaderTest {
+
+ public static final File TestDataDir = new File("testdata/picard/illumina/readerTests");
+ public static final File PASSING_CBCL_C99_1 = new File(TestDataDir, "C99.1.cbcl");
+ public static final File PASSING_CBCL_C100_1 = new File(TestDataDir, "C100.1.cbcl");
+ public static final File TILE_1101_FILTER = new File(TestDataDir, "tile_1101.filter");
+ public static final File TILE_1102_FILTER = new File(TestDataDir, "tile_1102.filter");
+
+ public static final char[] expectedBases = new char[]{
+ 'G','G','C','C','G','A','A','G','T','A','C','C','C','T','G','A'
+ };
+
+ public static final int[] expectedQuals = new int[]{
+ 37,37,37,37,37,37,37,12,37,37,37,37,37,37,12,37
+ };
+
+ @Test
+ public void testReadValidFile() {
+ final CbclReader reader = new CbclReader(Arrays.asList(PASSING_CBCL_C99_1, PASSING_CBCL_C100_1), new int[] {2});
+ Map filters = new HashMap<>();
+ filters.put(1101, TILE_1101_FILTER);
+ filters.put(1102, TILE_1102_FILTER);
+ reader.addFilters(filters);
+
+ int i = 0;
+ while (reader.hasNext()) {
+ final BclData bv = reader.next();
+ for (int cluster = 0; cluster < bv.bases.length; cluster++) {
+ for (int cycle = 0; cycle < bv.bases[cluster].length; cycle++) {
+ String actual = new String(new byte[]{bv.bases[cluster][cycle]});
+ String expected = new String(new char[]{expectedBases[i]});
+ Assert.assertEquals(actual, expected, "For cluster " + cluster + " cycle " + cycle + ",");
+ Assert.assertEquals(bv.qualities[cluster][cycle], expectedQuals[i], "For cluster " + cluster + " cycle " + cycle + ",");
+ i++;
+ }
+ }
+ }
+ Assert.assertEquals(i, expectedBases.length);
+ reader.close();
+ }
+}
diff --git a/testdata/picard/illumina/readerTests/C100.1.cbcl b/testdata/picard/illumina/readerTests/C100.1.cbcl
new file mode 100644
index 000000000..f776a938e
Binary files /dev/null and b/testdata/picard/illumina/readerTests/C100.1.cbcl differ
diff --git a/testdata/picard/illumina/readerTests/C99.1.cbcl b/testdata/picard/illumina/readerTests/C99.1.cbcl
new file mode 100644
index 000000000..002cd52e5
Binary files /dev/null and b/testdata/picard/illumina/readerTests/C99.1.cbcl differ
diff --git a/testdata/picard/illumina/readerTests/tile_1101.filter b/testdata/picard/illumina/readerTests/tile_1101.filter
new file mode 100644
index 000000000..445bc582e
Binary files /dev/null and b/testdata/picard/illumina/readerTests/tile_1101.filter differ
diff --git a/testdata/picard/illumina/readerTests/tile_1102.filter b/testdata/picard/illumina/readerTests/tile_1102.filter
new file mode 100644
index 000000000..445bc582e
Binary files /dev/null and b/testdata/picard/illumina/readerTests/tile_1102.filter differ