diff --git a/src/main/java/htsjdk/samtools/reference/FastaSequenceIndex.java b/src/main/java/htsjdk/samtools/reference/FastaSequenceIndex.java index 9ae9f1db9..3668fe671 100644 --- a/src/main/java/htsjdk/samtools/reference/FastaSequenceIndex.java +++ b/src/main/java/htsjdk/samtools/reference/FastaSequenceIndex.java @@ -31,6 +31,9 @@ import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; +import java.io.OutputStream; +import java.io.PrintStream; +import java.nio.file.Files; import java.nio.file.Path; import java.util.Iterator; import java.util.LinkedHashMap; @@ -39,7 +42,7 @@ import java.util.regex.MatchResult; /** - * Reads a fasta index file (.fai), as generated by `samtools faidx`. + * Reads/writes a fasta index file (.fai), as generated by `samtools faidx`. */ public class FastaSequenceIndex implements Iterable { /** @@ -159,6 +162,27 @@ private void parseIndexFile(Path indexFile) { } /** + * Writes this index to the specified path. + * + * @param indexFile index file to output the index in the .fai format + * + * @throws IOException if an IO error occurs. + */ + public void write(final Path indexFile) throws IOException { + try (final PrintStream writer = new PrintStream(Files.newOutputStream(indexFile))) { + sequenceEntries.values().forEach(se -> + writer.println(String.join("\t", + se.getContig(), + String.valueOf(se.getSize()), + String.valueOf(se.getLocation()), + String.valueOf(se.getBasesPerLine()), + String.valueOf(se.getBytesPerLine())) + ) + ); + } + } + + /** * Does the given contig name have a corresponding entry? * @param contigName The contig name for which to search. * @return True if contig name is present; false otherwise. diff --git a/src/main/java/htsjdk/samtools/reference/FastaSequenceIndexCreator.java b/src/main/java/htsjdk/samtools/reference/FastaSequenceIndexCreator.java new file mode 100644 index 000000000..ee425ffd5 --- /dev/null +++ b/src/main/java/htsjdk/samtools/reference/FastaSequenceIndexCreator.java @@ -0,0 +1,180 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2017 Daniel Gomez-Sanchez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package htsjdk.samtools.reference; + +import htsjdk.samtools.SAMException; +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.samtools.util.IOUtil; +import htsjdk.tribble.readers.AsciiLineReader; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +/** + * Static methods to create an {@link FastaSequenceIndex}. + * + * @author Daniel Gomez-Sanchez (magicDGS) + */ +public final class FastaSequenceIndexCreator { + + // cannot be instantiated because it is an utility class + private FastaSequenceIndexCreator() {} + + /** + * Creates a FASTA .fai index for the provided FASTA. + * + * @param fastaFile the file to build the index from. + * @param overwrite if the .fai index already exists override it if {@code true}; otherwise, throws a {@link SAMException}. + * + * @throws SAMException if the fai file already exists or the file is malformed. + * @throws IOException if an IO error occurs. + */ + public static void create(final Path fastaFile, final boolean overwrite) throws IOException { + // get the index to write the file in + final Path indexFile = ReferenceSequenceFileFactory.getFastaIndexFileName(fastaFile); + if (!overwrite && Files.exists(indexFile)) { + // throw an exception if the file already exists + throw new SAMException("Index file " + indexFile + " already exists for " + fastaFile); + } + // build the index + final FastaSequenceIndex index = buildFromFasta(fastaFile); + index.write(indexFile); + } + + /** + * Builds a FastaSequenceIndex on the fly from a FASTA file. + * + * Note: this also alows to create an index for a compressed file, but does not generate the + * .gzi index required for use it with samtools. + * + * @param fastaFile the FASTA file. + * + * @return a fai index. + * + * @throws SAMException for formatting errors. + * @throws IOException if an IO error occurs. + */ + public static FastaSequenceIndex buildFromFasta(final Path fastaFile) throws IOException { + try(final AsciiLineReader in = new AsciiLineReader(IOUtil.openFileForReading(fastaFile))) { + + // sanity check reference format: + // 1. Non-empty file + // 2. Header name starts with > + String previous = in.readLine(); + if (previous == null) { + throw new SAMException("Cannot index empty file: " + fastaFile); + } else if (previous.charAt(0) != '>') { + throw new SAMException("Wrong sequence header: " + previous); + } + + // initialize the sequence index + int sequenceIndex = -1; + // the location should be kept before iterating over the rest of the lines + long location = in.getPosition(); + + // initialize an empty index and the entry builder to null + final FastaSequenceIndex index = new FastaSequenceIndex(); + FaiEntryBuilder entry = null; + + // read the lines two by two + for (String line = in.readLine(); previous != null; line = in.readLine()) { + // in this case, the previous line contains a header and the current line the first sequence + if (previous.charAt(0) == '>') { + // first entry should be skipped; otherwise it should be added to the index + if (entry != null) index.add(entry.build()); + // creates a new entry (and update sequence index) + entry = new FaiEntryBuilder(sequenceIndex++, previous, line, in.getLineTerminatorLength(), location); + } else if (line != null && line.charAt(0) == '>') { + // update the location, next iteration the sequence will be handled + location = in.getPosition(); + } else if (line != null && !line.isEmpty()) { + // update in case it is not a blank-line + entry.updateWithSequence(line, in.getLineTerminatorLength()); + } + // set the previous to the current line + previous = line; + } + // add the last entry + index.add(entry.build()); + + // and return the index + return index; + } + } + + // utility class for building the FastaSequenceIndexEntry + private static class FaiEntryBuilder { + private final int index; + private final String contig; + private final long location; + // the bytes per line is the bases per line plus the length of the end of the line + private final int basesPerLine; + private final int endOfLineLength; + + // the size is updated for each line in the input using updateWithSequence + private long size; + // flag to check if the supposedly last line was already reached + private boolean lessBasesFound; + + private FaiEntryBuilder(final int index, final String header, final String firstSequenceLine, final int endOfLineLength, final long location) { + if (header == null || header.charAt(0) != '>') { + throw new SAMException("Wrong sequence header: " + header); + } else if (firstSequenceLine == null) { + throw new SAMException("Empty sequences could not be indexed"); + } + this.index = index; + // parse the contig name (without the starting '>' and truncating white-spaces) + this.contig = SAMSequenceRecord.truncateSequenceName(header.substring(1).trim()); + this.location = location; + this.basesPerLine = firstSequenceLine.length(); + this.endOfLineLength = endOfLineLength; + this.size = firstSequenceLine.length(); + this.lessBasesFound = false; + } + + private void updateWithSequence(final String sequence, final int endOfLineLength) { + if (this.endOfLineLength != endOfLineLength) { + throw new SAMException(String.format("Different end of line for the same sequence was found.")); + } + if (sequence.length() > basesPerLine) { + throw new SAMException(String.format("Sequence line for {} was longer than the expected length ({}): {}", + contig, basesPerLine, sequence)); + } else if (sequence.length() < basesPerLine) { + if (lessBasesFound) { + throw new SAMException(String.format("Only last line could have less than {} bases for '{}' sequence, but at least two are different. Last sequence line: {}", + basesPerLine, contig, sequence)); + } + lessBasesFound = true; + } + // update size + this.size += sequence.length(); + } + + private FastaSequenceIndexEntry build() { + return new FastaSequenceIndexEntry(contig, location, size, basesPerLine, basesPerLine + endOfLineLength, index); + } + } +} diff --git a/src/main/java/htsjdk/samtools/reference/IndexedFastaSequenceFile.java b/src/main/java/htsjdk/samtools/reference/IndexedFastaSequenceFile.java index 5a8703381..5c318782e 100644 --- a/src/main/java/htsjdk/samtools/reference/IndexedFastaSequenceFile.java +++ b/src/main/java/htsjdk/samtools/reference/IndexedFastaSequenceFile.java @@ -136,18 +136,14 @@ public static boolean canCreateIndexedFastaReader(final File fastaFile) { } private static Path findFastaIndex(Path fastaFile) { - Path indexFile = getFastaIndexFileName(fastaFile); + Path indexFile = ReferenceSequenceFileFactory.getFastaIndexFileName(fastaFile); if (!Files.exists(indexFile)) return null; return indexFile; } - private static Path getFastaIndexFileName(Path fastaFile) { - return fastaFile.resolveSibling(fastaFile.getFileName() + ".fai"); - } - private static Path findRequiredFastaIndexFile(Path fastaFile) throws FileNotFoundException { Path ret = findFastaIndex(fastaFile); - if (ret == null) throw new FileNotFoundException(getFastaIndexFileName(fastaFile) + " not found."); + if (ret == null) throw new FileNotFoundException(ReferenceSequenceFileFactory.getFastaIndexFileName(fastaFile) + " not found."); return ret; } diff --git a/src/main/java/htsjdk/samtools/reference/ReferenceSequenceFileFactory.java b/src/main/java/htsjdk/samtools/reference/ReferenceSequenceFileFactory.java index 2b0b1e7fc..654706819 100644 --- a/src/main/java/htsjdk/samtools/reference/ReferenceSequenceFileFactory.java +++ b/src/main/java/htsjdk/samtools/reference/ReferenceSequenceFileFactory.java @@ -163,4 +163,13 @@ public static String getFastaExtension(final Path path) { .orElseGet(() -> {throw new IllegalArgumentException("File is not a supported reference file type: " + path.toAbsolutePath());}); } + /** + * Returns the index name for a FASTA file. + * + * @param fastaFile the reference sequence file path. + */ + public static Path getFastaIndexFileName(Path fastaFile) { + return fastaFile.resolveSibling(fastaFile.getFileName() + ".fai"); + } + } diff --git a/src/test/java/htsjdk/samtools/reference/FastaSequenceIndexCreatorTest.java b/src/test/java/htsjdk/samtools/reference/FastaSequenceIndexCreatorTest.java new file mode 100644 index 000000000..193770abd --- /dev/null +++ b/src/test/java/htsjdk/samtools/reference/FastaSequenceIndexCreatorTest.java @@ -0,0 +1,90 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2017 Daniel Gomez-Sanchez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package htsjdk.samtools.reference; + +import htsjdk.HtsjdkTest; +import htsjdk.samtools.util.IOUtil; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.nio.file.Files; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * @author Daniel Gomez-Sanchez (magicDGS) + */ +public class FastaSequenceIndexCreatorTest extends HtsjdkTest { + private static File TEST_DATA_DIR = new File("src/test/resources/htsjdk/samtools/reference"); + + + @DataProvider(name = "indexedSequences") + public Object[][] getIndexedSequences() { + return new Object[][]{ + {new File(TEST_DATA_DIR, "Homo_sapiens_assembly18.trimmed.fasta")}, + {new File(TEST_DATA_DIR, "Homo_sapiens_assembly18.trimmed.fasta.gz")}, + {new File(TEST_DATA_DIR, "header_with_white_space.fasta")}, + {new File(TEST_DATA_DIR, "crlf.fasta")} + }; + } + + @Test(dataProvider = "indexedSequences") + public void testBuildFromFasta(final File indexedFile) throws Exception { + final FastaSequenceIndex original = new FastaSequenceIndex(new File(indexedFile.getAbsolutePath() + ".fai")); + final FastaSequenceIndex build = FastaSequenceIndexCreator.buildFromFasta(indexedFile.toPath()); + Assert.assertEquals(original, build); + } + + @Test(dataProvider = "indexedSequences") + public void testCreate(final File indexedFile) throws Exception { + // copy the file to index + final File tempDir = IOUtil.createTempDir("FastaSequenceIndexCreatorTest", "testCreate"); + final File copied = new File(tempDir, indexedFile.getName()); + copied.deleteOnExit(); + Files.copy(indexedFile.toPath(), copied.toPath()); + + // create the index for the copied file + FastaSequenceIndexCreator.create(copied.toPath(), false); + + // test if the expected .fai and the created one are the same + final File expectedFai = new File(indexedFile.getAbsolutePath() + ".fai"); + final File createdFai = new File(copied.getAbsolutePath() + ".fai"); + + // read all the files and compare line by line + try(final Stream expected = Files.lines(expectedFai.toPath()); + final Stream created = Files.lines(createdFai.toPath())) { + final List expectedLines = expected.filter(String::isEmpty).collect(Collectors.toList()); + final List createdLines = created.filter(String::isEmpty).collect(Collectors.toList()); + Assert.assertEquals(expectedLines, createdLines); + } + + // load the tmp index and check that both are the same + Assert.assertEquals(new FastaSequenceIndex(createdFai), new FastaSequenceIndex(expectedFai)); + } + +} \ No newline at end of file diff --git a/src/test/java/htsjdk/samtools/reference/FastaSequenceIndexTest.java b/src/test/java/htsjdk/samtools/reference/FastaSequenceIndexTest.java index e24e28874..c6fa1384a 100644 --- a/src/test/java/htsjdk/samtools/reference/FastaSequenceIndexTest.java +++ b/src/test/java/htsjdk/samtools/reference/FastaSequenceIndexTest.java @@ -30,9 +30,15 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; +import java.io.FileReader; +import java.nio.file.Files; import java.util.Iterator; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; /** * Test the fasta sequence index reader. @@ -254,4 +260,30 @@ public void testSpecialCharacters(FastaSequenceIndex specialCharactersIndex) { Assert.assertEquals(ent.getBasesPerLine(),70,"Contig file:gi|17981852|ref|NC_001807.4| bases per line is not correct"); Assert.assertEquals(ent.getBytesPerLine(),71,"Contig file:gi|17981852|ref|NC_001807.4| bytes per line is not correct"); } + + @Test + public void testWrite() throws Exception { + // gets the original file and index + final File originalFile = new File(TEST_DATA_DIR, "testing.fai"); + final FastaSequenceIndex originalIndex = new FastaSequenceIndex(originalFile); + + // write the index to a temp file and test if files are the same + final File fileToWrite = File.createTempFile("testing.toWrite", "fai"); + fileToWrite.deleteOnExit(); + originalIndex.write(fileToWrite.toPath()); + + // read all the files and compare line by line + try(final Stream original = Files.lines(originalFile.toPath()); + final Stream written = Files.lines(fileToWrite.toPath())) { + final List originalLines = original.filter(s -> ! s.isEmpty()).collect(Collectors.toList()); + final List actualLines = written.filter(s -> !s.isEmpty()).collect(Collectors.toList()); + Assert.assertEquals(actualLines, originalLines); + } + + // load the tmp index and check that both are the same + final FastaSequenceIndex writtenIndex = new FastaSequenceIndex(fileToWrite); + Assert.assertEquals(writtenIndex, originalIndex); + } + + } diff --git a/src/test/resources/htsjdk/samtools/reference/Homo_sapiens_assembly18.trimmed.fasta.gz b/src/test/resources/htsjdk/samtools/reference/Homo_sapiens_assembly18.trimmed.fasta.gz new file mode 100644 index 000000000..aa8ef591b Binary files /dev/null and b/src/test/resources/htsjdk/samtools/reference/Homo_sapiens_assembly18.trimmed.fasta.gz differ diff --git a/src/test/resources/htsjdk/samtools/reference/Homo_sapiens_assembly18.trimmed.fasta.gz.fai b/src/test/resources/htsjdk/samtools/reference/Homo_sapiens_assembly18.trimmed.fasta.gz.fai new file mode 100644 index 000000000..04a438b94 --- /dev/null +++ b/src/test/resources/htsjdk/samtools/reference/Homo_sapiens_assembly18.trimmed.fasta.gz.fai @@ -0,0 +1,2 @@ +chrM 16571 6 60 61 +chr20 1000000 16861 60 61 diff --git a/src/test/resources/htsjdk/samtools/reference/Homo_sapiens_assembly18.trimmed.fasta.gz.gzi b/src/test/resources/htsjdk/samtools/reference/Homo_sapiens_assembly18.trimmed.fasta.gz.gzi new file mode 100644 index 000000000..a53660233 Binary files /dev/null and b/src/test/resources/htsjdk/samtools/reference/Homo_sapiens_assembly18.trimmed.fasta.gz.gzi differ diff --git a/src/test/resources/htsjdk/samtools/reference/crlf.fasta b/src/test/resources/htsjdk/samtools/reference/crlf.fasta new file mode 100644 index 000000000..70c878536 --- /dev/null +++ b/src/test/resources/htsjdk/samtools/reference/crlf.fasta @@ -0,0 +1,4 @@ +>a test CR+LF +ACTG +>b test CR+LF +ACTG diff --git a/src/test/resources/htsjdk/samtools/reference/crlf.fasta.fai b/src/test/resources/htsjdk/samtools/reference/crlf.fasta.fai new file mode 100644 index 000000000..923386e04 --- /dev/null +++ b/src/test/resources/htsjdk/samtools/reference/crlf.fasta.fai @@ -0,0 +1,2 @@ +a 4 15 4 6 +b 4 36 4 5 diff --git a/src/test/resources/htsjdk/samtools/reference/header_with_white_space.fasta b/src/test/resources/htsjdk/samtools/reference/header_with_white_space.fasta new file mode 100644 index 000000000..24cff02a9 --- /dev/null +++ b/src/test/resources/htsjdk/samtools/reference/header_with_white_space.fasta @@ -0,0 +1,4 @@ +>a test white space +ACTG +>b test whitespace +ACTG diff --git a/src/test/resources/htsjdk/samtools/reference/header_with_white_space.fasta.fai b/src/test/resources/htsjdk/samtools/reference/header_with_white_space.fasta.fai new file mode 100644 index 000000000..bb15aa584 --- /dev/null +++ b/src/test/resources/htsjdk/samtools/reference/header_with_white_space.fasta.fai @@ -0,0 +1,2 @@ +a 4 20 4 5 +b 4 44 4 5