diff --git a/src/main/java/htsjdk/samtools/util/SequenceUtil.java b/src/main/java/htsjdk/samtools/util/SequenceUtil.java index d2fb861a7..3108cee0d 100644 --- a/src/main/java/htsjdk/samtools/util/SequenceUtil.java +++ b/src/main/java/htsjdk/samtools/util/SequenceUtil.java @@ -442,33 +442,6 @@ public static int countMismatches(final SAMRecord read, final byte[] referenceBa } /** - * Sadly, this is a duplicate of the method above, except that it takes char[] for referenceBases rather - * than byte[]. This is because GATK needs it this way. - *

- * TODO: Remove this method when GATK map method is changed to take refseq as byte[]. - * TODO: UPDATE: Seems to be removed from GATK. Deprecated now to be removed in a future version. - */ - @Deprecated - private static int countMismatches(final SAMRecord read, final char[] referenceBases, final int referenceOffset) { - int mismatches = 0; - - final byte[] readBases = read.getReadBases(); - - for (final AlignmentBlock block : read.getAlignmentBlocks()) { - final int readBlockStart = block.getReadStart() - 1; - final int referenceBlockStart = block.getReferenceStart() - 1 - referenceOffset; - final int length = block.getLength(); - - for (int i = 0; i < length; ++i) { - if (!basesEqual(readBases[readBlockStart + i], StringUtil.charToByte(referenceBases[referenceBlockStart + i]))) { - ++mismatches; - } - } - } - return mismatches; - } - - /** * Calculates the sum of qualities for mismatched bases in the read. * * @param referenceBases Array of ASCII bytes in which the 0th position in the array corresponds @@ -537,41 +510,6 @@ public static int sumQualitiesOfMismatches(final SAMRecord read, final byte[] re return qualities; } - /** - * Sadly, this is a duplicate of the method above, except that it takes char[] for referenceBases rather - * than byte[]. This is because GATK needs it this way. - *

- * TODO: Remove this method when GATK map method is changed to take refseq as byte[]. - * TODO: UPDATE: Seems to be removed from GATK. Deprecated now to be removed in a future version. - */ - @Deprecated - public static int sumQualitiesOfMismatches(final SAMRecord read, final char[] referenceBases, - final int referenceOffset) { - int qualities = 0; - - final byte[] readBases = read.getReadBases(); - final byte[] readQualities = read.getBaseQualities(); - - if (read.getAlignmentStart() <= referenceOffset) { - throw new IllegalArgumentException("read.getAlignmentStart(" + read.getAlignmentStart() + - ") <= referenceOffset(" + referenceOffset + ")"); - } - - for (final AlignmentBlock block : read.getAlignmentBlocks()) { - final int readBlockStart = block.getReadStart() - 1; - final int referenceBlockStart = block.getReferenceStart() - 1 - referenceOffset; - final int length = block.getLength(); - - for (int i = 0; i < length; ++i) { - if (!basesEqual(readBases[readBlockStart + i], StringUtil.charToByte(referenceBases[referenceBlockStart + i]))) { - qualities += readQualities[readBlockStart + i]; - } - } - } - - return qualities; - } - public static int countInsertedBases(final Cigar cigar) { int ret = 0; for (final CigarElement element : cigar.getCigarElements()) { @@ -658,26 +596,6 @@ public static int calculateSamNmTagFromCigar(final SAMRecord record) { return samNm; } - - /** - * Sadly, this is a duplicate of the method above, except that it takes char[] for referenceBases rather - * than byte[]. This is because GATK needs it this way. - *

- * TODO: Remove this method when GATK map method is changed to take refseq as byte[]. - * TODO: UPDATE: Seems to be removed from GATK. Deprecated now to be removed in a future version. - */ - @Deprecated - public static int calculateSamNmTag(final SAMRecord read, final char[] referenceBases, - final int referenceOffset) { - int samNm = countMismatches(read, referenceBases, referenceOffset); - for (final CigarElement el : read.getCigar().getCigarElements()) { - if (el.getOperator() == CigarOperator.INSERTION || el.getOperator() == CigarOperator.DELETION) { - samNm += el.getLength(); - } - } - return samNm; - } - /** Returns the complement of a single byte. */ public static byte complement(final byte b) { switch (b) { @@ -972,10 +890,11 @@ public static String calculateMD5String(final byte[] data, final int offset, fin /** * Calculate MD and NM similarly to Samtools, except that N->N is a match. * - * @param record - * @param ref - * @param calcMD - * @return + * @param record Input record for which to calculate NM and MD. + * The appropriate tags will be added/updated in the record + * @param ref The reference bases for the sequence to which the record is mapped + * @param calcMD A flag indicating whether to update the MD tag in the record + * @param calcNM A flag indicating whether to update the NM tag in the record */ public static void calculateMdAndNmTags(final SAMRecord record, final byte[] ref, final boolean calcMD, final boolean calcNM) { @@ -985,66 +904,63 @@ public static void calculateMdAndNmTags(final SAMRecord record, final byte[] ref final Cigar cigar = record.getCigar(); final List cigarElements = cigar.getCigarElements(); final byte[] seq = record.getReadBases(); - final int start = record.getAlignmentStart() - 1; - int i, x, y, u = 0; - int nm = 0; - final StringBuilder str = new StringBuilder(); - - final int size = cigarElements.size(); - for (i = y = 0, x = start; i < size; ++i) { - final CigarElement ce = cigarElements.get(i); - int j; - final int length = ce.getLength(); + final int alignmentStart = record.getAlignmentStart() - 1; + int cigarIndex, blockRefPos, blockReadStart, matchCount = 0; + int nmCount = 0; + final StringBuilder mdString = new StringBuilder(); + + final int nElements = cigarElements.size(); + for (cigarIndex = blockReadStart = 0, blockRefPos = alignmentStart; cigarIndex < nElements; ++cigarIndex) { + final CigarElement ce = cigarElements.get(cigarIndex); + int inBlockOffset; + final int blockLength = ce.getLength(); final CigarOperator op = ce.getOperator(); if (op == CigarOperator.MATCH_OR_MISMATCH || op == CigarOperator.EQ || op == CigarOperator.X) { - for (j = 0; j < length; ++j) { - final int z = y + j; + for (inBlockOffset = 0; inBlockOffset < blockLength; ++inBlockOffset) { + final int readOffset = blockReadStart + inBlockOffset; - if (ref.length <= x + j) break; // out of boundary + if (ref.length <= blockRefPos + inBlockOffset) break; // out of boundary - int c1 = 0; - int c2 = 0; - // try { - c1 = seq[z]; - c2 = ref[x + j]; + final byte readBase = seq[readOffset]; + final byte refBase = ref[blockRefPos + inBlockOffset]; - if ((c1 == c2) || c1 == 0) { + if ((bases[readBase] == bases[refBase]) || readBase == 0) { // a match - ++u; + ++matchCount; } else { - str.append(u); - str.appendCodePoint(ref[x + j]); - u = 0; - ++nm; + mdString.append(matchCount); + mdString.appendCodePoint(refBase); + matchCount = 0; + ++nmCount; } } - if (j < length) break; - x += length; - y += length; + if (inBlockOffset < blockLength) break; + blockRefPos += blockLength; + blockReadStart += blockLength; } else if (op == CigarOperator.DELETION) { - str.append(u); - str.append('^'); - for (j = 0; j < length; ++j) { - if (ref[x + j] == 0) break; - str.appendCodePoint(ref[x + j]); + mdString.append(matchCount); + mdString.append('^'); + for (inBlockOffset = 0; inBlockOffset < blockLength; ++inBlockOffset) { + if (ref[blockRefPos + inBlockOffset] == 0) break; + mdString.appendCodePoint(ref[blockRefPos + inBlockOffset]); } - u = 0; - if (j < length) break; - x += length; - nm += length; + matchCount = 0; + if (inBlockOffset < blockLength) break; + blockRefPos += blockLength; + nmCount += blockLength; } else if (op == CigarOperator.INSERTION || op == CigarOperator.SOFT_CLIP) { - y += length; - if (op == CigarOperator.INSERTION) nm += length; + blockReadStart += blockLength; + if (op == CigarOperator.INSERTION) nmCount += blockLength; } else if (op == CigarOperator.SKIPPED_REGION) { - x += length; + blockRefPos += blockLength; } } - str.append(u); + mdString.append(matchCount); - if (calcMD) record.setAttribute(SAMTag.MD.name(), str.toString()); - if (calcNM) record.setAttribute(SAMTag.NM.name(), nm); + if (calcMD) record.setAttribute(SAMTag.MD.name(), mdString.toString()); + if (calcNM) record.setAttribute(SAMTag.NM.name(), nmCount); } public static byte upperCase(final byte base) { @@ -1059,7 +975,7 @@ public static byte upperCase(final byte base) { /** Generates all possible unambiguous kmers (upper-case) of length and returns them as byte[]s. */ public static List generateAllKmers(final int length) { - final List sofar = new LinkedList(); + final List sofar = new LinkedList<>(); if (sofar.isEmpty()) { sofar.add(new byte[length]); diff --git a/src/test/java/htsjdk/samtools/util/SequenceUtilTest.java b/src/test/java/htsjdk/samtools/util/SequenceUtilTest.java index c5c797e7b..008cca507 100644 --- a/src/test/java/htsjdk/samtools/util/SequenceUtilTest.java +++ b/src/test/java/htsjdk/samtools/util/SequenceUtilTest.java @@ -23,16 +23,15 @@ */ package htsjdk.samtools.util; -import htsjdk.samtools.Cigar; -import htsjdk.samtools.SAMRecord; -import htsjdk.samtools.SAMSequenceDictionary; -import htsjdk.samtools.SAMTag; -import htsjdk.samtools.SAMTextHeaderCodec; -import htsjdk.samtools.TextCigarCodec; +import htsjdk.samtools.*; +import htsjdk.samtools.reference.ReferenceSequence; +import htsjdk.samtools.reference.ReferenceSequenceFile; +import htsjdk.samtools.reference.ReferenceSequenceFileFactory; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.File; import java.util.Arrays; import java.util.HashSet; import java.util.Set; @@ -429,4 +428,26 @@ public void testGetSamReadNameFromFastqHeader(final String fastqHeader, {"Name/1/2", "Name"} }; } + + @Test + public void testCalculateNmTag() { + final File TEST_DIR = new File("src/test/resources/htsjdk/samtools/SequenceUtil"); + final File referenceFile = new File(TEST_DIR, "reference_with_lower_and_uppercase.fasta"); + final File samFile = new File(TEST_DIR, "upper_and_lowercase_read.sam"); + + SamReader reader = SamReaderFactory.makeDefault().open(samFile); + ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile); + + reader.iterator().stream().forEach(r -> { + Integer nm = SequenceUtil.calculateSamNmTag(r, ref.getSequence(r.getContig()).getBases()); + String md = r.getStringAttribute(SAMTag.MD.name()); + Assert.assertEquals(r.getIntegerAttribute(SAMTag.NM.name()), nm, "problem with NM in read \'" + r.getReadName() + "\':"); + SequenceUtil.calculateMdAndNmTags(r, ref.getSequence(r.getContig()).getBases(), true, true); + + Assert.assertEquals(r.getIntegerAttribute(SAMTag.NM.name()), nm, "problem with NM in read \'" + r.getReadName() + "\':"); + if (md != null) { + Assert.assertEquals(r.getStringAttribute(SAMTag.MD.name()), md, "problem with MD in read \'" + r.getReadName() + "\':"); + } + }); + } } diff --git a/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.dict b/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.dict new file mode 100644 index 000000000..db5b251d0 --- /dev/null +++ b/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.dict @@ -0,0 +1,3 @@ +@HD VN:1.5 SO:unsorted +@SQ SN:chr1 LN:16 M5:56b74a652b3ed2f610263b8bb423167c UR:file:src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta +@SQ SN:chr2 LN:16 M5:b835d2c026aa66c52a05838dcc0b59d4 UR:file:src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta diff --git a/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta b/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta new file mode 100644 index 000000000..0b446caa8 --- /dev/null +++ b/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta @@ -0,0 +1,4 @@ +>chr1 +ACGTACGTacgtacgt +>chr2 +TCGATCGAtcgatcga \ No newline at end of file diff --git a/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta.fai b/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta.fai new file mode 100644 index 000000000..9314c8fe5 --- /dev/null +++ b/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta.fai @@ -0,0 +1,2 @@ +chr1 16 6 16 17 +chr2 16 29 16 16 diff --git a/src/test/resources/htsjdk/samtools/SequenceUtil/upper_and_lowercase_read.sam b/src/test/resources/htsjdk/samtools/SequenceUtil/upper_and_lowercase_read.sam new file mode 100644 index 000000000..82efe858e --- /dev/null +++ b/src/test/resources/htsjdk/samtools/SequenceUtil/upper_and_lowercase_read.sam @@ -0,0 +1,10 @@ +@HD VN:1.5 SO:coordinate +@SQ SN:chr1 LN:16 M5:56b74a652b3ed2f610263b8bb423167c UR:file:src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta +@SQ SN:chr2 LN:16 M5:b835d2c026aa66c52a05838dcc0b59d4 UR:file:src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta +@CO chr1 value is ACGTACGTacgtacgt +@CO chr2 value is TCGATCGAtcgatcga +read1 0 chr1 1 0 16M * 0 0 AcGtAcGTaCGtAcGt AAAAAAAAAAAAAAAA NM:i:0 +read2 0 chr1 1 0 16M * 0 0 AcGtAcGTaCGtAcGt AAAAAAAAAAAAAAAA NM:i:0 +read3 0 chr2 1 0 16M * 0 0 AcGtAcGTaCGtAcGt AAAAAAAAAAAAAAAA NM:i:8 MD:Z:0T2A0T2A0t2a0t2a0 +read4 0 chr2 1 0 8M * 0 0 TCGATCGA AAAAAAAA NM:i:0 +read5 0 chr2 1 0 4M1D2M1S * 0 0 TCGACGAA AAAAAAAA NM:i:1 MD:Z:4^T2