diff --git a/src/main/java/htsjdk/samtools/util/SequenceUtil.java b/src/main/java/htsjdk/samtools/util/SequenceUtil.java
index d2fb861a7..3108cee0d 100644
--- a/src/main/java/htsjdk/samtools/util/SequenceUtil.java
+++ b/src/main/java/htsjdk/samtools/util/SequenceUtil.java
@@ -442,33 +442,6 @@ public static int countMismatches(final SAMRecord read, final byte[] referenceBa
}
/**
- * Sadly, this is a duplicate of the method above, except that it takes char[] for referenceBases rather
- * than byte[]. This is because GATK needs it this way.
- *
- * TODO: Remove this method when GATK map method is changed to take refseq as byte[].
- * TODO: UPDATE: Seems to be removed from GATK. Deprecated now to be removed in a future version.
- */
- @Deprecated
- private static int countMismatches(final SAMRecord read, final char[] referenceBases, final int referenceOffset) {
- int mismatches = 0;
-
- final byte[] readBases = read.getReadBases();
-
- for (final AlignmentBlock block : read.getAlignmentBlocks()) {
- final int readBlockStart = block.getReadStart() - 1;
- final int referenceBlockStart = block.getReferenceStart() - 1 - referenceOffset;
- final int length = block.getLength();
-
- for (int i = 0; i < length; ++i) {
- if (!basesEqual(readBases[readBlockStart + i], StringUtil.charToByte(referenceBases[referenceBlockStart + i]))) {
- ++mismatches;
- }
- }
- }
- return mismatches;
- }
-
- /**
* Calculates the sum of qualities for mismatched bases in the read.
*
* @param referenceBases Array of ASCII bytes in which the 0th position in the array corresponds
@@ -537,41 +510,6 @@ public static int sumQualitiesOfMismatches(final SAMRecord read, final byte[] re
return qualities;
}
- /**
- * Sadly, this is a duplicate of the method above, except that it takes char[] for referenceBases rather
- * than byte[]. This is because GATK needs it this way.
- *
- * TODO: Remove this method when GATK map method is changed to take refseq as byte[].
- * TODO: UPDATE: Seems to be removed from GATK. Deprecated now to be removed in a future version.
- */
- @Deprecated
- public static int sumQualitiesOfMismatches(final SAMRecord read, final char[] referenceBases,
- final int referenceOffset) {
- int qualities = 0;
-
- final byte[] readBases = read.getReadBases();
- final byte[] readQualities = read.getBaseQualities();
-
- if (read.getAlignmentStart() <= referenceOffset) {
- throw new IllegalArgumentException("read.getAlignmentStart(" + read.getAlignmentStart() +
- ") <= referenceOffset(" + referenceOffset + ")");
- }
-
- for (final AlignmentBlock block : read.getAlignmentBlocks()) {
- final int readBlockStart = block.getReadStart() - 1;
- final int referenceBlockStart = block.getReferenceStart() - 1 - referenceOffset;
- final int length = block.getLength();
-
- for (int i = 0; i < length; ++i) {
- if (!basesEqual(readBases[readBlockStart + i], StringUtil.charToByte(referenceBases[referenceBlockStart + i]))) {
- qualities += readQualities[readBlockStart + i];
- }
- }
- }
-
- return qualities;
- }
-
public static int countInsertedBases(final Cigar cigar) {
int ret = 0;
for (final CigarElement element : cigar.getCigarElements()) {
@@ -658,26 +596,6 @@ public static int calculateSamNmTagFromCigar(final SAMRecord record) {
return samNm;
}
-
- /**
- * Sadly, this is a duplicate of the method above, except that it takes char[] for referenceBases rather
- * than byte[]. This is because GATK needs it this way.
- *
- * TODO: Remove this method when GATK map method is changed to take refseq as byte[].
- * TODO: UPDATE: Seems to be removed from GATK. Deprecated now to be removed in a future version.
- */
- @Deprecated
- public static int calculateSamNmTag(final SAMRecord read, final char[] referenceBases,
- final int referenceOffset) {
- int samNm = countMismatches(read, referenceBases, referenceOffset);
- for (final CigarElement el : read.getCigar().getCigarElements()) {
- if (el.getOperator() == CigarOperator.INSERTION || el.getOperator() == CigarOperator.DELETION) {
- samNm += el.getLength();
- }
- }
- return samNm;
- }
-
/** Returns the complement of a single byte. */
public static byte complement(final byte b) {
switch (b) {
@@ -972,10 +890,11 @@ public static String calculateMD5String(final byte[] data, final int offset, fin
/**
* Calculate MD and NM similarly to Samtools, except that N->N is a match.
*
- * @param record
- * @param ref
- * @param calcMD
- * @return
+ * @param record Input record for which to calculate NM and MD.
+ * The appropriate tags will be added/updated in the record
+ * @param ref The reference bases for the sequence to which the record is mapped
+ * @param calcMD A flag indicating whether to update the MD tag in the record
+ * @param calcNM A flag indicating whether to update the NM tag in the record
*/
public static void calculateMdAndNmTags(final SAMRecord record, final byte[] ref,
final boolean calcMD, final boolean calcNM) {
@@ -985,66 +904,63 @@ public static void calculateMdAndNmTags(final SAMRecord record, final byte[] ref
final Cigar cigar = record.getCigar();
final List cigarElements = cigar.getCigarElements();
final byte[] seq = record.getReadBases();
- final int start = record.getAlignmentStart() - 1;
- int i, x, y, u = 0;
- int nm = 0;
- final StringBuilder str = new StringBuilder();
-
- final int size = cigarElements.size();
- for (i = y = 0, x = start; i < size; ++i) {
- final CigarElement ce = cigarElements.get(i);
- int j;
- final int length = ce.getLength();
+ final int alignmentStart = record.getAlignmentStart() - 1;
+ int cigarIndex, blockRefPos, blockReadStart, matchCount = 0;
+ int nmCount = 0;
+ final StringBuilder mdString = new StringBuilder();
+
+ final int nElements = cigarElements.size();
+ for (cigarIndex = blockReadStart = 0, blockRefPos = alignmentStart; cigarIndex < nElements; ++cigarIndex) {
+ final CigarElement ce = cigarElements.get(cigarIndex);
+ int inBlockOffset;
+ final int blockLength = ce.getLength();
final CigarOperator op = ce.getOperator();
if (op == CigarOperator.MATCH_OR_MISMATCH || op == CigarOperator.EQ
|| op == CigarOperator.X) {
- for (j = 0; j < length; ++j) {
- final int z = y + j;
+ for (inBlockOffset = 0; inBlockOffset < blockLength; ++inBlockOffset) {
+ final int readOffset = blockReadStart + inBlockOffset;
- if (ref.length <= x + j) break; // out of boundary
+ if (ref.length <= blockRefPos + inBlockOffset) break; // out of boundary
- int c1 = 0;
- int c2 = 0;
- // try {
- c1 = seq[z];
- c2 = ref[x + j];
+ final byte readBase = seq[readOffset];
+ final byte refBase = ref[blockRefPos + inBlockOffset];
- if ((c1 == c2) || c1 == 0) {
+ if ((bases[readBase] == bases[refBase]) || readBase == 0) {
// a match
- ++u;
+ ++matchCount;
} else {
- str.append(u);
- str.appendCodePoint(ref[x + j]);
- u = 0;
- ++nm;
+ mdString.append(matchCount);
+ mdString.appendCodePoint(refBase);
+ matchCount = 0;
+ ++nmCount;
}
}
- if (j < length) break;
- x += length;
- y += length;
+ if (inBlockOffset < blockLength) break;
+ blockRefPos += blockLength;
+ blockReadStart += blockLength;
} else if (op == CigarOperator.DELETION) {
- str.append(u);
- str.append('^');
- for (j = 0; j < length; ++j) {
- if (ref[x + j] == 0) break;
- str.appendCodePoint(ref[x + j]);
+ mdString.append(matchCount);
+ mdString.append('^');
+ for (inBlockOffset = 0; inBlockOffset < blockLength; ++inBlockOffset) {
+ if (ref[blockRefPos + inBlockOffset] == 0) break;
+ mdString.appendCodePoint(ref[blockRefPos + inBlockOffset]);
}
- u = 0;
- if (j < length) break;
- x += length;
- nm += length;
+ matchCount = 0;
+ if (inBlockOffset < blockLength) break;
+ blockRefPos += blockLength;
+ nmCount += blockLength;
} else if (op == CigarOperator.INSERTION
|| op == CigarOperator.SOFT_CLIP) {
- y += length;
- if (op == CigarOperator.INSERTION) nm += length;
+ blockReadStart += blockLength;
+ if (op == CigarOperator.INSERTION) nmCount += blockLength;
} else if (op == CigarOperator.SKIPPED_REGION) {
- x += length;
+ blockRefPos += blockLength;
}
}
- str.append(u);
+ mdString.append(matchCount);
- if (calcMD) record.setAttribute(SAMTag.MD.name(), str.toString());
- if (calcNM) record.setAttribute(SAMTag.NM.name(), nm);
+ if (calcMD) record.setAttribute(SAMTag.MD.name(), mdString.toString());
+ if (calcNM) record.setAttribute(SAMTag.NM.name(), nmCount);
}
public static byte upperCase(final byte base) {
@@ -1059,7 +975,7 @@ public static byte upperCase(final byte base) {
/** Generates all possible unambiguous kmers (upper-case) of length and returns them as byte[]s. */
public static List generateAllKmers(final int length) {
- final List sofar = new LinkedList();
+ final List sofar = new LinkedList<>();
if (sofar.isEmpty()) {
sofar.add(new byte[length]);
diff --git a/src/test/java/htsjdk/samtools/util/SequenceUtilTest.java b/src/test/java/htsjdk/samtools/util/SequenceUtilTest.java
index c5c797e7b..008cca507 100644
--- a/src/test/java/htsjdk/samtools/util/SequenceUtilTest.java
+++ b/src/test/java/htsjdk/samtools/util/SequenceUtilTest.java
@@ -23,16 +23,15 @@
*/
package htsjdk.samtools.util;
-import htsjdk.samtools.Cigar;
-import htsjdk.samtools.SAMRecord;
-import htsjdk.samtools.SAMSequenceDictionary;
-import htsjdk.samtools.SAMTag;
-import htsjdk.samtools.SAMTextHeaderCodec;
-import htsjdk.samtools.TextCigarCodec;
+import htsjdk.samtools.*;
+import htsjdk.samtools.reference.ReferenceSequence;
+import htsjdk.samtools.reference.ReferenceSequenceFile;
+import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
+import java.io.File;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
@@ -429,4 +428,26 @@ public void testGetSamReadNameFromFastqHeader(final String fastqHeader,
{"Name/1/2", "Name"}
};
}
+
+ @Test
+ public void testCalculateNmTag() {
+ final File TEST_DIR = new File("src/test/resources/htsjdk/samtools/SequenceUtil");
+ final File referenceFile = new File(TEST_DIR, "reference_with_lower_and_uppercase.fasta");
+ final File samFile = new File(TEST_DIR, "upper_and_lowercase_read.sam");
+
+ SamReader reader = SamReaderFactory.makeDefault().open(samFile);
+ ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile);
+
+ reader.iterator().stream().forEach(r -> {
+ Integer nm = SequenceUtil.calculateSamNmTag(r, ref.getSequence(r.getContig()).getBases());
+ String md = r.getStringAttribute(SAMTag.MD.name());
+ Assert.assertEquals(r.getIntegerAttribute(SAMTag.NM.name()), nm, "problem with NM in read \'" + r.getReadName() + "\':");
+ SequenceUtil.calculateMdAndNmTags(r, ref.getSequence(r.getContig()).getBases(), true, true);
+
+ Assert.assertEquals(r.getIntegerAttribute(SAMTag.NM.name()), nm, "problem with NM in read \'" + r.getReadName() + "\':");
+ if (md != null) {
+ Assert.assertEquals(r.getStringAttribute(SAMTag.MD.name()), md, "problem with MD in read \'" + r.getReadName() + "\':");
+ }
+ });
+ }
}
diff --git a/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.dict b/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.dict
new file mode 100644
index 000000000..db5b251d0
--- /dev/null
+++ b/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.dict
@@ -0,0 +1,3 @@
+@HD VN:1.5 SO:unsorted
+@SQ SN:chr1 LN:16 M5:56b74a652b3ed2f610263b8bb423167c UR:file:src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta
+@SQ SN:chr2 LN:16 M5:b835d2c026aa66c52a05838dcc0b59d4 UR:file:src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta
diff --git a/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta b/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta
new file mode 100644
index 000000000..0b446caa8
--- /dev/null
+++ b/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta
@@ -0,0 +1,4 @@
+>chr1
+ACGTACGTacgtacgt
+>chr2
+TCGATCGAtcgatcga
\ No newline at end of file
diff --git a/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta.fai b/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta.fai
new file mode 100644
index 000000000..9314c8fe5
--- /dev/null
+++ b/src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta.fai
@@ -0,0 +1,2 @@
+chr1 16 6 16 17
+chr2 16 29 16 16
diff --git a/src/test/resources/htsjdk/samtools/SequenceUtil/upper_and_lowercase_read.sam b/src/test/resources/htsjdk/samtools/SequenceUtil/upper_and_lowercase_read.sam
new file mode 100644
index 000000000..82efe858e
--- /dev/null
+++ b/src/test/resources/htsjdk/samtools/SequenceUtil/upper_and_lowercase_read.sam
@@ -0,0 +1,10 @@
+@HD VN:1.5 SO:coordinate
+@SQ SN:chr1 LN:16 M5:56b74a652b3ed2f610263b8bb423167c UR:file:src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta
+@SQ SN:chr2 LN:16 M5:b835d2c026aa66c52a05838dcc0b59d4 UR:file:src/test/resources/htsjdk/samtools/SequenceUtil/reference_with_lower_and_uppercase.fasta
+@CO chr1 value is ACGTACGTacgtacgt
+@CO chr2 value is TCGATCGAtcgatcga
+read1 0 chr1 1 0 16M * 0 0 AcGtAcGTaCGtAcGt AAAAAAAAAAAAAAAA NM:i:0
+read2 0 chr1 1 0 16M * 0 0 AcGtAcGTaCGtAcGt AAAAAAAAAAAAAAAA NM:i:0
+read3 0 chr2 1 0 16M * 0 0 AcGtAcGTaCGtAcGt AAAAAAAAAAAAAAAA NM:i:8 MD:Z:0T2A0T2A0t2a0t2a0
+read4 0 chr2 1 0 8M * 0 0 TCGATCGA AAAAAAAA NM:i:0
+read5 0 chr2 1 0 4M1D2M1S * 0 0 TCGACGAA AAAAAAAA NM:i:1 MD:Z:4^T2