From 199c6c65bb0b3a7c53f3d920b44d6375cce10f11 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Fri, 18 Nov 2016 23:23:29 -0500 Subject: [PATCH 1/4] Discounting the score for records that do not pass filters. This will mean that failing records are very un-likely to be chosen as the representative read in a set. --- src/main/java/htsjdk/samtools/DuplicateScoringStrategy.java | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/main/java/htsjdk/samtools/DuplicateScoringStrategy.java b/src/main/java/htsjdk/samtools/DuplicateScoringStrategy.java index 1abd51473..656e8f71a 100644 --- a/src/main/java/htsjdk/samtools/DuplicateScoringStrategy.java +++ b/src/main/java/htsjdk/samtools/DuplicateScoringStrategy.java @@ -72,7 +72,9 @@ public static short computeDuplicateScore(final SAMRecord record, final ScoringS Short storedScore = (Short) record.getTransientAttribute(Attr.DuplicateScore); if (storedScore == null) { - short score = 0; + // make sure that filter-failing records are heavily discounted. Dividing by 10 to be far from + // overflow when adding score from multiple fragments + short score = record.getReadFailsVendorQualityCheckFlag() ? (short)(Short.MIN_VALUE/10):0; switch (scoringStrategy) { case SUM_OF_BASE_QUALITIES: @@ -109,6 +111,11 @@ public static short computeDuplicateScore(final SAMRecord record, final ScoringS public static int compare(final SAMRecord rec1, final SAMRecord rec2, final ScoringStrategy scoringStrategy, final boolean assumeMateCigar) { int cmp; + // always prefer passing filter over non-passing filter + if (rec1.getReadFailsVendorQualityCheckFlag() != rec2.getReadFailsVendorQualityCheckFlag()) { + return rec1.getReadFailsVendorQualityCheckFlag() ? 1 : -1; + } + // always prefer paired over non-paired if (rec1.getReadPairedFlag() != rec2.getReadPairedFlag()) return rec1.getReadPairedFlag() ? 1 : -1; @@ -125,7 +132,7 @@ public static int compare(final SAMRecord rec1, final SAMRecord rec2, final Scor } /** - * Compare two records based on their duplicate scores. The duplicate scores for each record is assume to be + * Compare two records based on their duplicate scores. The duplicate scores for each record is assumed to be * pre-computed by computeDuplicateScore and stored in the "DS" tag. If the scores are equal, we break * ties based on mapping quality (added to the mate's mapping quality if paired and mapped), then library/read name. * From 302796df894b0efb44355264defcb892e95bcc10 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Sat, 19 Nov 2016 14:47:03 -0500 Subject: [PATCH 2/4] - fixup --- src/main/java/htsjdk/samtools/DuplicateScoringStrategy.java | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/main/java/htsjdk/samtools/DuplicateScoringStrategy.java b/src/main/java/htsjdk/samtools/DuplicateScoringStrategy.java index 656e8f71a..9004bcba0 100644 --- a/src/main/java/htsjdk/samtools/DuplicateScoringStrategy.java +++ b/src/main/java/htsjdk/samtools/DuplicateScoringStrategy.java @@ -111,11 +111,6 @@ public static short computeDuplicateScore(final SAMRecord record, final ScoringS public static int compare(final SAMRecord rec1, final SAMRecord rec2, final ScoringStrategy scoringStrategy, final boolean assumeMateCigar) { int cmp; - // always prefer passing filter over non-passing filter - if (rec1.getReadFailsVendorQualityCheckFlag() != rec2.getReadFailsVendorQualityCheckFlag()) { - return rec1.getReadFailsVendorQualityCheckFlag() ? 1 : -1; - } - // always prefer paired over non-paired if (rec1.getReadPairedFlag() != rec2.getReadPairedFlag()) return rec1.getReadPairedFlag() ? 1 : -1; From badb7af8b184ec49a12a64331aba9902a241b7a2 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Sat, 19 Nov 2016 16:10:08 -0500 Subject: [PATCH 3/4] - caping score to guarrantee that discount works --- .../htsjdk/samtools/DuplicateScoringStrategy.java | 34 ++++++++++++++++------ 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/src/main/java/htsjdk/samtools/DuplicateScoringStrategy.java b/src/main/java/htsjdk/samtools/DuplicateScoringStrategy.java index 9004bcba0..4657e3bee 100644 --- a/src/main/java/htsjdk/samtools/DuplicateScoringStrategy.java +++ b/src/main/java/htsjdk/samtools/DuplicateScoringStrategy.java @@ -46,10 +46,11 @@ private static enum Attr { DuplicateScore } /** Calculates a score for the read which is the sum of scores over Q15. */ - private static short getSumOfBaseQualities(final SAMRecord rec) { - short score = 0; + private static int getSumOfBaseQualities(final SAMRecord rec) { + + int score = 0; for (final byte b : rec.getBaseQualities()) { - if (b >= 15) score += b; + if (b >= 15 ) score += b; } return score; @@ -64,6 +65,8 @@ public static short computeDuplicateScore(final SAMRecord record, final ScoringS /** * Returns the duplicate score computed from the given fragment. + * value should be capped by Short.MAX_VALUE since the score from two reads will be + * added and an overflow will be * * If true is given to assumeMateCigar, then any score that can use the mate cigar to compute the mate's score will return the score * computed on both ends. @@ -72,13 +75,12 @@ public static short computeDuplicateScore(final SAMRecord record, final ScoringS Short storedScore = (Short) record.getTransientAttribute(Attr.DuplicateScore); if (storedScore == null) { - // make sure that filter-failing records are heavily discounted. Dividing by 10 to be far from - // overflow when adding score from multiple fragments - short score = record.getReadFailsVendorQualityCheckFlag() ? (short)(Short.MIN_VALUE/10):0; - + short score=0; switch (scoringStrategy) { case SUM_OF_BASE_QUALITIES: - score += getSumOfBaseQualities(record); + // two (very) long reads worth of high-quality bases can go over Short.MAX_VALUE/2 + // and risk overflow. + score += (short) Math.min(getSumOfBaseQualities(record), Short.MAX_VALUE / 2); break; case TOTAL_MAPPED_REFERENCE_LENGTH: if (!record.getReadUnmappedFlag()) { @@ -89,9 +91,23 @@ public static short computeDuplicateScore(final SAMRecord record, final ScoringS } break; case RANDOM: - score += (short) (hasher.hashUnencodedChars(record.getReadName()) >> 16); + // start with a random number between Short.MIN_VALUE/4 and Short.MAX_VALUE/4 + score += (short) (hasher.hashUnencodedChars(record.getReadName()) & 0b11_1111_1111_1111); + // subtract Short.MIN_VALUE/4 from it to end up with a number between + // 0 and Short.MAX_VALUE/2. This number can be then discounted in case the read is + // not passing filters. We need to stay far from overflow so that when we add the two + // scores from a read-pair we do not overflow since that could cause us to chose the + // Failing read-pair instead of a passing one. + score -= Short.MIN_VALUE/4; } + // make sure that filter-failing records are heavily discounted. Dividing by 2 to be far from + // overflow when adding score from both fragments of a read-pair + + // a long read with high quality bases may overflow and + score = (short) Math.max(Short.MIN_VALUE, score); + score += record.getReadFailsVendorQualityCheckFlag() ? (short) (Short.MIN_VALUE / 2) : 0; + storedScore = score; record.setTransientAttribute(Attr.DuplicateScore, storedScore); } From 74e138f790102a0e609216d96f99c7ea195612da Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Sat, 24 Dec 2016 13:26:12 -0500 Subject: [PATCH 4/4] - responding to review comments. --- .../htsjdk/samtools/DuplicateScoringStrategy.java | 26 +++++++++++----------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/main/java/htsjdk/samtools/DuplicateScoringStrategy.java b/src/main/java/htsjdk/samtools/DuplicateScoringStrategy.java index 4657e3bee..292ab0654 100644 --- a/src/main/java/htsjdk/samtools/DuplicateScoringStrategy.java +++ b/src/main/java/htsjdk/samtools/DuplicateScoringStrategy.java @@ -47,10 +47,9 @@ /** Calculates a score for the read which is the sum of scores over Q15. */ private static int getSumOfBaseQualities(final SAMRecord rec) { - int score = 0; for (final byte b : rec.getBaseQualities()) { - if (b >= 15 ) score += b; + if (b >= 15) score += b; } return score; @@ -65,7 +64,7 @@ public static short computeDuplicateScore(final SAMRecord record, final ScoringS /** * Returns the duplicate score computed from the given fragment. - * value should be capped by Short.MAX_VALUE since the score from two reads will be + * value should be capped by Short.MAX_VALUE/2 since the score from two reads will be * added and an overflow will be * * If true is given to assumeMateCigar, then any score that can use the mate cigar to compute the mate's score will return the score @@ -84,28 +83,29 @@ public static short computeDuplicateScore(final SAMRecord record, final ScoringS break; case TOTAL_MAPPED_REFERENCE_LENGTH: if (!record.getReadUnmappedFlag()) { - score += record.getCigar().getReferenceLength(); + // no need to remember the score since this scoring mechanism is symmetric + score = (short) Math.min(record.getCigar().getReferenceLength(), Short.MAX_VALUE / 2); } if (assumeMateCigar && record.getReadPairedFlag() && !record.getMateUnmappedFlag()) { - score += SAMUtils.getMateCigar(record).getReferenceLength(); + score += (short) Math.min(SAMUtils.getMateCigar(record).getReferenceLength(), Short.MAX_VALUE / 2); } break; + // The RANDOM score gives the same score to both reads so that they get filtered together. + // it's not critical do use the readName since the scores from both ends get added, but it seem + // to be clearer this way. case RANDOM: // start with a random number between Short.MIN_VALUE/4 and Short.MAX_VALUE/4 score += (short) (hasher.hashUnencodedChars(record.getReadName()) & 0b11_1111_1111_1111); // subtract Short.MIN_VALUE/4 from it to end up with a number between // 0 and Short.MAX_VALUE/2. This number can be then discounted in case the read is // not passing filters. We need to stay far from overflow so that when we add the two - // scores from a read-pair we do not overflow since that could cause us to chose the - // Failing read-pair instead of a passing one. - score -= Short.MIN_VALUE/4; + // scores from the two read mates we do not overflow since that could cause us to chose a + // failing read-pair instead of a passing one. + score -= Short.MIN_VALUE / 4; } - // make sure that filter-failing records are heavily discounted. Dividing by 2 to be far from - // overflow when adding score from both fragments of a read-pair - - // a long read with high quality bases may overflow and - score = (short) Math.max(Short.MIN_VALUE, score); + // make sure that filter-failing records are heavily discounted. (the discount can happen twice, once + // for each mate, so need to make sure we do not subtract more than Short.MIN_VALUE overall.) score += record.getReadFailsVendorQualityCheckFlag() ? (short) (Short.MIN_VALUE / 2) : 0; storedScore = score;