From 4dbeb6d19e9a41f2cfa0605a917ba5b0e9a2894c Mon Sep 17 00:00:00 2001 From: Pavel Silin Date: Wed, 14 Dec 2016 17:52:46 +0300 Subject: [PATCH 1/3] new elc version --- .../ElcHashBasedDuplicatesFinder.java | 114 ++++++++++++--------- .../markduplicates/EstimateLibraryComplexity.java | 42 ++++---- 2 files changed, 85 insertions(+), 71 deletions(-) diff --git a/src/main/java/picard/sam/markduplicates/ElcHashBasedDuplicatesFinder.java b/src/main/java/picard/sam/markduplicates/ElcHashBasedDuplicatesFinder.java index b1b017f97..10533d701 100644 --- a/src/main/java/picard/sam/markduplicates/ElcHashBasedDuplicatesFinder.java +++ b/src/main/java/picard/sam/markduplicates/ElcHashBasedDuplicatesFinder.java @@ -40,13 +40,17 @@ */ class ElcHashBasedDuplicatesFinder extends ElcDuplicatesFinder { - // We split each read on several parts and each part has length hashLength. - // hashLength = (min read length) / (number of errors + 1) - private int hashLength = Integer.MAX_VALUE; + // We split each read on several parts, each part represent set of nucleotides, + // which behind each other on numberOfHashesInGroup. + private int numberOfHashesInGroup = -1; //initial value + private int minReadLenInGroup = Integer.MAX_VALUE; //initial value + + private Map> readsByHashInGroup; ElcHashBasedDuplicatesFinder(double maxDiffRate, int maxReadLength, int minIdenticalBases, OpticalDuplicateFinder opticalDuplicateFinder) { super(maxDiffRate, maxReadLength, minIdenticalBases, opticalDuplicateFinder); + readsByHashInGroup = new HashMap<>(); } @Override @@ -58,56 +62,56 @@ void searchDuplicates(List sequences, Histogram dup populateDupCandidates(sequences); //Duplicate set to filter the treated PairedReadSequences out - Set dupSet = new HashSet<>(); + final Set dupSet = new HashSet<>(); + for (PairedReadSequence lhs : sequences) { if (dupSet.contains(lhs)) continue; - final List dupes = new ArrayList<>(); - - for (PairedReadSequence rhs : lhs.dupCandidates) { + for (PairedReadSequence rhs : getSimilarReads(lhs)) { if (dupSet.contains(rhs)) continue; - if (isDuplicate(lhs, rhs)) { dupes.add(rhs); } } - dupSet.addAll(dupes); fillHistogram(duplicationHisto, opticalHisto, lhs, dupes); } } + private Set getSimilarReads(PairedReadSequence lhs) { + final Set toCheck = new HashSet<>(); + for (int[] hashesForRead: new int[][]{lhs.hashes1, lhs.hashes2}){ + for (int hash : hashesForRead) { + List readsWithSameHash = readsByHashInGroup.get(hash); + // if size == 1, then this list contains only lhs read + if (readsWithSameHash.size() > 1) { + toCheck.addAll(readsWithSameHash); + } + } + } + return toCheck; + } + /** * Populate PairedReadSequence.dupCandidates with duplicate candidates * based on the same hash on the same position. */ private void populateDupCandidates(List seqs) { // Contains hash value as a key and PairedReadSequences match this key as a value - Map> readsByHash = new HashMap<>(); + readsByHashInGroup.clear(); - // Iterate over all PairedReadSequence and split in sets according to the hash value + // Iterate over all PairedReadSequence and split in sets according to the hash value // in a certain position for (PairedReadSequence prs : seqs) { int[][] readHashValues = {prs.hashes1, prs.hashes2}; for (int[] readHashValue : readHashValues) { for (int key : readHashValue) { - List dupCandidates = readsByHash.get(key); - - if (dupCandidates == null) { - dupCandidates = new ArrayList<>(); - readsByHash.put(key, dupCandidates); - } + List dupCandidates + = readsByHashInGroup.computeIfAbsent(key, k -> new ArrayList<>()); dupCandidates.add(prs); } } } - - // Fill for each PairedReadSequence a set of possible duplicates - for (List dupCandidates : readsByHash.values()) { - for (PairedReadSequence prs : dupCandidates) { - prs.dupCandidates.addAll(dupCandidates); - } - } } /** @@ -115,35 +119,28 @@ private void populateDupCandidates(List seqs) { */ private void fillHashValues(List sequences) { for (PairedReadSequence prs : sequences) { - prs.initHashes(maxReadLength, hashLength, minIdenticalBases); + prs.initHashes(numberOfHashesInGroup, minIdenticalBases, minReadLenInGroup); } } /** - * Calculate hash length based on minReadLength and hashNumber + * Calculate hash length based on minReadLength and numberOfHashesInGroup */ private void initHashLength(List sequences) { - if (sequences == null || sequences.isEmpty()) { - return; - } for (PairedReadSequence prs : sequences) { - int minReadLength = Math.min( - Math.min(prs.read1.length, prs.read2.length) - minIdenticalBases, - maxReadLength - minIdenticalBases - ); - - //if read.length % shingle.length != 0, we have tail of the read which is not included in the hashValues, - // and we need extra hash value to get prs.hashValues.length = (number of errors) + 1 - int hashNumber = (int) (minReadLength * maxDiffRate) + 1; - int currentHashLength = minReadLength / hashNumber; - if (hashLength > currentHashLength) { - hashLength = currentHashLength; + int minReadLength = Math.min(Math.min(prs.read1.length, prs.read2.length), maxReadLength); + + //search maximum number of hashes (if length of reads is similar number of hashes will be similar too) + int numberOfHashes = (int) ((minReadLength - minIdenticalBases) * maxDiffRate) + 1; + if (numberOfHashes > numberOfHashesInGroup) { + numberOfHashesInGroup = numberOfHashes; } - } - } - private int getReadOffset(int k) { - return k * hashLength + minIdenticalBases; + //search minimum length of read for calculating hashes value + if (minReadLenInGroup > minReadLength) { + minReadLenInGroup = minReadLength; + } + } } private boolean isDuplicate(final PairedReadSequence lhs, final PairedReadSequence rhs) { @@ -174,26 +171,24 @@ private boolean isDuplicate(final PairedReadSequence lhs, final PairedReadSequen return errors <= maxErrors; } - /** * Compare hashes and if they are similar we compare bases corresponding to the hashes. */ private int compareReadToRead(byte[] read1, int[] hashes1, byte[] read2, int[] hashes2, int maxErrors) { int errors = 0; final int minReadLength = minLength(read1, read2); - int minHashesLength = Math.min(hashes1.length, hashes2.length); - for (int k = 0; k < minHashesLength; ++k) { + for (int k = 0; k < numberOfHashesInGroup; ++k) { if (hashes1[k] != hashes2[k]) { - errors += compareHashes(read1, read2, getReadOffset(k), getReadOffset((k + 1))); + errors += compareHashes(read1, read2, k); if (errors > maxErrors) { return errors; } } } - if (minReadLength > getReadOffset(minHashesLength)) { - errors += compareHashes(read1, read2, getReadOffset(minHashesLength), minReadLength); + if (minReadLength > minReadLenInGroup) { + errors += compareTails(read1, read2, minReadLenInGroup, minReadLength); } return errors; @@ -204,7 +199,24 @@ private int compareReadToRead(byte[] read1, int[] hashes1, byte[] read2, int[] h * * @return errors number for current part of the read */ - private int compareHashes(byte[] read1, byte[] read2, int start, int stop) { + private int compareHashes(byte[] read1, byte[] read2, int k) { + int errors = 0; + int position = minIdenticalBases + k; + while (position < minReadLenInGroup) { + if (read1[position] != read2[position]) { + errors++; + } + position += numberOfHashesInGroup; + } + return errors; + } + + /** + * Compare bases corresponding to the hashes. + * + * @return errors number for current part of the read + */ + private int compareTails(byte[] read1, byte[] read2, int start, int stop) { int errors = 0; for (int i = start; i < stop; ++i) { if (read1[i] != read2[i]) { diff --git a/src/main/java/picard/sam/markduplicates/EstimateLibraryComplexity.java b/src/main/java/picard/sam/markduplicates/EstimateLibraryComplexity.java index 58ca76c9c..345855300 100644 --- a/src/main/java/picard/sam/markduplicates/EstimateLibraryComplexity.java +++ b/src/main/java/picard/sam/markduplicates/EstimateLibraryComplexity.java @@ -190,9 +190,6 @@ int[] hashes1; int[] hashes2; - // Possible candidates for this PairedReadSequence - Set dupCandidates; - public static int getSizeInBytes() { // rough guess at memory footprint, summary size of all fields return 16 + 4 + (2 * 4) + 1 + 2 * (24 + 8 + NUMBER_BASES_IN_READ) + 2 + (2 * (24 + 8)) + 8 + 4; @@ -210,26 +207,31 @@ public static int getSizeInBytes() { return new PairedReadCodec(); } - void initHashes(int maxReadLength, int hashLength, int skippedBases) { - dupCandidates = new HashSet<>(); - hashes1 = getHashes(read1, maxReadLength, hashLength, skippedBases); - hashes2 = getHashes(read2, maxReadLength, hashLength, skippedBases); + void initHashes(int numberOfHashes, int skippedBases, int minReadLength) { + hashes1 = getHashes(read1, numberOfHashes, skippedBases, minReadLength); + hashes2 = getHashes(read2, numberOfHashes, skippedBases, minReadLength); } - // Split read by (MAX_DIFF_RATE * read.length + 1) parts and hash each part - private int[] getHashes(byte[] read, int maxReadLength, int hashLength, int skippedBases) { - int maxLengthWithoutHead = maxReadLength - skippedBases; - int hashNum = (Math.min(read.length - skippedBases, maxLengthWithoutHead)) / - hashLength; - int[] hashValues = new int[hashNum]; - for (int i = 0; i < hashNum; ++i) { - int st = skippedBases + i * hashLength; - int end = st + hashLength; - - // use custom hash to avoid using System.arraycopy + // Split read by numberOfHashes parts and hash each part + // For instance: + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 + // read = A C G T A C G T A C G T A C G T A C G T + // numberOfHashes = 5 + // skippedBases = 1 + // minReadLength = 15 + // So, method returns hashValues with 5 hash value + // first value calculated from read[0], read[5], read[10] + // second value calculated from read[1], read[6], read[11] + // etc. + // chars from 15 to 19 position is a tail, see compareTails() in ElcHashBasedDuplicatesFinder + private int[] getHashes(byte[] read, int numberOfHashes, int skippedBases, int minReadLength) { + int[] hashValues = new int[numberOfHashes]; + for (int i = 0; i < numberOfHashes; ++i) { hashValues[i] = 1; - for (int j = st; j < end; ++j) { - hashValues[i] = 31 * hashValues[i] + read[j]; + int position = skippedBases + i; + while (position < minReadLength) { + hashValues[i] = 31 * hashValues[i] + read[position]; + position += numberOfHashes; } } return hashValues; From 975f7b57db66f14bf0653396d129ea057bee66e4 Mon Sep 17 00:00:00 2001 From: Pavel Silin Date: Thu, 5 Jan 2017 17:31:07 +0300 Subject: [PATCH 2/3] correction for comments --- .../sam/markduplicates/ElcHashBasedDuplicatesFinder.java | 11 +++++++---- .../picard/sam/markduplicates/EstimateLibraryComplexity.java | 8 ++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/main/java/picard/sam/markduplicates/ElcHashBasedDuplicatesFinder.java b/src/main/java/picard/sam/markduplicates/ElcHashBasedDuplicatesFinder.java index 10533d701..3fc05c54b 100644 --- a/src/main/java/picard/sam/markduplicates/ElcHashBasedDuplicatesFinder.java +++ b/src/main/java/picard/sam/markduplicates/ElcHashBasedDuplicatesFinder.java @@ -42,6 +42,9 @@ // We split each read on several parts, each part represent set of nucleotides, // which behind each other on numberOfHashesInGroup. + // f. i.: + // 1 2 3 4 1 2 3 4 1 2 3 4 + // A C A T T A C G G A T T private int numberOfHashesInGroup = -1; //initial value private int minReadLenInGroup = Integer.MAX_VALUE; //initial value @@ -78,12 +81,12 @@ void searchDuplicates(List sequences, Histogram dup } } - private Set getSimilarReads(PairedReadSequence lhs) { + private Set getSimilarReads(final PairedReadSequence pattern) { final Set toCheck = new HashSet<>(); - for (int[] hashesForRead: new int[][]{lhs.hashes1, lhs.hashes2}){ + for (int[] hashesForRead: new int[][]{pattern.hashes1, pattern.hashes2}) { for (int hash : hashesForRead) { List readsWithSameHash = readsByHashInGroup.get(hash); - // if size == 1, then this list contains only lhs read + // if size == 1, then this list contains only pattern read if (readsWithSameHash.size() > 1) { toCheck.addAll(readsWithSameHash); } @@ -106,7 +109,7 @@ private void populateDupCandidates(List seqs) { int[][] readHashValues = {prs.hashes1, prs.hashes2}; for (int[] readHashValue : readHashValues) { for (int key : readHashValue) { - List dupCandidates + final List dupCandidates = readsByHashInGroup.computeIfAbsent(key, k -> new ArrayList<>()); dupCandidates.add(prs); } diff --git a/src/main/java/picard/sam/markduplicates/EstimateLibraryComplexity.java b/src/main/java/picard/sam/markduplicates/EstimateLibraryComplexity.java index 345855300..dd96838c8 100644 --- a/src/main/java/picard/sam/markduplicates/EstimateLibraryComplexity.java +++ b/src/main/java/picard/sam/markduplicates/EstimateLibraryComplexity.java @@ -220,12 +220,12 @@ void initHashes(int numberOfHashes, int skippedBases, int minReadLength) { // skippedBases = 1 // minReadLength = 15 // So, method returns hashValues with 5 hash value - // first value calculated from read[0], read[5], read[10] - // second value calculated from read[1], read[6], read[11] + // first value calculated from read[1], read[6], read[11] + // second value calculated from read[2], read[7], read[12] // etc. - // chars from 15 to 19 position is a tail, see compareTails() in ElcHashBasedDuplicatesFinder + // chars from 16 to 19 position is a tail, see compareTails() in ElcHashBasedDuplicatesFinder private int[] getHashes(byte[] read, int numberOfHashes, int skippedBases, int minReadLength) { - int[] hashValues = new int[numberOfHashes]; + final int[] hashValues = new int[numberOfHashes]; for (int i = 0; i < numberOfHashes; ++i) { hashValues[i] = 1; int position = skippedBases + i; From b398e218033c31654e9adcfa66e92a864e5de74f Mon Sep 17 00:00:00 2001 From: Pavel Silin Date: Wed, 15 Feb 2017 11:14:06 +0300 Subject: [PATCH 3/3] correction to review: comments were expanded --- .../ElcHashBasedDuplicatesFinder.java | 31 +++++++++++++++------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/src/main/java/picard/sam/markduplicates/ElcHashBasedDuplicatesFinder.java b/src/main/java/picard/sam/markduplicates/ElcHashBasedDuplicatesFinder.java index 3fc05c54b..a6026fc68 100644 --- a/src/main/java/picard/sam/markduplicates/ElcHashBasedDuplicatesFinder.java +++ b/src/main/java/picard/sam/markduplicates/ElcHashBasedDuplicatesFinder.java @@ -40,15 +40,24 @@ */ class ElcHashBasedDuplicatesFinder extends ElcDuplicatesFinder { - // We split each read on several parts, each part represent set of nucleotides, - // which behind each other on numberOfHashesInGroup. + // We split the bases of each read into interspersed parts, so that each part spans the entire length of the read. + // e.g. breaking a read of length 12 into 4 parts would be as follows: // f. i.: // 1 2 3 4 1 2 3 4 1 2 3 4 // A C A T T A C G G A T T - private int numberOfHashesInGroup = -1; //initial value - private int minReadLenInGroup = Integer.MAX_VALUE; //initial value - private Map> readsByHashInGroup; + // number of parts we split reads in + // -1 because we want to find biggest number of hashes in this group + // calculated formula : ((minReadLength - minIdenticalBases) * maxDiffRate) + 1 + // when minReadLength = Math.min(Math.min(prs.read1.length, prs.read2.length), maxReadLength) + private int numberOfHashesInGroup = -1; + + // minimal read length in group, this variable necessary for splitting reads on part, + // see EstimateLibraryComplexity.PairedReadSequence.getHashes() + // initial value Integer.MAX_VALUE + private int minReadLenInGroup = Integer.MAX_VALUE; + + private final Map> readsByHashInGroup; ElcHashBasedDuplicatesFinder(double maxDiffRate, int maxReadLength, int minIdenticalBases, OpticalDuplicateFinder opticalDuplicateFinder) { @@ -181,9 +190,9 @@ private int compareReadToRead(byte[] read1, int[] hashes1, byte[] read2, int[] h int errors = 0; final int minReadLength = minLength(read1, read2); - for (int k = 0; k < numberOfHashesInGroup; ++k) { - if (hashes1[k] != hashes2[k]) { - errors += compareHashes(read1, read2, k); + for (int hashNumber = 0; hashNumber < numberOfHashesInGroup; ++hashNumber) { + if (hashes1[hashNumber] != hashes2[hashNumber]) { + errors += compareHashes(read1, read2, hashNumber); if (errors > maxErrors) { return errors; } @@ -202,13 +211,15 @@ private int compareReadToRead(byte[] read1, int[] hashes1, byte[] read2, int[] h * * @return errors number for current part of the read */ - private int compareHashes(byte[] read1, byte[] read2, int k) { + private int compareHashes(byte[] read1, byte[] read2, int hashNumber) { int errors = 0; - int position = minIdenticalBases + k; + //shift position corresponding to hash we inspect + int position = minIdenticalBases + hashNumber; while (position < minReadLenInGroup) { if (read1[position] != read2[position]) { errors++; } + // shift position to next base in the hash position += numberOfHashesInGroup; } return errors;