From 8041c8bc74f3766b0e61fc8dd0b44a0b1a9dff2e Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Sat, 20 Aug 2016 15:27:33 -0400 Subject: [PATCH 1/4] Add a mergeDictionary utility to SAMSequenceDictionary to be used by picard in MergeBamAlignment --- .../htsjdk/samtools/SAMSequenceDictionary.java | 114 ++++++++++++++++++--- .../htsjdk/samtools/SAMSequenceDictionaryTest.java | 41 ++++++++ 2 files changed, 143 insertions(+), 12 deletions(-) diff --git a/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java b/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java index 2b8d18a31..f19598fdd 100644 --- a/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java +++ b/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java @@ -23,20 +23,20 @@ */ package htsjdk.samtools; +import htsjdk.samtools.util.Log; + import java.io.Serializable; import java.math.BigInteger; import java.security.MessageDigest; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; +import java.util.*; +import java.util.stream.Collectors; import javax.xml.bind.annotation.XmlElement; import javax.xml.bind.annotation.XmlRootElement; import javax.xml.bind.annotation.XmlTransient; +import static htsjdk.samtools.SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH; + /** * Collection of SAMSequenceRecords. */ @@ -64,6 +64,8 @@ public SAMSequenceDictionary(final List list) { return Collections.unmodifiableList(mSequences); } + private static Log log = Log.getInstance(SAMSequenceDictionary.class); + public SAMSequenceRecord getSequence(final String name) { return mSequenceMap.get(name); } @@ -135,7 +137,7 @@ public long getReferenceLength() { } return len; } - + /** * @return true is the dictionary is empty */ @@ -146,7 +148,7 @@ public boolean isEmpty() { private static String DICT_MISMATCH_TEMPLATE = "SAM dictionaries are not the same: %s."; /** * Non-comprehensive {@link #equals(Object)}-assertion: instead of calling {@link SAMSequenceRecord#equals(Object)} on constituent - * {@link SAMSequenceRecord}s in this dictionary against its pair in the target dictionary, in order, call + * {@link SAMSequenceRecord}s in this dictionary against its pair in the target dictionary, in order, call * {@link SAMSequenceRecord#isSameSequence(SAMSequenceRecord)}. * Aliases are ignored. * @@ -154,7 +156,7 @@ public boolean isEmpty() { */ public void assertSameDictionary(final SAMSequenceDictionary that) { if (this == that) return; - + final Iterator thatSequences = that.mSequences.iterator(); for (final SAMSequenceRecord thisSequence : mSequences) { if (!thatSequences.hasNext()) @@ -189,7 +191,7 @@ public boolean equals(Object o) { * alternate names fo a given contig. e.g: * 1,chr1,chr01,01,CM000663,NC_000001.10 e.g: * MT,chrM - * + * * @param originalName * existing contig name * @param altName @@ -219,11 +221,11 @@ public SAMSequenceRecord addSequenceAlias(final String originalName, /** * return a MD5 sum for ths dictionary, the checksum is re-computed each * time this method is called. - * + * *
      * md5( (seq1.md5_if_available) + ' '+(seq2.name+seq2.length) + ' '+...)
      * 
- * + * * @return a MD5 checksum for this dictionary or the empty string if it is * empty */ @@ -266,5 +268,93 @@ public String toString() { " length:"+ getReferenceLength()+" "+ " md5:"+md5()+")"; } + + public static final List DEFAULT_DICTIONARY_EQUAL_TAG = Arrays.asList( + SAMSequenceRecord.URI_TAG, + SAMSequenceRecord.MD5_TAG, + SAMSequenceRecord.SEQUENCE_LENGTH_TAG); + + /** + * Will merge dictionaryTags from two dictionaries into one focusing on merging the tags rather than the sequences. + * + * Requires that dictionaries have the same SAMSequence records in the same order. + * For each sequenceIndex, the union of the tags from both sequences will be added to the new sequence, mismatching + * values (for tags that are in both) will generate a warning, and the value from dict1 will be used. + * For tags that are in tagsToEquate. an unequal value will generate an error (an IllegalArgumentException will + * be thrown.) + * + * @param dict1 first dictionary + * @param dict2 first dictionary + * @param tagsToMatch list of tags that must be equal if present in both sequence. Typical values will be MD, LN, and MD + * @return dictionary consisting of the same sequences as the two inputs with the merged values of tags. + */ + static public SAMSequenceDictionary mergeDictionaries(final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2, final List tagsToMatch) { + + final SAMSequenceDictionary finalDict = new SAMSequenceDictionary(dict1.getSequences()); + for (final SAMSequenceRecord sequence : finalDict.getSequences()) { + final int sequenceIndex = sequence.getSequenceIndex(); + final Set allTags = new HashSet<>(); + + for (SAMSequenceRecord seq : Arrays.asList( + dict1.getSequence(sequenceIndex), + dict2.getSequence(sequenceIndex))) { + + final Set dictTags = seq + .getAttributes().parallelStream() + .map(Map.Entry::getKey) + .collect(Collectors.toSet()); + allTags.addAll(dictTags); + } + + for (final String tag : allTags) { + final String value1 = dict1.getSequence(sequenceIndex).getAttribute(tag); + final String value2 = dict2.getSequence(sequenceIndex).getAttribute(tag); + + if (value1 != null && value2 != null && !value1.equals(value2)) { + if (tagsToMatch.contains(tag)) { + final String error = String.format("Cannot merge the two dictionaries. Found sequence entry for which " + + "tags differ: %s and tag %s has the two values: %s and %s", + dict1.getSequence(sequenceIndex).getSequenceName(), + tag, + value1, + value2); + + log.error(error); + throw new IllegalArgumentException(error); + } else { + log.warn("Found sequence entry for which " + + "tags differ: %s and tag %s has the two values: %s and %s. using Value %s", + dict1.getSequence(sequenceIndex).getSequenceName(), + tag, + value1, + value2, value1); + } + } + sequence.setAttribute(tag, value1 == null ? value2 : value1); + } + + final int length1 = dict1.getSequence(sequenceIndex).getSequenceLength(); + final int length2 = dict2.getSequence(sequenceIndex).getSequenceLength(); + + if (length1 != UNKNOWN_SEQUENCE_LENGTH && length2 != UNKNOWN_SEQUENCE_LENGTH && length1 != length2) { + if (tagsToMatch.contains(SAMSequenceRecord.SEQUENCE_LENGTH_TAG)) { + final String error = String.format("Cannot merge the two dictionaries. Found sequence entry for which " + + "lengths differ: %s has lengths %s and %s", + dict1.getSequence(sequenceIndex).getSequenceName(), + length1, length2); + + log.error(error); + throw new IllegalArgumentException(error); + } else { + log.warn("Found sequence entry for which " + + "lengths differ:%s has lengths %s and %s. using Value %s", + dict1.getSequence(sequenceIndex).getSequenceName(), + length1, length2, length1); + } + } + sequence.setSequenceLength(length1 == UNKNOWN_SEQUENCE_LENGTH ? length2 : length1); + } + return finalDict; + } } diff --git a/src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java index 89e1b35fb..44eb54925 100644 --- a/src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java +++ b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java @@ -27,11 +27,13 @@ package htsjdk.samtools; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.StringReader; import java.io.StringWriter; import java.util.Arrays; +import java.util.Collections; import javax.xml.bind.JAXBContext; import javax.xml.bind.JAXBException; @@ -89,4 +91,43 @@ public void testXmlSeralization() throws JAXBException { Assert.assertEquals(dict1, dict2); } + @DataProvider(name="testMergeDictionariesData") + Object[][] testMergeDictionariesData(){ + + final SAMSequenceRecord rec1, rec2, rec3, rec4; + rec1 = new SAMSequenceRecord("chr1", 100); + rec2 = new SAMSequenceRecord("chr1", 101); + rec2.setMd5("dummy"); + rec3 = new SAMSequenceRecord("chr1", SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH); + rec3.setMd5("dummy2"); + + rec4 = new SAMSequenceRecord("chr1", 100); + rec4.setAttribute(SAMSequenceRecord.URI_TAG,"file://some/file/name.ok"); + + + return new Object[][]{ + new Object[]{rec1,rec2,false} + }; + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testMergeDictionaries(final SAMSequenceRecord rec1 ,final SAMSequenceRecord rec2, boolean canMerge) throws Exception { + final SAMSequenceDictionary dict1 = new SAMSequenceDictionary(Collections.singletonList(rec1)); + final SAMSequenceDictionary dict2 = new SAMSequenceDictionary(Collections.singletonList(rec2)); + + try { + SAMSequenceDictionary.mergeDictionaries(dict1, dict2, SAMSequenceDictionary.DEFAULT_DICTIONARY_EQUAL_TAG); + } catch (final IllegalArgumentException e) { + if (!canMerge){ + throw e; + } else{ + throw new Exception("Expected to be able to merge dictionaries, but wasn't"); + } + } + if (canMerge){ + throw new Exception("Expected to be able to merge dictionaries, but wasn't"); + } else { + throw new Exception("Expected to not be able to merge dictionaries, but was able"); + } + } } From d240b2f682b830b0a18cf5f92e1c9126080c1cb0 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Sat, 20 Aug 2016 20:47:31 -0400 Subject: [PATCH 2/4] tests (found a bug!) --- .../htsjdk/samtools/SAMSequenceDictionary.java | 35 +++++++++++++++------- .../htsjdk/samtools/SAMSequenceDictionaryTest.java | 24 +++++++++++---- 2 files changed, 43 insertions(+), 16 deletions(-) diff --git a/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java b/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java index f19598fdd..551500e03 100644 --- a/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java +++ b/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java @@ -285,14 +285,29 @@ public String toString() { * * @param dict1 first dictionary * @param dict2 first dictionary - * @param tagsToMatch list of tags that must be equal if present in both sequence. Typical values will be MD, LN, and MD + * @param tagsToMatch list of tags that must be equal if present in both sequence. Typical values will be MD, and LN * @return dictionary consisting of the same sequences as the two inputs with the merged values of tags. */ - static public SAMSequenceDictionary mergeDictionaries(final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2, final List tagsToMatch) { - - final SAMSequenceDictionary finalDict = new SAMSequenceDictionary(dict1.getSequences()); - for (final SAMSequenceRecord sequence : finalDict.getSequences()) { + static public SAMSequenceDictionary mergeDictionaries(final SAMSequenceDictionary dict1, + final SAMSequenceDictionary dict2, + final List tagsToMatch) { + + if(dict1.getSequences().size()!=dict2.getSequences().size()) { + throw new IllegalArgumentException(String.format("Do not use this function to merge dictionaries with " + + "different number of sequences in them. Found %d and %d sequences", + dict1.getSequences().size(), dict2.getSequences().size())); + } + final SAMSequenceDictionary finalDict = new SAMSequenceDictionary(); + for (final SAMSequenceRecord sequence : dict1.getSequences()) { + final SAMSequenceRecord mergedSequence = new SAMSequenceRecord(sequence.getSequenceName(), + UNKNOWN_SEQUENCE_LENGTH); + finalDict.addSequence(mergedSequence); final int sequenceIndex = sequence.getSequenceIndex(); + if (!dict2.getSequence(sequenceIndex).getSequenceName().equals(sequence.getSequenceName())) { + throw new IllegalArgumentException(String.format("Non-equal sequence names (%s and %s) at index %d in " + + "the dictionaries.", + sequence.getSequenceName(), dict2.getSequence(sequenceIndex).getSequenceName(), sequenceIndex)); + } final Set allTags = new HashSet<>(); for (SAMSequenceRecord seq : Arrays.asList( @@ -313,7 +328,7 @@ static public SAMSequenceDictionary mergeDictionaries(final SAMSequenceDictionar if (value1 != null && value2 != null && !value1.equals(value2)) { if (tagsToMatch.contains(tag)) { final String error = String.format("Cannot merge the two dictionaries. Found sequence entry for which " + - "tags differ: %s and tag %s has the two values: %s and %s", + "tag values differ: Sequence %s at tag %s has the values: %s and %s", dict1.getSequence(sequenceIndex).getSequenceName(), tag, value1, @@ -323,14 +338,14 @@ static public SAMSequenceDictionary mergeDictionaries(final SAMSequenceDictionar throw new IllegalArgumentException(error); } else { log.warn("Found sequence entry for which " + - "tags differ: %s and tag %s has the two values: %s and %s. using Value %s", + "tags differ: %s and tag %s has the two values: %s and %s. Using Value %s", dict1.getSequence(sequenceIndex).getSequenceName(), tag, value1, value2, value1); } } - sequence.setAttribute(tag, value1 == null ? value2 : value1); + mergedSequence.setAttribute(tag, value1 == null ? value2 : value1); } final int length1 = dict1.getSequence(sequenceIndex).getSequenceLength(); @@ -347,12 +362,12 @@ static public SAMSequenceDictionary mergeDictionaries(final SAMSequenceDictionar throw new IllegalArgumentException(error); } else { log.warn("Found sequence entry for which " + - "lengths differ:%s has lengths %s and %s. using Value %s", + "lengths differ: Sequence %s has lengths %s and %s. Using Value %s", dict1.getSequence(sequenceIndex).getSequenceName(), length1, length2, length1); } } - sequence.setSequenceLength(length1 == UNKNOWN_SEQUENCE_LENGTH ? length2 : length1); + mergedSequence.setSequenceLength(length1 == UNKNOWN_SEQUENCE_LENGTH ? length2 : length1); } return finalDict; } diff --git a/src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java index 44eb54925..a7d369d48 100644 --- a/src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java +++ b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java @@ -92,9 +92,9 @@ public void testXmlSeralization() throws JAXBException { } @DataProvider(name="testMergeDictionariesData") - Object[][] testMergeDictionariesData(){ + public Object[][] testMergeDictionariesData(){ - final SAMSequenceRecord rec1, rec2, rec3, rec4; + final SAMSequenceRecord rec1, rec2, rec3, rec4, rec5; rec1 = new SAMSequenceRecord("chr1", 100); rec2 = new SAMSequenceRecord("chr1", 101); rec2.setMd5("dummy"); @@ -104,14 +104,26 @@ public void testXmlSeralization() throws JAXBException { rec4 = new SAMSequenceRecord("chr1", 100); rec4.setAttribute(SAMSequenceRecord.URI_TAG,"file://some/file/name.ok"); + rec5 = new SAMSequenceRecord("chr2", 200); + rec4.setAttribute(SAMSequenceRecord.URI_TAG,"file://some/file/name.ok"); return new Object[][]{ - new Object[]{rec1,rec2,false} + new Object[]{rec1, rec1, true}, + new Object[]{rec2, rec2, true}, + new Object[]{rec3, rec3, true}, + new Object[]{rec4, rec4, true}, + new Object[]{rec1, rec2, false},//since 100 != 101 in Length + new Object[]{rec1, rec3, true}, + new Object[]{rec1, rec4, true}, + new Object[]{rec2, rec3, false}, // since MD5 is not equal + new Object[]{rec2, rec4, false}, //length differs + new Object[]{rec3, rec4, true}, + new Object[]{rec4, rec5, false}, // different name }; } - @Test(expectedExceptions = IllegalArgumentException.class) - public void testMergeDictionaries(final SAMSequenceRecord rec1 ,final SAMSequenceRecord rec2, boolean canMerge) throws Exception { + @Test(dataProvider = "testMergeDictionariesData", expectedExceptions = IllegalArgumentException.class) + public void testMergeDictionaries(final SAMSequenceRecord rec1, final SAMSequenceRecord rec2, boolean canMerge) throws Exception { final SAMSequenceDictionary dict1 = new SAMSequenceDictionary(Collections.singletonList(rec1)); final SAMSequenceDictionary dict2 = new SAMSequenceDictionary(Collections.singletonList(rec2)); @@ -125,7 +137,7 @@ public void testMergeDictionaries(final SAMSequenceRecord rec1 ,final SAMSequenc } } if (canMerge){ - throw new Exception("Expected to be able to merge dictionaries, but wasn't"); + throw new IllegalArgumentException("Expected to be able to merge dictionaries, and was indeed able to do so."); } else { throw new Exception("Expected to not be able to merge dictionaries, but was able"); } From ec52360effd3f77492ee55907495774b72e7f027 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Mon, 22 Aug 2016 07:30:38 -0400 Subject: [PATCH 3/4] more reviewer comments now incorporated --- src/main/java/htsjdk/samtools/SAMSequenceDictionary.java | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java b/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java index 551500e03..14cc52321 100644 --- a/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java +++ b/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java @@ -270,7 +270,6 @@ public String toString() { } public static final List DEFAULT_DICTIONARY_EQUAL_TAG = Arrays.asList( - SAMSequenceRecord.URI_TAG, SAMSequenceRecord.MD5_TAG, SAMSequenceRecord.SEQUENCE_LENGTH_TAG); @@ -308,18 +307,11 @@ static public SAMSequenceDictionary mergeDictionaries(final SAMSequenceDictionar "the dictionaries.", sequence.getSequenceName(), dict2.getSequence(sequenceIndex).getSequenceName(), sequenceIndex)); } - final Set allTags = new HashSet<>(); - for (SAMSequenceRecord seq : Arrays.asList( - dict1.getSequence(sequenceIndex), - dict2.getSequence(sequenceIndex))) { - - final Set dictTags = seq - .getAttributes().parallelStream() - .map(Map.Entry::getKey) - .collect(Collectors.toSet()); - allTags.addAll(dictTags); - } + final Set allTags = new HashSet<>(dict1.getSequence(sequenceIndex).getAttributes() + .stream().map(Map.Entry::getKey).collect(Collectors.toSet())); + allTags.addAll(dict2.getSequence(sequenceIndex).getAttributes() + .stream().map(Map.Entry::getKey).collect(Collectors.toSet())); for (final String tag : allTags) { final String value1 = dict1.getSequence(sequenceIndex).getAttribute(tag); From 316c085acd5a1e55d2fd28554cb32e6a7ed38b28 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Thu, 22 Sep 2016 10:42:56 -0400 Subject: [PATCH 4/4] - responding to review comments --- .../htsjdk/samtools/SAMSequenceDictionary.java | 100 +++++++++------------ .../htsjdk/samtools/SAMSequenceDictionaryTest.java | 8 +- 2 files changed, 49 insertions(+), 59 deletions(-) diff --git a/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java b/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java index 14cc52321..b7744d796 100644 --- a/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java +++ b/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java @@ -29,13 +29,15 @@ import java.math.BigInteger; import java.security.MessageDigest; import java.util.*; +import java.util.stream.Collector; import java.util.stream.Collectors; import javax.xml.bind.annotation.XmlElement; import javax.xml.bind.annotation.XmlRootElement; import javax.xml.bind.annotation.XmlTransient; -import static htsjdk.samtools.SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH; +import static htsjdk.samtools.SAMSequenceRecord.*; +import static java.util.stream.Collectors.toList; /** * Collection of SAMSequenceRecords. @@ -279,87 +281,73 @@ public String toString() { * Requires that dictionaries have the same SAMSequence records in the same order. * For each sequenceIndex, the union of the tags from both sequences will be added to the new sequence, mismatching * values (for tags that are in both) will generate a warning, and the value from dict1 will be used. - * For tags that are in tagsToEquate. an unequal value will generate an error (an IllegalArgumentException will - * be thrown.) + * For tags that are in tagsToEquate an unequal value will generate an error (an IllegalArgumentException will + * be thrown.) tagsToEquate must include LN and MD. * * @param dict1 first dictionary * @param dict2 first dictionary - * @param tagsToMatch list of tags that must be equal if present in both sequence. Typical values will be MD, and LN + * @param tagsToMatch list of tags that must be equal if present in both sequence. Must contain MD, and LN * @return dictionary consisting of the same sequences as the two inputs with the merged values of tags. */ static public SAMSequenceDictionary mergeDictionaries(final SAMSequenceDictionary dict1, final SAMSequenceDictionary dict2, final List tagsToMatch) { - if(dict1.getSequences().size()!=dict2.getSequences().size()) { + // We require MD and LN to match. + if (!tagsToMatch.contains(MD5_TAG) || !tagsToMatch.contains(SEQUENCE_LENGTH_TAG)) { + throw new IllegalArgumentException("Both " + MD5_TAG + " and " + SEQUENCE_LENGTH_TAG + " must be matched " + + "when merging dictionaries. Found: " + String.join(",", tagsToMatch)); + } + + if (!dict1.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toList()).equals( + dict2.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(Collectors.toList()))) { + throw new IllegalArgumentException(String.format("Do not use this function to merge dictionaries with " + - "different number of sequences in them. Found %d and %d sequences", - dict1.getSequences().size(), dict2.getSequences().size())); + "different sequences in them. Sequences must be in the same order as well. Found [%s] and [%s].", + String.join(", ", dict1.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(toList())), + String.join(", ", dict2.getSequences().stream().map(SAMSequenceRecord::getSequenceName).collect(toList())))); } + final SAMSequenceDictionary finalDict = new SAMSequenceDictionary(); - for (final SAMSequenceRecord sequence : dict1.getSequences()) { - final SAMSequenceRecord mergedSequence = new SAMSequenceRecord(sequence.getSequenceName(), - UNKNOWN_SEQUENCE_LENGTH); - finalDict.addSequence(mergedSequence); - final int sequenceIndex = sequence.getSequenceIndex(); - if (!dict2.getSequence(sequenceIndex).getSequenceName().equals(sequence.getSequenceName())) { - throw new IllegalArgumentException(String.format("Non-equal sequence names (%s and %s) at index %d in " + - "the dictionaries.", - sequence.getSequenceName(), dict2.getSequence(sequenceIndex).getSequenceName(), sequenceIndex)); - } + for (int sequenceIndex = 0; sequenceIndex < dict1.getSequences().size(); sequenceIndex++) { + final SAMSequenceRecord s1 = dict1.getSequence(sequenceIndex); + final SAMSequenceRecord s2 = dict2.getSequence(sequenceIndex); - final Set allTags = new HashSet<>(dict1.getSequence(sequenceIndex).getAttributes() - .stream().map(Map.Entry::getKey).collect(Collectors.toSet())); - allTags.addAll(dict2.getSequence(sequenceIndex).getAttributes() - .stream().map(Map.Entry::getKey).collect(Collectors.toSet())); + final String sName = s1.getSequenceName(); + final SAMSequenceRecord sMerged = new SAMSequenceRecord(sName, UNKNOWN_SEQUENCE_LENGTH); + finalDict.addSequence(sMerged); + + final Set allTags = new HashSet<>(); + s1.getAttributes().stream().forEach(a -> allTags.add(a.getKey())); + s2.getAttributes().stream().forEach(a -> allTags.add(a.getKey())); for (final String tag : allTags) { - final String value1 = dict1.getSequence(sequenceIndex).getAttribute(tag); - final String value2 = dict2.getSequence(sequenceIndex).getAttribute(tag); + final String value1 = s1.getAttribute(tag); + final String value2 = s2.getAttribute(tag); if (value1 != null && value2 != null && !value1.equals(value2)) { + String baseMessage = String.format("Found sequence entry for which " + + "tags differ: %s and tag %s has the two values: %s and %s.", + sName, tag, value1, value2); + if (tagsToMatch.contains(tag)) { - final String error = String.format("Cannot merge the two dictionaries. Found sequence entry for which " + - "tag values differ: Sequence %s at tag %s has the values: %s and %s", - dict1.getSequence(sequenceIndex).getSequenceName(), - tag, - value1, - value2); - - log.error(error); - throw new IllegalArgumentException(error); + log.error("Cannot merge dictionaries. ", baseMessage); + throw new IllegalArgumentException("Cannot merge dictionaries. " + baseMessage); } else { - log.warn("Found sequence entry for which " + - "tags differ: %s and tag %s has the two values: %s and %s. Using Value %s", - dict1.getSequence(sequenceIndex).getSequenceName(), - tag, - value1, - value2, value1); + log.warn(baseMessage, " Using ", value1); } } - mergedSequence.setAttribute(tag, value1 == null ? value2 : value1); + sMerged.setAttribute(tag, value1 == null ? value2 : value1); } - final int length1 = dict1.getSequence(sequenceIndex).getSequenceLength(); - final int length2 = dict2.getSequence(sequenceIndex).getSequenceLength(); + final int length1 = s1.getSequenceLength(); + final int length2 = s2.getSequenceLength(); if (length1 != UNKNOWN_SEQUENCE_LENGTH && length2 != UNKNOWN_SEQUENCE_LENGTH && length1 != length2) { - if (tagsToMatch.contains(SAMSequenceRecord.SEQUENCE_LENGTH_TAG)) { - final String error = String.format("Cannot merge the two dictionaries. Found sequence entry for which " + - "lengths differ: %s has lengths %s and %s", - dict1.getSequence(sequenceIndex).getSequenceName(), - length1, length2); - - log.error(error); - throw new IllegalArgumentException(error); - } else { - log.warn("Found sequence entry for which " + - "lengths differ: Sequence %s has lengths %s and %s. Using Value %s", - dict1.getSequence(sequenceIndex).getSequenceName(), - length1, length2, length1); - } + throw new IllegalArgumentException(String.format("Cannot merge the two dictionaries. " + + "Found sequence entry for which " + "lengths differ: %s has lengths %s and %s", sName, length1, length2)); } - mergedSequence.setSequenceLength(length1 == UNKNOWN_SEQUENCE_LENGTH ? length2 : length1); + sMerged.setSequenceLength(length1 == UNKNOWN_SEQUENCE_LENGTH ? length2 : length1); } return finalDict; } diff --git a/src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java index a7d369d48..0b1a50780 100644 --- a/src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java +++ b/src/test/java/htsjdk/samtools/SAMSequenceDictionaryTest.java @@ -32,8 +32,10 @@ import java.io.StringReader; import java.io.StringWriter; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.List; import javax.xml.bind.JAXBContext; import javax.xml.bind.JAXBException; @@ -130,10 +132,10 @@ public void testMergeDictionaries(final SAMSequenceRecord rec1, final SAMSequenc try { SAMSequenceDictionary.mergeDictionaries(dict1, dict2, SAMSequenceDictionary.DEFAULT_DICTIONARY_EQUAL_TAG); } catch (final IllegalArgumentException e) { - if (!canMerge){ + if (canMerge) { + throw new Exception("Expected to be able to merge dictionaries, but wasn't:" , e); + } else { throw e; - } else{ - throw new Exception("Expected to be able to merge dictionaries, but wasn't"); } } if (canMerge){