From 31b71952737d453608e94c37e7a0e96027956fad Mon Sep 17 00:00:00 2001 From: Pierre Lindenbaum Date: Tue, 16 Aug 2016 16:36:08 +0200 Subject: [PATCH 1/5] extract suppl alignments --- src/main/java/htsjdk/samtools/SAMUtils.java | 69 +++++++++++++++++++++++++ src/test/java/htsjdk/samtools/SAMUtilsTest.java | 55 +++++++++++++++++++- 2 files changed, 123 insertions(+), 1 deletion(-) diff --git a/src/main/java/htsjdk/samtools/SAMUtils.java b/src/main/java/htsjdk/samtools/SAMUtils.java index b31b7718c..7c9fc162b 100644 --- a/src/main/java/htsjdk/samtools/SAMUtils.java +++ b/src/main/java/htsjdk/samtools/SAMUtils.java @@ -41,6 +41,7 @@ import java.util.List; import java.util.Map; import java.util.TreeMap; +import java.util.regex.Pattern; /** @@ -1096,4 +1097,72 @@ public static SAMRecord clipOverlappingAlignedBases(final SAMRecord record, fina public static boolean isValidUnsignedIntegerAttribute(long value) { return value >= 0 && value <= BinaryCodec.MAX_UINT; } + + /** + * Extract a List of 'other canonical alignments' from a SAM record. Those alignments are stored as a string in the 'SA' tag as defined + * in the SAM specification. + * Each record in the List is a non-paired read. + * The name, sequence and qualities are copied from the original recordabsent. + * The SAM flag is set to SUPPLEMENTARY_ALIGNMENT (+ READ_REVERSE_STRAND ) + * @param record must be non null and must have a non-null associated header. + * @return a list of 'other canonical alignments' SAMRecords. The list is empty if the 'SA' attribute is missing. + */ + public static List getOtherCanonicalAlignments(final SAMRecord record) { + final Pattern semiColonPattern = Pattern.compile("[;]"); + final Pattern commaPattern = Pattern.compile("[,]"); + if( record == null ) throw new IllegalArgumentException("record is null"); + if( record.getHeader() == null ) throw new IllegalArgumentException("record.getHeader() is null"); + /* extract value of SA tag */ + final Object saValue = record.getAttribute( SAMTagUtil.getSingleton().SA ); + if( saValue == null ) return Collections.emptyList(); + if( ! (saValue instanceof String) ) throw new SAMException( + "Expected a String for attribute 'SA' but got " + saValue.getClass() ); + + final SAMRecordFactory srf = new DefaultSAMRecordFactory(); + + /* the spec says: "Other canonical alignments in a chimeric alignment, formatted as a + * semicolon-delimited list: (rname,pos,strand,CIGAR,mapQ,NM;)+. + * Each element in the list represents a part of the chimeric alignment. + * Conventionally, at a supplementary line, the rst element points to the primary line. + */ + + /* break string using semicolon */ + final String semiColonStrs[] = semiColonPattern.split((String)saValue); + + /* the result list */ + final List alignments = new ArrayList<>( semiColonStrs.length ); + + for( final String semiColonStr : semiColonStrs ) { + /* ignore empty string */ + if( semiColonStr.isEmpty() ) continue; + + /* break string using comma */ + final String commaStrs[] = commaPattern.split(semiColonStr); + if( commaStrs.length < 5 ) throw new SAMException("Bad 'SA' attribute in " + semiColonStr); + + /* create the new record */ + final SAMRecord otherRec = srf.createSAMRecord( record.getHeader() ); + + /* copy fields from the original record */ + otherRec.setReadName( record.getReadName() ); + otherRec.setReadBases( record.getReadBases() ); + otherRec.setBaseQualities( record.getBaseQualities() ); + + /* get reference sequence */ + final int tid = record.getHeader().getSequenceIndex( commaStrs[0] ); + if( tid == -1 ) throw new SAMException("Unknown contig in " + semiColonStr); + otherRec.setReferenceIndex( tid ); + + /* fill other fields */ + otherRec.setAlignmentStart( Integer.parseInt(commaStrs[1]) ); + otherRec.setFlags( + SAMFlag.SUPPLEMENTARY_ALIGNMENT.flag + + (commaStrs[2].equals("+") ? 0 : SAMFlag.READ_REVERSE_STRAND.flag) ); + otherRec.setCigar( TextCigarCodec.decode( commaStrs[3] ) ); + otherRec.setMappingQuality( Integer.parseInt(commaStrs[4]) ); + otherRec.setAttribute(SAMTagUtil.getSingleton().NM, Integer.parseInt(commaStrs[5]) ); + alignments.add( otherRec ); + } + return alignments; + } } diff --git a/src/test/java/htsjdk/samtools/SAMUtilsTest.java b/src/test/java/htsjdk/samtools/SAMUtilsTest.java index 48baf448d..13f3c3cee 100644 --- a/src/test/java/htsjdk/samtools/SAMUtilsTest.java +++ b/src/test/java/htsjdk/samtools/SAMUtilsTest.java @@ -26,7 +26,7 @@ import org.testng.Assert; import org.testng.annotations.Test; -import java.util.Arrays; +import java.util.List; public class SAMUtilsTest { @Test @@ -173,4 +173,57 @@ public void testClippingOfRecordWithMateAtSamePosition() { record.setSecondOfPairFlag(true); Assert.assertEquals(SAMUtils.getNumOverlappingAlignedBasesToClip(record), 10); } + + @Test + public void testOtherCanonicalAlignments() { + // setup the record + final SAMFileHeader header = new SAMFileHeader(); + header.addSequence(new SAMSequenceRecord("1", 1000)); + header.addSequence(new SAMSequenceRecord("2", 1000)); + final SAMRecord record = new SAMRecord(header); + record.setReadPairedFlag(true); + record.setFirstOfPairFlag(true); + record.setCigar(TextCigarCodec.decode("10M")); + record.setReferenceIndex(0); + record.setAlignmentStart(1); + record.setMateReferenceIndex(0); + record.setMateAlignmentStart(1); + record.setReadBases("AAAAAAAAAA".getBytes()); + record.setBaseQualities("##########".getBytes()); + record.setAttribute(SAMTagUtil.getSingleton().SA, + "2,500,+,3S2=1X2=2S,60,1;" + + "1,191,-,8M2S,60,0;"); + + // extract suppl alignments + final List suppl = SAMUtils.getOtherCanonicalAlignments(record); + Assert.assertNotNull(suppl); + Assert.assertEquals(suppl.size(), 2); + + for(final SAMRecord other: suppl) { + Assert.assertFalse(other.getReadUnmappedFlag()); + Assert.assertFalse(other.getReadPairedFlag()); + Assert.assertTrue(other.getSupplementaryAlignmentFlag()); + Assert.assertEquals(other.getReadName(),record.getReadName()); + Assert.assertEquals(other.getReadString(),record.getReadString()); + Assert.assertEquals(other.getBaseQualityString(),record.getBaseQualityString()); + } + + SAMRecord other = suppl.get(0); + Assert.assertEquals(other.getReferenceName(),"2"); + Assert.assertEquals(other.getAlignmentStart(),500); + Assert.assertFalse(other.getReadNegativeStrandFlag()); + Assert.assertEquals(other.getMappingQuality(), 60); + Assert.assertEquals(other.getAttribute(SAMTagUtil.getSingleton().NM),1); + Assert.assertEquals(other.getCigarString(),"3S2=1X2=2S"); + + other = suppl.get(1); + Assert.assertEquals(other.getReferenceName(),"1"); + Assert.assertEquals(other.getAlignmentStart(),191); + Assert.assertTrue(other.getReadNegativeStrandFlag()); + Assert.assertEquals(other.getMappingQuality(), 60); + Assert.assertEquals(other.getAttribute(SAMTagUtil.getSingleton().NM),0); + Assert.assertEquals(other.getCigarString(),"8M2S"); + + } + } From c100ed8a64b6d003b5c256d6eba675ff17b56e6b Mon Sep 17 00:00:00 2001 From: Pierre Lindenbaum Date: Tue, 16 Aug 2016 16:49:46 +0200 Subject: [PATCH 2/5] typo --- src/main/java/htsjdk/samtools/SAMUtils.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/htsjdk/samtools/SAMUtils.java b/src/main/java/htsjdk/samtools/SAMUtils.java index 7c9fc162b..583c72eb3 100644 --- a/src/main/java/htsjdk/samtools/SAMUtils.java +++ b/src/main/java/htsjdk/samtools/SAMUtils.java @@ -1102,7 +1102,7 @@ public static boolean isValidUnsignedIntegerAttribute(long value) { * Extract a List of 'other canonical alignments' from a SAM record. Those alignments are stored as a string in the 'SA' tag as defined * in the SAM specification. * Each record in the List is a non-paired read. - * The name, sequence and qualities are copied from the original recordabsent. + * The name, sequence and qualities are copied from the original record. * The SAM flag is set to SUPPLEMENTARY_ALIGNMENT (+ READ_REVERSE_STRAND ) * @param record must be non null and must have a non-null associated header. * @return a list of 'other canonical alignments' SAMRecords. The list is empty if the 'SA' attribute is missing. @@ -1123,7 +1123,7 @@ public static boolean isValidUnsignedIntegerAttribute(long value) { /* the spec says: "Other canonical alignments in a chimeric alignment, formatted as a * semicolon-delimited list: (rname,pos,strand,CIGAR,mapQ,NM;)+. * Each element in the list represents a part of the chimeric alignment. - * Conventionally, at a supplementary line, the rst element points to the primary line. + * Conventionally, at a supplementary line, the 1rst element points to the primary line. */ /* break string using semicolon */ From 1a8c36a0ba45e55f43bff3d86f76c49626ce745c Mon Sep 17 00:00:00 2001 From: Pierre Lindenbaum Date: Tue, 16 Aug 2016 16:58:26 +0200 Subject: [PATCH 3/5] variable name --- src/main/java/htsjdk/samtools/SAMUtils.java | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/main/java/htsjdk/samtools/SAMUtils.java b/src/main/java/htsjdk/samtools/SAMUtils.java index 583c72eb3..3a0ee2305 100644 --- a/src/main/java/htsjdk/samtools/SAMUtils.java +++ b/src/main/java/htsjdk/samtools/SAMUtils.java @@ -1118,7 +1118,7 @@ public static boolean isValidUnsignedIntegerAttribute(long value) { if( ! (saValue instanceof String) ) throw new SAMException( "Expected a String for attribute 'SA' but got " + saValue.getClass() ); - final SAMRecordFactory srf = new DefaultSAMRecordFactory(); + final SAMRecordFactory samReaderFactory = new DefaultSAMRecordFactory(); /* the spec says: "Other canonical alignments in a chimeric alignment, formatted as a * semicolon-delimited list: (rname,pos,strand,CIGAR,mapQ,NM;)+. @@ -1138,10 +1138,10 @@ public static boolean isValidUnsignedIntegerAttribute(long value) { /* break string using comma */ final String commaStrs[] = commaPattern.split(semiColonStr); - if( commaStrs.length < 5 ) throw new SAMException("Bad 'SA' attribute in " + semiColonStr); + if( commaStrs.length != 6 ) throw new SAMException("Bad 'SA' attribute in " + semiColonStr); /* create the new record */ - final SAMRecord otherRec = srf.createSAMRecord( record.getHeader() ); + final SAMRecord otherRec = samReaderFactory.createSAMRecord( record.getHeader() ); /* copy fields from the original record */ otherRec.setReadName( record.getReadName() ); @@ -1160,7 +1160,9 @@ public static boolean isValidUnsignedIntegerAttribute(long value) { (commaStrs[2].equals("+") ? 0 : SAMFlag.READ_REVERSE_STRAND.flag) ); otherRec.setCigar( TextCigarCodec.decode( commaStrs[3] ) ); otherRec.setMappingQuality( Integer.parseInt(commaStrs[4]) ); - otherRec.setAttribute(SAMTagUtil.getSingleton().NM, Integer.parseInt(commaStrs[5]) ); + otherRec.setAttribute( SAMTagUtil.getSingleton().NM , Integer.parseInt(commaStrs[5]) ); + + /* add the alignment */ alignments.add( otherRec ); } return alignments; From de3873f108394db43a231fda412fb93c78b98e83 Mon Sep 17 00:00:00 2001 From: Pierre Lindenbaum Date: Wed, 17 Aug 2016 10:57:53 +0200 Subject: [PATCH 4/5] strand --- src/main/java/htsjdk/samtools/SAMUtils.java | 5 +++++ src/test/java/htsjdk/samtools/SAMUtilsTest.java | 6 ++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/main/java/htsjdk/samtools/SAMUtils.java b/src/main/java/htsjdk/samtools/SAMUtils.java index 3a0ee2305..e366d30f3 100644 --- a/src/main/java/htsjdk/samtools/SAMUtils.java +++ b/src/main/java/htsjdk/samtools/SAMUtils.java @@ -1162,6 +1162,11 @@ public static boolean isValidUnsignedIntegerAttribute(long value) { otherRec.setMappingQuality( Integer.parseInt(commaStrs[4]) ); otherRec.setAttribute( SAMTagUtil.getSingleton().NM , Integer.parseInt(commaStrs[5]) ); + /* if strand is not the same: reverse-complement */ + if( otherRec.getReadNegativeStrandFlag() != record.getReadNegativeStrandFlag() ) { + SAMRecordUtil.reverseComplement(otherRec); + } + /* add the alignment */ alignments.add( otherRec ); } diff --git a/src/test/java/htsjdk/samtools/SAMUtilsTest.java b/src/test/java/htsjdk/samtools/SAMUtilsTest.java index 13f3c3cee..01b38beb3 100644 --- a/src/test/java/htsjdk/samtools/SAMUtilsTest.java +++ b/src/test/java/htsjdk/samtools/SAMUtilsTest.java @@ -204,8 +204,10 @@ public void testOtherCanonicalAlignments() { Assert.assertFalse(other.getReadPairedFlag()); Assert.assertTrue(other.getSupplementaryAlignmentFlag()); Assert.assertEquals(other.getReadName(),record.getReadName()); - Assert.assertEquals(other.getReadString(),record.getReadString()); - Assert.assertEquals(other.getBaseQualityString(),record.getBaseQualityString()); + if( other.getReadNegativeStrandFlag()==record.getReadNegativeStrandFlag()) { + Assert.assertEquals(other.getReadString(),record.getReadString()); + Assert.assertEquals(other.getBaseQualityString(),record.getBaseQualityString()); + } } SAMRecord other = suppl.get(0); From 899227ba6ddca812474f880ffc39c6a784b912e5 Mon Sep 17 00:00:00 2001 From: Pierre Lindenbaum Date: Wed, 17 Aug 2016 22:04:37 +0200 Subject: [PATCH 5/5] fix PR --- src/main/java/htsjdk/samtools/SAMUtils.java | 80 ++++++++++++++++++++----- src/test/java/htsjdk/samtools/SAMUtilsTest.java | 24 +++++++- 2 files changed, 86 insertions(+), 18 deletions(-) diff --git a/src/main/java/htsjdk/samtools/SAMUtils.java b/src/main/java/htsjdk/samtools/SAMUtils.java index e366d30f3..c0432acc8 100644 --- a/src/main/java/htsjdk/samtools/SAMUtils.java +++ b/src/main/java/htsjdk/samtools/SAMUtils.java @@ -48,6 +48,11 @@ * Utilty methods. */ public final class SAMUtils { + /** regex for semicolon, used in {@link SAMUtils#getOtherCanonicalAlignments(SAMRecord)} */ + private static final Pattern SEMICOLON_PAT = Pattern.compile("[;]"); + /** regex for comma, used in {@link SAMUtils#getOtherCanonicalAlignments(SAMRecord)} */ + private static final Pattern COMMA_PAT = Pattern.compile("[,]"); + // Representation of bases, one for when in low-order nybble, one for when in high-order nybble. private static final byte COMPRESSED_EQUAL_LOW = 0; private static final byte COMPRESSED_A_LOW = 1; @@ -1101,15 +1106,11 @@ public static boolean isValidUnsignedIntegerAttribute(long value) { /** * Extract a List of 'other canonical alignments' from a SAM record. Those alignments are stored as a string in the 'SA' tag as defined * in the SAM specification. - * Each record in the List is a non-paired read. - * The name, sequence and qualities are copied from the original record. - * The SAM flag is set to SUPPLEMENTARY_ALIGNMENT (+ READ_REVERSE_STRAND ) + * The name, sequence and qualities, mate data are copied from the original record. * @param record must be non null and must have a non-null associated header. * @return a list of 'other canonical alignments' SAMRecords. The list is empty if the 'SA' attribute is missing. */ public static List getOtherCanonicalAlignments(final SAMRecord record) { - final Pattern semiColonPattern = Pattern.compile("[;]"); - final Pattern commaPattern = Pattern.compile("[,]"); if( record == null ) throw new IllegalArgumentException("record is null"); if( record.getHeader() == null ) throw new IllegalArgumentException("record.getHeader() is null"); /* extract value of SA tag */ @@ -1127,17 +1128,25 @@ public static boolean isValidUnsignedIntegerAttribute(long value) { */ /* break string using semicolon */ - final String semiColonStrs[] = semiColonPattern.split((String)saValue); + final String semiColonStrs[] = SEMICOLON_PAT.split((String)saValue); /* the result list */ final List alignments = new ArrayList<>( semiColonStrs.length ); + + /* base SAM flag */ + int record_flag = record.getFlags() ; + record_flag &= ~SAMFlag.PROPER_PAIR.flag; + record_flag &= ~SAMFlag.SUPPLEMENTARY_ALIGNMENT.flag; + record_flag &= ~SAMFlag.READ_REVERSE_STRAND.flag; + - for( final String semiColonStr : semiColonStrs ) { + for(int i=0; i< semiColonStrs.length;++i ) { + final String semiColonStr = semiColonStrs[i]; /* ignore empty string */ if( semiColonStr.isEmpty() ) continue; /* break string using comma */ - final String commaStrs[] = commaPattern.split(semiColonStr); + final String commaStrs[] = COMMA_PAT.split(semiColonStr); if( commaStrs.length != 6 ) throw new SAMException("Bad 'SA' attribute in " + semiColonStr); /* create the new record */ @@ -1147,20 +1156,59 @@ public static boolean isValidUnsignedIntegerAttribute(long value) { otherRec.setReadName( record.getReadName() ); otherRec.setReadBases( record.getReadBases() ); otherRec.setBaseQualities( record.getBaseQualities() ); + if( record.getReadPairedFlag() && !record.getMateUnmappedFlag()) { + otherRec.setMateReferenceIndex( record.getMateReferenceIndex() ); + otherRec.setMateAlignmentStart( record.getMateAlignmentStart() ); + } + /* get reference sequence */ final int tid = record.getHeader().getSequenceIndex( commaStrs[0] ); if( tid == -1 ) throw new SAMException("Unknown contig in " + semiColonStr); otherRec.setReferenceIndex( tid ); - /* fill other fields */ - otherRec.setAlignmentStart( Integer.parseInt(commaStrs[1]) ); - otherRec.setFlags( - SAMFlag.SUPPLEMENTARY_ALIGNMENT.flag + - (commaStrs[2].equals("+") ? 0 : SAMFlag.READ_REVERSE_STRAND.flag) ); - otherRec.setCigar( TextCigarCodec.decode( commaStrs[3] ) ); - otherRec.setMappingQuality( Integer.parseInt(commaStrs[4]) ); - otherRec.setAttribute( SAMTagUtil.getSingleton().NM , Integer.parseInt(commaStrs[5]) ); + /* fill POS */ + final int alignStart; + try { + alignStart = Integer.parseInt(commaStrs[1]); + } catch( final NumberFormatException err ) { + throw new SAMException("bad POS in "+semiColonStr, err); + } + + otherRec.setAlignmentStart( alignStart ); + + /* set TLEN */ + if( record.getReadPairedFlag() && + !record.getMateUnmappedFlag() && + record.getMateReferenceIndex() == tid ) { + otherRec.setInferredInsertSize( record.getMateAlignmentStart() - alignStart ); + } + + /* set FLAG */ + int other_flag = record_flag; + other_flag |= (commaStrs[2].equals("+") ? 0 : SAMFlag.READ_REVERSE_STRAND.flag) ; + /* spec: Conventionally, at a supplementary line, the 1st element points to the primary line */ + if( !( record.getSupplementaryAlignmentFlag() && i==0 ) ) { + other_flag |= SAMFlag.SUPPLEMENTARY_ALIGNMENT.flag; + } + otherRec.setFlags(other_flag); + + /* set CIGAR */ + otherRec.setCigar( TextCigarCodec.decode( commaStrs[3] ) ); + + /* set MAPQ */ + try { + otherRec.setMappingQuality( Integer.parseInt(commaStrs[4]) ); + } catch (final NumberFormatException err) { + throw new SAMException("bad MAPQ in "+semiColonStr, err); + } + + /* fill NM */ + try { + otherRec.setAttribute( SAMTagUtil.getSingleton().NM , Integer.parseInt(commaStrs[5]) ); + } catch (final NumberFormatException err) { + throw new SAMException("bad NM in "+semiColonStr, err); + } /* if strand is not the same: reverse-complement */ if( otherRec.getReadNegativeStrandFlag() != record.getReadNegativeStrandFlag() ) { diff --git a/src/test/java/htsjdk/samtools/SAMUtilsTest.java b/src/test/java/htsjdk/samtools/SAMUtilsTest.java index 01b38beb3..0fa6b4a69 100644 --- a/src/test/java/htsjdk/samtools/SAMUtilsTest.java +++ b/src/test/java/htsjdk/samtools/SAMUtilsTest.java @@ -188,8 +188,19 @@ public void testOtherCanonicalAlignments() { record.setAlignmentStart(1); record.setMateReferenceIndex(0); record.setMateAlignmentStart(1); + record.setReadPairedFlag(true); + record.setSupplementaryAlignmentFlag(true);//spec says first 'SA' record will be the primary record + + record.setMateReferenceIndex(0); + record.setMateAlignmentStart(100); + record.setInferredInsertSize(99); + record.setReadBases("AAAAAAAAAA".getBytes()); record.setBaseQualities("##########".getBytes()); + // check no alignments if no SA tag */ + Assert.assertEquals(SAMUtils.getOtherCanonicalAlignments(record).size(),0); + + record.setAttribute(SAMTagUtil.getSingleton().SA, "2,500,+,3S2=1X2=2S,60,1;" + "1,191,-,8M2S,60,0;"); @@ -201,8 +212,11 @@ public void testOtherCanonicalAlignments() { for(final SAMRecord other: suppl) { Assert.assertFalse(other.getReadUnmappedFlag()); - Assert.assertFalse(other.getReadPairedFlag()); - Assert.assertTrue(other.getSupplementaryAlignmentFlag()); + Assert.assertTrue(other.getReadPairedFlag()); + Assert.assertFalse(other.getMateUnmappedFlag()); + Assert.assertEquals(other.getMateAlignmentStart(),record.getMateAlignmentStart()); + Assert.assertEquals(other.getMateReferenceName(),record.getMateReferenceName()); + Assert.assertEquals(other.getReadName(),record.getReadName()); if( other.getReadNegativeStrandFlag()==record.getReadNegativeStrandFlag()) { Assert.assertEquals(other.getReadString(),record.getReadString()); @@ -211,20 +225,26 @@ public void testOtherCanonicalAlignments() { } SAMRecord other = suppl.get(0); + Assert.assertFalse(other.getSupplementaryAlignmentFlag());//1st of suppl and 'record' is supplementary Assert.assertEquals(other.getReferenceName(),"2"); Assert.assertEquals(other.getAlignmentStart(),500); Assert.assertFalse(other.getReadNegativeStrandFlag()); Assert.assertEquals(other.getMappingQuality(), 60); Assert.assertEquals(other.getAttribute(SAMTagUtil.getSingleton().NM),1); Assert.assertEquals(other.getCigarString(),"3S2=1X2=2S"); + Assert.assertEquals(other.getInferredInsertSize(),0); + other = suppl.get(1); + Assert.assertTrue(other.getSupplementaryAlignmentFlag()); Assert.assertEquals(other.getReferenceName(),"1"); Assert.assertEquals(other.getAlignmentStart(),191); Assert.assertTrue(other.getReadNegativeStrandFlag()); Assert.assertEquals(other.getMappingQuality(), 60); Assert.assertEquals(other.getAttribute(SAMTagUtil.getSingleton().NM),0); Assert.assertEquals(other.getCigarString(),"8M2S"); + Assert.assertEquals(other.getInferredInsertSize(),-91);//100(mate) - 191(other) + }