From 71f06db720b81e14076e1af5bf97708f16c11a2b Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 6 Jul 2017 11:20:10 -0400 Subject: [PATCH 1/5] support CIGARs with >64k operations See also samtools/hts-spec#40. --- sam.c | 51 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/sam.c b/sam.c index 5e9c20de2..f1cd8800c 100644 --- a/sam.c +++ b/sam.c @@ -352,6 +352,33 @@ int32_t bam_endpos(const bam1_t *b) return b->core.pos + 1; } +static void bam_tag2cigar(bam1_t *b) +{ + bam1_core_t *c = &b->core; + uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data; + uint8_t *CG; + if (c->n_cigar > 0 || !(c->flag & BAM_FUNMAP) || c->tid < 0 || c->pos < 0) return; + if ((CG = bam_aux_get(b, "CG")) == 0) return; + if (CG[0] != 'B' || CG[1] != 'I') return; + cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; + c->n_cigar = *(uint32_t*)(CG + 2); + n_cigar4 = c->n_cigar * 4; + CG_st = CG - b->data - 2; + CG_en = CG_st + 8 + n_cigar4; + b->l_data += n_cigar4; + if (b->m_data < b->l_data) { + b->m_data = b->l_data; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st, ori_len - cigar_st); // make room for the real CIGAR + memcpy(b->data + cigar_st, b->data + n_cigar4 + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place + if (ori_len > CG_en) + memmove(b->data + CG_st + n_cigar4, b->data + CG_en + n_cigar4, ori_len - CG_en); + b->l_data -= n_cigar4 + 8; + c->flag &= ~BAM_FUNMAP; +} + static inline int aux_type2size(uint8_t type) { switch (type) { @@ -421,6 +448,7 @@ int bam_read1(BGZF *fp, bam1_t *b) bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname) return -4; if (fp->is_be) swap_data(c, b->l_data, b->data, 0); + bam_tag2cigar(b); return 4 + block_len; } @@ -429,15 +457,12 @@ int bam_write1(BGZF *fp, const bam1_t *b) const bam1_core_t *c = &b->core; uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y; int i, ok; - if (c->n_cigar >= 65536) { - hts_log_error("Too many CIGAR operations (%d >= 64K for QNAME \"%s\")", c->n_cigar, bam_get_qname(b)); - errno = EOVERFLOW; - return -1; - } + if (c->n_cigar > 0xffff) block_len += 8; // "8" for "CGBI" plus 4-byte tag length x[0] = c->tid; x[1] = c->pos; x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul); - x[3] = (uint32_t)c->flag<<16 | c->n_cigar; + if (c->n_cigar > 0xffff) x[3] = (uint32_t)(c->flag | BAM_FUNMAP) << 16; + else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff); x[4] = c->l_qseq; x[5] = c->mtid; x[6] = c->mpos; @@ -453,7 +478,18 @@ int bam_write1(BGZF *fp, const bam1_t *b) } if (ok) ok = (bgzf_write(fp, x, 32) >= 0); if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0); - if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0); + if (c->n_cigar <= 0xffff) { + if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0); + } else { + uint32_t cigar_st, cigar_en; + cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; + cigar_en = cigar_st + c->n_cigar * 4; + if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR + memcpy(&x[0], "CGBI", 4); + x[1] = c->n_cigar; + if (ok) ok = (bgzf_write(fp, x, 8) >= 0); // write CG:B:I and length + if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR + } if (fp->is_be) swap_data(c, b->l_data, b->data, 0); return ok? 4 + block_len : -1; } @@ -1143,6 +1179,7 @@ int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) } else _parse_err_param(1, "unrecognized type %c", type); } b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m; + bam_tag2cigar(b); return 0; #undef _parse_warn From 860ebdd796c0c19bb7bf96f476b99a292f796480 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 6 Jul 2017 11:49:36 -0400 Subject: [PATCH 2/5] fixed wrong byte-order on big endian --- sam.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sam.c b/sam.c index f1cd8800c..d649b2e62 100644 --- a/sam.c +++ b/sam.c @@ -485,9 +485,13 @@ int bam_write1(BGZF *fp, const bam1_t *b) cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; cigar_en = cigar_st + c->n_cigar * 4; if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR - memcpy(&x[0], "CGBI", 4); - x[1] = c->n_cigar; - if (ok) ok = (bgzf_write(fp, x, 8) >= 0); // write CG:B:I and length + if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I + if (fp->is_be) { + y = c->n_cigar; + if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0); + } else { + if (ok) ok = (bgzf_write(fp, &c->n_cigar, 4) >= 0); + } if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR } if (fp->is_be) swap_data(c, b->l_data, b->data, 0); From 02274067ab222c4197869fd2bfaf8162e8b0195f Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 6 Jul 2017 11:51:00 -0400 Subject: [PATCH 3/5] changed TAB to 4 spaces --- sam.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sam.c b/sam.c index d649b2e62..fa97e9357 100644 --- a/sam.c +++ b/sam.c @@ -485,13 +485,13 @@ int bam_write1(BGZF *fp, const bam1_t *b) cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; cigar_en = cigar_st + c->n_cigar * 4; if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR - if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I - if (fp->is_be) { - y = c->n_cigar; - if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0); - } else { - if (ok) ok = (bgzf_write(fp, &c->n_cigar, 4) >= 0); - } + if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I + if (fp->is_be) { + y = c->n_cigar; + if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0); + } else { + if (ok) ok = (bgzf_write(fp, &c->n_cigar, 4) >= 0); + } if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR } if (fp->is_be) swap_data(c, b->l_data, b->data, 0); From 6cb3dcd48144553c613e065a2b41423f7511fb13 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 6 Jul 2017 12:28:02 -0400 Subject: [PATCH 4/5] check realloc return; fixed an endianness issue --- sam.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/sam.c b/sam.c index fa97e9357..fdef51aa3 100644 --- a/sam.c +++ b/sam.c @@ -352,24 +352,27 @@ int32_t bam_endpos(const bam1_t *b) return b->core.pos + 1; } -static void bam_tag2cigar(bam1_t *b) +static int bam_tag2cigar(bam1_t *b) { bam1_core_t *c = &b->core; uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data; uint8_t *CG; - if (c->n_cigar > 0 || !(c->flag & BAM_FUNMAP) || c->tid < 0 || c->pos < 0) return; - if ((CG = bam_aux_get(b, "CG")) == 0) return; - if (CG[0] != 'B' || CG[1] != 'I') return; + if (c->n_cigar > 0 || !(c->flag & BAM_FUNMAP) || c->tid < 0 || c->pos < 0) return 0; + if ((CG = bam_aux_get(b, "CG")) == 0) return 0; + if (CG[0] != 'B' || CG[1] != 'I') return 0; cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; - c->n_cigar = *(uint32_t*)(CG + 2); + c->n_cigar = le_to_u32(CG + 2); n_cigar4 = c->n_cigar * 4; CG_st = CG - b->data - 2; CG_en = CG_st + 8 + n_cigar4; b->l_data += n_cigar4; if (b->m_data < b->l_data) { - b->m_data = b->l_data; - kroundup32(b->m_data); - b->data = (uint8_t*)realloc(b->data, b->m_data); + uint8_t *new_data; + uint32_t new_max = b->l_data; + kroundup32(new_max); + new_data = (uint8_t*)realloc(b->data, new_max); + if (new_data == 0) return -1; + b->m_data = new_max, b->data = new_data; } memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st, ori_len - cigar_st); // make room for the real CIGAR memcpy(b->data + cigar_st, b->data + n_cigar4 + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place @@ -377,6 +380,7 @@ static void bam_tag2cigar(bam1_t *b) memmove(b->data + CG_st + n_cigar4, b->data + CG_en + n_cigar4, ori_len - CG_en); b->l_data -= n_cigar4 + 8; c->flag &= ~BAM_FUNMAP; + return 1; } static inline int aux_type2size(uint8_t type) @@ -448,7 +452,7 @@ int bam_read1(BGZF *fp, bam1_t *b) bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname) return -4; if (fp->is_be) swap_data(c, b->l_data, b->data, 0); - bam_tag2cigar(b); + if (bam_tag2cigar(b) < 0) return -4; return 4 + block_len; } @@ -1183,7 +1187,7 @@ int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) } else _parse_err_param(1, "unrecognized type %c", type); } b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m; - bam_tag2cigar(b); + if (bam_tag2cigar(b) < 0) return -2; return 0; #undef _parse_warn From 9d2d1a83c3a9ced1c091aecae809a94cd82eaf33 Mon Sep 17 00:00:00 2001 From: Heng Li Date: Thu, 13 Jul 2017 17:48:26 -0400 Subject: [PATCH 5/5] Long-cigar solution with soft-clipping --- sam.c | 55 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/sam.c b/sam.c index fdef51aa3..68d27b81d 100644 --- a/sam.c +++ b/sam.c @@ -355,17 +355,25 @@ int32_t bam_endpos(const bam1_t *b) static int bam_tag2cigar(bam1_t *b) { bam1_core_t *c = &b->core; - uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data; + uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len; uint8_t *CG; - if (c->n_cigar > 0 || !(c->flag & BAM_FUNMAP) || c->tid < 0 || c->pos < 0) return 0; - if ((CG = bam_aux_get(b, "CG")) == 0) return 0; - if (CG[0] != 'B' || CG[1] != 'I') return 0; - cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; - c->n_cigar = le_to_u32(CG + 2); + + // test where there is a real CIGAR in the CG tag to move + if (c->n_cigar != 1 || c->tid < 0 || c->pos < 0) return 0; + cigar0 = bam_get_cigar(b); + if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0; + if ((CG = bam_aux_get(b, "CG")) == 0) return 0; // no CG tag + if (CG[0] != 'B' || CG[1] != 'I') return 0; // not of type B,I + CG_len = le_to_u32(CG + 2); + if (CG_len == 0) return 0; // nothing to move + + // move from the CG tag to the right position + cigar_st = (uint8_t*)cigar0 - b->data; + c->n_cigar = CG_len; n_cigar4 = c->n_cigar * 4; CG_st = CG - b->data - 2; CG_en = CG_st + 8 + n_cigar4; - b->l_data += n_cigar4; + b->l_data += n_cigar4 - 4; // we need (c->n_cigar-1)*4 bytes to swap CIGAR to the right place if (b->m_data < b->l_data) { uint8_t *new_data; uint32_t new_max = b->l_data; @@ -374,12 +382,11 @@ static int bam_tag2cigar(bam1_t *b) if (new_data == 0) return -1; b->m_data = new_max, b->data = new_data; } - memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st, ori_len - cigar_st); // make room for the real CIGAR - memcpy(b->data + cigar_st, b->data + n_cigar4 + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place - if (ori_len > CG_en) - memmove(b->data + CG_st + n_cigar4, b->data + CG_en + n_cigar4, ori_len - CG_en); - b->l_data -= n_cigar4 + 8; - c->flag &= ~BAM_FUNMAP; + memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + 4, ori_len - (cigar_st + 4)); // insert 4*(c->n_cigar-1) empty space to make room + memcpy(b->data + cigar_st, b->data + (n_cigar4 - 4) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -4 for the fake CIGAR + if (ori_len > CG_en) // move data after the CG tag + memmove(b->data + CG_st + n_cigar4 - 4, b->data + CG_en + n_cigar4 - 4, ori_len - CG_en); + b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4) return 1; } @@ -461,11 +468,11 @@ int bam_write1(BGZF *fp, const bam1_t *b) const bam1_core_t *c = &b->core; uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y; int i, ok; - if (c->n_cigar > 0xffff) block_len += 8; // "8" for "CGBI" plus 4-byte tag length + if (c->n_cigar > 0xffff) block_len += 12; // "12" for "CGBI", 4-byte tag length and 4-byte fake CIGAR x[0] = c->tid; x[1] = c->pos; x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul); - if (c->n_cigar > 0xffff) x[3] = (uint32_t)(c->flag | BAM_FUNMAP) << 16; + if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 1; else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff); x[4] = c->l_qseq; x[5] = c->mtid; @@ -482,20 +489,20 @@ int bam_write1(BGZF *fp, const bam1_t *b) } if (ok) ok = (bgzf_write(fp, x, 32) >= 0); if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0); - if (c->n_cigar <= 0xffff) { + if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0); - } else { - uint32_t cigar_st, cigar_en; + } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag + uint8_t buf[4]; + uint32_t cigar_st, cigar_en, cigar1; cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; cigar_en = cigar_st + c->n_cigar * 4; + cigar1 = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP; + u32_to_le(cigar1, buf); + if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write cigar: S if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I - if (fp->is_be) { - y = c->n_cigar; - if (ok) ok = (bgzf_write(fp, ed_swap_4p(&y), 4) >= 0); - } else { - if (ok) ok = (bgzf_write(fp, &c->n_cigar, 4) >= 0); - } + u32_to_le(c->n_cigar, buf); + if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR } if (fp->is_be) swap_data(c, b->l_data, b->data, 0);