diff --git a/sam.c b/sam.c index 5e9c20de2..68d27b81d 100644 --- a/sam.c +++ b/sam.c @@ -352,6 +352,44 @@ int32_t bam_endpos(const bam1_t *b) return b->core.pos + 1; } +static int bam_tag2cigar(bam1_t *b) +{ + bam1_core_t *c = &b->core; + uint32_t cigar_st, n_cigar4, CG_st, CG_en, ori_len = b->l_data, *cigar0, CG_len; + uint8_t *CG; + + // test where there is a real CIGAR in the CG tag to move + if (c->n_cigar != 1 || c->tid < 0 || c->pos < 0) return 0; + cigar0 = bam_get_cigar(b); + if (bam_cigar_op(cigar0[0]) != BAM_CSOFT_CLIP || bam_cigar_oplen(cigar0[0]) != c->l_qseq) return 0; + if ((CG = bam_aux_get(b, "CG")) == 0) return 0; // no CG tag + if (CG[0] != 'B' || CG[1] != 'I') return 0; // not of type B,I + CG_len = le_to_u32(CG + 2); + if (CG_len == 0) return 0; // nothing to move + + // move from the CG tag to the right position + cigar_st = (uint8_t*)cigar0 - b->data; + c->n_cigar = CG_len; + n_cigar4 = c->n_cigar * 4; + CG_st = CG - b->data - 2; + CG_en = CG_st + 8 + n_cigar4; + b->l_data += n_cigar4 - 4; // we need (c->n_cigar-1)*4 bytes to swap CIGAR to the right place + if (b->m_data < b->l_data) { + uint8_t *new_data; + uint32_t new_max = b->l_data; + kroundup32(new_max); + new_data = (uint8_t*)realloc(b->data, new_max); + if (new_data == 0) return -1; + b->m_data = new_max, b->data = new_data; + } + memmove(b->data + cigar_st + n_cigar4, b->data + cigar_st + 4, ori_len - (cigar_st + 4)); // insert 4*(c->n_cigar-1) empty space to make room + memcpy(b->data + cigar_st, b->data + (n_cigar4 - 4) + CG_st + 8, n_cigar4); // copy the real CIGAR to the right place; -4 for the fake CIGAR + if (ori_len > CG_en) // move data after the CG tag + memmove(b->data + CG_st + n_cigar4 - 4, b->data + CG_en + n_cigar4 - 4, ori_len - CG_en); + b->l_data -= n_cigar4 + 8; // 8: CGBI (4 bytes) and CGBI length (4) + return 1; +} + static inline int aux_type2size(uint8_t type) { switch (type) { @@ -421,6 +459,7 @@ int bam_read1(BGZF *fp, bam1_t *b) bgzf_read(fp, b->data + c->l_qname, b->l_data - c->l_qname) != b->l_data - c->l_qname) return -4; if (fp->is_be) swap_data(c, b->l_data, b->data, 0); + if (bam_tag2cigar(b) < 0) return -4; return 4 + block_len; } @@ -429,15 +468,12 @@ int bam_write1(BGZF *fp, const bam1_t *b) const bam1_core_t *c = &b->core; uint32_t x[8], block_len = b->l_data - c->l_extranul + 32, y; int i, ok; - if (c->n_cigar >= 65536) { - hts_log_error("Too many CIGAR operations (%d >= 64K for QNAME \"%s\")", c->n_cigar, bam_get_qname(b)); - errno = EOVERFLOW; - return -1; - } + if (c->n_cigar > 0xffff) block_len += 12; // "12" for "CGBI", 4-byte tag length and 4-byte fake CIGAR x[0] = c->tid; x[1] = c->pos; x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul); - x[3] = (uint32_t)c->flag<<16 | c->n_cigar; + if (c->n_cigar > 0xffff) x[3] = (uint32_t)c->flag << 16 | 1; + else x[3] = (uint32_t)c->flag << 16 | (c->n_cigar & 0xffff); x[4] = c->l_qseq; x[5] = c->mtid; x[6] = c->mpos; @@ -453,7 +489,22 @@ int bam_write1(BGZF *fp, const bam1_t *b) } if (ok) ok = (bgzf_write(fp, x, 32) >= 0); if (ok) ok = (bgzf_write(fp, b->data, c->l_qname - c->l_extranul) >= 0); - if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0); + if (c->n_cigar <= 0xffff) { // no long CIGAR; write normally + if (ok) ok = (bgzf_write(fp, b->data + c->l_qname, b->l_data - c->l_qname) >= 0); + } else { // with long CIGAR, insert a fake CIGAR record and move the real CIGAR to the CG:B,I tag + uint8_t buf[4]; + uint32_t cigar_st, cigar_en, cigar1; + cigar_st = (uint8_t*)bam_get_cigar(b) - b->data; + cigar_en = cigar_st + c->n_cigar * 4; + cigar1 = (uint32_t)c->l_qseq << 4 | BAM_CSOFT_CLIP; + u32_to_le(cigar1, buf); + if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write cigar: S + if (ok) ok = (bgzf_write(fp, &b->data[cigar_en], b->l_data - cigar_en) >= 0); // write data after CIGAR + if (ok) ok = (bgzf_write(fp, "CGBI", 4) >= 0); // write CG:B,I + u32_to_le(c->n_cigar, buf); + if (ok) ok = (bgzf_write(fp, buf, 4) >= 0); // write the true CIGAR length + if (ok) ok = (bgzf_write(fp, &b->data[cigar_st], c->n_cigar * 4) >= 0); // write the real CIGAR + } if (fp->is_be) swap_data(c, b->l_data, b->data, 0); return ok? 4 + block_len : -1; } @@ -1143,6 +1194,7 @@ int sam_parse1(kstring_t *s, bam_hdr_t *h, bam1_t *b) } else _parse_err_param(1, "unrecognized type %c", type); } b->data = (uint8_t*)str.s; b->l_data = str.l; b->m_data = str.m; + if (bam_tag2cigar(b) < 0) return -2; return 0; #undef _parse_warn