From 7fbcb902f4430443f12b200949a4f251cdd4d964 Mon Sep 17 00:00:00 2001 From: jklughammer Date: Tue, 9 Aug 2016 18:45:52 +0200 Subject: [PATCH 1/5] add mismatch filter --- bwameth.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/bwameth.py b/bwameth.py index 5b61d67..79178dd 100755 --- a/bwameth.py +++ b/bwameth.py @@ -22,7 +22,7 @@ from subprocess import check_call from operator import itemgetter from itertools import groupby, repeat, chain -import re +import reF try: from itertools import izip @@ -37,6 +37,7 @@ def checkX(cmd): for p in os.environ['PATH'].split(":"): + print(p) if os.access(os.path.join(p, cmd), os.X_OK): break else: @@ -254,7 +255,7 @@ def name(f): def bwa_mem(fa, mfq, extra_args, prefix='bwa-meth', threads=1, rg=None, - calmd=False, paired=True, set_as_failed=None): + calmd=False, paired=True, set_as_failed=None, mismatch_ratio=0.08): conv_fa = convert_fasta(fa, just_name=True) if not is_newer_b(conv_fa, (conv_fa + '.amb', conv_fa + '.sa')): raise BWAMethException("first run bwameth.py index %s" % fa) @@ -270,10 +271,10 @@ def bwa_mem(fa, mfq, extra_args, prefix='bwa-meth', threads=1, rg=None, cmd += "-R '{rg}' -t {threads} {extra_args} {conv_fa} {mfq}" cmd = cmd.format(**locals()) sys.stderr.write("running: %s\n" % cmd.lstrip("|")) - as_bam(cmd, fa, prefix, calmd, set_as_failed) + as_bam(cmd, fa, prefix, calmd, set_as_failed, mismatch_ratio) -def as_bam(pfile, fa, prefix, calmd=False, set_as_failed=None): +def as_bam(pfile, fa, prefix, calmd=False, set_as_failed=None, mismatch_ratio=0.08): """ pfile: either a file or a |process to generate sam output fa: the reference fasta @@ -309,7 +310,7 @@ def as_bam(pfile, fa, prefix, calmd=False, set_as_failed=None): for read_name, pair_list in groupby(bam_iter2, itemgetter(0)): pair_list = [Bam(toks) for toks in pair_list] - for aln in handle_reads(pair_list, set_as_failed): + for aln in handle_reads(pair_list, set_as_failed, mismatch_ratio): out.write(str(aln) + '\n') p.stdin.flush() @@ -336,7 +337,7 @@ def handle_header(toks, out): out.write("\t".join(toks) + "\n") -def handle_reads(alns, set_as_failed): +def handle_reads(alns, set_as_failed, mismatch_ratio=0.08): for aln in alns: orig_seq = aln.original_seq @@ -355,6 +356,16 @@ def handle_reads(alns, set_as_failed): assert direction in 'fr', (direction, aln) aln.other.append('YD:Z:' + direction) + mismatches = int(next((s for s in aln.other if "NM:i:" in s), None).split(":")[2]) + + if mismatches > (aln.longest_match() * mismatch_ratio): + aln.flag |= 0x200 + aln.other.append('MC:Z:' + aln.chrom) + aln.other.append('MP:Z:' + str(aln.pos)) + aln.chrom = "*" + aln.pos = 0 + + if set_as_failed == direction: aln.flag |= 0x200 @@ -579,6 +590,9 @@ def main(args=sys.argv[1:]): p = argparse.ArgumentParser(__doc__) p.add_argument("--reference", help="reference fasta", required=True) p.add_argument("-t", "--threads", type=int, default=6) + p.add_argument("-m", "--mismatch_ratio", type=float, default=0.08, help="Maximum ratio of mismatches to " + "alignment length. Hits with more mismatches" + "will be reported as qc-failed (0x200)") p.add_argument("-p", "--prefix", default="bwa-meth") p.add_argument("--calmd", default=False, action="store_true") p.add_argument("--read-group", help="read-group to add to bam in same" @@ -603,7 +617,8 @@ def main(args=sys.argv[1:]): threads=args.threads, rg=args.read_group or rname(*args.fastqs), calmd=args.calmd, paired=len(args.fastqs) == 2, - set_as_failed=args.set_as_failed) + set_as_failed=args.set_as_failed, + mismatch_ratio=args.mismatch_ratio) def test(): From 8b0d38508e7ba533ce82bad450916b54e0bead21 Mon Sep 17 00:00:00 2001 From: jklughammer Date: Tue, 9 Aug 2016 18:54:51 +0200 Subject: [PATCH 2/5] fix typo --- bwameth.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bwameth.py b/bwameth.py index 79178dd..8a5b990 100755 --- a/bwameth.py +++ b/bwameth.py @@ -22,7 +22,7 @@ from subprocess import check_call from operator import itemgetter from itertools import groupby, repeat, chain -import reF +import re try: from itertools import izip From 0d2fe2905c6d6132e04ca090fb1b5a031442faef Mon Sep 17 00:00:00 2001 From: jklughammer Date: Tue, 9 Aug 2016 18:58:19 +0200 Subject: [PATCH 3/5] remove debug-print --- bwameth.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bwameth.py b/bwameth.py index 8a5b990..814a882 100755 --- a/bwameth.py +++ b/bwameth.py @@ -37,7 +37,6 @@ def checkX(cmd): for p in os.environ['PATH'].split(":"): - print(p) if os.access(os.path.join(p, cmd), os.X_OK): break else: From c3216561d0f733aff28426f1926522d0bd67d595 Mon Sep 17 00:00:00 2001 From: jklughammer Date: Tue, 9 Aug 2016 21:06:35 +0200 Subject: [PATCH 4/5] set default -m to 1 --- bwameth.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bwameth.py b/bwameth.py index 814a882..f6d8777 100755 --- a/bwameth.py +++ b/bwameth.py @@ -254,7 +254,7 @@ def name(f): def bwa_mem(fa, mfq, extra_args, prefix='bwa-meth', threads=1, rg=None, - calmd=False, paired=True, set_as_failed=None, mismatch_ratio=0.08): + calmd=False, paired=True, set_as_failed=None, mismatch_ratio=1): conv_fa = convert_fasta(fa, just_name=True) if not is_newer_b(conv_fa, (conv_fa + '.amb', conv_fa + '.sa')): raise BWAMethException("first run bwameth.py index %s" % fa) @@ -273,7 +273,7 @@ def bwa_mem(fa, mfq, extra_args, prefix='bwa-meth', threads=1, rg=None, as_bam(cmd, fa, prefix, calmd, set_as_failed, mismatch_ratio) -def as_bam(pfile, fa, prefix, calmd=False, set_as_failed=None, mismatch_ratio=0.08): +def as_bam(pfile, fa, prefix, calmd=False, set_as_failed=None, mismatch_ratio=1): """ pfile: either a file or a |process to generate sam output fa: the reference fasta @@ -336,7 +336,7 @@ def handle_header(toks, out): out.write("\t".join(toks) + "\n") -def handle_reads(alns, set_as_failed, mismatch_ratio=0.08): +def handle_reads(alns, set_as_failed, mismatch_ratio=1): for aln in alns: orig_seq = aln.original_seq @@ -589,7 +589,7 @@ def main(args=sys.argv[1:]): p = argparse.ArgumentParser(__doc__) p.add_argument("--reference", help="reference fasta", required=True) p.add_argument("-t", "--threads", type=int, default=6) - p.add_argument("-m", "--mismatch_ratio", type=float, default=0.08, help="Maximum ratio of mismatches to " + p.add_argument("-m", "--mismatch-ratio", type=float, default=1, help="Maximum ratio of mismatches to " "alignment length. Hits with more mismatches" "will be reported as qc-failed (0x200)") p.add_argument("-p", "--prefix", default="bwa-meth") From 0eda876edd2ff9a26aa436e767ad1a1b3bec0b6c Mon Sep 17 00:00:00 2001 From: jklughammer Date: Tue, 9 Aug 2016 21:18:36 +0200 Subject: [PATCH 5/5] no extra work if -m = 1 --- bwameth.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/bwameth.py b/bwameth.py index f6d8777..b432eb9 100755 --- a/bwameth.py +++ b/bwameth.py @@ -355,14 +355,14 @@ def handle_reads(alns, set_as_failed, mismatch_ratio=1): assert direction in 'fr', (direction, aln) aln.other.append('YD:Z:' + direction) - mismatches = int(next((s for s in aln.other if "NM:i:" in s), None).split(":")[2]) - - if mismatches > (aln.longest_match() * mismatch_ratio): - aln.flag |= 0x200 - aln.other.append('MC:Z:' + aln.chrom) - aln.other.append('MP:Z:' + str(aln.pos)) - aln.chrom = "*" - aln.pos = 0 + if mismatch_ratio < 1: + mismatches = int(next((s for s in aln.other if "NM:i:" in s), None).split(":")[2]) + if mismatches > (aln.longest_match() * mismatch_ratio): + aln.flag |= 0x200 + aln.other.append('MC:Z:' + aln.chrom) + aln.other.append('MP:Z:' + str(aln.pos)) + aln.chrom = "*" + aln.pos = 0 if set_as_failed == direction: