From c4b84f5f86ba8d2524d5ae77fd05e30926490ce5 Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Mon, 6 Mar 2017 01:52:59 -0500 Subject: [PATCH 1/2] Updated revert script to split RGs up front for scalability --- ...RevertBamToUnmappedRGBamsWf_170306.inputs.json} | 0 ....wdl => RevertBamToUnmappedRGBamsWf_170306.wdl} | 67 +++++++++++++++++----- 2 files changed, 52 insertions(+), 15 deletions(-) rename scripts/broad_dsde_workflows/{RevertBamToUnmappedRGBamsWf_170107.inputs.json => RevertBamToUnmappedRGBamsWf_170306.inputs.json} (100%) rename scripts/broad_dsde_workflows/{RevertBamToUnmappedRGBamsWf_170107.wdl => RevertBamToUnmappedRGBamsWf_170306.wdl} (55%) diff --git a/scripts/broad_dsde_workflows/RevertBamToUnmappedRGBamsWf_170107.inputs.json b/scripts/broad_dsde_workflows/RevertBamToUnmappedRGBamsWf_170306.inputs.json similarity index 100% rename from scripts/broad_dsde_workflows/RevertBamToUnmappedRGBamsWf_170107.inputs.json rename to scripts/broad_dsde_workflows/RevertBamToUnmappedRGBamsWf_170306.inputs.json diff --git a/scripts/broad_dsde_workflows/RevertBamToUnmappedRGBamsWf_170107.wdl b/scripts/broad_dsde_workflows/RevertBamToUnmappedRGBamsWf_170306.wdl similarity index 55% rename from scripts/broad_dsde_workflows/RevertBamToUnmappedRGBamsWf_170107.wdl rename to scripts/broad_dsde_workflows/RevertBamToUnmappedRGBamsWf_170306.wdl index b75682b..48327d8 100644 --- a/scripts/broad_dsde_workflows/RevertBamToUnmappedRGBamsWf_170107.wdl +++ b/scripts/broad_dsde_workflows/RevertBamToUnmappedRGBamsWf_170306.wdl @@ -26,20 +26,46 @@ # TASK DEFINITIONS -# Revert a BAM to uBAMs, one per readgroup -task RevertBamToUnmappedRGBams { +# Split sample BAM into per-readgroup BAMs +task SplitReadsByRG { File input_bam - String output_dir + File input_bam_index + Int disk_size + String mem_size + + command { + java -Xmx4000m -jar /usr/gitc/GATK4.jar \ + SplitReads \ + -I ${input_bam} \ + -O . \ + -RG + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.5-1486412288" + disks: "local-disk " + disk_size + " HDD" + memory: mem_size + } + output { + Array[File] readgroup_bams = glob("*.bam") + } +} + +# Revert a BAM to uBAM +task RevertBamToUnmapped { + File input_bam + String output_basename Float? max_discard_pct Int disk_size String mem_size + String output_name = "${output_basename}.bam" + command { - java -Xmx1000m -jar /usr/gitc/picard.jar \ + java -Xmx4000m -jar /usr/gitc/picard.jar \ RevertSam \ INPUT=${input_bam} \ - O=${output_dir} \ - OUTPUT_BY_READGROUP=true \ + O=${output_name} \ + OUTPUT_BY_READGROUP=false \ VALIDATION_STRINGENCY=LENIENT \ SANITIZE=TRUE \ MAX_DISCARD_FRACTION=${max_discard_pct} \ @@ -47,12 +73,12 @@ task RevertBamToUnmappedRGBams { SORT_ORDER=queryname } runtime { - docker: "broadinstitute/genomes-in-the-cloud:2.2.3-1469027018" + docker: "broadinstitute/genomes-in-the-cloud:2.2.5-1486412288" disks: "local-disk " + disk_size + " HDD" memory: mem_size } output { - Array[File] unmapped_bams = glob("*.bam") + File unmapped_bam = "${output_name}" } } @@ -61,17 +87,28 @@ workflow RevertBamToUnmappedRGBamsWf { File input_bam File ref_fasta File ref_fasta_index - String output_dir - # Revert inputs to unmapped - call RevertBamToUnmappedRGBams { - input: - input_bam = input_bam, - output_dir = output_dir + # Split input BAM by readgroup + call SplitReadsByRG { + input: + input_bam = input_bam + } + + scatter (readgroup_bam in SplitReadsByRG.readgroup_bams) { + + String sub_strip_path = "gs://.*/" + String sub_strip_suffix = ".bam$" + + # Revert readgroup BAMs to unmapped + call RevertBamToUnmapped { + input: + input_bam = readgroup_bam, + output_basename = sub(sub(input_bam, sub_strip_path, ""), sub_strip_suffix, "") + ".unmapped" + } } # Outputs that will be retained when execution is complete output { - Array[File] unmapped_bams_output=RevertBamToUnmappedRGBams.unmapped_bams + Array[File] unmapped_bams_output=RevertBamToUnmapped.unmapped_bam } } From 0ce40d3412e657bfc9fe8f586ef7d8c36a54a184 Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Mon, 6 Mar 2017 02:20:55 -0500 Subject: [PATCH 2/2] fixup --- .../RevertBamToUnmappedRGBamsWf_170306.inputs.json | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/broad_dsde_workflows/RevertBamToUnmappedRGBamsWf_170306.inputs.json b/scripts/broad_dsde_workflows/RevertBamToUnmappedRGBamsWf_170306.inputs.json index 5511207..51d8fc1 100644 --- a/scripts/broad_dsde_workflows/RevertBamToUnmappedRGBamsWf_170306.inputs.json +++ b/scripts/broad_dsde_workflows/RevertBamToUnmappedRGBamsWf_170306.inputs.json @@ -4,13 +4,13 @@ "RevertBamToUnmappedRGBamsWf.ref_fasta_index": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta.fai", "RevertBamToUnmappedRGBamsWf.input_bam": "gs://gatk-test-data/wgs_bam/NA12878_20k_b37/NA12878.bam", + "RevertBamToUnmappedRGBamsWf.SplitReadsByRG.input_bam_index": "gs://gatk-test-data/wgs_bam/NA12878_20k_b37/NA12878.bai", + + "RevertBamToUnmappedRGBamsWf.RevertBamToUnmapped.max_discard_pct": 0.01, - "RevertBamToUnmappedRGBamsWf.output_dir": ".", - - "RevertBamToUnmappedRGBamsWf.RevertBamToUnmappedRGBams.max_discard_pct": 0.01, + "RevertBamToUnmappedRGBamsWf.SplitReadsByRG.disk_size": "10", + "RevertBamToUnmappedRGBamsWf.SplitReadsByRG.mem_size": "1 GB", - "RevertBamToUnmappedRGBamsWf.RevertBamToUnmappedRGBams.disk_size": 10, - "RevertBamToUnmappedRGBamsWf.RevertBamToUnmappedRGBams.mem_size": "1 GB", - "RevertBamToUnmappedRGBamsWf.SortBamByQueryname.disk_size": 10, - "RevertBamToUnmappedRGBamsWf.SortBamByQueryname.mem_size": "3500 MB" + "RevertBamToUnmappedRGBamsWf.RevertBamToUnmapped.disk_size": 10, + "RevertBamToUnmappedRGBamsWf.RevertBamToUnmapped.mem_size": "1 GB" }