From 0e2e8528e89fdbda0c36583968f422cfa419262e Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Thu, 14 Jul 2016 13:51:12 -0400 Subject: [PATCH] Update PairedSingleSampleWf and add JSON --- .../PublicPairedSingleSampleWf_160714.json | 45 ++ .../PublicPairedSingleSampleWf_160714.wdl | 772 +++++++++++++++++++++ scripts/broad_pipelines/README.md | 12 +- .../PublicPairedSingleSampleWf_160624.wdl} | 17 +- 4 files changed, 836 insertions(+), 10 deletions(-) create mode 100644 scripts/broad_pipelines/PublicPairedSingleSampleWf_160714.json create mode 100644 scripts/broad_pipelines/PublicPairedSingleSampleWf_160714.wdl rename scripts/broad_pipelines/{PublicPairedSingleSampleWf-160624.wdl => archive/PublicPairedSingleSampleWf_160624.wdl} (97%) diff --git a/scripts/broad_pipelines/PublicPairedSingleSampleWf_160714.json b/scripts/broad_pipelines/PublicPairedSingleSampleWf_160714.json new file mode 100644 index 0000000..1307903 --- /dev/null +++ b/scripts/broad_pipelines/PublicPairedSingleSampleWf_160714.json @@ -0,0 +1,45 @@ +{ + "##_COMMENT1": "SAMPLE NAME AND UNMAPPED BAMS", + "PairedEndSingleSampleWorkflow.sample_name": "NA12878", + "PairedEndSingleSampleWorkflow.flowcell_unmapped_bams": [ + "/PATH/TO/NA12878.1.20k_reads.bam", + "/PATH/TO/NA12878.2.20k_reads.bam" + ], + "PairedEndSingleSampleWorkflow.final_gvcf_name": "NA12878.g.vcf.gz", + "PairedEndSingleSampleWorkflow.unmapped_bam_suffix": ".bam", + + "##_COMMENT2": "INTERVALS", + "PairedEndSingleSampleWorkflow.scattered_calling_intervals": ["/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0001_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0002_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0003_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0004_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0005_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0006_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0007_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0008_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0009_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0010_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0011_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0012_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0013_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0014_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0015_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0016_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0017_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0018_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0019_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0020_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0021_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0022_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0023_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0024_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0025_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0026_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0027_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0028_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0029_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0030_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0031_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0032_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0033_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0034_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0035_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0036_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0037_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0038_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0039_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0040_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0041_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0042_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0043_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0044_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0045_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0046_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0047_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0048_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0049_of_50/scattered.interval_list", "/PATH/TO/hg38/v0/scattered_calling_intervals/temp_0050_of_50/scattered.interval_list"], + "PairedEndSingleSampleWorkflow.wgs_calling_interval_list": "/PATH/TO/hg38/v0/wgs_calling_regions.hg38.interval_list", + + "##_COMMENT2": "OPTIONAL ARGUMENTS", + "PairedEndSingleSampleWorkflow.HaplotypeCaller.contamination": 0, + + "##_COMMENT3": "REFERENCE FILES", + "PairedEndSingleSampleWorkflow.ref_dict": "/PATH/TO/hg38/v0/Homo_sapiens_assembly38.dict", + "PairedEndSingleSampleWorkflow.ref_fasta": "/PATH/TO/hg38/v0/Homo_sapiens_assembly38.fasta", + "PairedEndSingleSampleWorkflow.ref_fasta_index": "/PATH/TO/hg38/v0/Homo_sapiens_assembly38.fasta.fai", + "PairedEndSingleSampleWorkflow.ref_alt": "/PATH/TO/hg38/v0/Homo_sapiens_assembly38.fasta.64.alt", + "PairedEndSingleSampleWorkflow.ref_sa": "/PATH/TO/hg38/v0/Homo_sapiens_assembly38.fasta.64.sa", + "PairedEndSingleSampleWorkflow.ref_amb": "/PATH/TO/hg38/v0/Homo_sapiens_assembly38.fasta.64.amb", + "PairedEndSingleSampleWorkflow.ref_bwt": "/PATH/TO/hg38/v0/Homo_sapiens_assembly38.fasta.64.bwt", + "PairedEndSingleSampleWorkflow.ref_ann": "/PATH/TO/hg38/v0/Homo_sapiens_assembly38.fasta.64.ann", + "PairedEndSingleSampleWorkflow.ref_pac": "/PATH/TO/hg38/v0/Homo_sapiens_assembly38.fasta.64.pac", + + "##_COMMENT4": "KNOWN SITES RESOURCES", + "PairedEndSingleSampleWorkflow.dbSNP_vcf": "/PATH/TO/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", + "PairedEndSingleSampleWorkflow.dbSNP_vcf_index": "/PATH/TO/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx", + "PairedEndSingleSampleWorkflow.known_snps_sites_vcf": "/PATH/TO/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", + "PairedEndSingleSampleWorkflow.known_snps_sites_vcf_index": "/PATH/TO/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi", + "PairedEndSingleSampleWorkflow.known_indels_sites_vcf": "/PATH/TO/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz", + "PairedEndSingleSampleWorkflow.known_indels_sites_vcf_index": "/PATH/TO/hg38/v0/Homo_sapiens_assembly38.known_indels.vcf.gz.tbi", + + "##_COMMENT5": "DISK SIZES + PREEMPTIBLES", + "PairedEndSingleSampleWorkflow.agg_small_disk": 200, + "PairedEndSingleSampleWorkflow.agg_medium_disk": 300, + "PairedEndSingleSampleWorkflow.agg_large_disk": 400, + "PairedEndSingleSampleWorkflow.agg_preemptible_tries": 3, + "PairedEndSingleSampleWorkflow.flowcell_small_disk": 200, + "PairedEndSingleSampleWorkflow.flowcell_medium_disk": 300, + "PairedEndSingleSampleWorkflow.preemptible_tries": 3, +} diff --git a/scripts/broad_pipelines/PublicPairedSingleSampleWf_160714.wdl b/scripts/broad_pipelines/PublicPairedSingleSampleWf_160714.wdl new file mode 100644 index 0000000..686e43f --- /dev/null +++ b/scripts/broad_pipelines/PublicPairedSingleSampleWf_160714.wdl @@ -0,0 +1,772 @@ +## Copyright Broad Institute, 2016 +## +## This WDL pipeline implements data pre-processing and initial variant calling (GVCF +## generation) according to the GATK Best Practices (June 2016) for germline SNP and +## Indel discovery in human whole-genome sequencing (WGS) data. +## +## Requirements/expectations : +## - Human whole-genome pair-end sequencing data in unmapped BAM (uBAM) format +## - One or more read groups, one per uBAM file, all belonging to a single sample (SM) +## - Input uBAM files must additionally comply with the following requirements: +## - - filenames all have the same suffix (we use ".unmapped.bam") +## - - files must pass validation by ValidateSamFile +## - - reads are provided in query-sorted order +## - - all reads must have an RG tag +## - Reference genome must be Hg38 with ALT contigs +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# TASK DEFINITIONS + +# Get version of BWA +task GetBwaVersion { + command { + /usr/gitc/bwa 2>&1 | \ + grep -e '^Version' | \ + sed 's/Version: //' + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.2-1466113830" + memory: "1 GB" + } + output { + String version = read_string(stdout()) + } +} + +# Read unmapped BAM, convert on-the-fly to FASTQ and stream to BWA MEM for alignment +task SamToFastqAndBwaMem { + File input_bam + String bwa_commandline + String output_bam_basename + File ref_fasta + File ref_fasta_index + File ref_dict + + # This is the .alt file from bwa-kit (https://github.com/lh3/bwa/tree/master/bwakit), + # listing the reference contigs that are "alternative". + File ref_alt + + File ref_amb + File ref_ann + File ref_bwt + File ref_pac + File ref_sa + Int disk_size + Int preemptible_tries + + command <<< + set -o pipefail + # set the bash variable needed for the command-line + bash_ref_fasta=${ref_fasta} + # if ref_alt has data in it, + if [ -s ${ref_alt} ]; then + java -Xmx3000m -jar /usr/gitc/picard.jar \ + SamToFastq \ + INPUT=${input_bam} \ + FASTQ=/dev/stdout \ + INTERLEAVE=true \ + CLIPPING_ATTRIBUTE=XT \ + CLIPPING_ACTION=2 \ + NON_PF=true | \ + /usr/gitc/${bwa_commandline} /dev/stdin - 2> >(tee ${output_bam_basename}.bwa.stderr.log >&2) | \ + samtools view -1 - > ${output_bam_basename}.bam && \ + grep -m1 "read .* ALT contigs" ${output_bam_basename}.bwa.stderr.log | \ + grep -v "read 0 ALT contigs" + + # else ref_alt is empty or could not be found + else + exit 1; + fi + >>> + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.2-1466113830" + memory: "14 GB" + cpu: "16" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File output_bam = "${output_bam_basename}.bam" + File bwa_stderr_log = "${output_bam_basename}.bwa.stderr.log" + } +} + +# Merge original input uBAM file with BWA-aligned BAM file +task MergeBamAlignment { + File unmapped_bam + String bwa_commandline + String bwa_version + File aligned_bam + String output_bam_basename + File ref_fasta + File ref_fasta_index + File ref_dict + Int disk_size + Int preemptible_tries + + command { + # set the bash variable needed for the command-line + bash_ref_fasta=${ref_fasta} + java -Xmx3000m -jar /usr/gitc/picard.jar \ + MergeBamAlignment \ + VALIDATION_STRINGENCY=SILENT \ + EXPECTED_ORIENTATIONS=FR \ + ATTRIBUTES_TO_RETAIN=X0 \ + ALIGNED_BAM=${aligned_bam} \ + UNMAPPED_BAM=${unmapped_bam} \ + OUTPUT=${output_bam_basename}.bam \ + REFERENCE_SEQUENCE=${ref_fasta} \ + PAIRED_RUN=true \ + SORT_ORDER="unsorted" \ + IS_BISULFITE_SEQUENCE=false \ + ALIGNED_READS_ONLY=false \ + CLIP_ADAPTERS=false \ + MAX_RECORDS_IN_RAM=2000000 \ + ADD_MATE_CIGAR=true \ + MAX_INSERTIONS_OR_DELETIONS=-1 \ + PRIMARY_ALIGNMENT_STRATEGY=MostDistant \ + PROGRAM_RECORD_ID="bwamem" \ + PROGRAM_GROUP_VERSION="${bwa_version}" \ + PROGRAM_GROUP_COMMAND_LINE="${bwa_commandline}" \ + PROGRAM_GROUP_NAME="bwamem" \ + UNMAP_CONTAMINANT_READS=true + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.2-1466113830" + memory: "3500 MB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File output_bam = "${output_bam_basename}.bam" + } +} + +# Sort BAM file by coordinate order and fix tag values for NM and UQ +task SortAndFixTags { + File input_bam + String output_bam_basename + File ref_dict + File ref_fasta + File ref_fasta_index + Int disk_size + Int preemptible_tries + + command { + java -Xmx4000m -jar /usr/gitc/picard.jar \ + SortSam \ + INPUT=${input_bam} \ + OUTPUT=/dev/stdout \ + SORT_ORDER="coordinate" \ + CREATE_INDEX=false \ + CREATE_MD5_FILE=false | \ + java -Xmx500m -jar /usr/gitc/picard.jar \ + SetNmAndUqTags \ + INPUT=/dev/stdin \ + OUTPUT=${output_bam_basename}.bam \ + CREATE_INDEX=true \ + CREATE_MD5_FILE=true \ + REFERENCE_SEQUENCE=${ref_fasta} + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.2-1466113830" + disks: "local-disk " + disk_size + " HDD" + cpu: "1" + memory: "5000 MB" + preemptible: preemptible_tries + } + output { + File output_bam = "${output_bam_basename}.bam" + File output_bam_index = "${output_bam_basename}.bai" + File output_bam_md5 = "${output_bam_basename}.bam.md5" + } +} + +# Mark duplicate reads to avoid counting non-independent observations +task MarkDuplicates { + Array[File] input_bams + String output_bam_basename + String metrics_filename + Int disk_size + + # Task is assuming query-sorted input so that the Secondary and Supplementary reads get marked correctly + # This works because the output of BWA is query-grouped, and thus so is the output of MergeBamAlignment. + # While query-grouped isn't actually query-sorted, it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname" + command { + java -Xmx4000m -jar /usr/gitc/picard.jar \ + MarkDuplicates \ + INPUT=${sep=' INPUT=' input_bams} \ + OUTPUT=${output_bam_basename}.bam \ + METRICS_FILE=${metrics_filename} \ + VALIDATION_STRINGENCY=SILENT \ + OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \ + ASSUME_SORT_ORDER="queryname" + CREATE_MD5_FILE=true + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.2-1466113830" + memory: "7 GB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File output_bam = "${output_bam_basename}.bam" + File duplicate_metrics = "${metrics_filename}" + } +} + +# Generate sets of intervals for scatter-gathering over chromosomes +task CreateSequenceGroupingTSV { + File ref_dict + Int preemptible_tries + + # Use python to create the Sequencing Groupings used for BQSR and PrintReads Scatter. It outputs to stdout + # where it is parsed into a wdl Array[Array[String]] + # e.g. [["1"], ["2"], ["3", "4"], ["5"], ["6", "7", "8"]] + command <<< + python <>> + runtime { + docker: "python:2.7" + memory: "2 GB" + preemptible: preemptible_tries + } + output { + Array[Array[String]] sequence_grouping = read_tsv(stdout()) + } +} + +# Generate Base Quality Score Recalibration (BQSR) model +task BaseRecalibrator { + File input_bam + File input_bam_index + String recalibration_report_filename + Array[String] sequence_group_interval + File dbSNP_vcf + File dbSNP_vcf_index + File known_snps_sites_vcf + File known_snps_sites_vcf_index + File known_indels_sites_vcf + File known_indels_sites_vcf_index + File ref_dict + File ref_fasta + File ref_fasta_index + Int disk_size + Int preemptible_tries + + command { + java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+PrintFlagsFinal \ + -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -XX:+PrintGCDetails \ + -Xloggc:gc_log.log -Dsamjdk.use_async_io=false -Xmx4000m \ + -jar /usr/gitc/GATK4.jar \ + BaseRecalibrator \ + -R ${ref_fasta} \ + -I ${input_bam} \ + --useOriginalQualities \ + -O ${recalibration_report_filename} \ + -knownSites ${dbSNP_vcf} \ + -knownSites ${known_snps_sites_vcf} \ + -knownSites ${known_indels_sites_vcf} \ + -L ${sep=" -L " sequence_group_interval} + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.2-1466113830" + memory: "6 GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File recalibration_report = "${recalibration_report_filename}" + #this output is only for GOTC STAGING to give some GC statistics to the GATK4 team + #File gc_logs = "gc_log.log" + } +} + +# Apply Base Quality Score Recalibration (BQSR) model +task ApplyBQSR { + File input_bam + File input_bam_index + String output_bam_basename + File recalibration_report + Array[String] sequence_group_interval + File ref_dict + File ref_fasta + File ref_fasta_index + Int disk_size + Int preemptible_tries + + command { + java -XX:+PrintFlagsFinal -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps \ + -XX:+PrintGCDetails -Xloggc:gc_log.log -Dsamjdk.use_async_io=false \ + -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx3000m \ + -jar /usr/gitc/GATK4.jar \ + ApplyBQSR \ + --createOutputBamMD5 \ + --addOutputSAMProgramRecord \ + -R ${ref_fasta} \ + -I ${input_bam} \ + --useOriginalQualities \ + -O ${output_bam_basename}.bam \ + -bqsr ${recalibration_report} \ + -SQQ 10 -SQQ 20 -SQQ 30 -SQQ 40 \ + --emit_original_quals \ + -L ${sep=" -L " sequence_group_interval} + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.2-1466113830" + memory: "3500 MB" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File recalibrated_bam = "${output_bam_basename}.bam" + File recalibrated_bam_checksum = "${output_bam_basename}.bam.md5" + #this output is only for GOTC STAGING to give some GC statistics to the GATK4 team + #File gc_logs = "gc_log.log" + } +} + +# Combine multiple recalibration tables from scattered BaseRecalibrator runs +task GatherBqsrReports { + Array[File] input_bqsr_reports + String output_report_filename + Int disk_size + Int preemptible_tries + + command { + java -Xmx3000m -jar /usr/gitc/GATK4.jar \ + GatherBQSRReports \ + -I ${sep=' -I ' input_bqsr_reports} \ + -O ${output_report_filename} + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.2-1466113830" + memory: "3500 MB" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File output_bqsr_report = "${output_report_filename}" + } +} + +# Combine multiple recalibrated BAM files from scattered ApplyRecalibration runs +task GatherBamFiles { + Array[File] input_bams + File input_unmapped_reads_bam + String output_bam_basename + Int disk_size + Int preemptible_tries + + command { + java -Xmx2000m -jar /usr/gitc/picard.jar \ + GatherBamFiles \ + INPUT=${sep=' INPUT=' input_bams} \ + INPUT=${input_unmapped_reads_bam} \ + OUTPUT=${output_bam_basename}.bam \ + CREATE_INDEX=true \ + CREATE_MD5_FILE=true + + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.2-1466113830" + memory: "3 GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File output_bam = "${output_bam_basename}.bam" + File output_bam_index = "${output_bam_basename}.bai" + File output_bam_md5 = "${output_bam_basename}.bam.md5" + } +} + +# Call variants on a single sample with HaplotypeCaller to produce a GVCF +task HaplotypeCaller { + File input_bam + File input_bam_index + File interval_list + String gvcf_basename + File ref_dict + File ref_fasta + File ref_fasta_index + Float? contamination + Int disk_size + Int preemptible_tries + + # tried to find lowest memory variable where it would still work, might change once tested on JES + command { + java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx8000m \ + -jar /usr/gitc/GATK35.jar \ + -T HaplotypeCaller \ + -R ${ref_fasta} \ + -o ${gvcf_basename}.vcf.gz \ + -I ${input_bam} \ + -L ${interval_list} \ + -ERC GVCF \ + --max_alternate_alleles 3 \ + -variant_index_parameter 128000 \ + -variant_index_type LINEAR \ + -contamination ${default=0 contamination} \ + --read_filter OverclippedRead + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.2-1466113830" + memory: "10 GB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File output_gvcf = "${gvcf_basename}.vcf.gz" + File output_gvcf_index = "${gvcf_basename}.vcf.gz.tbi" + } +} + +# Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs +task GatherVCFs { + Array[File] input_vcfs + Array[File] input_vcfs_indexes + String output_vcf_name + Int disk_size + Int preemptible_tries + + # using MergeVcfs instead of GatherVcfs so we can create indices + # WARNING 2015-10-28 15:01:48 GatherVcfs Index creation not currently supported when gathering block compressed VCFs. + command { + java -Xmx2g -jar /usr/gitc/picard.jar \ + MergeVcfs \ + INPUT=${sep=' INPUT=' input_vcfs} \ + OUTPUT=${output_vcf_name} + } + output { + File output_vcf = "${output_vcf_name}" + File output_vcf_index = "${output_vcf_name}.tbi" + } + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.2-1466113830" + memory: "3 GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } +} + +# Convert BAM file to CRAM format +task ConvertToCram { + File input_bam + File ref_fasta + File ref_fasta_index + String output_basename + Int disk_size + + # Note that we are not activating pre-emptible instances for this step yet, + # but we should if it ends up being fairly quick + command <<< + samtools view -C -T ${ref_fasta} ${input_bam} | \ + tee ${output_basename}.cram | \ + md5sum > ${output_basename}.cram.md5 && \ + samtools index ${output_basename}.cram && \ + mv ${output_basename}.cram.crai ${output_basename}.crai + >>> + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.2-1466113830" + memory: "3 GB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + } + output { + File output_cram = "${output_basename}.cram" + File output_cram_index = "${output_basename}.crai" + File output_cram_md5 = "${output_basename}.cram.md5" + } +} + +# WORKFLOW DEFINITION +workflow PairedEndSingleSampleWorkflow { + + String sample_name + String final_gvcf_name + Array[File] flowcell_unmapped_bams + String unmapped_bam_suffix + + Array[File] scattered_calling_intervals + File wgs_calling_interval_list + + File ref_fasta + File ref_fasta_index + File ref_dict + File ref_alt + File ref_bwt + File ref_sa + File ref_amb + File ref_ann + File ref_pac + + File dbSNP_vcf + File dbSNP_vcf_index + File known_snps_sites_vcf + File known_snps_sites_vcf_index + File known_indels_sites_vcf + File known_indels_sites_vcf_index + + Int flowcell_small_disk + Int flowcell_medium_disk + Int agg_small_disk + Int agg_medium_disk + Int agg_large_disk + Int preemptible_tries + Int agg_preemptible_tries + + String bwa_commandline="bwa mem -K 100000000 -p -v 3 -t 16 $bash_ref_fasta" + + String recalibrated_bam_basename = sample_name + ".aligned.duplicates_marked.recalibrated" + + # Get the version of BWA to include in the PG record in the header of the BAM produced + # by MergeBamAlignment. + call GetBwaVersion + + # Align flowcell-level unmapped input bams in parallel + scatter (unmapped_bam in flowcell_unmapped_bams) { + + # Because of a wdl/cromwell bug this is not currently valid so we have to sub(sub()) in each task + # String base_name = sub(sub(unmapped_bam, "gs://.*/", ""), unmapped_bam_suffix + "$", "") + + String sub_strip_path = "gs://.*/" + String sub_strip_unmapped = unmapped_bam_suffix + "$" + + # Map reads to reference + call SamToFastqAndBwaMem { + input: + input_bam = unmapped_bam, + bwa_commandline = bwa_commandline, + output_bam_basename = sub(sub(unmapped_bam, sub_strip_path, ""), sub_strip_unmapped, "") + ".unmerged", + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + ref_alt = ref_alt, + ref_bwt = ref_bwt, + ref_amb = ref_amb, + ref_ann = ref_ann, + ref_pac = ref_pac, + ref_sa = ref_sa, + disk_size = flowcell_medium_disk, + preemptible_tries = preemptible_tries + + } + + # Merge original uBAM and BWA-aligned BAM + call MergeBamAlignment { + input: + unmapped_bam = unmapped_bam, + bwa_commandline = bwa_commandline, + bwa_version = GetBwaVersion.version, + aligned_bam = SamToFastqAndBwaMem.output_bam, + output_bam_basename = sub(sub(unmapped_bam, sub_strip_path, ""), sub_strip_unmapped, "") + ".aligned.unsorted", + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + disk_size = flowcell_medium_disk, + preemptible_tries = preemptible_tries + } + + # Sort and fix tags in the merged BAM + call SortAndFixTags as SortAndFixReadGroupBam { + input: + input_bam = MergeBamAlignment.output_bam, + output_bam_basename = sub(sub(unmapped_bam, sub_strip_path, ""), sub_strip_unmapped, "") + ".sorted", + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + disk_size = flowcell_medium_disk, + preemptible_tries = preemptible_tries + } + + } + + # Aggregate aligned+merged flowcell BAM files and mark duplicates + call MarkDuplicates { + input: + input_bams = MergeBamAlignment.output_bam, + output_bam_basename = sample_name + ".aligned.unsorted.duplicates_marked", + metrics_filename = sample_name + ".duplicate_metrics", + disk_size = agg_large_disk + } + + # Sort aggregated+deduped BAM file and fix tags + call SortAndFixTags as SortAndFixSampleBam { + input: + input_bam = MarkDuplicates.output_bam, + output_bam_basename = sample_name + ".aligned.duplicate_marked.sorted", + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + disk_size = agg_large_disk, + preemptible_tries = 0 + } + + # Create list of sequences for scatter-gather parallelization + call CreateSequenceGroupingTSV { + input: + ref_dict = ref_dict, + preemptible_tries = preemptible_tries + } + + # Perform Base Quality Score Recalibration (BQSR) on the sorted BAM in parallel + scatter (subgroup in CreateSequenceGroupingTSV.sequence_grouping) { + # Generate the recalibration model by interval + call BaseRecalibrator { + input: + input_bam = SortAndFixSampleBam.output_bam, + input_bam_index = SortAndFixSampleBam.output_bam_index, + recalibration_report_filename = sample_name + ".recal_data.csv", + sequence_group_interval = subgroup, + dbSNP_vcf = dbSNP_vcf, + dbSNP_vcf_index = dbSNP_vcf_index, + known_snps_sites_vcf = known_snps_sites_vcf, + known_snps_sites_vcf_index = known_snps_sites_vcf_index, + known_indels_sites_vcf = known_indels_sites_vcf, + known_indels_sites_vcf_index = known_indels_sites_vcf_index, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + disk_size = agg_small_disk, + preemptible_tries = agg_preemptible_tries + } + # Apply the recalibration model by interval + call ApplyBQSR { + input: + input_bam = SortAndFixSampleBam.output_bam, + input_bam_index = SortAndFixSampleBam.output_bam_index, + output_bam_basename = recalibrated_bam_basename, + recalibration_report = GatherBqsrReports.output_bqsr_report, + sequence_group_interval = subgroup, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + disk_size = agg_small_disk, + preemptible_tries = agg_preemptible_tries + } + } + + # Merge the recalibration reports resulting from by-interval recalibration + call GatherBqsrReports{ + input: + input_bqsr_reports = BaseRecalibrator.recalibration_report, + output_report_filename = sample_name + ".recal_data.csv", + disk_size = flowcell_small_disk, + preemptible_tries = preemptible_tries + } + + # Do an additional round of recalibration on the unmapped reads (which would otherwise + # be left behind because they're not accounted for in the scatter intervals). This is + # done by running ApplyBQSR with "-L unmapped". + Array[String] unmapped_group_interval = ["unmapped"] + call ApplyBQSR as ApplyBQSRToUnmappedReads { + input: + input_bam = SortAndFixSampleBam.output_bam, + input_bam_index = SortAndFixSampleBam.output_bam_index, + output_bam_basename = recalibrated_bam_basename, + recalibration_report = GatherBqsrReports.output_bqsr_report, + sequence_group_interval = unmapped_group_interval, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + disk_size = agg_small_disk, + preemptible_tries = agg_preemptible_tries + } + + # Merge the recalibrated BAM files resulting from by-interval recalibration + # TODO: when we have capability of adding elements to arrays, can just have one array + # as an input and add the output of the above task to the scattered printreads bams + call GatherBamFiles { + input: + input_bams = ApplyBQSR.recalibrated_bam, + input_unmapped_reads_bam = ApplyBQSRToUnmappedReads.recalibrated_bam, + output_bam_basename = sample_name, + disk_size = agg_large_disk, + preemptible_tries = agg_preemptible_tries + } + + # Convert the final merged recalibrated BAM file to CRAM format + call ConvertToCram { + input: + input_bam = GatherBamFiles.output_bam, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + output_basename = sample_name, + disk_size = agg_medium_disk + } + + # Call variants in parallel over WGS calling intervals + scatter (subInterval in scattered_calling_intervals) { + + # Generate GVCF by interval + call HaplotypeCaller { + input: + input_bam = GatherBamFiles.output_bam, + input_bam_index = GatherBamFiles.output_bam_index, + interval_list = subInterval, + gvcf_basename = sample_name, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + disk_size = agg_small_disk, + preemptible_tries = agg_preemptible_tries + } + } + + # Combine by-interval GVCFs into a single sample GVCF file + call GatherVCFs { + input: + input_vcfs = HaplotypeCaller.output_gvcf, + input_vcfs_indexes = HaplotypeCaller.output_gvcf_index, + output_vcf_name = final_gvcf_name, + disk_size = agg_small_disk, + preemptible_tries = agg_preemptible_tries + } + + # Outputs that will be retained when execution is complete + output { + MarkDuplicates.duplicate_metrics + GatherBqsrReports.* + ConvertToCram.* + GatherVCFs.* + } + +} diff --git a/scripts/broad_pipelines/README.md b/scripts/broad_pipelines/README.md index 759d538..705bf51 100644 --- a/scripts/broad_pipelines/README.md +++ b/scripts/broad_pipelines/README.md @@ -1,10 +1,12 @@ This directory contains the following WDL scripts used in production at -the Broad Institute. Versions are distinguished with a datestamp -(format: YYMMDD). +the Broad Institute. Old versions are stored in the `archive` directory. +Versions are distinguished with a datestamp (format: YYMMDD). Each WDL +script is accompanied by a JSON file of example inputs bearing the same +name, with the `.json` extension instead of `.wdl`. ####`PublicPairedSingleSampleWf-.wdl` This WDL pipeline implements data pre-processing and initial variant -calling (GVCF generation) according to the GATK Best Practices (June -2016) for germline SNP and Indel discovery in human whole-genome -sequencing (WGS) data. +calling (GVCF generation) according to the GATK Best Practices for +germline SNP and Indel discovery in human whole-genome sequencing (WGS) +data. diff --git a/scripts/broad_pipelines/PublicPairedSingleSampleWf-160624.wdl b/scripts/broad_pipelines/archive/PublicPairedSingleSampleWf_160624.wdl similarity index 97% rename from scripts/broad_pipelines/PublicPairedSingleSampleWf-160624.wdl rename to scripts/broad_pipelines/archive/PublicPairedSingleSampleWf_160624.wdl index eea15d0..f6a76ec 100644 --- a/scripts/broad_pipelines/PublicPairedSingleSampleWf-160624.wdl +++ b/scripts/broad_pipelines/archive/PublicPairedSingleSampleWf_160624.wdl @@ -16,6 +16,14 @@ ## ## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. ## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. # TASK DEFINITIONS @@ -258,7 +266,6 @@ task CreateSequenceGroupingTSV { } } - # Generate Base Quality Score Recalibration (BQSR) model task BaseRecalibrator { File input_bam @@ -390,7 +397,7 @@ task GatherBamFiles { CREATE_INDEX=true \ CREATE_MD5_FILE=true - } + } runtime { docker: "broadinstitute/genomes-in-the-cloud:2.2.2-1466113830" memory: "3 GB" @@ -572,8 +579,7 @@ workflow PairedEndSingleSampleWorkflow { ref_sa = ref_sa, disk_size = flowcell_medium_disk, preemptible_tries = preemptible_tries - - } + } # Merge original uBAM and BWA-aligned BAM call MergeBamAlignment { @@ -601,7 +607,8 @@ workflow PairedEndSingleSampleWorkflow { disk_size = flowcell_medium_disk, preemptible_tries = preemptible_tries } - + } + # Aggregate aligned+merged flowcell BAM files and mark duplicates call MarkDuplicates { input: