From 2c5665804e6bb74365caa489e1142fb1bdb5c524 Mon Sep 17 00:00:00 2001 From: Geraldine Van der Auwera Date: Sun, 22 Jan 2017 15:35:50 -0500 Subject: [PATCH] WDLs for segments of the germline short variant calling pipeline - HaplotypeCaller GVCF scatter-gathered across intervals - GenotypeGVCFs scatter-gathered across intervals - VQSR (no scatter) - Joint discovery (GGVCFs through VQSR, no genotype refinement) --- .../GenotypeGVCFsScatterWf_170204.inputs.json | 28 ++ .../GenotypeGVCFsScatterWf_170204.wdl | 175 ++++++++++ ...HaplotypeCallerGvcfScatterWf_170204.inputs.json | 17 + .../HaplotypeCallerGvcfScatterWf_170204.wdl | 147 +++++++++ .../JointDiscoveryWf_170305.inputs.json | 71 ++++ .../JointDiscoveryWf_170305.wdl | 364 +++++++++++++++++++++ .../VariantRecalibrationWf_170305.inputs.json | 56 ++++ .../VariantRecalibrationWf_170305.wdl | 229 +++++++++++++ 8 files changed, 1087 insertions(+) create mode 100644 scripts/broad_dsde_workflows/GenotypeGVCFsScatterWf_170204.inputs.json create mode 100644 scripts/broad_dsde_workflows/GenotypeGVCFsScatterWf_170204.wdl create mode 100644 scripts/broad_dsde_workflows/HaplotypeCallerGvcfScatterWf_170204.inputs.json create mode 100644 scripts/broad_dsde_workflows/HaplotypeCallerGvcfScatterWf_170204.wdl create mode 100644 scripts/broad_dsde_workflows/JointDiscoveryWf_170305.inputs.json create mode 100644 scripts/broad_dsde_workflows/JointDiscoveryWf_170305.wdl create mode 100644 scripts/broad_dsde_workflows/VariantRecalibrationWf_170305.inputs.json create mode 100644 scripts/broad_dsde_workflows/VariantRecalibrationWf_170305.wdl diff --git a/scripts/broad_dsde_workflows/GenotypeGVCFsScatterWf_170204.inputs.json b/scripts/broad_dsde_workflows/GenotypeGVCFsScatterWf_170204.inputs.json new file mode 100644 index 0000000..6fbc1c5 --- /dev/null +++ b/scripts/broad_dsde_workflows/GenotypeGVCFsScatterWf_170204.inputs.json @@ -0,0 +1,28 @@ +{ + "GenotypeGVCFsScatterWf.cohort_vcf_name": "PlatinumTrio_b37", + + "GenotypeGVCFsScatterWf.input_gvcfs": [ + "gs://gatk-test-data/wgs_gvcf/PlatinumTrio_b37/NA12877.g.vcf.gz", + "gs://gatk-test-data/wgs_gvcf/PlatinumTrio_b37/NA12878.g.vcf.gz", + "gs://gatk-test-data/wgs_gvcf/PlatinumTrio_b37/NA12882.g.vcf.gz" + ], + + "GenotypeGVCFsScatterWf.input_gvcf_indices": [ + "gs://gatk-test-data/wgs_gvcf/PlatinumTrio_b37/NA12877.g.vcf.gz.tbi", + "gs://gatk-test-data/wgs_gvcf/PlatinumTrio_b37/NA12878.g.vcf.gz.tbi", + "gs://gatk-test-data/wgs_gvcf/PlatinumTrio_b37/NA12882.g.vcf.gz.tbi" + ], + + "GenotypeGVCFsScatterWf.ref_dict": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.dict", + "GenotypeGVCFsScatterWf.ref_fasta": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta", + "GenotypeGVCFsScatterWf.ref_fasta_index": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta.fai", + + "GenotypeGVCFsScatterWf.scattered_calling_intervals_list": "gs://gatk-test-data/intervals/b37_wgs_scattered_calling_intervals.txt", + + "GenotypeGVCFsScatterWf.UnzipGVCF.disk_size": 100, + "GenotypeGVCFsScatterWf.UnzipGVCF.mem_size": "10 GB", + "GenotypeGVCFsScatterWf.GenotypeGVCFs.disk_size": 100, + "GenotypeGVCFsScatterWf.GenotypeGVCFs.mem_size": "10 GB", + "GenotypeGVCFsScatterWf.MergeVCFs.disk_size": 100, + "GenotypeGVCFsScatterWf.MergeVCFs.mem_size": "10 GB" +} \ No newline at end of file diff --git a/scripts/broad_dsde_workflows/GenotypeGVCFsScatterWf_170204.wdl b/scripts/broad_dsde_workflows/GenotypeGVCFsScatterWf_170204.wdl new file mode 100644 index 0000000..36eb11b --- /dev/null +++ b/scripts/broad_dsde_workflows/GenotypeGVCFsScatterWf_170204.wdl @@ -0,0 +1,175 @@ +## Copyright Broad Institute, 2017 +## +## This WDL workflow runs GenotypeGVCFs on a set of GVCFs to joint-call multiple +## samples, scattered across intervals. +## +## Requirements/expectations : +## - One or more GVCFs produced by HaplotypeCaller in GVCF mode +## +## Outputs : +## - A VCF file and its index, with genotypes for all samples represented in the +## GVCF inputs. +## +## Cromwell version support +## - Successfully tested on v24 +## - Does not work on versions < v23 due to output syntax +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# TASK DEFINITIONS + +# Unzip GVCFs +task UnzipGVCF { + File gzipped_gvcf + String unzipped_basename + Int disk_size + String mem_size + + # HACK ALERT! Using .gvcf extension here to force IndexFeatureFile to make the right + # kind of index, but afterward we need to change to .g.vcf which is the correct + # for GVCFs. + command <<< + gunzip -c ${gzipped_gvcf} > ${unzipped_basename}.gvcf + java -Xmx2g -jar /usr/gitc/GATK4.jar IndexFeatureFile -F ${unzipped_basename}.gvcf + mv ${unzipped_basename}.gvcf ${unzipped_basename}.g.vcf + mv ${unzipped_basename}.gvcf.idx ${unzipped_basename}.g.vcf.idx + >>> + + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" + memory: mem_size + disks: "local-disk " + disk_size + " HDD" + } + + output { + File unzipped_gvcf = "${unzipped_basename}.g.vcf" + File gvcf_index = "${unzipped_basename}.g.vcf.idx" + } +} + +# Perform joint-genotyping +task GenotypeGVCFs { + Array[File] gvcfs + Array[File] gvcf_indices + String vcf_basename + File ref_dict + File ref_fasta + File ref_fasta_index + File interval_list + Int disk_size + String mem_size + + command { + java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx8000m \ + -jar /usr/gitc/GATK36.jar \ + -T GenotypeGVCFs \ + -R ${ref_fasta} \ + --variant ${sep=' --variant ' gvcfs} \ + -L ${interval_list} \ + -o ${vcf_basename}.vcf.gz + } + + output { + File genotyped_vcf = "${vcf_basename}.vcf.gz" + File genotyped_index = "${vcf_basename}.vcf.gz.tbi" + } + + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" + memory: mem_size + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + } +} + +# Combine multiple VCFs +task MergeVCFs { + Array [File] input_vcfs + Array [File] input_vcfs_indices + String vcf_name + String vcf_index + Int disk_size + String mem_size + + command { + java -Xmx2g -jar /usr/gitc/picard.jar \ + MergeVcfs \ + INPUT=${sep=' INPUT=' input_vcfs} \ + OUTPUT=${vcf_name} + } + + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" + memory: mem_size + disks: "local-disk " + disk_size + " HDD" + } + + output { + File output_vcf = "${vcf_name}" + File output_vcf_index = "${vcf_index}" + } +} + +workflow GenotypeGVCFsScatterWf { + File ref_fasta + File ref_fasta_index + File ref_dict + Array[File] input_gvcfs + Array[File] input_gvcf_indices + String cohort_vcf_name + File scattered_calling_intervals_list + + Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list) + + # Unzip GVCFs in parallel + scatter (input_gvcf in input_gvcfs) { + + # Unzip block-compressed VCFs with .gz extension because GenotypeGVCFs + # currently does not handle compressed files. + call UnzipGVCF { + input: + gzipped_gvcf = input_gvcf, + unzipped_basename = "temp_unzipped" + } + } + + # Joint-call variants in parallel over WGS calling intervals + scatter (subInterval in scattered_calling_intervals) { + + # Perform joint genotyping per interval + call GenotypeGVCFs { + input: + gvcfs = UnzipGVCF.unzipped_gvcf, + gvcf_indices = UnzipGVCF.gvcf_index, + vcf_basename = cohort_vcf_name, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + interval_list = subInterval + } + } + + # Merge per-interval VCFs into a single cohort VCF file + call MergeVCFs { + input: + input_vcfs = GenotypeGVCFs.genotyped_vcf, + input_vcfs_indices = GenotypeGVCFs.genotyped_index, + vcf_name = cohort_vcf_name + ".vcf.gz", + vcf_index = cohort_vcf_name + ".vcf.gz.tbi" + } + + # Outputs that will be retained when execution is complete + output { + File output_merged_vcf = MergeVCFs.output_vcf + File output_merged_vcf_index = MergeVCFs.output_vcf_index + } +} diff --git a/scripts/broad_dsde_workflows/HaplotypeCallerGvcfScatterWf_170204.inputs.json b/scripts/broad_dsde_workflows/HaplotypeCallerGvcfScatterWf_170204.inputs.json new file mode 100644 index 0000000..45ab8b5 --- /dev/null +++ b/scripts/broad_dsde_workflows/HaplotypeCallerGvcfScatterWf_170204.inputs.json @@ -0,0 +1,17 @@ +{ + "HaplotypeCallerGvcfScatterWf.input_bam": "gs://gatk-test-data/wgs_bam/NA12878_20k_b37/NA12878.bam", + "HaplotypeCallerGvcfScatterWf.input_bam_index": "gs://gatk-test-data/wgs_bam/NA12878_20k_b37/NA12878.bai", + + "HaplotypeCallerGvcfScatterWf.ref_dict": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.dict", + "HaplotypeCallerGvcfScatterWf.ref_fasta": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta", + "HaplotypeCallerGvcfScatterWf.ref_fasta_index": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta.fai", + + "HaplotypeCallerGvcfScatterWf.scattered_calling_intervals_list": "gs://gatk-test-data/intervals/b37_wgs_scattered_calling_intervals.txt", + + "HaplotypeCallerGvcfScatterWf.HaplotypeCaller.interval_padding": 100, + + "HaplotypeCallerGvcfScatterWf.HaplotypeCaller.disk_size": 100, + "HaplotypeCallerGvcfScatterWf.HaplotypeCaller.mem_size": "10 GB", + "HaplotypeCallerGvcfScatterWf.MergeVCFs.disk_size": 100, + "HaplotypeCallerGvcfScatterWf.MergeVCFs.mem_size": "3 GB" +} \ No newline at end of file diff --git a/scripts/broad_dsde_workflows/HaplotypeCallerGvcfScatterWf_170204.wdl b/scripts/broad_dsde_workflows/HaplotypeCallerGvcfScatterWf_170204.wdl new file mode 100644 index 0000000..5acb19d --- /dev/null +++ b/scripts/broad_dsde_workflows/HaplotypeCallerGvcfScatterWf_170204.wdl @@ -0,0 +1,147 @@ +## Copyright Broad Institute, 2017 +## +## This WDL workflow runs HaplotypeCaller in GVCF mode on a single sample, scattered +## across intervals. +## +## Requirements/expectations : +## - One analysis-ready BAM file for a single sample (as identified in RG:SM) +## - Set of variant calling intervals lists for the scatter, provided in a file +## +## Outputs : +## - One GVCF file and its index +## +## Cromwell version support +## - Successfully tested on v24 +## - Does not work on versions < v23 due to output syntax +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# TASK DEFINITIONS + +# HaplotypeCaller per-sample in GVCF mode +task HaplotypeCaller { + File input_bam + File input_bam_index + String gvcf_name + String gvcf_index + File ref_dict + File ref_fasta + File ref_fasta_index + File interval_list + Int? interval_padding + Int disk_size + String mem_size + + command { + java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx800m \ + -jar /usr/gitc/GATK36.jar \ + -T HaplotypeCaller \ + -R ${ref_fasta} \ + -I ${input_bam} \ + -o ${gvcf_name} \ + -L ${interval_list} \ + -ip ${default=100 interval_padding} \ + -ERC GVCF + } + + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" + cpu: "1" + memory: mem_size + disks: "local-disk " + disk_size + " HDD" + } + + output { + File output_gvcf = "${gvcf_name}" + File output_gvcf_index = "${gvcf_index}" + } +} + +# Merge GVCFs generated per-interval for the same sample +task MergeVCFs { + Array [File] input_vcfs + Array [File] input_vcfs_indices + String vcf_name + String vcf_index + Int disk_size + String mem_size + + command { + java -Xmx2g -jar /usr/gitc/picard.jar \ + MergeVcfs \ + INPUT=${sep=' INPUT=' input_vcfs} \ + OUTPUT=${vcf_name} + } + + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" + memory: mem_size + disks: "local-disk " + disk_size + " HDD" +} + + output { + File output_vcf = "${vcf_name}" + File output_vcf_index = "${vcf_index}" + } +} + +# WORKFLOW DEFINITION +workflow HaplotypeCallerGvcfScatterWf { + File input_bam + File input_bam_index + File ref_dict + File ref_fasta + File ref_fasta_index + File scattered_calling_intervals_list + + Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list) + + String sub_strip_path = "gs://.*/" + String sub_strip_suffix = ".bam$" + + String sample_basename = sub(sub(input_bam, sub_strip_path, ""), sub_strip_suffix, "") + + String gvcf_name = sample_basename + ".g.vcf.gz" + String gvcf_index = sample_basename + ".g.vcf.gz.tbi" + + # Call variants in parallel over grouped calling intervals + scatter (interval_file in scattered_calling_intervals) { + + # Generate GVCF by interval + call HaplotypeCaller { + input: + input_bam = input_bam, + input_bam_index = input_bam_index, + interval_list = interval_file, + gvcf_name = gvcf_name, + gvcf_index = gvcf_index, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index + } + } + + # Merge per-interval GVCFs + call MergeVCFs { + input: + input_vcfs = HaplotypeCaller.output_gvcf, + input_vcfs_indices = HaplotypeCaller.output_gvcf_index, + vcf_name = gvcf_name, + vcf_index = gvcf_index + } + + # Outputs that will be retained when execution is complete + output { + File output_merged_gvcf = MergeVCFs.output_vcf + File output_merged_gvcf_index = MergeVCFs.output_vcf_index + } +} \ No newline at end of file diff --git a/scripts/broad_dsde_workflows/JointDiscoveryWf_170305.inputs.json b/scripts/broad_dsde_workflows/JointDiscoveryWf_170305.inputs.json new file mode 100644 index 0000000..3553eec --- /dev/null +++ b/scripts/broad_dsde_workflows/JointDiscoveryWf_170305.inputs.json @@ -0,0 +1,71 @@ +{ + "JointDiscoveryWf.ref_dict": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.dict", + "JointDiscoveryWf.ref_fasta": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta", + "JointDiscoveryWf.ref_fasta_index": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta.fai", + + "JointDiscoveryWf.scattered_calling_intervals_list": "gs://gatk-test-data/intervals/b37_wgs_scattered_calling_intervals.txt", + + "JointDiscoveryWf.cohort_vcf_name": "PlatinumTrio_b37", + + "JointDiscoveryWf.input_gvcfs": [ + "gs://gatk-test-data/wgs_gvcf/PlatinumTrio_b37/NA12877.g.vcf.gz", + "gs://gatk-test-data/wgs_gvcf/PlatinumTrio_b37/NA12878.g.vcf.gz", + "gs://gatk-test-data/wgs_gvcf/PlatinumTrio_b37/NA12882.g.vcf.gz" + ], + + "JointDiscoveryWf.input_gvcf_indices": [ + "gs://gatk-test-data/wgs_gvcf/PlatinumTrio_b37/NA12877.g.vcf.gz.tbi", + "gs://gatk-test-data/wgs_gvcf/PlatinumTrio_b37/NA12878.g.vcf.gz.tbi", + "gs://gatk-test-data/wgs_gvcf/PlatinumTrio_b37/NA12882.g.vcf.gz.tbi" + ], + + "JointDiscoveryWf.resource_files": [ + "gs://gatk-legacy-bundles/b37/hapmap_3.3.b37.vcf", + "gs://gatk-legacy-bundles/b37/1000G_omni2.5.b37.vcf", + "gs://gatk-legacy-bundles/b37/1000G_phase1.snps.high_confidence.b37.vcf", + "gs://gatk-legacy-bundles/b37/dbsnp_138.b37.vcf", + "gs://gatk-legacy-bundles/b37/Mills_and_1000G_gold_standard.indels.b37.vcf" + ], + "JointDiscoveryWf.resource_indices": [ + "gs://gatk-legacy-bundles/b37/hapmap_3.3.b37.vcf.idx", + "gs://gatk-legacy-bundles/b37/1000G_omni2.5.b37.vcf.idx", + "gs://gatk-legacy-bundles/b37/1000G_phase1.snps.high_confidence.b37.vcf.idx", + "gs://gatk-legacy-bundles/b37/dbsnp_138.b37.vcf.idx", + "gs://gatk-legacy-bundles/b37/Mills_and_1000G_gold_standard.indels.b37.vcf.idx" + ], + + "JointDiscoveryWf.SNP_resources": [ + "hapmap,known=false,training=true,truth=true,prior=15.0 gatk-legacy-bundles/b37/hapmap_3.3.b37.vcf", + "omni,known=false,training=true,truth=true,prior=12.0 gatk-legacy-bundles/b37/1000G_omni2.5.b37.vcf", + "1000G,known=false,training=true,truth=false,prior=10.0 gatk-legacy-bundles/b37/1000G_phase1.snps.high_confidence.b37.vcf", + "dbsnp,known=true,training=false,truth=false,prior=2.0 gatk-legacy-bundles/b37/dbsnp_138.b37.vcf" + ], + "JointDiscoveryWf.INDEL_resources": [ + "mills,known=false,training=true,truth=true,prior=12.0 gatk-legacy-bundles/b37/Mills_and_1000G_gold_standard.indels.b37.vcf", + "dbsnp,known=true,training=false,truth=false,prior=2.0 gatk-legacy-bundles/b37/dbsnp_138.b37.vcf" + ], + + "JointDiscoveryWf.SNP_annotations": ["DP", "QD", "FS", "SOR", "MQ", "MQRankSum", "ReadPosRankSum"], + "JointDiscoveryWf.INDEL_annotations": ["QD", "FS", "SOR", "MQRankSum", "ReadPosRankSum"], + + "JointDiscoveryWf.SNP_filter_level": 99.7, + "JointDiscoveryWf.INDEL_filter_level": 99.7, + + "JointDiscoveryWf.SNP_tranches": [100.0, 99.99, 99.95, 99.9, 99.8, 99.7, 99.6, 99.5, 99.4, 99.3, 99.2, 99.1, 99.0, 98.0, 97.0, 96.0, 95.0, 90.0], + "JointDiscoveryWf.INDEL_tranches": [100.0, 99.99, 99.95, 99.9, 99.8, 99.7, 99.6, 99.5, 99.0, 98.0, 97.0, 96.0, 95.0, 94.0, 93.0, 92.0, 91.0, 90.0], + + "JointDiscoveryWf.UnzipGVCF.disk_size": 100, + "JointDiscoveryWf.UnzipGVCF.mem_size": "10 GB", + "JointDiscoveryWf.GenotypeGVCFs.disk_size": 100, + "JointDiscoveryWf.GenotypeGVCFs.mem_size": "10 GB", + "JointDiscoveryWf.MergeVCFs.disk_size": 100, + "JointDiscoveryWf.MergeVCFs.mem_size": "10 GB", + "JointDiscoveryWf.BuildVQSRModelForSNPs.disk_size": 100, + "JointDiscoveryWf.BuildVQSRModelForSNPs.mem_size": "10 GB", + "JointDiscoveryWf.BuildVQSRModelForINDELs.disk_size": 100, + "JointDiscoveryWf.BuildVQSRModelForINDELs.mem_size": "10 GB", + "JointDiscoveryWf.ApplyRecalibrationFilterForSNPs.disk_size": 100, + "JointDiscoveryWf.ApplyRecalibrationFilterForSNPs.mem_size": "10 GB", + "JointDiscoveryWf.ApplyRecalibrationFilterForINDELs.disk_size": 100, + "JointDiscoveryWf.ApplyRecalibrationFilterForINDELs.mem_size": "10 GB" +} \ No newline at end of file diff --git a/scripts/broad_dsde_workflows/JointDiscoveryWf_170305.wdl b/scripts/broad_dsde_workflows/JointDiscoveryWf_170305.wdl new file mode 100644 index 0000000..1b20f4a --- /dev/null +++ b/scripts/broad_dsde_workflows/JointDiscoveryWf_170305.wdl @@ -0,0 +1,364 @@ +## Copyright Broad Institute, 2017 +## +## This WDL implements the joint discovery and VQSR filtering portion of the GATK +## Best Practices (June 2016) for germline SNP and Indel discovery in human +## whole-genome sequencing (WGS) and exome sequencing data. +## +## Requirements/expectations : +## - One or more GVCFs produced by HaplotypeCaller in GVCF mode +## - Bare minimum 1 WGS sample or 30 Exome samples. Gene panels are not supported. +## +## Outputs : +## - A VCF file and its index, filtered using variant quality score recalibration +## (VQSR) with genotypes for all samples present in the input VCF. All sites that +## are present in the input VCF are retained; filtered sites are annotated as such +## in the FILTER field. +## +## Note about VQSR wiring : +## The SNP and INDEL models are built in parallel, but then the corresponding +## recalibrations are applied in series. Because the INDEL model is generally ready +## first (because there are fewer indels than SNPs) we set INDEL recalibration to +## be applied first to the input VCF, while the SNP model is still being built. By +## the time the SNP model is available, the indel-recalibrated file is available to +## serve as input to apply the SNP recalibration. If we did it the other way around, +## we would have to wait until the SNP recal file was available despite the INDEL +## recal file being there already, then apply SNP recalibration, then apply INDEL +## recalibration. This would lead to a longer wall clock time for complete workflow +## execution. Wiring the INDEL recalibration to be applied first solves the problem. +## +## Cromwell version support +## - Successfully tested on v24 +## - Does not work on versions < v23 due to output syntax +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# TASK DEFINITIONS + +# Unzip GVCFs +task UnzipGVCF { + File gzipped_gvcf + String unzipped_basename + Int disk_size + String mem_size + + # HACK ALERT! Using .gvcf extension here to force IndexFeatureFile to make the right + # kind of index, but afterward we need to change to .g.vcf which is the correct + # for GVCFs. + command <<< + gunzip -c ${gzipped_gvcf} > ${unzipped_basename}.gvcf + java -Xmx2g -jar /usr/gitc/GATK4.jar IndexFeatureFile -F ${unzipped_basename}.gvcf + mv ${unzipped_basename}.gvcf ${unzipped_basename}.g.vcf + mv ${unzipped_basename}.gvcf.idx ${unzipped_basename}.g.vcf.idx + >>> + + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" + memory: mem_size + disks: "local-disk " + disk_size + " HDD" + } + + output { + File unzipped_gvcf = "${unzipped_basename}.g.vcf" + File gvcf_index = "${unzipped_basename}.g.vcf.idx" + } +} + +# Perform joint-genotyping +task GenotypeGVCFs { + Array[File] gvcfs + Array[File] gvcf_indices + String vcf_basename + File ref_dict + File ref_fasta + File ref_fasta_index + File interval_list + Int disk_size + String mem_size + + command { + java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx8000m \ + -jar /usr/gitc/GATK36.jar \ + -T GenotypeGVCFs \ + -R ${ref_fasta} \ + --variant ${sep=' --variant ' gvcfs} \ + -L ${interval_list} \ + -o ${vcf_basename}.vcf.gz + } + + output { + File genotyped_vcf = "${vcf_basename}.vcf.gz" + File genotyped_index = "${vcf_basename}.vcf.gz.tbi" + } + + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" + memory: mem_size + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + } +} + +# Combine multiple VCFs +task MergeVCFs { + Array [File] input_vcfs + Array [File] input_vcfs_indices + String vcf_name + String vcf_index + Int disk_size + String mem_size + + command { + java -Xmx2g -jar /usr/gitc/picard.jar \ + MergeVcfs \ + INPUT=${sep=' INPUT=' input_vcfs} \ + OUTPUT=${vcf_name} + } + + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" + memory: mem_size + disks: "local-disk " + disk_size + " HDD" + } + + output { + File output_vcf = "${vcf_name}" + File output_vcf_index = "${vcf_index}" + } +} + +# Build VQSR model +task BuildVQSRModel { + File ref_dict + File ref_fasta + File ref_fasta_index + File cohort_vcf + File cohort_vcf_index + String output_basename + Array[File] interval_lists + String mode + Array[String] annotations + Array[Float] tranches + Array[String] resources + Array[File] resource_files + Array[File] resource_indices + Int disk_size + String mem_size + + command { + java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx8000m \ + -jar /usr/gitc/GATK36.jar \ + -T VariantRecalibrator \ + -R ${ref_fasta} \ + -input ${cohort_vcf} \ + -L ${sep=' -L ' interval_lists} \ + -resource:${sep=' -resource:' resources} \ + -an ${sep=' -an ' annotations} \ + -mode ${mode} \ + -tranche ${sep=' -tranche ' tranches} \ + -recalFile ${output_basename}.${mode}.recal \ + -tranchesFile ${output_basename}.${mode}.tranches \ + -rscriptFile ${output_basename}.${mode}.plots.R + } + + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" + memory: mem_size + disks: "local-disk " + disk_size + " HDD" + } + + output { + File recal_file = "${output_basename}.${mode}.recal" + File recal_file_index = "${output_basename}.${mode}.recal.idx" + File tranches_file = "${output_basename}.${mode}.tranches" + File rscript_file = "${output_basename}.${mode}.plots.R" + } +} + +# Apply recalibration +task ApplyRecalibrationFilter { + File ref_dict + File ref_fasta + File ref_fasta_index + File cohort_vcf + File cohort_vcf_index + File recal_file + File recal_file_index + Array[File] interval_lists + String output_basename + String mode + File tranches_file + Float filter_level + Int disk_size + String mem_size + + command { + java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx8000m \ + -jar /usr/gitc/GATK36.jar \ + -T ApplyRecalibration \ + -R ${ref_fasta} \ + -input ${cohort_vcf} \ + -L ${sep=' -L ' interval_lists} \ + -mode ${mode} \ + --ts_filter_level ${filter_level} \ + -recalFile ${recal_file} \ + -tranchesFile ${tranches_file} \ + -o ${output_basename}.vcf.gz + } + + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" + memory: mem_size + disks: "local-disk " + disk_size + " HDD" + } + + output { + File recalibrated_vcf = "${output_basename}.vcf.gz" + File recalibrated_vcf_index = "${output_basename}.vcf.gz.tbi" + } +} + +workflow JointDiscoveryWf { + File ref_fasta + File ref_fasta_index + File ref_dict + Array[File] input_gvcfs + Array[File] input_gvcf_indices + Array[String] SNP_annotations + Array[String] INDEL_annotations + Array[Float] SNP_tranches + Array[Float] INDEL_tranches + Array[String] SNP_resources + Array[String] INDEL_resources + Array[File] resource_files + Array[File] resource_indices + Float SNP_filter_level + Float INDEL_filter_level + String cohort_vcf_name + File scattered_calling_intervals_list + + Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list) + + # Unzip GVCFs in parallel + scatter (input_gvcf in input_gvcfs) { + + # Unzip block-compressed VCFs with .gz extension because GenotypeGVCFs + # currently does not handle compressed files. + call UnzipGVCF { + input: + gzipped_gvcf = input_gvcf, + unzipped_basename = "temp_unzipped" + } + } + + # Joint-call variants in parallel over WGS calling intervals + scatter (subInterval in scattered_calling_intervals) { + + # Perform joint genotyping per interval + call GenotypeGVCFs { + input: + gvcfs = UnzipGVCF.unzipped_gvcf, + gvcf_indices = UnzipGVCF.gvcf_index, + vcf_basename = cohort_vcf_name, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + interval_list = subInterval + } + } + + # Merge per-interval VCFs into a single cohort VCF file + call MergeVCFs { + input: + input_vcfs = GenotypeGVCFs.genotyped_vcf, + input_vcfs_indices = GenotypeGVCFs.genotyped_index, + vcf_name = cohort_vcf_name + ".vcf.gz", + vcf_index = cohort_vcf_name + ".vcf.gz.tbi" + } + + # Build SNP model + call BuildVQSRModel as BuildVQSRModelForSNPs { + input: + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + cohort_vcf = MergeVCFs.output_vcf, + cohort_vcf_index = MergeVCFs.output_vcf_index, + interval_lists = scattered_calling_intervals, + output_basename = cohort_vcf_name, + annotations = SNP_annotations, + mode = "SNP", + tranches = SNP_tranches, + resources = SNP_resources, + resource_files = resource_files, + resource_indices = resource_indices + } + + # Build INDEL model + call BuildVQSRModel as BuildVQSRModelForINDELs { + input: + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + cohort_vcf = MergeVCFs.output_vcf, + cohort_vcf_index = MergeVCFs.output_vcf_index, + interval_lists = scattered_calling_intervals, + output_basename = cohort_vcf_name, + annotations = INDEL_annotations, + mode = "INDEL", + tranches = INDEL_tranches, + resources = INDEL_resources, + resource_files = resource_files, + resource_indices = resource_indices + } + + # Apply INDEL filter (first because INDEL model is usually done sooner) + call ApplyRecalibrationFilter as ApplyRecalibrationFilterForINDELs { + input: + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + cohort_vcf = MergeVCFs.output_vcf, + cohort_vcf_index = MergeVCFs.output_vcf_index, + interval_lists = scattered_calling_intervals, + output_basename = cohort_vcf_name + ".recal.INDEL", + mode = "INDEL", + recal_file = BuildVQSRModelForINDELs.recal_file, + recal_file_index = BuildVQSRModelForINDELs.recal_file_index, + tranches_file = BuildVQSRModelForINDELs.tranches_file, + filter_level = INDEL_filter_level + } + + # Apply SNP filter + call ApplyRecalibrationFilter as ApplyRecalibrationFilterForSNPs { + input: + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + cohort_vcf = ApplyRecalibrationFilterForINDELs.recalibrated_vcf, + cohort_vcf_index = ApplyRecalibrationFilterForINDELs.recalibrated_vcf_index, + interval_lists = scattered_calling_intervals, + output_basename = cohort_vcf_name + ".recal.INDEL.SNP", + mode = "SNP", + recal_file = BuildVQSRModelForSNPs.recal_file, + recal_file_index = BuildVQSRModelForSNPs.recal_file_index, + tranches_file = BuildVQSRModelForSNPs.tranches_file, + filter_level = SNP_filter_level + } + + # Outputs that will be retained when execution is complete + output { + File jointcalled_vcf = MergeVCFs.output_vcf + File jointcalled_vcf_index = MergeVCFs.output_vcf_index + File filtered_vcf = ApplyRecalibrationFilterForSNPs.recalibrated_vcf + File filtered_vcf_idx = ApplyRecalibrationFilterForSNPs.recalibrated_vcf_index + } +} diff --git a/scripts/broad_dsde_workflows/VariantRecalibrationWf_170305.inputs.json b/scripts/broad_dsde_workflows/VariantRecalibrationWf_170305.inputs.json new file mode 100644 index 0000000..ef3f97c --- /dev/null +++ b/scripts/broad_dsde_workflows/VariantRecalibrationWf_170305.inputs.json @@ -0,0 +1,56 @@ +{ + "VariantRecalibrationWf.ref_dict": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.dict", + "VariantRecalibrationWf.ref_fasta": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta", + "VariantRecalibrationWf.ref_fasta_index": "gs://gatk-legacy-bundles/b37/human_g1k_v37_decoy.fasta.fai", + + "VariantRecalibrationWf.cohort_vcf_name": "PlatinumTrio_b37", + + "VariantRecalibrationWf.input_vcf": "gs://gatk-test-data/wgs_vcf/PlatinumTrio_b37/PlatinumTrio_b37.vcf.gz", + "VariantRecalibrationWf.input_vcf_index": "gs://gatk-test-data/wgs_vcf/PlatinumTrio_b37/PlatinumTrio_b37.vcf.gz.tbi", + + "VariantRecalibrationWf.calling_intervals_list": "gs://gatk-legacy-bundles/b37/wgs_calling_regions.v1.interval_list", + + "VariantRecalibrationWf.resource_files": [ + "gs://gatk-legacy-bundles/b37/hapmap_3.3.b37.vcf", + "gs://gatk-legacy-bundles/b37/1000G_omni2.5.b37.vcf", + "gs://gatk-legacy-bundles/b37/1000G_phase1.snps.high_confidence.b37.vcf", + "gs://gatk-legacy-bundles/b37/dbsnp_138.b37.vcf", + "gs://gatk-legacy-bundles/b37/Mills_and_1000G_gold_standard.indels.b37.vcf" + ], + "VariantRecalibrationWf.resource_indices": [ + "gs://gatk-legacy-bundles/b37/hapmap_3.3.b37.vcf.idx", + "gs://gatk-legacy-bundles/b37/1000G_omni2.5.b37.vcf.idx", + "gs://gatk-legacy-bundles/b37/1000G_phase1.snps.high_confidence.b37.vcf.idx", + "gs://gatk-legacy-bundles/b37/dbsnp_138.b37.vcf.idx", + "gs://gatk-legacy-bundles/b37/Mills_and_1000G_gold_standard.indels.b37.vcf.idx" + ], + + "VariantRecalibrationWf.SNP_resources": [ + "hapmap,known=false,training=true,truth=true,prior=15.0 gatk-legacy-bundles/b37/hapmap_3.3.b37.vcf", + "omni,known=false,training=true,truth=true,prior=12.0 gatk-legacy-bundles/b37/1000G_omni2.5.b37.vcf", + "1000G,known=false,training=true,truth=false,prior=10.0 gatk-legacy-bundles/b37/1000G_phase1.snps.high_confidence.b37.vcf", + "dbsnp,known=true,training=false,truth=false,prior=2.0 gatk-legacy-bundles/b37/dbsnp_138.b37.vcf" + ], + "VariantRecalibrationWf.INDEL_resources": [ + "mills,known=false,training=true,truth=true,prior=12.0 gatk-legacy-bundles/b37/Mills_and_1000G_gold_standard.indels.b37.vcf", + "dbsnp,known=true,training=false,truth=false,prior=2.0 gatk-legacy-bundles/b37/dbsnp_138.b37.vcf" + ], + + "VariantRecalibrationWf.SNP_annotations": ["DP", "QD", "FS", "SOR", "MQ", "MQRankSum", "ReadPosRankSum"], + "VariantRecalibrationWf.INDEL_annotations": ["QD", "FS", "SOR", "MQRankSum", "ReadPosRankSum"], + + "VariantRecalibrationWf.SNP_filter_level": 99.7, + "VariantRecalibrationWf.INDEL_filter_level": 99.7, + + "VariantRecalibrationWf.SNP_tranches": [100.0, 99.99, 99.95, 99.9, 99.8, 99.7, 99.6, 99.5, 99.4, 99.3, 99.2, 99.1, 99.0, 98.0, 97.0, 96.0, 95.0, 90.0], + "VariantRecalibrationWf.INDEL_tranches": [100.0, 99.99, 99.95, 99.9, 99.8, 99.7, 99.6, 99.5, 99.0, 98.0, 97.0, 96.0, 95.0, 94.0, 93.0, 92.0, 91.0, 90.0], + + "VariantRecalibrationWf.BuildVQSRModelForSNPs.disk_size": 100, + "VariantRecalibrationWf.BuildVQSRModelForSNPs.mem_size": "10 GB", + "VariantRecalibrationWf.BuildVQSRModelForINDELs.disk_size": 100, + "VariantRecalibrationWf.BuildVQSRModelForINDELs.mem_size": "10 GB", + "VariantRecalibrationWf.ApplyRecalibrationFilterForINDELs.disk_size": 100, + "VariantRecalibrationWf.ApplyRecalibrationFilterForINDELs.mem_size": "10 GB", + "VariantRecalibrationWf.ApplyRecalibrationFilterForSNPs.disk_size": 100, + "VariantRecalibrationWf.ApplyRecalibrationFilterForSNPs.mem_size": "10 GB" +} \ No newline at end of file diff --git a/scripts/broad_dsde_workflows/VariantRecalibrationWf_170305.wdl b/scripts/broad_dsde_workflows/VariantRecalibrationWf_170305.wdl new file mode 100644 index 0000000..7d342b8 --- /dev/null +++ b/scripts/broad_dsde_workflows/VariantRecalibrationWf_170305.wdl @@ -0,0 +1,229 @@ +## Copyright Broad Institute, 2017 +## +## This WDL workflow runs VQSR filtering on a VCF. +## +## Requirements/expectations : +## - VCF produced by HaplotypeCaller or GenotypeGVCFs +## - Bare minimum 1 WGS sample or 30 Exome samples. Gene panels are not supported. +## +## Outputs : +## - A VCF file to which VQSR filtering has been applied (all sites are retained; +## filtered sites are so annotated in the FILTER field). +## +## Note about VQSR wiring : +## The SNP and INDEL models are built in parallel, but then the corresponding +## recalibrations are applied in series. Because the INDEL model is generally ready +## first (because there are fewer indels than SNPs) we set INDEL recalibration to +## be applied first to the input VCF, while the SNP model is still being built. By +## the time the SNP model is available, the indel-recalibrated file is available to +## serve as input to apply the SNP recalibration. If we did it the other way around, +## we would have to wait until the SNP recal file was available despite the INDEL +## recal file being there already, then apply SNP recalibration, then apply INDEL +## recalibration. This would lead to a longer wall clock time for complete workflow +## execution. Wiring the INDEL recalibration to be applied first solves the problem. +## +## Cromwell version support +## - Successfully tested on v24 +## - Does not work on versions < v23 due to output syntax +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# TASK DEFINITIONS + +# Build VQSR model +task BuildVQSRModel { + File ref_dict + File ref_fasta + File ref_fasta_index + File cohort_vcf + File cohort_vcf_index + File interval_list + String output_basename + String mode + Array[String] annotations + Array[Float] tranches + Array[String] resources + Array[File] resource_files + Array[File] resource_indices + Int disk_size + String mem_size + + command { + java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx8000m \ + -jar /usr/gitc/GATK36.jar \ + -T VariantRecalibrator \ + -R ${ref_fasta} \ + -input ${cohort_vcf} \ + -L ${interval_list} \ + -resource:${sep=' -resource:' resources} \ + -an ${sep=' -an ' annotations} \ + -mode ${mode} \ + -tranche ${sep=' -tranche ' tranches} \ + -recalFile ${output_basename}.${mode}.recal \ + -tranchesFile ${output_basename}.${mode}.tranches \ + -rscriptFile ${output_basename}.${mode}.plots.R + } + + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" + memory: mem_size + disks: "local-disk " + disk_size + " HDD" + } + + output { + File recal_file = "${output_basename}.${mode}.recal" + File recal_file_index = "${output_basename}.${mode}.recal.idx" + File tranches_file = "${output_basename}.${mode}.tranches" + File rscript_file = "${output_basename}.${mode}.plots.R" + } +} + +# Apply recalibration +task ApplyRecalibrationFilter { + File ref_dict + File ref_fasta + File ref_fasta_index + File cohort_vcf + File cohort_vcf_index + File recal_file + File recal_file_index + File interval_list + String output_basename + String mode + File tranches_file + Float filter_level + Int disk_size + String mem_size + + command { + java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx8000m \ + -jar /usr/gitc/GATK36.jar \ + -T ApplyRecalibration \ + -R ${ref_fasta} \ + -input ${cohort_vcf} \ + -L ${interval_list} \ + -mode ${mode} \ + --ts_filter_level ${filter_level} \ + -recalFile ${recal_file} \ + -tranchesFile ${tranches_file} \ + -o ${output_basename}.vcf.gz + } + + runtime { + docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" + memory: mem_size + disks: "local-disk " + disk_size + " HDD" + } + + output { + File recalibrated_vcf = "${output_basename}.vcf.gz" + File recalibrated_vcf_index = "${output_basename}.vcf.gz.tbi" + } +} + +workflow VariantRecalibrationWf { + File ref_fasta + File ref_fasta_index + File ref_dict + File input_vcf + File input_vcf_index + File calling_intervals_list + Array[String] SNP_annotations + Array[String] INDEL_annotations + Array[Float] SNP_tranches + Array[Float] INDEL_tranches + Array[String] SNP_resources + Array[String] INDEL_resources + Array[File] resource_files + Array[File] resource_indices + String cohort_vcf_name + Float SNP_filter_level + Float INDEL_filter_level + + # Build SNP model + call BuildVQSRModel as BuildVQSRModelForSNPs { + input: + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + cohort_vcf = input_vcf, + cohort_vcf_index = input_vcf_index, + interval_list = calling_intervals_list, + output_basename = cohort_vcf_name, + annotations = SNP_annotations, + mode = "SNP", + tranches = SNP_tranches, + resources = SNP_resources, + resource_files = resource_files, + resource_indices = resource_indices + } + + # Build INDEL model + call BuildVQSRModel as BuildVQSRModelForINDELs { + input: + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + cohort_vcf = input_vcf, + cohort_vcf_index = input_vcf_index, + interval_list = calling_intervals_list, + output_basename = cohort_vcf_name, + annotations = INDEL_annotations, + mode = "INDEL", + tranches = INDEL_tranches, + resources = INDEL_resources, + resource_files = resource_files, + resource_indices = resource_indices + } + + # Apply INDEL filter (first because INDEL model is usually done sooner) + call ApplyRecalibrationFilter as ApplyRecalibrationFilterForINDELs { + input: + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + cohort_vcf = input_vcf, + cohort_vcf_index = input_vcf_index, + interval_list = calling_intervals_list, + output_basename = cohort_vcf_name + ".recal.INDEL", + mode = "INDEL", + recal_file = BuildVQSRModelForINDELs.recal_file, + recal_file_index = BuildVQSRModelForINDELs.recal_file_index, + tranches_file = BuildVQSRModelForINDELs.tranches_file, + filter_level = INDEL_filter_level + } + + # Apply SNP filter + call ApplyRecalibrationFilter as ApplyRecalibrationFilterForSNPs { + input: + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + cohort_vcf = ApplyRecalibrationFilterForINDELs.recalibrated_vcf, + cohort_vcf_index = ApplyRecalibrationFilterForINDELs.recalibrated_vcf_index, + interval_list = calling_intervals_list, + output_basename = cohort_vcf_name + ".recal.INDEL.SNP", + mode = "SNP", + recal_file = BuildVQSRModelForSNPs.recal_file, + recal_file_index = BuildVQSRModelForSNPs.recal_file_index, + tranches_file = BuildVQSRModelForSNPs.tranches_file, + filter_level = SNP_filter_level + } + + # Outputs that will be retained when execution is complete + output { + File snp_tranches = BuildVQSRModelForSNPs.tranches_file + File indel_tranches = BuildVQSRModelForINDELs.tranches_file + File fully_filtered_vcf = ApplyRecalibrationFilterForINDELs.recalibrated_vcf + File fully_filtered_vcf_idx = ApplyRecalibrationFilterForINDELs.recalibrated_vcf_index + } +} \ No newline at end of file