From 14bcb36d7ee10d783e58e1234a01725a0bb9cc99 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Thu, 4 Aug 2022 19:17:06 -0400 Subject: [PATCH 01/10] continuous integration M2 tests single pair --- scripts/m2_cromwell_tests/pair_list | 2 - .../m2_cromwell_tests/pair_list_tumor_only | 2 - scripts/m2_cromwell_tests/run_m2_wdl.sh | 12 +- scripts/m2_cromwell_tests/test_m2_wdl.json | 19 +++ .../m2_cromwell_tests/test_m2_wdl_multi.json | 16 --- scripts/mutect2_wdl/mutect2_multi_sample.wdl | 124 ------------------ 6 files changed, 25 insertions(+), 150 deletions(-) delete mode 100644 scripts/m2_cromwell_tests/pair_list delete mode 100644 scripts/m2_cromwell_tests/pair_list_tumor_only create mode 100644 scripts/m2_cromwell_tests/test_m2_wdl.json delete mode 100644 scripts/m2_cromwell_tests/test_m2_wdl_multi.json delete mode 100644 scripts/mutect2_wdl/mutect2_multi_sample.wdl diff --git a/scripts/m2_cromwell_tests/pair_list b/scripts/m2_cromwell_tests/pair_list deleted file mode 100644 index 962fe49cca2..00000000000 --- a/scripts/m2_cromwell_tests/pair_list +++ /dev/null @@ -1,2 +0,0 @@ -/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/tumor_1.bam /home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/tumor_1.bam.bai /home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/normal_1.bam /home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/normal_1.bam.bai -/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/tumor_2.bam /home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/tumor_2.bam.bai /home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/normal_2.bam /home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/normal_2.bam.bai diff --git a/scripts/m2_cromwell_tests/pair_list_tumor_only b/scripts/m2_cromwell_tests/pair_list_tumor_only deleted file mode 100644 index 9498e7e342f..00000000000 --- a/scripts/m2_cromwell_tests/pair_list_tumor_only +++ /dev/null @@ -1,2 +0,0 @@ -/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/tumor_1.bam /home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/tumor_1.bam.bai -/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/tumor_2.bam /home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/tumor_2.bam.bai \ No newline at end of file diff --git a/scripts/m2_cromwell_tests/run_m2_wdl.sh b/scripts/m2_cromwell_tests/run_m2_wdl.sh index f7f863e765b..b676f58b463 100644 --- a/scripts/m2_cromwell_tests/run_m2_wdl.sh +++ b/scripts/m2_cromwell_tests/run_m2_wdl.sh @@ -12,7 +12,7 @@ echo "Creating tar.gz for Funcotator datasources ==========" pushd . FUNCOTATOR_TEST_DS_DIR=${WORKING_DIR}/gatk/src/test/resources/large/funcotator/ cd ${FUNCOTATOR_TEST_DS_DIR} -# First parameter must match Mutect2_Multi.funco_data_sources_tar_gz test_m2_wdl_multi.json +# First parameter must match Mutect2_Multi.funco_data_sources_tar_gz test_m2_wdl.json tar zcvf ${WORKING_DIR}/gatk/small_ds_pik3ca.tar.gz small_ds_pik3ca/* popd @@ -35,7 +35,7 @@ fi echo "Docker build done ==========" echo "Putting the newly built docker image into the json parameters" cd $WORKING_DIR/gatk/scripts/ -sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" m2_cromwell_tests/test_m2_wdl_multi.json >$WORKING_DIR/test_m2_wdl_multi_mod.json +sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" m2_cromwell_tests/test_m2_wdl.json >$WORKING_DIR/test_m2_wdl_mod.json echo "JSON FILE (modified) =======" cat $WORKING_DIR/test_m2_wdl_multi_mod.json sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" m2_cromwell_tests/test_mitochondria_m2_wdl.json >$WORKING_DIR/test_mitochondria_m2_wdl_mod.json @@ -43,16 +43,16 @@ echo "JSON FILE (modified) =======" cat $WORKING_DIR/test_mitochondria_m2_wdl_mod.json echo "==================" -# Create the tumor-only json by using the pair_list_tumor_only file -sed -r "s/\"pair_list/\"pair_list_tumor_only/g" $WORKING_DIR/test_m2_wdl_multi_mod.json >$WORKING_DIR/test_m2_wdl_multi_mod_to.json +# Create the tumor-only json by removing normal_reads and normal_reads_index from the input json +grep -v 'Mutect2.normal_reads' $WORKING_DIR/test_m2_wdl_mod.json >$WORKING_DIR/test_m2_wdl_mod_to.json cd $WORKING_DIR/ echo "Running M2 WDL through cromwell (T/N)" ln -fs $WORKING_DIR/gatk/scripts/mutect2_wdl/mutect2.wdl -sudo java -jar $CROMWELL_JAR run $WORKING_DIR/gatk/scripts/mutect2_wdl/mutect2_multi_sample.wdl -i $WORKING_DIR/test_m2_wdl_multi_mod.json -m $WORKING_DIR/test_m2_wdl.metadata +sudo java -jar $CROMWELL_JAR run $WORKING_DIR/gatk/scripts/mutect2_wdl/mutect2.wdl -i $WORKING_DIR/test_m2_wdl_mod.json -m $WORKING_DIR/test_m2_wdl.metadata echo "Running M2 WDL through cromwell (Tumor-only)" -sudo java -jar $CROMWELL_JAR run $WORKING_DIR/gatk/scripts/mutect2_wdl/mutect2_multi_sample.wdl -i $WORKING_DIR/test_m2_wdl_multi_mod_to.json -m $WORKING_DIR/test_m2_wdl_to.metadata +sudo java -jar $CROMWELL_JAR run $WORKING_DIR/gatk/scripts/mutect2_wdl/mutect2.wdl -i $WORKING_DIR/test_m2_wdl_mod_to.json -m $WORKING_DIR/test_m2_wdl_to.metadata echo "Running Mitochondria M2 WDL through cromwell" ln -fs $WORKING_DIR/gatk/scripts/mitochondria_m2_wdl/AlignAndCall.wdl diff --git a/scripts/m2_cromwell_tests/test_m2_wdl.json b/scripts/m2_cromwell_tests/test_m2_wdl.json new file mode 100644 index 00000000000..4a3073b7fbd --- /dev/null +++ b/scripts/m2_cromwell_tests/test_m2_wdl.json @@ -0,0 +1,19 @@ +{ + "Mutect2.gatk_docker": "__GATK_DOCKER__", + "Mutect2.intervals": "/home/runner/work/gatk/gatk/scripts/m2_cromwell_tests/interval_list.interval_list", + "Mutect2.ref_fasta": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.fasta", + "Mutect2.ref_fai": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.fasta.fai", + "Mutect2.ref_dict": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.dict", + "Mutect2.tumor_reads": "/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/tumor_1.bam", + "Mutect2.tumor_reads_index": "/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/tumor_1.bam.bai", + "Mutect2.normal_reads": "/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/normal_1.bam", + "Mutect2.normal_reads_index": "/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/normal_1.bam.bai", + "Mutect2.funco_data_sources_tar_gz": "/home/runner/work/gatk/gatk/small_ds_pik3ca.tar.gz", + "Mutect2.funco_reference_version": "hg19", + "Mutect2.scatter_count": 2, + "Mutect2.run_orientation_bias_mixture_model_filter": true, + "Mutect2.run_funcotator": true, + "Mutect2.preemptible_attempts": 2, + "Mutect2.compress_vcfs": false, + "Mutect2.make_bamout": true +} \ No newline at end of file diff --git a/scripts/m2_cromwell_tests/test_m2_wdl_multi.json b/scripts/m2_cromwell_tests/test_m2_wdl_multi.json deleted file mode 100644 index d81b0852158..00000000000 --- a/scripts/m2_cromwell_tests/test_m2_wdl_multi.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "Mutect2_Multi.gatk_docker": "__GATK_DOCKER__", - "Mutect2_Multi.intervals": "/home/runner/work/gatk/gatk/scripts/m2_cromwell_tests/interval_list.interval_list", - "Mutect2_Multi.ref_fasta": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.fasta", - "Mutect2_Multi.ref_fai": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.fasta.fai", - "Mutect2_Multi.ref_dict": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.dict", - "Mutect2_Multi.pair_list": "/home/runner/work/gatk/gatk/scripts/m2_cromwell_tests/pair_list", - "Mutect2_Multi.funco_data_sources_tar_gz": "/home/runner/work/gatk/gatk/small_ds_pik3ca.tar.gz", - "Mutect2_Multi.funco_reference_version": "hg19", - "Mutect2_Multi.scatter_count": 2, - "Mutect2_Multi.run_orientation_bias_mixture_model_filter": true, - "Mutect2_Multi.run_funcotator": true, - "Mutect2_Multi.preemptible_attempts": 2, - "Mutect2_Multi.compress_vcfs": false, - "Mutect2_Multi.make_bamout": true -} \ No newline at end of file diff --git a/scripts/mutect2_wdl/mutect2_multi_sample.wdl b/scripts/mutect2_wdl/mutect2_multi_sample.wdl deleted file mode 100644 index 61442704914..00000000000 --- a/scripts/mutect2_wdl/mutect2_multi_sample.wdl +++ /dev/null @@ -1,124 +0,0 @@ -version 1.0 - -# Run Mutect 2 on a list of tumors or tumor-normal pairs -# -# Description of inputs -# intervals: genomic intervals -# ref_fasta, ref_fai, ref_dict: reference genome, index, and dictionary -# pon, pon_idx: optional panel of normals and index in vcf format containing known false positves -# scatter_count: number of parallel jobs when scattering over intervals -# gnomad, gnomad_idx: optional database of known germline variants, obtainable from http://gnomad.broadinstitute.org/downloads -# variants_for_contamination, variants_for_contamination_idx: vcf of common variants with allele frequencies fo calculating contamination -# run_orientation_bias_filter: if true, run the orientation bias filter post-processing step -# pair_list: a tab-separated table with no header in the following format: -# TUMOR_1_BAMTUMOR_1_baiNORMAL_1_BAMNORMAL_1_bai -# TUMOR_2_BAMTUMOR_2_baiNORMAL_2_BAMNORMAL_2_bai -# . . . -# Tumor-only input is the same but without the columns for the normal: -# TUMOR_1_BAMTUMOR_1_bai -# TUMOR_2_BAMTUMOR_2_bai -# . . . - -import "mutect2.wdl" as m2 - -workflow Mutect2_Multi { - input { - File? intervals - File ref_fasta - File ref_fai - File ref_dict - File pair_list - - File? pon - File? pon_idx - File? gnomad - File? gnomad_idx - File? variants_for_contamination - File? variants_for_contamination_idx - Boolean? run_orientation_bias_mixture_model_filter - Int scatter_count - String? m2_extra_args - String? m2_extra_filtering_args - Boolean? compress_vcfs - Boolean? make_bamout - - String? gcs_project_for_requester_pays - - # Oncotator inputs - String? sequencing_center - String? sequence_source - - # funcotator inputs - Boolean? run_funcotator - String? funco_reference_version - File? funco_data_sources_tar_gz - String? funco_transcript_selection_mode - File? funco_transcript_selection_list - Array[String]? funco_annotation_defaults - Array[String]? funco_annotation_overrides - - - # runtime - String gatk_docker - Int? preemptible_attempts - File? gatk_override - } - - Array[Array[String]] pairs = read_tsv(pair_list) - - scatter( row in pairs ) { - # If the condition is true, variables inside the 'if' block retain their values outside the block. - # Otherwise they are treated as null, which in WDL is equivalent to an empty optional - if(length(row) == 4) { - File normal_bam = row[2] - File normal_bai = row[3] - } - - call m2.Mutect2 { - input: - intervals = intervals, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - tumor_reads = row[0], - tumor_reads_index = row[1], - normal_reads = normal_bam, - normal_reads_index = normal_bai, - pon = pon, - pon_idx = pon_idx, - scatter_count = scatter_count, - gnomad = gnomad, - gnomad_idx = gnomad_idx, - variants_for_contamination = variants_for_contamination, - variants_for_contamination_idx = variants_for_contamination_idx, - run_orientation_bias_mixture_model_filter = run_orientation_bias_mixture_model_filter, - m2_extra_args = m2_extra_args, - m2_extra_filtering_args = m2_extra_filtering_args, - sequencing_center = sequencing_center, - sequence_source = sequence_source, - run_funcotator = run_funcotator, - funco_reference_version = funco_reference_version, - funco_data_sources_tar_gz = funco_data_sources_tar_gz, - funco_transcript_selection_mode = funco_transcript_selection_mode, - funco_transcript_selection_list = funco_transcript_selection_list, - funco_annotation_defaults = funco_annotation_defaults, - funco_annotation_overrides = funco_annotation_overrides, - - make_bamout = make_bamout, - compress_vcfs = compress_vcfs, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible = preemptible_attempts, - gcs_project_for_requester_pays = gcs_project_for_requester_pays - } - } - - output { - Array[File] filtered_vcf = Mutect2.filtered_vcf - Array[File] filtered_vcf_idx = Mutect2.filtered_vcf_idx - Array[File?] contamination_tables = Mutect2.contamination_table - - Array[File?] m2_bamout = Mutect2.bamout - Array[File?] m2_bamout_index = Mutect2.bamout_index - } -} From b293f8d34adbca32a67a4b1e992ec7638dc6f12b Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Thu, 4 Aug 2022 19:22:11 -0400 Subject: [PATCH 02/10] remove Funcotator from M2 WDL (Funcotator has its own WDL) --- scripts/m2_cromwell_tests/test_m2_wdl.json | 2 - scripts/mutect2_wdl/mutect2.wdl | 210 +-------------------- 2 files changed, 4 insertions(+), 208 deletions(-) diff --git a/scripts/m2_cromwell_tests/test_m2_wdl.json b/scripts/m2_cromwell_tests/test_m2_wdl.json index 4a3073b7fbd..e8f85f4bbdc 100644 --- a/scripts/m2_cromwell_tests/test_m2_wdl.json +++ b/scripts/m2_cromwell_tests/test_m2_wdl.json @@ -8,8 +8,6 @@ "Mutect2.tumor_reads_index": "/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/tumor_1.bam.bai", "Mutect2.normal_reads": "/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/normal_1.bam", "Mutect2.normal_reads_index": "/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/normal_1.bam.bai", - "Mutect2.funco_data_sources_tar_gz": "/home/runner/work/gatk/gatk/small_ds_pik3ca.tar.gz", - "Mutect2.funco_reference_version": "hg19", "Mutect2.scatter_count": 2, "Mutect2.run_orientation_bias_mixture_model_filter": true, "Mutect2.run_funcotator": true, diff --git a/scripts/mutect2_wdl/mutect2.wdl b/scripts/mutect2_wdl/mutect2.wdl index 1d02f9fb45e..638b4e054cd 100755 --- a/scripts/mutect2_wdl/mutect2.wdl +++ b/scripts/mutect2_wdl/mutect2.wdl @@ -3,7 +3,7 @@ version 1.0 ## Copyright Broad Institute, 2017 ## ## This WDL workflow runs GATK4 Mutect 2 on a single tumor-normal pair or on a single tumor sample, -## and performs additional filtering and functional annotation tasks. +## and performs additional filtering. ## ## Main requirements/expectations : ## - One analysis-ready BAM file (and its index) for each sample @@ -38,22 +38,8 @@ version 1.0 ## ** Secondary resources ** (for optional tasks) ## realignment_index_bundle: resource for FilterAlignmentArtifacts, which runs if and only if it is specified. Generated by BwaMemIndexImageCreator. ## -## Funcotator parameters (see Funcotator help for more details). -## funco_reference_version: "hg19" for hg19 or b37. "hg38" for hg38. Default: "hg19" -## funco_output_format: "MAF" to produce a MAF file, "VCF" to procude a VCF file. Default: "MAF" -## funco_compress: (Only valid if funco_output_format == "VCF" ) If true, will compress the output of Funcotator. If false, produces an uncompressed output file. Default: false -## funco_use_gnomad_AF: If true, will include gnomAD allele frequency annotations in output by connecting to the internet to query gnomAD (this impacts performance). If false, will not annotate with gnomAD. Default: false -## funco_transcript_selection_mode: How to select transcripts in Funcotator. ALL, CANONICAL, or BEST_EFFECT -## funco_transcript_selection_list: Transcripts (one GENCODE ID per line) to give priority during selection process. -## funco_data_sources_tar_gz: Funcotator datasources tar gz file. Bucket location is recommended when running on the cloud. -## funco_annotation_defaults: Default values for annotations, when values are unspecified. Specified as :. For example: "Center:Broad" -## funco_annotation_overrides: Values for annotations, even when values are unspecified. Specified as :. For example: "Center:Broad" -## funcotator_excluded_fields: Annotations that should not appear in the output (VCF or MAF). Specified as . For example: "ClinVar_ALLELEID" -## funco_filter_funcotations: If true, will only annotate variants that have passed filtering (. or PASS value in the FILTER column). If false, will annotate all variants in the input file. Default: true -## funcotator_extra_args: Any additional arguments to pass to Funcotator. Default: "" -## ## Outputs : -## - One VCF file and its index with primary filtering applied; secondary filtering and functional annotation if requested; a bamout.bam +## - One VCF file and its index with primary filtering applied; secondary filtering if requested; a bamout.bam ## file of reassembled reads if requested ## ## Cromwell version support @@ -111,30 +97,10 @@ workflow Mutect2 { File? gga_vcf_idx String? gcs_project_for_requester_pays - # Funcotator inputs - Boolean? run_funcotator - String? sequencing_center - String? sequence_source - String? funco_reference_version - String? funco_output_format - Boolean? funco_compress - Boolean? funco_use_gnomad_AF - File? funco_data_sources_tar_gz - String? funco_transcript_selection_mode - File? funco_transcript_selection_list - Array[String]? funco_annotation_defaults - Array[String]? funco_annotation_overrides - Array[String]? funcotator_excluded_fields - Boolean? funco_filter_funcotations - String? funcotator_extra_args - - String funco_default_output_format = "MAF" - # runtime String gatk_docker File? gatk_override String basic_bash_docker = "ubuntu:16.04" - Boolean? filter_funcotations Int? preemptible Int? max_retries @@ -162,8 +128,7 @@ workflow Mutect2 { Boolean compress = select_first([compress_vcfs, false]) Boolean run_ob_filter = select_first([run_orientation_bias_mixture_model_filter, false]) Boolean make_bamout_or_default = select_first([make_bamout, false]) - Boolean run_funcotator_or_default = select_first([run_funcotator, false]) - Boolean filter_funcotations_or_default = select_first([filter_funcotations, true]) + # Disk sizes used for dynamic sizing Int ref_size = ceil(size(ref_fasta, "GB") + size(ref_dict, "GB") + size(ref_fai, "GB")) @@ -172,7 +137,6 @@ workflow Mutect2 { Int normal_reads_size = if defined(normal_reads) then ceil(size(normal_reads, "GB") + size(normal_reads_index, "GB")) else 0 # If no tar is provided, the task downloads one from broads ftp server - Int funco_tar_size = if defined(funco_data_sources_tar_gz) then ceil(size(funco_data_sources_tar_gz, "GB") * 3) else 100 Int gatk_override_size = if defined(gatk_override) then ceil(size(gatk_override, "GB")) else 0 # This is added to every task as padding, should increase if systematically you need more disk for every call @@ -182,7 +146,6 @@ workflow Mutect2 { String output_basename = basename(basename(tumor_reads, ".bam"),".cram") #hacky way to strip either .bam or .cram String unfiltered_name = output_basename + "-unfiltered" String filtered_name = output_basename + "-filtered" - String funcotated_name = output_basename + "-funcotated" String output_vcf_name = output_basename + ".vcf" @@ -348,37 +311,7 @@ workflow Mutect2 { } } - if (run_funcotator_or_default) { - File funcotate_vcf_input = select_first([FilterAlignmentArtifacts.filtered_vcf, Filter.filtered_vcf]) - File funcotate_vcf_input_index = select_first([FilterAlignmentArtifacts.filtered_vcf_idx, Filter.filtered_vcf_idx]) - call Funcotate { - input: - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - input_vcf = funcotate_vcf_input, - input_vcf_idx = funcotate_vcf_input_index, - reference_version = select_first([funco_reference_version, "hg19"]), - output_file_base_name = basename(funcotate_vcf_input, ".vcf") + ".annotated", - output_format = if defined(funco_output_format) then "" + funco_output_format else funco_default_output_format, - compress = if defined(funco_compress) then select_first([funco_compress]) else false, - use_gnomad = if defined(funco_use_gnomad_AF) then select_first([funco_use_gnomad_AF]) else false, - data_sources_tar_gz = funco_data_sources_tar_gz, - case_id = M2.tumor_sample[0], - control_id = M2.normal_sample[0], - sequencing_center = sequencing_center, - sequence_source = sequence_source, - transcript_selection_mode = funco_transcript_selection_mode, - transcript_selection_list = funco_transcript_selection_list, - annotation_defaults = funco_annotation_defaults, - annotation_overrides = funco_annotation_overrides, - funcotator_excluded_fields = funcotator_excluded_fields, - filter_funcotations = filter_funcotations_or_default, - extra_args = funcotator_extra_args, - runtime_params = standard_runtime, - disk_space = ceil(size(funcotate_vcf_input, "GB") * large_input_to_output_multiplier) + funco_tar_size + disk_pad - } - } + output { File filtered_vcf = select_first([FilterAlignmentArtifacts.filtered_vcf, Filter.filtered_vcf]) @@ -387,8 +320,6 @@ workflow Mutect2 { File mutect_stats = MergeStats.merged_stats File? contamination_table = CalculateContamination.contamination_table - File? funcotated_file = Funcotate.funcotated_output_file - File? funcotated_file_index = Funcotate.funcotated_output_file_index File? bamout = MergeBamOuts.merged_bam_out File? bamout_index = MergeBamOuts.merged_bam_out_index File? maf_segments = CalculateContamination.maf_segments @@ -928,136 +859,3 @@ task FilterAlignmentArtifacts { } } -task Funcotate { - input { - File ref_fasta - File ref_fai - File ref_dict - File input_vcf - File input_vcf_idx - String reference_version - String output_file_base_name - String output_format - Boolean compress - Boolean use_gnomad - # This should be updated when a new version of the data sources is released - # TODO: Make this dynamically chosen in the command. - File? data_sources_tar_gz = "gs://broad-public-datasets/funcotator/funcotator_dataSources.v1.7.20200521s.tar.gz" - String? control_id - String? case_id - String? sequencing_center - String? sequence_source - String? transcript_selection_mode - File? transcript_selection_list - Array[String]? annotation_defaults - Array[String]? annotation_overrides - Array[String]? funcotator_excluded_fields - Boolean? filter_funcotations - File? interval_list - - String? extra_args - String? gcs_project_for_requester_pays - - # ============== - Runtime runtime_params - Int? disk_space #override to request more disk than default small task params - - # You may have to change the following two parameter values depending on the task requirements - Int default_ram_mb = 3000 - # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). Please see [TODO: Link from Jose] for examples. - Int default_disk_space_gb = 100 - } - - # ============== - # Process input args: - String output_maf = output_file_base_name + ".maf" - String output_maf_index = output_maf + ".idx" - String output_vcf = output_file_base_name + if compress then ".vcf.gz" else ".vcf" - String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" - String output_file = if output_format == "MAF" then output_maf else output_vcf - String output_file_index = if output_format == "MAF" then output_maf_index else output_vcf_idx - String transcript_selection_arg = if defined(transcript_selection_list) then " --transcript-list " else "" - String annotation_def_arg = if defined(annotation_defaults) then " --annotation-default " else "" - String annotation_over_arg = if defined(annotation_overrides) then " --annotation-override " else "" - String filter_funcotations_args = if defined(filter_funcotations) && (filter_funcotations) then " --remove-filtered-variants " else "" - String excluded_fields_args = if defined(funcotator_excluded_fields) then " --exclude-field " else "" - String interval_list_arg = if defined(interval_list) then " -L " else "" - String extra_args_arg = select_first([extra_args, ""]) - - String dollar = "$" - - parameter_meta{ - ref_fasta: {localization_optional: true} - ref_fai: {localization_optional: true} - ref_dict: {localization_optional: true} - input_vcf: {localization_optional: true} - input_vcf_idx: {localization_optional: true} - } - - command <<< - set -e - export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override} - - # Extract our data sources: - echo "Extracting data sources zip file..." - mkdir datasources_dir - tar zxvf ~{data_sources_tar_gz} -C datasources_dir --strip-components 1 - DATA_SOURCES_FOLDER="$PWD/datasources_dir" - - # Handle gnomAD: - if ~{use_gnomad} ; then - echo "Enabling gnomAD..." - for potential_gnomad_gz in gnomAD_exome.tar.gz gnomAD_genome.tar.gz ; do - if [[ -f ~{dollar}{DATA_SOURCES_FOLDER}/~{dollar}{potential_gnomad_gz} ]] ; then - cd ~{dollar}{DATA_SOURCES_FOLDER} - tar -zvxf ~{dollar}{potential_gnomad_gz} - cd - - else - echo "ERROR: Cannot find gnomAD folder: ~{dollar}{potential_gnomad_gz}" 1>&2 - false - fi - done - fi - - # Run Funcotator: - gatk --java-options "-Xmx~{runtime_params.command_mem}m" Funcotator \ - --data-sources-path $DATA_SOURCES_FOLDER \ - --ref-version ~{reference_version} \ - --output-file-format ~{output_format} \ - -R ~{ref_fasta} \ - -V ~{input_vcf} \ - -O ~{output_file} \ - ~{interval_list_arg} ~{default="" interval_list} \ - --annotation-default normal_barcode:~{default="Unknown" control_id} \ - --annotation-default tumor_barcode:~{default="Unknown" case_id} \ - --annotation-default Center:~{default="Unknown" sequencing_center} \ - --annotation-default source:~{default="Unknown" sequence_source} \ - ~{"--transcript-selection-mode " + transcript_selection_mode} \ - ~{transcript_selection_arg}~{default="" sep=" --transcript-list " transcript_selection_list} \ - ~{annotation_def_arg}~{default="" sep=" --annotation-default " annotation_defaults} \ - ~{annotation_over_arg}~{default="" sep=" --annotation-override " annotation_overrides} \ - ~{excluded_fields_args}~{default="" sep=" --exclude-field " funcotator_excluded_fields} \ - ~{filter_funcotations_args} \ - ~{extra_args_arg} \ - ~{"--gcs-project-for-requester-pays " + gcs_project_for_requester_pays} - # Make sure we have a placeholder index for MAF files so this workflow doesn't fail: - if [[ "~{output_format}" == "MAF" ]] ; then - touch ~{output_maf_index} - fi - >>> - - runtime { - docker: runtime_params.gatk_docker - bootDiskSizeGb: runtime_params.boot_disk_size - memory: runtime_params.machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, runtime_params.disk]) + " HDD" - preemptible: runtime_params.preemptible - maxRetries: runtime_params.max_retries - cpu: runtime_params.cpu - } - - output { - File funcotated_output_file = "~{output_file}" - File funcotated_output_file_index = "~{output_file_index}" - } -} From ec011233efc7fdcf11122de462b9eff1e1003753 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Thu, 4 Aug 2022 22:49:17 -0400 Subject: [PATCH 03/10] simplify default inputs --- scripts/mutect2_wdl/mutect2.wdl | 168 ++++--- scripts/mutect2_wdl/mutect3_training_data.wdl | 415 ------------------ 2 files changed, 75 insertions(+), 508 deletions(-) delete mode 100644 scripts/mutect2_wdl/mutect3_training_data.wdl diff --git a/scripts/mutect2_wdl/mutect2.wdl b/scripts/mutect2_wdl/mutect2.wdl index 638b4e054cd..ddf64bf8bf1 100755 --- a/scripts/mutect2_wdl/mutect2.wdl +++ b/scripts/mutect2_wdl/mutect2.wdl @@ -68,79 +68,67 @@ struct Runtime { workflow Mutect2 { input { - # Mutect2 inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - File tumor_reads - File tumor_reads_index - File? normal_reads - File? normal_reads_index - File? pon - File? pon_idx - Int scatter_count - File? gnomad - File? gnomad_idx - File? variants_for_contamination - File? variants_for_contamination_idx - File? realignment_index_bundle - String? realignment_extra_args - Boolean? run_orientation_bias_mixture_model_filter - String? m2_extra_args - String? m2_extra_filtering_args - String? getpileupsummaries_extra_args - String? split_intervals_extra_args - Boolean? make_bamout - Boolean? compress_vcfs - File? gga_vcf - File? gga_vcf_idx - String? gcs_project_for_requester_pays - - # runtime - String gatk_docker - File? gatk_override - String basic_bash_docker = "ubuntu:16.04" - - Int? preemptible - Int? max_retries - Int small_task_cpu = 2 - Int small_task_mem = 4 - Int small_task_disk = 100 - Int boot_disk_size = 12 - Int learn_read_orientation_mem = 8000 - Int filter_alignment_artifacts_mem = 9000 - - # Use as a last resort to increase the disk given to every task in case of ill behaving data - Int? emergency_extra_disk - - # These are multipliers to multipler inputs by to make sure we have enough disk to accommodate for possible output sizes - # Large is for Bams/WGS vcfs - # Small is for metrics/other vcfs - Float large_input_to_output_multiplier = 2.25 - Float small_input_to_output_multiplier = 2.0 - Float cram_to_bam_multiplier = 6.0 + # basic inputs + File? intervals + File ref_fasta + File ref_fai + File ref_dict + File tumor_reads + File tumor_reads_index + File? normal_reads + File? normal_reads_index + + # optional but usually recommended resources + File? pon + File? pon_idx + File? gnomad + File? gnomad_idx + File? variants_for_contamination + File? variants_for_contamination_idx + + # extra arguments + String? m2_extra_args + String? m2_extra_filtering_args + String? getpileupsummaries_extra_args + String? split_intervals_extra_args + + # additional modes and outputs + File? realignment_index_bundle + String? realignment_extra_args + Boolean run_orientation_bias_mixture_model_filter = false + Boolean make_bamout = false + Boolean compress_vcfs = false + File? gga_vcf + File? gga_vcf_idx + + + # runtime + String gatk_docker + File? gatk_override + String basic_bash_docker = "ubuntu:16.04" + Int scatter_count + Int preemptible = 2 + Int max_retries = 1 + Int small_task_cpu = 2 + Int small_task_mem = 4 + Int small_task_disk = 100 + Int boot_disk_size = 12 + Int learn_read_orientation_mem = 8000 + Int filter_alignment_artifacts_mem = 9000 + String? gcs_project_for_requester_pays + + # Use as a last resort to increase the disk given to every task in case of ill behaving data + Int emergency_extra_disk = 0 } - Int preemptible_or_default = select_first([preemptible, 2]) - Int max_retries_or_default = select_first([max_retries, 2]) - - Boolean compress = select_first([compress_vcfs, false]) - Boolean run_ob_filter = select_first([run_orientation_bias_mixture_model_filter, false]) - Boolean make_bamout_or_default = select_first([make_bamout, false]) - - # Disk sizes used for dynamic sizing Int ref_size = ceil(size(ref_fasta, "GB") + size(ref_dict, "GB") + size(ref_fai, "GB")) Int tumor_reads_size = ceil(size(tumor_reads, "GB") + size(tumor_reads_index, "GB")) Int gnomad_vcf_size = if defined(gnomad) then ceil(size(gnomad, "GB")) else 0 Int normal_reads_size = if defined(normal_reads) then ceil(size(normal_reads, "GB") + size(normal_reads_index, "GB")) else 0 - # If no tar is provided, the task downloads one from broads ftp server - Int gatk_override_size = if defined(gatk_override) then ceil(size(gatk_override, "GB")) else 0 - # This is added to every task as padding, should increase if systematically you need more disk for every call - Int disk_pad = 10 + gatk_override_size + select_first([emergency_extra_disk,0]) + Int disk_pad = 10 + select_first([emergency_extra_disk,0]) # logic about output file names -- these are the names *without* .vcf extensions String output_basename = basename(basename(tumor_reads, ".bam"),".cram") #hacky way to strip either .bam or .cram @@ -149,15 +137,11 @@ workflow Mutect2 { String output_vcf_name = output_basename + ".vcf" - Int tumor_cram_to_bam_disk = ceil(tumor_reads_size * cram_to_bam_multiplier) - Int normal_cram_to_bam_disk = ceil(normal_reads_size * cram_to_bam_multiplier) - Runtime standard_runtime = {"gatk_docker": gatk_docker, "gatk_override": gatk_override, - "max_retries": max_retries_or_default, "preemptible": preemptible_or_default, "cpu": small_task_cpu, + "max_retries": max_retries, "preemptible": preemptible, "cpu": small_task_cpu, "machine_mem": small_task_mem * 1000, "command_mem": small_task_mem * 1000 - 500, "disk": small_task_disk + disk_pad, "boot_disk_size": boot_disk_size} - Int tumor_reads_size = ceil(size(tumor_reads, "GB") + size(tumor_reads_index, "GB")) Int normal_reads_size = if defined(normal_reads) then ceil(size(normal_reads, "GB") + size(normal_reads_index, "GB")) else 0 @@ -197,9 +181,9 @@ workflow Mutect2 { getpileupsummaries_extra_args = getpileupsummaries_extra_args, variants_for_contamination = variants_for_contamination, variants_for_contamination_idx = variants_for_contamination_idx, - make_bamout = make_bamout_or_default, - run_ob_filter = run_ob_filter, - compress = compress, + make_bamout = make_bamout, + run_ob_filter = run_orientation_bias_mixture_model_filter, + compress_vcfs = compress_vcfs, gga_vcf = gga_vcf, gga_vcf_idx = gga_vcf_idx, gatk_override = gatk_override, @@ -212,7 +196,7 @@ workflow Mutect2 { Int merged_vcf_size = ceil(size(M2.unfiltered_vcf, "GB")) Int merged_bamout_size = ceil(size(M2.output_bamOut, "GB")) - if (run_ob_filter) { + if (run_orientation_bias_mixture_model_filter) { call LearnReadOrientationModel { input: f1r2_tar_gz = M2.f1r2_counts, @@ -226,11 +210,11 @@ workflow Mutect2 { input_vcfs = M2.unfiltered_vcf, input_vcf_indices = M2.unfiltered_vcf_idx, output_name = unfiltered_name, - compress = compress, + compress_vcfs = compress_vcfs, runtime_params = standard_runtime } - if (make_bamout_or_default) { + if (make_bamout) { call MergeBamOuts { input: ref_fasta = ref_fasta, @@ -239,7 +223,7 @@ workflow Mutect2 { bam_outs = M2.output_bamOut, output_vcf_name = basename(MergeVCFs.merged_vcf, ".vcf"), runtime_params = standard_runtime, - disk_space = ceil(merged_bamout_size * large_input_to_output_multiplier) + disk_pad, + disk_space = ceil(merged_bamout_size * 4) + disk_pad, } } @@ -281,14 +265,14 @@ workflow Mutect2 { unfiltered_vcf = MergeVCFs.merged_vcf, unfiltered_vcf_idx = MergeVCFs.merged_vcf_idx, output_name = filtered_name, - compress = compress, + compress_vcfs = compress_vcfs, mutect_stats = MergeStats.merged_stats, contamination_table = CalculateContamination.contamination_table, maf_segments = CalculateContamination.maf_segments, artifact_priors_tar_gz = LearnReadOrientationModel.artifact_prior_table, m2_extra_filtering_args = m2_extra_filtering_args, runtime_params = standard_runtime, - disk_space = ceil(size(MergeVCFs.merged_vcf, "GB") * small_input_to_output_multiplier) + disk_pad + disk_space = ceil(size(MergeVCFs.merged_vcf, "GB") * 4) + disk_pad } if (defined(realignment_index_bundle)) { @@ -301,7 +285,7 @@ workflow Mutect2 { reads_index = tumor_reads_index, realignment_index_bundle = select_first([realignment_index_bundle]), realignment_extra_args = realignment_extra_args, - compress = compress, + compress_vcfs = compress_vcfs, output_name = filtered_name, input_vcf = Filter.filtered_vcf, input_vcf_idx = Filter.filtered_vcf_idx, @@ -311,8 +295,6 @@ workflow Mutect2 { } } - - output { File filtered_vcf = select_first([FilterAlignmentArtifacts.filtered_vcf, Filter.filtered_vcf]) File filtered_vcf_idx = select_first([FilterAlignmentArtifacts.filtered_vcf_idx, Filter.filtered_vcf_idx]) @@ -387,7 +369,7 @@ task M2 { String? getpileupsummaries_extra_args Boolean? make_bamout Boolean? run_ob_filter - Boolean compress + Boolean compress_vcfs File? gga_vcf File? gga_vcf_idx File? variants_for_contamination @@ -407,8 +389,8 @@ task M2 { Boolean use_ssd = false } - String output_vcf = "output" + if compress then ".vcf.gz" else ".vcf" - String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" + String output_vcf = "output" + if compress_vcfs then ".vcf.gz" else ".vcf" + String output_vcf_idx = output_vcf + if compress_vcfs then ".tbi" else ".idx" String output_stats = output_vcf + ".stats" @@ -524,12 +506,12 @@ task MergeVCFs { Array[File] input_vcfs Array[File] input_vcf_indices String output_name - Boolean compress + Boolean compress_vcfs Runtime runtime_params } - String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" - String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" + String output_vcf = output_name + if compress_vcfs then ".vcf.gz" else ".vcf" + String output_vcf_idx = output_vcf + if compress_vcfs then ".tbi" else ".idx" # using MergeVcfs instead of GatherVcfs so we can create indices # WARNING 2015-10-28 15:01:48 GatherVcfs Index creation not currently supported when gathering block compressed VCFs. @@ -741,7 +723,7 @@ task Filter { File unfiltered_vcf File unfiltered_vcf_idx String output_name - Boolean compress + Boolean compress_vcfs File? mutect_stats File? artifact_priors_tar_gz File? contamination_table @@ -752,8 +734,8 @@ task Filter { Int? disk_space } - String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" - String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" + String output_vcf = output_name + if compress_vcfs then ".vcf.gz" else ".vcf" + String output_vcf_idx = output_vcf + if compress_vcfs then ".tbi" else ".idx" parameter_meta{ ref_fasta: {localization_optional: true} @@ -804,7 +786,7 @@ task FilterAlignmentArtifacts { File reads File reads_index String output_name - Boolean compress + Boolean compress_vcfs File realignment_index_bundle String? realignment_extra_args String? gcs_project_for_requester_pays @@ -812,8 +794,8 @@ task FilterAlignmentArtifacts { Int mem } - String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" - String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx" + String output_vcf = output_name + if compress_vcfs then ".vcf.gz" else ".vcf" + String output_vcf_idx = output_vcf + if compress_vcfs then ".tbi" else ".idx" Int machine_mem = mem Int command_mem = machine_mem - 500 diff --git a/scripts/mutect2_wdl/mutect3_training_data.wdl b/scripts/mutect2_wdl/mutect3_training_data.wdl deleted file mode 100644 index 444dfb76da1..00000000000 --- a/scripts/mutect2_wdl/mutect3_training_data.wdl +++ /dev/null @@ -1,415 +0,0 @@ -version 1.0 - -import "https://raw.githubusercontent.com/gatk-workflows/gatk4-somatic-snvs-indels/2.6.0/mutect2.wdl" as m2 - -workflow Mutect3TrainingData { - input { - File? intervals - File? masks - File ref_fasta - File ref_fai - File ref_dict - Int scatter_count - File tumor_bam - File tumor_bai - File? normal_bam - File? normal_bai - File? pon - File? gnomad - File? variants_for_contamination - String ref_downsample - Boolean? run_orientation_bias_mixture_model_filter - File? realignment_index_bundle - String? realignment_extra_args - String? m2_extra_args - String? m2_extra_filtering_args - String? normal_artifact_extra_args - String? split_intervals_extra_args - File? truth_vcf - File? truth_vcf_idx - Boolean? make_bamout - - # runtime - String gatk_docker - File? gatk_override - Int? preemptible - Int? max_retries - } - - String m2_extra_args_with_training_mode = select_first([m2_extra_args, ""]) + " --training-data-mode --training-data-mode-ref-downsample " + ref_downsample - - Runtime small_runtime = {"gatk_docker": gatk_docker, "gatk_override": gatk_override, - "max_retries": 2, "preemptible": 0, "cpu": 2, - "machine_mem": 4000, "command_mem": 3500, - "disk": 100, "boot_disk_size": 12} - - # call on the tumor (with normal if present) to get tumor read data and M2 filtering - call m2.Mutect2 as Tumor { - input: - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - scatter_count = scatter_count, - tumor_reads = tumor_bam, - tumor_reads_index = tumor_bai, - normal_reads = normal_bam, - normal_reads_index = normal_bai, - intervals = intervals, - pon = pon, - gnomad = gnomad, - variants_for_contamination = variants_for_contamination, - run_orientation_bias_mixture_model_filter = run_orientation_bias_mixture_model_filter, - realignment_index_bundle = realignment_index_bundle, - realignment_extra_args = realignment_extra_args, - preemptible = preemptible, - max_retries = max_retries, - m2_extra_args = m2_extra_args_with_training_mode, - m2_extra_filtering_args = m2_extra_filtering_args, - make_bamout = make_bamout, - gatk_override = gatk_override, - gatk_docker = gatk_docker - } - - if(defined(truth_vcf)) { - call Concordance { - input: - intervals = intervals, - masks = masks, - truth_vcf = select_first([truth_vcf]), - truth_vcf_idx = select_first([truth_vcf_idx]), - eval_vcf = Tumor.filtered_vcf, - eval_vcf_idx = Tumor.filtered_vcf_idx, - preemptible = preemptible, - gatk_override = gatk_override, - gatk_docker = gatk_docker - } - - call MakeTableFromConcordance as TumorConcordanceTable { - input: - tpfp = Concordance.tpfp, - tpfp_idx = Concordance.tpfp_idx, - ftnfn = Concordance.ftnfn, - ftnfn_idx = Concordance.ftnfn_idx, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible = preemptible - } - } - - if(!defined(truth_vcf)) { - call MakeTableFromMutect2 as TumorTable { - input: - filtered_vcf = Tumor.filtered_vcf, - filtered_vcf_idx = Tumor.filtered_vcf_idx, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible = preemptible - } - } - - # call on the normal, with tumor as "matched normal", to get normal read data and M2 filtering - if(defined(normal_bam)) { - call m2.Mutect2 as Normal { - input: - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - scatter_count = scatter_count, - tumor_reads = select_first([normal_bam]), - tumor_reads_index = select_first([normal_bai]), - normal_reads = tumor_bam, - normal_reads_index = tumor_bai, - intervals = intervals, - pon = pon, - gnomad = gnomad, - variants_for_contamination = variants_for_contamination, - run_orientation_bias_mixture_model_filter = run_orientation_bias_mixture_model_filter, - realignment_index_bundle = realignment_index_bundle, - realignment_extra_args = realignment_extra_args, - preemptible = preemptible, - max_retries = max_retries, - m2_extra_args = m2_extra_args_with_training_mode, - m2_extra_filtering_args = m2_extra_filtering_args, - make_bamout = make_bamout, - gatk_override = gatk_override, - gatk_docker = gatk_docker - } - - # there's no reason to call concordance on the normal because the calls will have no relation to the truth VCF - - call MakeTableFromMutect2 as NormalTable { - input: - filtered_vcf = Normal.filtered_vcf, - filtered_vcf_idx = Normal.filtered_vcf_idx, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible = preemptible - } - - call m2.SplitIntervals as Split { - input: - intervals = intervals, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - scatter_count = scatter_count, - split_intervals_extra_args = split_intervals_extra_args, - runtime_params = small_runtime - } - - scatter (subintervals in Split.interval_files ) { - call GetNormalArtifactData { - input: - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - tumor_reads = select_first([normal_bam]), - tumor_reads_index = select_first([normal_bai]), - normal_reads = tumor_bam, - normal_reads_index = tumor_bai, - intervals = subintervals, - preemptible = preemptible, - max_retries = max_retries, - extra_args = normal_artifact_extra_args, - gatk_override = gatk_override, - gatk_docker = gatk_docker - } - } - - call MergeNormalArtifactData { - input: - input_tables = GetNormalArtifactData.table, - runtime_params = small_runtime - } - } - - output { - File tumor_table = select_first([TumorConcordanceTable.table, TumorTable.table]) - File? normal_table = NormalTable.table - File? normal_artifact_table = MergeNormalArtifactData.merged_table - } -} - -task Concordance { - input { - File? intervals - File? masks - File truth_vcf - File truth_vcf_idx - File eval_vcf - File eval_vcf_idx - - File? gatk_override - - # runtime - String gatk_docker - Int? preemptible - } - - command { - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx2g" Concordance \ - ~{"-L " + intervals} \ - ~{"-XL " + masks} \ - -truth ~{truth_vcf} -eval ~{eval_vcf} \ - -tpfp "tpfp.vcf" \ - -ftnfn "ftnfn.vcf" \ - -summary "summary.txt" - } - - runtime { - memory: "5 GB" - bootDiskSizeGb: 12 - docker: "${gatk_docker}" - disks: "local-disk " + 100 + " HDD" - preemptible: select_first([preemptible, 2]) - } - - output { - File tpfp = "tpfp.vcf" - File tpfp_idx = "tpfp.vcf.idx" - File ftnfn = "ftnfn.vcf" - File ftnfn_idx = "ftnfn.vcf.idx" - File summary = "summary.txt" - } -} - -task MakeTableFromMutect2 { - input { - File filtered_vcf - File filtered_vcf_idx - - File? gatk_override - String gatk_docker - Int? preemptible - } - - command { - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx2g" SelectVariants -V ~{filtered_vcf} --restrict-alleles-to BIALLELIC -O biallelic.vcf - gatk --java-options "-Xmx2g" VariantsToTable -V biallelic.vcf \ - -F CHROM -F POS -F REF -F ALT -F POPAF -F TLOD -F STATUS -F REF_BASES -F HEC -F HAPDOM -F HAPCOMP -GF DP -F FILTER -GF FRS \ - --show-filtered \ - -O output.table - } - - runtime { - memory: "5 GB" - bootDiskSizeGb: 12 - docker: "${gatk_docker}" - disks: "local-disk " + 100 + " HDD" - preemptible: select_first([preemptible, 2]) - } - - output { - File table = "output.table" - } -} - -task MakeTableFromConcordance { - input { - File tpfp - File tpfp_idx - File ftnfn - File ftnfn_idx - - File? gatk_override - String gatk_docker - Int? preemptible - } - - command { - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - for file in ~{tpfp} ~{ftnfn}; do - gatk --java-options "-Xmx2g" SelectVariants -V $file --restrict-alleles-to BIALLELIC -O biallelic.vcf - gatk --java-options "-Xmx2g" VariantsToTable -V biallelic.vcf \ - -F CHROM -F POS -F REF -F ALT -F POPAF -F TLOD -F STATUS -F REF_BASES -F HEC -F HAPDOM -F HAPCOMP -GF DP -F FILTER -GF FRS \ - --show-filtered \ - -O tmp.table - - # if it's the first table, copy it to the output; otherwise copy all but the header line - if [ ! -f output.table ]; then - mv tmp.table output.table - else - tail -n +2 tmp.table >> output.table - fi - done - } - - runtime { - memory: "5 GB" - bootDiskSizeGb: 12 - docker: "${gatk_docker}" - disks: "local-disk " + 100 + " HDD" - preemptible: select_first([preemptible, 2]) - } - - output { - File table = "output.table" - } -} - -task GetNormalArtifactData { - input { - File? intervals - File ref_fasta - File ref_fai - File ref_dict - File tumor_reads - File tumor_reads_index - File? normal_reads - File? normal_reads_index - String? extra_args - - File? gatk_override - String? gcs_project_for_requester_pays - - # runtime - String gatk_docker - Int? mem - Int? preemptible - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false - } - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 500 - - parameter_meta{ - intervals: {localization_optional: true} - ref_fasta: {localization_optional: true} - ref_fai: {localization_optional: true} - ref_dict: {localization_optional: true} - tumor_reads: {localization_optional: true} - tumor_reads_index: {localization_optional: true} - normal_reads: {localization_optional: true} - normal_reads_index: {localization_optional: true} - } - - command <<< - set -e - - export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} - - if [[ ! -z "~{normal_reads}" ]]; then - gatk --java-options "-Xmx~{command_mem}m" GetSampleName -R ~{ref_fasta} -I ~{normal_reads} -O normal_name.txt -encode \ - ~{"--gcs-project-for-requester-pays " + gcs_project_for_requester_pays} - normal_sample="`cat normal_name.txt`" - fi - - gatk --java-options "-Xmx~{command_mem}m" GetNormalArtifactData \ - -R ~{ref_fasta} ~{"-L " + intervals} -I ~{tumor_reads} -I ~{normal_reads} -O normal_artifact.table \ - -normal $normal_sample \ - ~{extra_args} ~{"--gcs-project-for-requester-pays " + gcs_project_for_requester_pays} - >>> - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible, 10]) - maxRetries: select_first([max_retries, 0]) - cpu: select_first([cpu, 1]) - } - - output { - File table = "normal_artifact.table" - } -} - -task MergeNormalArtifactData { - input { - Array[File] input_tables - Runtime runtime_params - } - - command { - set -e - export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override} - - gatk --java-options "-Xmx~{runtime_params.command_mem}m" GatherNormalArtifactData \ - -I ~{sep=' -I ' input_tables} \ - -O normal_artifact.table - } - - runtime { - docker: runtime_params.gatk_docker - bootDiskSizeGb: runtime_params.boot_disk_size - memory: runtime_params.machine_mem + " MB" - disks: "local-disk " + runtime_params.disk + " HDD" - preemptible: runtime_params.preemptible - maxRetries: runtime_params.max_retries - cpu: runtime_params.cpu - } - - output { - File merged_table = "normal_artifact.table" - } -} \ No newline at end of file From 8c18d872c8afabee4376f7db7a1258a7a6850c03 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Thu, 4 Aug 2022 23:03:13 -0400 Subject: [PATCH 04/10] simplifed file names --- scripts/mutect2_wdl/mutect2.wdl | 64 +++++++++++++-------------------- 1 file changed, 24 insertions(+), 40 deletions(-) diff --git a/scripts/mutect2_wdl/mutect2.wdl b/scripts/mutect2_wdl/mutect2.wdl index ddf64bf8bf1..72cd390aa3f 100755 --- a/scripts/mutect2_wdl/mutect2.wdl +++ b/scripts/mutect2_wdl/mutect2.wdl @@ -129,14 +129,7 @@ workflow Mutect2 { # This is added to every task as padding, should increase if systematically you need more disk for every call Int disk_pad = 10 + select_first([emergency_extra_disk,0]) - - # logic about output file names -- these are the names *without* .vcf extensions - String output_basename = basename(basename(tumor_reads, ".bam"),".cram") #hacky way to strip either .bam or .cram - String unfiltered_name = output_basename + "-unfiltered" - String filtered_name = output_basename + "-filtered" - - String output_vcf_name = output_basename + ".vcf" - + Runtime standard_runtime = {"gatk_docker": gatk_docker, "gatk_override": gatk_override, "max_retries": max_retries, "preemptible": preemptible, "cpu": small_task_cpu, "machine_mem": small_task_mem * 1000, "command_mem": small_task_mem * 1000 - 500, @@ -209,7 +202,6 @@ workflow Mutect2 { input: input_vcfs = M2.unfiltered_vcf, input_vcf_indices = M2.unfiltered_vcf_idx, - output_name = unfiltered_name, compress_vcfs = compress_vcfs, runtime_params = standard_runtime } @@ -221,7 +213,6 @@ workflow Mutect2 { ref_fai = ref_fai, ref_dict = ref_dict, bam_outs = M2.output_bamOut, - output_vcf_name = basename(MergeVCFs.merged_vcf, ".vcf"), runtime_params = standard_runtime, disk_space = ceil(merged_bamout_size * 4) + disk_pad, } @@ -233,7 +224,7 @@ workflow Mutect2 { call MergePileupSummaries as MergeTumorPileups { input: input_tables = flatten(M2.tumor_pileups), - output_name = output_basename, + output_name = "tumor-pileups", ref_dict = ref_dict, runtime_params = standard_runtime } @@ -242,7 +233,7 @@ workflow Mutect2 { call MergePileupSummaries as MergeNormalPileups { input: input_tables = flatten(M2.normal_pileups), - output_name = output_basename, + output_name = "normal-pileups", ref_dict = ref_dict, runtime_params = standard_runtime } @@ -264,7 +255,6 @@ workflow Mutect2 { intervals = intervals, unfiltered_vcf = MergeVCFs.merged_vcf, unfiltered_vcf_idx = MergeVCFs.merged_vcf_idx, - output_name = filtered_name, compress_vcfs = compress_vcfs, mutect_stats = MergeStats.merged_stats, contamination_table = CalculateContamination.contamination_table, @@ -286,7 +276,6 @@ workflow Mutect2 { realignment_index_bundle = select_first([realignment_index_bundle]), realignment_extra_args = realignment_extra_args, compress_vcfs = compress_vcfs, - output_name = filtered_name, input_vcf = Filter.filtered_vcf, input_vcf_idx = Filter.filtered_vcf_idx, runtime_params = standard_runtime, @@ -505,12 +494,11 @@ task MergeVCFs { input { Array[File] input_vcfs Array[File] input_vcf_indices - String output_name Boolean compress_vcfs Runtime runtime_params } - String output_vcf = output_name + if compress_vcfs then ".vcf.gz" else ".vcf" + String output_vcf = if compress_vcfs then "merged.vcf.gz" else "merged.vcf" String output_vcf_idx = output_vcf + if compress_vcfs then ".tbi" else ".idx" # using MergeVcfs instead of GatherVcfs so we can create indices @@ -543,7 +531,6 @@ task MergeBamOuts { File ref_fai File ref_dict Array[File]+ bam_outs - String output_vcf_name Runtime runtime_params Int? disk_space #override to request more disk than default small task params } @@ -560,9 +547,8 @@ task MergeBamOuts { # overlapping bamouts gatk --java-options "-Xmx~{runtime_params.command_mem}m" SortSam -I unsorted.out.bam \ - -O ~{output_vcf_name}.out.bam \ - --SORT_ORDER coordinate -VALIDATION_STRINGENCY LENIENT - gatk --java-options "-Xmx~{runtime_params.command_mem}m" BuildBamIndex -I ~{output_vcf_name}.out.bam -VALIDATION_STRINGENCY LENIENT + -O bamout.bam --SORT_ORDER coordinate -VALIDATION_STRINGENCY LENIENT + gatk --java-options "-Xmx~{runtime_params.command_mem}m" BuildBamIndex -I bamout.bam -VALIDATION_STRINGENCY LENIENT >>> runtime { @@ -576,8 +562,8 @@ task MergeBamOuts { } output { - File merged_bam_out = "~{output_vcf_name}.out.bam" - File merged_bam_out_index = "~{output_vcf_name}.out.bai" + File merged_bam_out = "bamout.bam" + File merged_bam_out_index = "bamout.bai" } } @@ -716,25 +702,24 @@ task CalculateContamination { task Filter { input { - File? intervals - File ref_fasta - File ref_fai - File ref_dict - File unfiltered_vcf - File unfiltered_vcf_idx - String output_name - Boolean compress_vcfs - File? mutect_stats - File? artifact_priors_tar_gz - File? contamination_table - File? maf_segments - String? m2_extra_filtering_args + File? intervals + File ref_fasta + File ref_fai + File ref_dict + File unfiltered_vcf + File unfiltered_vcf_idx + Boolean compress_vcfs + File? mutect_stats + File? artifact_priors_tar_gz + File? contamination_table + File? maf_segments + String? m2_extra_filtering_args - Runtime runtime_params - Int? disk_space + Runtime runtime_params + Int? disk_space } - String output_vcf = output_name + if compress_vcfs then ".vcf.gz" else ".vcf" + String output_vcf = if compress_vcfs then "filtered.vcf.gz" else "filtered.vcf" String output_vcf_idx = output_vcf + if compress_vcfs then ".tbi" else ".idx" parameter_meta{ @@ -785,7 +770,6 @@ task FilterAlignmentArtifacts { File input_vcf_idx File reads File reads_index - String output_name Boolean compress_vcfs File realignment_index_bundle String? realignment_extra_args @@ -794,7 +778,7 @@ task FilterAlignmentArtifacts { Int mem } - String output_vcf = output_name + if compress_vcfs then ".vcf.gz" else ".vcf" + String output_vcf = if compress_vcfs then "filtered.vcf.gz" else "filtered.vcf" String output_vcf_idx = output_vcf + if compress_vcfs then ".tbi" else ".idx" Int machine_mem = mem From 1eb76184f4e6ac26e7d4cd607ff0edea48004d9c Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Fri, 5 Aug 2022 02:21:58 -0400 Subject: [PATCH 05/10] M# dataset options in M2 WDL --- scripts/mutect2_wdl/mutect2.wdl | 153 ++++++++++++++++++++++---------- 1 file changed, 105 insertions(+), 48 deletions(-) diff --git a/scripts/mutect2_wdl/mutect2.wdl b/scripts/mutect2_wdl/mutect2.wdl index 72cd390aa3f..2594cf389e6 100755 --- a/scripts/mutect2_wdl/mutect2.wdl +++ b/scripts/mutect2_wdl/mutect2.wdl @@ -100,6 +100,10 @@ workflow Mutect2 { Boolean compress_vcfs = false File? gga_vcf File? gga_vcf_idx + Boolean make_m3_training_dataset = false + Boolean make_m3_test_dataset = false + File? m3_training_dataset_truth_vcf + File? m3_training_dataset_truth_vcf_idx # runtime @@ -129,7 +133,7 @@ workflow Mutect2 { # This is added to every task as padding, should increase if systematically you need more disk for every call Int disk_pad = 10 + select_first([emergency_extra_disk,0]) - + Runtime standard_runtime = {"gatk_docker": gatk_docker, "gatk_override": gatk_override, "max_retries": max_retries, "preemptible": preemptible, "cpu": small_task_cpu, "machine_mem": small_task_mem * 1000, "command_mem": small_task_mem * 1000 - 500, @@ -179,6 +183,10 @@ workflow Mutect2 { compress_vcfs = compress_vcfs, gga_vcf = gga_vcf, gga_vcf_idx = gga_vcf_idx, + make_m3_training_dataset = make_m3_training_dataset, + make_m3_test_dataset = make_m3_test_dataset, + m3_training_dataset_truth_vcf = m3_training_dataset_truth_vcf, + m3_training_dataset_truth_vcf_idx = m3_training_dataset_truth_vcf_idx, gatk_override = gatk_override, gatk_docker = gatk_docker, disk_space = m2_per_scatter_size, @@ -247,6 +255,14 @@ workflow Mutect2 { } } + if (make_m3_training_dataset || make_m3_test_dataset) { + call Concatenate { + input: + input_files = M2.m3_dataset, + gatk_docker = gatk_docker + } + } + call Filter { input: ref_fasta = ref_fasta, @@ -295,6 +311,7 @@ workflow Mutect2 { File? bamout_index = MergeBamOuts.merged_bam_out_index File? maf_segments = CalculateContamination.maf_segments File? read_orientation_model_params = LearnReadOrientationModel.artifact_prior_table + File? m3_dataset = Concatenate.concatenated } } @@ -342,40 +359,45 @@ task SplitIntervals { task M2 { input { - File? intervals - File ref_fasta - File ref_fai - File ref_dict - File tumor_reads - File tumor_reads_index - File? normal_reads - File? normal_reads_index - File? pon - File? pon_idx - File? gnomad - File? gnomad_idx - String? m2_extra_args - String? getpileupsummaries_extra_args - Boolean? make_bamout - Boolean? run_ob_filter - Boolean compress_vcfs - File? gga_vcf - File? gga_vcf_idx - File? variants_for_contamination - File? variants_for_contamination_idx + File? intervals + File ref_fasta + File ref_fai + File ref_dict + File tumor_reads + File tumor_reads_index + File? normal_reads + File? normal_reads_index + File? pon + File? pon_idx + File? gnomad + File? gnomad_idx + String? m2_extra_args + String? getpileupsummaries_extra_args + Boolean? make_bamout + Boolean? run_ob_filter + Boolean compress_vcfs + File? gga_vcf + File? gga_vcf_idx + File? variants_for_contamination + File? variants_for_contamination_idx - File? gatk_override + File? gatk_override - String? gcs_project_for_requester_pays + String? gcs_project_for_requester_pays - # runtime - String gatk_docker - Int? mem - Int? preemptible - Int? max_retries - Int? disk_space - Int? cpu - Boolean use_ssd = false + Boolean make_m3_training_dataset = false + Boolean make_m3_test_dataset = false + File? m3_training_dataset_truth_vcf + File? m3_training_dataset_truth_vcf_idx + + # runtime + String gatk_docker + Int? mem + Int? preemptible + Int? max_retries + Int? disk_space + Int? cpu + Boolean use_ssd = false } String output_vcf = "output" + if compress_vcfs then ".vcf.gz" else ".vcf" @@ -388,22 +410,24 @@ task M2 { Int command_mem = machine_mem - 500 parameter_meta{ - intervals: {localization_optional: true} - ref_fasta: {localization_optional: true} - ref_fai: {localization_optional: true} - ref_dict: {localization_optional: true} - tumor_reads: {localization_optional: true} - tumor_reads_index: {localization_optional: true} - normal_reads: {localization_optional: true} - normal_reads_index: {localization_optional: true} - pon: {localization_optional: true} - pon_idx: {localization_optional: true} - gnomad: {localization_optional: true} - gnomad_idx: {localization_optional: true} - gga_vcf: {localization_optional: true} - gga_vcf_idx: {localization_optional: true} - variants_for_contamination: {localization_optional: true} - variants_for_contamination_idx: {localization_optional: true} + intervals: {localization_optional: true} + ref_fasta: {localization_optional: true} + ref_fai: {localization_optional: true} + ref_dict: {localization_optional: true} + tumor_reads: {localization_optional: true} + tumor_reads_index: {localization_optional: true} + normal_reads: {localization_optional: true} + normal_reads_index: {localization_optional: true} + pon: {localization_optional: true} + pon_idx: {localization_optional: true} + gnomad: {localization_optional: true} + gnomad_idx: {localization_optional: true} + gga_vcf: {localization_optional: true} + gga_vcf_idx: {localization_optional: true} + variants_for_contamination: {localization_optional: true} + variants_for_contamination_idx: {localization_optional: true} + m3_training_dataset_truth_vcf: {localization_optional: true} + m3_training_dataset_truth_vcf_idx: {localization_optional: true} } command <<< @@ -414,6 +438,7 @@ task M2 { # We need to create these files regardless, even if they stay empty touch bamout.bam touch f1r2.tar.gz + touch dataset.txt echo "" > normal_name.txt gatk --java-options "-Xmx~{command_mem}m" GetSampleName -R ~{ref_fasta} -I ~{tumor_reads} -O tumor_name.txt -encode \ @@ -437,6 +462,9 @@ task M2 { -O "~{output_vcf}" \ ~{true='--bam-output bamout.bam' false='' make_bamout} \ ~{true='--f1r2-tar-gz f1r2.tar.gz' false='' run_ob_filter} \ + ~{true='--mutect3-dataset dataset.txt' false='' make_m3_test_dataset} \ + ~{true='--mutect3-dataset dataset.txt --mutect3-training-mode' false='' make_m3_training_dataset} \ + ~{"--mutect3-training-truth " + m3_training_dataset_truth_vcf} \ ~{m2_extra_args} \ ~{"--gcs-project-for-requester-pays " + gcs_project_for_requester_pays} @@ -487,6 +515,7 @@ task M2 { File f1r2_counts = "f1r2.tar.gz" Array[File] tumor_pileups = glob("*tumor-pileups.table") Array[File] normal_pileups = glob("*normal-pileups.table") + File m3_dataset = "dataset.txt" } } @@ -825,3 +854,31 @@ task FilterAlignmentArtifacts { } } +task Concatenate { + input { + Array[File] input_files + Int? mem + String gatk_docker + } + + Int machine_mem = if defined(mem) then mem * 1000 else 7000 + + command { + cat ~{sep=' ' input_files} > output.txt + } + + runtime { + docker: gatk_docker + bootDiskSizeGb: 12 + memory: machine_mem + " MB" + disks: "local-disk 100 HDD" + preemptible: 1 + maxRetries: 1 + cpu: 2 + } + + output { + File concatenated = "output.txt" + } +} + From 121bb700b99f513e8e5eaa19ef72360c198c57c2 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Tue, 16 Aug 2022 19:15:19 -0400 Subject: [PATCH 06/10] fix a very rare bug in Mutect3DatasetEngine --- .../tools/walkers/annotator/AssemblyComplexity.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java index bd1226975d4..c3220d1a865 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java @@ -117,7 +117,10 @@ public static Triple annotate(final VariantContext vc, .filter(hap -> containsAltAllele(hap.getEventMap(), vc, altAlleleIndex)) .mapToInt(hap -> haplotypeSupportCounts.get(hap).intValue()) .toArray(); - return MathUtils.arrayMax(counts) / (double) MathUtils.sum(counts); + // a very rare edge case occurs when no haplotypes containing the allele exist with non-zero read support. + // If this occurs, we set the dominance to 1 / the number of haplotypes. + final int maxCount = MathUtils.arrayMax(counts); + return maxCount == 0 ? (1 / (double) haplotypesByDescendingSupport.size()) : maxCount / (double) MathUtils.sum(counts); }).toArray(); return Triple.of(equivalenceCounts, editDistances, haplotypeDominance); From 724bb29d3ac2cbef8430cb29b8baa42d24f07a47 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Wed, 17 Aug 2022 16:18:02 -0400 Subject: [PATCH 07/10] tidying PON WDL --- scripts/m2_cromwell_tests/run_m2_wdl.sh | 2 +- scripts/mutect2_wdl/mutect2_pon.wdl | 22 +++++++++------------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/scripts/m2_cromwell_tests/run_m2_wdl.sh b/scripts/m2_cromwell_tests/run_m2_wdl.sh index b676f58b463..9e11623aaaf 100644 --- a/scripts/m2_cromwell_tests/run_m2_wdl.sh +++ b/scripts/m2_cromwell_tests/run_m2_wdl.sh @@ -37,7 +37,7 @@ echo "Putting the newly built docker image into the json parameters" cd $WORKING_DIR/gatk/scripts/ sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" m2_cromwell_tests/test_m2_wdl.json >$WORKING_DIR/test_m2_wdl_mod.json echo "JSON FILE (modified) =======" -cat $WORKING_DIR/test_m2_wdl_multi_mod.json +cat $WORKING_DIR/test_m2_wdl_mod.json sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" m2_cromwell_tests/test_mitochondria_m2_wdl.json >$WORKING_DIR/test_mitochondria_m2_wdl_mod.json echo "JSON FILE (modified) =======" cat $WORKING_DIR/test_mitochondria_m2_wdl_mod.json diff --git a/scripts/mutect2_wdl/mutect2_pon.wdl b/scripts/mutect2_wdl/mutect2_pon.wdl index 0da400ca584..b9d5956bee7 100644 --- a/scripts/mutect2_wdl/mutect2_pon.wdl +++ b/scripts/mutect2_wdl/mutect2_pon.wdl @@ -25,11 +25,11 @@ workflow Mutect2_Panel { File gnomad_idx String? m2_extra_args String? create_pon_extra_args - Boolean? compress + Boolean compress = false String pon_name - Int? min_contig_size - Int? create_panel_scatter_count + Int min_contig_size = 1000000 + Int create_panel_scatter_count = 24 String? gcs_project_for_requester_pays @@ -38,8 +38,8 @@ workflow Mutect2_Panel { File? gatk_override String basic_bash_docker = "ubuntu:16.04" - Int? preemptible - Int? max_retries + Int preemptible = 2 + Int max_retries = 2 Int small_task_cpu = 2 Int small_task_mem = 4 Int small_task_disk = 100 @@ -49,12 +49,8 @@ workflow Mutect2_Panel { Int? emergency_extra_disk } - Int contig_size = select_first([min_contig_size, 1000000]) - Int preemptible_or_default = select_first([preemptible, 2]) - Int max_retries_or_default = select_first([max_retries, 2]) - Runtime standard_runtime = {"gatk_docker": gatk_docker, "gatk_override": gatk_override, - "max_retries": max_retries_or_default, "preemptible": preemptible_or_default, "cpu": small_task_cpu, + "max_retries": max_retries, "preemptible": preemptible, "cpu": small_task_cpu, "machine_mem": small_task_mem * 1000, "command_mem": small_task_mem * 1000 - 500, "disk": small_task_disk, "boot_disk_size": boot_disk_size} @@ -82,8 +78,8 @@ workflow Mutect2_Panel { ref_fasta = ref_fasta, ref_fai = ref_fai, ref_dict = ref_dict, - scatter_count = select_first([create_panel_scatter_count, 24]), - split_intervals_extra_args = "--dont-mix-contigs --min-contig-size " + contig_size, + scatter_count = create_panel_scatter_count, + split_intervals_extra_args = "--dont-mix-contigs --min-contig-size " + min_contig_size, runtime_params = standard_runtime } @@ -108,7 +104,7 @@ workflow Mutect2_Panel { input_vcfs = CreatePanel.output_vcf, input_vcf_indices = CreatePanel.output_vcf_index, output_name = pon_name, - compress = select_first([compress, false]), + compress = compress, runtime_params = standard_runtime } From 8267742927fa97b792b9283a135dce247314ad07 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Wed, 17 Aug 2022 16:43:50 -0400 Subject: [PATCH 08/10] few little womtool error fixes --- scripts/mutect2_wdl/mutect2_pon.wdl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/mutect2_wdl/mutect2_pon.wdl b/scripts/mutect2_wdl/mutect2_pon.wdl index b9d5956bee7..46e41e720ed 100644 --- a/scripts/mutect2_wdl/mutect2_pon.wdl +++ b/scripts/mutect2_wdl/mutect2_pon.wdl @@ -103,8 +103,7 @@ workflow Mutect2_Panel { input: input_vcfs = CreatePanel.output_vcf, input_vcf_indices = CreatePanel.output_vcf_index, - output_name = pon_name, - compress = compress, + compress_vcfs = compress, runtime_params = standard_runtime } From 14e25048992044252939985539180c8f2e43a8fb Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Wed, 17 Aug 2022 17:02:09 -0400 Subject: [PATCH 09/10] ditto --- scripts/mutect2_wdl/mutect2.wdl | 2 +- scripts/mutect2_wdl/mutect2_pon.wdl | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/scripts/mutect2_wdl/mutect2.wdl b/scripts/mutect2_wdl/mutect2.wdl index 2594cf389e6..9ff90cedef8 100755 --- a/scripts/mutect2_wdl/mutect2.wdl +++ b/scripts/mutect2_wdl/mutect2.wdl @@ -132,7 +132,7 @@ workflow Mutect2 { Int normal_reads_size = if defined(normal_reads) then ceil(size(normal_reads, "GB") + size(normal_reads_index, "GB")) else 0 # This is added to every task as padding, should increase if systematically you need more disk for every call - Int disk_pad = 10 + select_first([emergency_extra_disk,0]) + Int disk_pad = 10 + emergency_extra_disk Runtime standard_runtime = {"gatk_docker": gatk_docker, "gatk_override": gatk_override, "max_retries": max_retries, "preemptible": preemptible, "cpu": small_task_cpu, diff --git a/scripts/mutect2_wdl/mutect2_pon.wdl b/scripts/mutect2_wdl/mutect2_pon.wdl index 46e41e720ed..9b750a8c80f 100644 --- a/scripts/mutect2_wdl/mutect2_pon.wdl +++ b/scripts/mutect2_wdl/mutect2_pon.wdl @@ -44,9 +44,6 @@ workflow Mutect2_Panel { Int small_task_mem = 4 Int small_task_disk = 100 Int boot_disk_size = 12 - - # Use as a last resort to increase the disk given to every task in case of ill behaving data - Int? emergency_extra_disk } Runtime standard_runtime = {"gatk_docker": gatk_docker, "gatk_override": gatk_override, From efd6394d829b6740d3dc5df0dc0483662ad753d9 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Wed, 17 Aug 2022 17:25:11 -0400 Subject: [PATCH 10/10] whoops, more fixing json --- scripts/m2_cromwell_tests/mutect2.inputs.json | 4 ---- scripts/m2_cromwell_tests/test_m2_wdl.json | 3 +-- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/scripts/m2_cromwell_tests/mutect2.inputs.json b/scripts/m2_cromwell_tests/mutect2.inputs.json index 4c7a08074eb..e67ed3c7356 100644 --- a/scripts/m2_cromwell_tests/mutect2.inputs.json +++ b/scripts/m2_cromwell_tests/mutect2.inputs.json @@ -4,10 +4,6 @@ "Mutect2.intervals": "gs://gatk-best-practices/somatic-b37/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.baits.interval_list", "Mutect2.scatter_count": 50, "Mutect2.m2_extra_args": "--downsampling-stride 20 --max-reads-per-alignment-start 6 --max-suspicious-reads-per-alignment-start 6", - "Mutect2.filter_funcotations": "True", - "Mutect2.funco_reference_version": "hg19", - "Mutect2.funco_data_sources_tar_gz": "gs://broad-public-datasets/funcotator/funcotator_dataSources.v1.6.20190124s.tar.gz", - "Mutect2.funco_transcript_selection_list": "gs://broad-public-datasets/funcotator/transcriptList.exact_uniprot_matches.AKT1_CRLF2_FGFR1.txt", "Mutect2.ref_fasta": "gs://gatk-best-practices/somatic-b37/Homo_sapiens_assembly19.fasta", "Mutect2.ref_dict": "gs://gatk-best-practices/somatic-b37/Homo_sapiens_assembly19.dict", diff --git a/scripts/m2_cromwell_tests/test_m2_wdl.json b/scripts/m2_cromwell_tests/test_m2_wdl.json index e8f85f4bbdc..259bb43ae45 100644 --- a/scripts/m2_cromwell_tests/test_m2_wdl.json +++ b/scripts/m2_cromwell_tests/test_m2_wdl.json @@ -10,8 +10,7 @@ "Mutect2.normal_reads_index": "/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/normal_1.bam.bai", "Mutect2.scatter_count": 2, "Mutect2.run_orientation_bias_mixture_model_filter": true, - "Mutect2.run_funcotator": true, - "Mutect2.preemptible_attempts": 2, + "Mutect2.preemptible": 2, "Mutect2.compress_vcfs": false, "Mutect2.make_bamout": true } \ No newline at end of file