From 14bcb36d7ee10d783e58e1234a01725a0bb9cc99 Mon Sep 17 00:00:00 2001
From: David Benjamin <davidben@broadinstitute.org>
Date: Thu, 4 Aug 2022 19:17:06 -0400
Subject: [PATCH 01/10] continuous integration M2 tests single pair

---
 scripts/m2_cromwell_tests/pair_list           |   2 -
 .../m2_cromwell_tests/pair_list_tumor_only    |   2 -
 scripts/m2_cromwell_tests/run_m2_wdl.sh       |  12 +-
 scripts/m2_cromwell_tests/test_m2_wdl.json    |  19 +++
 .../m2_cromwell_tests/test_m2_wdl_multi.json  |  16 ---
 scripts/mutect2_wdl/mutect2_multi_sample.wdl  | 124 ------------------
 6 files changed, 25 insertions(+), 150 deletions(-)
 delete mode 100644 scripts/m2_cromwell_tests/pair_list
 delete mode 100644 scripts/m2_cromwell_tests/pair_list_tumor_only
 create mode 100644 scripts/m2_cromwell_tests/test_m2_wdl.json
 delete mode 100644 scripts/m2_cromwell_tests/test_m2_wdl_multi.json
 delete mode 100644 scripts/mutect2_wdl/mutect2_multi_sample.wdl

diff --git a/scripts/m2_cromwell_tests/pair_list b/scripts/m2_cromwell_tests/pair_list
deleted file mode 100644
index 962fe49cca2..00000000000
--- a/scripts/m2_cromwell_tests/pair_list
+++ /dev/null
@@ -1,2 +0,0 @@
-/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/tumor_1.bam	/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/tumor_1.bam.bai	/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/normal_1.bam	/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/normal_1.bam.bai
-/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/tumor_2.bam	/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/tumor_2.bam.bai	/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/normal_2.bam	/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/normal_2.bam.bai
diff --git a/scripts/m2_cromwell_tests/pair_list_tumor_only b/scripts/m2_cromwell_tests/pair_list_tumor_only
deleted file mode 100644
index 9498e7e342f..00000000000
--- a/scripts/m2_cromwell_tests/pair_list_tumor_only
+++ /dev/null
@@ -1,2 +0,0 @@
-/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/tumor_1.bam	/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/tumor_1.bam.bai
-/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/tumor_2.bam	/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/tumor_2.bam.bai
\ No newline at end of file
diff --git a/scripts/m2_cromwell_tests/run_m2_wdl.sh b/scripts/m2_cromwell_tests/run_m2_wdl.sh
index f7f863e765b..b676f58b463 100644
--- a/scripts/m2_cromwell_tests/run_m2_wdl.sh
+++ b/scripts/m2_cromwell_tests/run_m2_wdl.sh
@@ -12,7 +12,7 @@ echo "Creating tar.gz for Funcotator datasources =========="
 pushd .
 FUNCOTATOR_TEST_DS_DIR=${WORKING_DIR}/gatk/src/test/resources/large/funcotator/
 cd ${FUNCOTATOR_TEST_DS_DIR}
-# First parameter must match Mutect2_Multi.funco_data_sources_tar_gz test_m2_wdl_multi.json
+# First parameter must match Mutect2_Multi.funco_data_sources_tar_gz test_m2_wdl.json
 tar zcvf ${WORKING_DIR}/gatk/small_ds_pik3ca.tar.gz small_ds_pik3ca/*
 popd
 
@@ -35,7 +35,7 @@ fi
 echo "Docker build done =========="
 echo "Putting the newly built docker image into the json parameters"
 cd $WORKING_DIR/gatk/scripts/
-sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" m2_cromwell_tests/test_m2_wdl_multi.json >$WORKING_DIR/test_m2_wdl_multi_mod.json
+sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" m2_cromwell_tests/test_m2_wdl.json >$WORKING_DIR/test_m2_wdl_mod.json
 echo "JSON FILE (modified) ======="
 cat $WORKING_DIR/test_m2_wdl_multi_mod.json
 sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" m2_cromwell_tests/test_mitochondria_m2_wdl.json >$WORKING_DIR/test_mitochondria_m2_wdl_mod.json
@@ -43,16 +43,16 @@ echo "JSON FILE (modified) ======="
 cat $WORKING_DIR/test_mitochondria_m2_wdl_mod.json
 echo "=================="
 
-# Create the tumor-only json by using the pair_list_tumor_only file
-sed -r "s/\"pair_list/\"pair_list_tumor_only/g" $WORKING_DIR/test_m2_wdl_multi_mod.json >$WORKING_DIR/test_m2_wdl_multi_mod_to.json
+# Create the tumor-only json by removing normal_reads and normal_reads_index from the input json
+grep -v 'Mutect2.normal_reads' $WORKING_DIR/test_m2_wdl_mod.json >$WORKING_DIR/test_m2_wdl_mod_to.json
 cd $WORKING_DIR/
 
 echo "Running M2 WDL through cromwell (T/N)"
 ln -fs $WORKING_DIR/gatk/scripts/mutect2_wdl/mutect2.wdl
-sudo java -jar $CROMWELL_JAR run $WORKING_DIR/gatk/scripts/mutect2_wdl/mutect2_multi_sample.wdl -i $WORKING_DIR/test_m2_wdl_multi_mod.json -m $WORKING_DIR/test_m2_wdl.metadata
+sudo java -jar $CROMWELL_JAR run $WORKING_DIR/gatk/scripts/mutect2_wdl/mutect2.wdl -i $WORKING_DIR/test_m2_wdl_mod.json -m $WORKING_DIR/test_m2_wdl.metadata
 
 echo "Running M2 WDL through cromwell (Tumor-only)"
-sudo java -jar $CROMWELL_JAR run $WORKING_DIR/gatk/scripts/mutect2_wdl/mutect2_multi_sample.wdl -i $WORKING_DIR/test_m2_wdl_multi_mod_to.json -m $WORKING_DIR/test_m2_wdl_to.metadata
+sudo java -jar $CROMWELL_JAR run $WORKING_DIR/gatk/scripts/mutect2_wdl/mutect2.wdl -i $WORKING_DIR/test_m2_wdl_mod_to.json -m $WORKING_DIR/test_m2_wdl_to.metadata
 
 echo "Running Mitochondria M2 WDL through cromwell"
 ln -fs $WORKING_DIR/gatk/scripts/mitochondria_m2_wdl/AlignAndCall.wdl
diff --git a/scripts/m2_cromwell_tests/test_m2_wdl.json b/scripts/m2_cromwell_tests/test_m2_wdl.json
new file mode 100644
index 00000000000..4a3073b7fbd
--- /dev/null
+++ b/scripts/m2_cromwell_tests/test_m2_wdl.json
@@ -0,0 +1,19 @@
+{
+  "Mutect2.gatk_docker": "__GATK_DOCKER__",
+  "Mutect2.intervals": "/home/runner/work/gatk/gatk/scripts/m2_cromwell_tests/interval_list.interval_list",
+  "Mutect2.ref_fasta": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.fasta",
+  "Mutect2.ref_fai": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.fasta.fai",
+  "Mutect2.ref_dict": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.dict",
+  "Mutect2.tumor_reads": "/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/tumor_1.bam",
+  "Mutect2.tumor_reads_index": "/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/tumor_1.bam.bai",
+  "Mutect2.normal_reads": "/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/normal_1.bam",
+  "Mutect2.normal_reads_index": "/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/normal_1.bam.bai",
+  "Mutect2.funco_data_sources_tar_gz": "/home/runner/work/gatk/gatk/small_ds_pik3ca.tar.gz",
+  "Mutect2.funco_reference_version": "hg19",
+  "Mutect2.scatter_count": 2,
+  "Mutect2.run_orientation_bias_mixture_model_filter": true,
+  "Mutect2.run_funcotator": true,
+  "Mutect2.preemptible_attempts": 2,
+  "Mutect2.compress_vcfs": false,
+  "Mutect2.make_bamout": true
+}
\ No newline at end of file
diff --git a/scripts/m2_cromwell_tests/test_m2_wdl_multi.json b/scripts/m2_cromwell_tests/test_m2_wdl_multi.json
deleted file mode 100644
index d81b0852158..00000000000
--- a/scripts/m2_cromwell_tests/test_m2_wdl_multi.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-  "Mutect2_Multi.gatk_docker": "__GATK_DOCKER__",
-  "Mutect2_Multi.intervals": "/home/runner/work/gatk/gatk/scripts/m2_cromwell_tests/interval_list.interval_list",
-  "Mutect2_Multi.ref_fasta": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.fasta",
-  "Mutect2_Multi.ref_fai": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.fasta.fai",
-  "Mutect2_Multi.ref_dict": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.dict",
-  "Mutect2_Multi.pair_list": "/home/runner/work/gatk/gatk/scripts/m2_cromwell_tests/pair_list",
-  "Mutect2_Multi.funco_data_sources_tar_gz": "/home/runner/work/gatk/gatk/small_ds_pik3ca.tar.gz",
-  "Mutect2_Multi.funco_reference_version": "hg19",
-  "Mutect2_Multi.scatter_count": 2,
-  "Mutect2_Multi.run_orientation_bias_mixture_model_filter": true,
-  "Mutect2_Multi.run_funcotator": true,
-  "Mutect2_Multi.preemptible_attempts": 2,
-  "Mutect2_Multi.compress_vcfs": false,
-  "Mutect2_Multi.make_bamout": true
-}
\ No newline at end of file
diff --git a/scripts/mutect2_wdl/mutect2_multi_sample.wdl b/scripts/mutect2_wdl/mutect2_multi_sample.wdl
deleted file mode 100644
index 61442704914..00000000000
--- a/scripts/mutect2_wdl/mutect2_multi_sample.wdl
+++ /dev/null
@@ -1,124 +0,0 @@
-version 1.0
-
-#  Run Mutect 2 on a list of tumors or tumor-normal pairs
-#
-#  Description of inputs
-#  intervals: genomic intervals
-#  ref_fasta, ref_fai, ref_dict: reference genome, index, and dictionary
-#  pon, pon_idx: optional panel of normals and index in vcf format containing known false positves
-#  scatter_count: number of parallel jobs when scattering over intervals
-#  gnomad, gnomad_idx: optional database of known germline variants, obtainable from http://gnomad.broadinstitute.org/downloads
-#  variants_for_contamination, variants_for_contamination_idx: vcf of common variants with allele frequencies fo calculating contamination
-#  run_orientation_bias_filter: if true, run the orientation bias filter post-processing step
-#  pair_list: a tab-separated table with no header in the following format:
-#   TUMOR_1_BAM</TAB>TUMOR_1_bai</TAB>NORMAL_1_BAM</TAB>NORMAL_1_bai
-#   TUMOR_2_BAM</TAB>TUMOR_2_bai</TAB>NORMAL_2_BAM</TAB>NORMAL_2_bai
-#   . . .
-#  Tumor-only input is the same but without the columns for the normal:
-#  TUMOR_1_BAM</TAB>TUMOR_1_bai
-#  TUMOR_2_BAM</TAB>TUMOR_2_bai
-#   . . .
-
-import "mutect2.wdl" as m2
-
-workflow Mutect2_Multi {
-  input {
-    File? intervals
-    File ref_fasta
-    File ref_fai
-    File ref_dict
-    File pair_list
-
-    File? pon
-    File? pon_idx
-    File? gnomad
-    File? gnomad_idx
-    File? variants_for_contamination
-    File? variants_for_contamination_idx
-    Boolean? run_orientation_bias_mixture_model_filter
-    Int scatter_count
-    String? m2_extra_args
-    String? m2_extra_filtering_args
-    Boolean? compress_vcfs
-    Boolean? make_bamout
-
-    String? gcs_project_for_requester_pays
-
-    # Oncotator inputs
-    String? sequencing_center
-    String? sequence_source
-
-    # funcotator inputs
-    Boolean? run_funcotator
-    String? funco_reference_version
-    File? funco_data_sources_tar_gz
-    String? funco_transcript_selection_mode
-    File? funco_transcript_selection_list
-    Array[String]? funco_annotation_defaults
-    Array[String]? funco_annotation_overrides
-
-
-    # runtime
-    String gatk_docker
-    Int? preemptible_attempts
-    File? gatk_override
-  }
-
-  Array[Array[String]] pairs = read_tsv(pair_list)
-
-	scatter( row in pairs ) {
-	    #      If the condition is true, variables inside the 'if' block retain their values outside the block.
-	    #      Otherwise they are treated as null, which in WDL is equivalent to an empty optional
-        if(length(row) == 4) {
-            File normal_bam = row[2]
-            File normal_bai = row[3]
-        }
-
-        call m2.Mutect2 {
-            input:
-                intervals = intervals,
-                ref_fasta = ref_fasta,
-                ref_fai = ref_fai,
-                ref_dict = ref_dict,
-                tumor_reads = row[0],
-                tumor_reads_index = row[1],
-                normal_reads = normal_bam,
-                normal_reads_index = normal_bai,
-                pon = pon,
-                pon_idx = pon_idx,
-                scatter_count = scatter_count,
-                gnomad = gnomad,
-                gnomad_idx = gnomad_idx,
-                variants_for_contamination = variants_for_contamination,
-                variants_for_contamination_idx = variants_for_contamination_idx,
-                run_orientation_bias_mixture_model_filter = run_orientation_bias_mixture_model_filter,
-                m2_extra_args = m2_extra_args,
-                m2_extra_filtering_args = m2_extra_filtering_args,
-                sequencing_center = sequencing_center,
-                sequence_source = sequence_source,
-                run_funcotator = run_funcotator,
-                funco_reference_version = funco_reference_version,
-                funco_data_sources_tar_gz = funco_data_sources_tar_gz,
-                funco_transcript_selection_mode = funco_transcript_selection_mode,
-                funco_transcript_selection_list = funco_transcript_selection_list,
-                funco_annotation_defaults = funco_annotation_defaults,
-                funco_annotation_overrides = funco_annotation_overrides,
-
-                make_bamout = make_bamout,
-                compress_vcfs = compress_vcfs,
-                gatk_override = gatk_override,
-                gatk_docker = gatk_docker,
-                preemptible = preemptible_attempts,
-                gcs_project_for_requester_pays = gcs_project_for_requester_pays
-        }
-    }
-
-    output {
-        Array[File] filtered_vcf = Mutect2.filtered_vcf
-        Array[File] filtered_vcf_idx = Mutect2.filtered_vcf_idx
-        Array[File?] contamination_tables = Mutect2.contamination_table
-
-        Array[File?] m2_bamout = Mutect2.bamout
-        Array[File?] m2_bamout_index = Mutect2.bamout_index
-    }
-}

From b293f8d34adbca32a67a4b1e992ec7638dc6f12b Mon Sep 17 00:00:00 2001
From: David Benjamin <davidben@broadinstitute.org>
Date: Thu, 4 Aug 2022 19:22:11 -0400
Subject: [PATCH 02/10] remove Funcotator from M2 WDL (Funcotator has its own
 WDL)

---
 scripts/m2_cromwell_tests/test_m2_wdl.json |   2 -
 scripts/mutect2_wdl/mutect2.wdl            | 210 +--------------------
 2 files changed, 4 insertions(+), 208 deletions(-)

diff --git a/scripts/m2_cromwell_tests/test_m2_wdl.json b/scripts/m2_cromwell_tests/test_m2_wdl.json
index 4a3073b7fbd..e8f85f4bbdc 100644
--- a/scripts/m2_cromwell_tests/test_m2_wdl.json
+++ b/scripts/m2_cromwell_tests/test_m2_wdl.json
@@ -8,8 +8,6 @@
   "Mutect2.tumor_reads_index": "/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/tumor_1.bam.bai",
   "Mutect2.normal_reads": "/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/normal_1.bam",
   "Mutect2.normal_reads_index": "/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/normal_1.bam.bai",
-  "Mutect2.funco_data_sources_tar_gz": "/home/runner/work/gatk/gatk/small_ds_pik3ca.tar.gz",
-  "Mutect2.funco_reference_version": "hg19",
   "Mutect2.scatter_count": 2,
   "Mutect2.run_orientation_bias_mixture_model_filter": true,
   "Mutect2.run_funcotator": true,
diff --git a/scripts/mutect2_wdl/mutect2.wdl b/scripts/mutect2_wdl/mutect2.wdl
index 1d02f9fb45e..638b4e054cd 100755
--- a/scripts/mutect2_wdl/mutect2.wdl
+++ b/scripts/mutect2_wdl/mutect2.wdl
@@ -3,7 +3,7 @@ version 1.0
 ## Copyright Broad Institute, 2017
 ##
 ## This WDL workflow runs GATK4 Mutect 2 on a single tumor-normal pair or on a single tumor sample,
-## and performs additional filtering and functional annotation tasks.
+## and performs additional filtering.
 ##
 ## Main requirements/expectations :
 ## - One analysis-ready BAM file (and its index) for each sample
@@ -38,22 +38,8 @@ version 1.0
 ## ** Secondary resources ** (for optional tasks)
 ## realignment_index_bundle: resource for FilterAlignmentArtifacts, which runs if and only if it is specified.  Generated by BwaMemIndexImageCreator.
 ##
-## Funcotator parameters (see Funcotator help for more details).
-## funco_reference_version: "hg19" for hg19 or b37.  "hg38" for hg38.  Default: "hg19"
-## funco_output_format: "MAF" to produce a MAF file, "VCF" to procude a VCF file.  Default: "MAF"
-## funco_compress: (Only valid if funco_output_format == "VCF" )  If true, will compress the output of Funcotator.  If false, produces an uncompressed output file.  Default: false
-## funco_use_gnomad_AF: If true, will include gnomAD allele frequency annotations in output by connecting to the internet to query gnomAD (this impacts performance).  If false, will not annotate with gnomAD.  Default: false
-## funco_transcript_selection_mode: How to select transcripts in Funcotator.  ALL, CANONICAL, or BEST_EFFECT
-## funco_transcript_selection_list: Transcripts (one GENCODE ID per line) to give priority during selection process.
-## funco_data_sources_tar_gz:  Funcotator datasources tar gz file.  Bucket location is recommended when running on the cloud.
-## funco_annotation_defaults:  Default values for annotations, when values are unspecified.  Specified as  <ANNOTATION>:<VALUE>.  For example:  "Center:Broad"
-## funco_annotation_overrides:  Values for annotations, even when values are unspecified.  Specified as  <ANNOTATION>:<VALUE>.  For example:  "Center:Broad"
-## funcotator_excluded_fields:  Annotations that should not appear in the output (VCF or MAF).  Specified as  <ANNOTATION>.  For example:  "ClinVar_ALLELEID"
-## funco_filter_funcotations: If true, will only annotate variants that have passed filtering (. or PASS value in the FILTER column).  If false, will annotate all variants in the input file.  Default: true
-## funcotator_extra_args: Any additional arguments to pass to Funcotator.  Default: ""
-##
 ## Outputs :
-## - One VCF file and its index with primary filtering applied; secondary filtering and functional annotation if requested; a bamout.bam
+## - One VCF file and its index with primary filtering applied; secondary filtering if requested; a bamout.bam
 ##   file of reassembled reads if requested
 ##
 ## Cromwell version support
@@ -111,30 +97,10 @@ workflow Mutect2 {
       File? gga_vcf_idx
       String? gcs_project_for_requester_pays
 
-      # Funcotator inputs
-      Boolean? run_funcotator
-      String? sequencing_center
-      String? sequence_source
-      String? funco_reference_version
-      String? funco_output_format
-      Boolean? funco_compress
-      Boolean? funco_use_gnomad_AF
-      File? funco_data_sources_tar_gz
-      String? funco_transcript_selection_mode
-      File? funco_transcript_selection_list
-      Array[String]? funco_annotation_defaults
-      Array[String]? funco_annotation_overrides
-      Array[String]? funcotator_excluded_fields
-      Boolean? funco_filter_funcotations
-      String? funcotator_extra_args
-
-      String funco_default_output_format = "MAF"
-
       # runtime
       String gatk_docker
       File? gatk_override
       String basic_bash_docker = "ubuntu:16.04"
-      Boolean? filter_funcotations
 
       Int? preemptible
       Int? max_retries
@@ -162,8 +128,7 @@ workflow Mutect2 {
     Boolean compress = select_first([compress_vcfs, false])
     Boolean run_ob_filter = select_first([run_orientation_bias_mixture_model_filter, false])
     Boolean make_bamout_or_default = select_first([make_bamout, false])
-    Boolean run_funcotator_or_default = select_first([run_funcotator, false])
-    Boolean filter_funcotations_or_default = select_first([filter_funcotations, true])
+
 
     # Disk sizes used for dynamic sizing
     Int ref_size = ceil(size(ref_fasta, "GB") + size(ref_dict, "GB") + size(ref_fai, "GB"))
@@ -172,7 +137,6 @@ workflow Mutect2 {
     Int normal_reads_size = if defined(normal_reads) then ceil(size(normal_reads, "GB") + size(normal_reads_index, "GB")) else 0
 
     # If no tar is provided, the task downloads one from broads ftp server
-    Int funco_tar_size = if defined(funco_data_sources_tar_gz) then ceil(size(funco_data_sources_tar_gz, "GB") * 3) else 100
     Int gatk_override_size = if defined(gatk_override) then ceil(size(gatk_override, "GB")) else 0
 
     # This is added to every task as padding, should increase if systematically you need more disk for every call
@@ -182,7 +146,6 @@ workflow Mutect2 {
     String output_basename = basename(basename(tumor_reads, ".bam"),".cram")  #hacky way to strip either .bam or .cram
     String unfiltered_name = output_basename + "-unfiltered"
     String filtered_name = output_basename + "-filtered"
-    String funcotated_name = output_basename + "-funcotated"
 
     String output_vcf_name = output_basename + ".vcf"
 
@@ -348,37 +311,7 @@ workflow Mutect2 {
         }
     }
 
-    if (run_funcotator_or_default) {
-        File funcotate_vcf_input = select_first([FilterAlignmentArtifacts.filtered_vcf, Filter.filtered_vcf])
-        File funcotate_vcf_input_index = select_first([FilterAlignmentArtifacts.filtered_vcf_idx, Filter.filtered_vcf_idx])
-        call Funcotate {
-            input:
-                ref_fasta = ref_fasta,
-                ref_fai = ref_fai,
-                ref_dict = ref_dict,
-                input_vcf = funcotate_vcf_input,
-                input_vcf_idx = funcotate_vcf_input_index,
-                reference_version = select_first([funco_reference_version, "hg19"]),
-                output_file_base_name = basename(funcotate_vcf_input, ".vcf") + ".annotated",
-                output_format = if defined(funco_output_format) then "" + funco_output_format else funco_default_output_format,
-                compress = if defined(funco_compress) then select_first([funco_compress]) else false,
-                use_gnomad = if defined(funco_use_gnomad_AF) then select_first([funco_use_gnomad_AF]) else false,
-                data_sources_tar_gz = funco_data_sources_tar_gz,
-                case_id = M2.tumor_sample[0],
-                control_id = M2.normal_sample[0],
-                sequencing_center = sequencing_center,
-                sequence_source = sequence_source,
-                transcript_selection_mode = funco_transcript_selection_mode,
-                transcript_selection_list = funco_transcript_selection_list,
-                annotation_defaults = funco_annotation_defaults,
-                annotation_overrides = funco_annotation_overrides,
-                funcotator_excluded_fields = funcotator_excluded_fields,
-                filter_funcotations = filter_funcotations_or_default,
-                extra_args = funcotator_extra_args,
-                runtime_params = standard_runtime,
-                disk_space = ceil(size(funcotate_vcf_input, "GB") * large_input_to_output_multiplier)  + funco_tar_size + disk_pad
-        }
-    }
+
 
     output {
         File filtered_vcf = select_first([FilterAlignmentArtifacts.filtered_vcf, Filter.filtered_vcf])
@@ -387,8 +320,6 @@ workflow Mutect2 {
         File mutect_stats = MergeStats.merged_stats
         File? contamination_table = CalculateContamination.contamination_table
 
-        File? funcotated_file = Funcotate.funcotated_output_file
-        File? funcotated_file_index = Funcotate.funcotated_output_file_index
         File? bamout = MergeBamOuts.merged_bam_out
         File? bamout_index = MergeBamOuts.merged_bam_out_index
         File? maf_segments = CalculateContamination.maf_segments
@@ -928,136 +859,3 @@ task FilterAlignmentArtifacts {
     }
 }
 
-task Funcotate {
-     input {
-       File ref_fasta
-       File ref_fai
-       File ref_dict
-       File input_vcf
-       File input_vcf_idx
-       String reference_version
-       String output_file_base_name
-       String output_format
-       Boolean compress
-       Boolean use_gnomad
-       # This should be updated when a new version of the data sources is released
-       # TODO: Make this dynamically chosen in the command.
-       File? data_sources_tar_gz = "gs://broad-public-datasets/funcotator/funcotator_dataSources.v1.7.20200521s.tar.gz"
-       String? control_id
-       String? case_id
-       String? sequencing_center
-       String? sequence_source
-       String? transcript_selection_mode
-       File? transcript_selection_list
-       Array[String]? annotation_defaults
-       Array[String]? annotation_overrides
-       Array[String]? funcotator_excluded_fields
-       Boolean? filter_funcotations
-       File? interval_list
-
-       String? extra_args
-       String? gcs_project_for_requester_pays
-
-       # ==============
-       Runtime runtime_params
-       Int? disk_space   #override to request more disk than default small task params
-
-       # You may have to change the following two parameter values depending on the task requirements
-       Int default_ram_mb = 3000
-       # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb).  Please see [TODO: Link from Jose] for examples.
-       Int default_disk_space_gb = 100
-     }
-
-     # ==============
-     # Process input args:
-     String output_maf = output_file_base_name + ".maf"
-     String output_maf_index = output_maf + ".idx"
-     String output_vcf = output_file_base_name + if compress then ".vcf.gz" else ".vcf"
-     String output_vcf_idx = output_vcf +  if compress then ".tbi" else ".idx"
-     String output_file = if output_format == "MAF" then output_maf else output_vcf
-     String output_file_index = if output_format == "MAF" then output_maf_index else output_vcf_idx
-     String transcript_selection_arg = if defined(transcript_selection_list) then " --transcript-list " else ""
-     String annotation_def_arg = if defined(annotation_defaults) then " --annotation-default " else ""
-     String annotation_over_arg = if defined(annotation_overrides) then " --annotation-override " else ""
-     String filter_funcotations_args = if defined(filter_funcotations) && (filter_funcotations) then " --remove-filtered-variants " else ""
-     String excluded_fields_args = if defined(funcotator_excluded_fields) then " --exclude-field " else ""
-     String interval_list_arg = if defined(interval_list) then " -L " else ""
-     String extra_args_arg = select_first([extra_args, ""])
-
-     String dollar = "$"
-
-     parameter_meta{
-      ref_fasta: {localization_optional: true}
-      ref_fai: {localization_optional: true}
-      ref_dict: {localization_optional: true}
-      input_vcf: {localization_optional: true}
-      input_vcf_idx: {localization_optional: true}
-     }
-
-     command <<<
-         set -e
-         export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override}
-
-         # Extract our data sources:
-         echo "Extracting data sources zip file..."
-         mkdir datasources_dir
-         tar zxvf ~{data_sources_tar_gz} -C datasources_dir --strip-components 1
-         DATA_SOURCES_FOLDER="$PWD/datasources_dir"
-
-         # Handle gnomAD:
-         if ~{use_gnomad} ; then
-             echo "Enabling gnomAD..."
-             for potential_gnomad_gz in gnomAD_exome.tar.gz gnomAD_genome.tar.gz ; do
-                 if [[ -f ~{dollar}{DATA_SOURCES_FOLDER}/~{dollar}{potential_gnomad_gz} ]] ; then
-                     cd ~{dollar}{DATA_SOURCES_FOLDER}
-                     tar -zvxf ~{dollar}{potential_gnomad_gz}
-                     cd -
-                 else
-                     echo "ERROR: Cannot find gnomAD folder: ~{dollar}{potential_gnomad_gz}" 1>&2
-                     false
-                 fi
-             done
-         fi
-
-         # Run Funcotator:
-         gatk --java-options "-Xmx~{runtime_params.command_mem}m" Funcotator \
-             --data-sources-path $DATA_SOURCES_FOLDER \
-             --ref-version ~{reference_version} \
-             --output-file-format ~{output_format} \
-             -R ~{ref_fasta} \
-             -V ~{input_vcf} \
-             -O ~{output_file} \
-             ~{interval_list_arg} ~{default="" interval_list} \
-             --annotation-default normal_barcode:~{default="Unknown" control_id} \
-             --annotation-default tumor_barcode:~{default="Unknown" case_id} \
-             --annotation-default Center:~{default="Unknown" sequencing_center} \
-             --annotation-default source:~{default="Unknown" sequence_source} \
-             ~{"--transcript-selection-mode " + transcript_selection_mode} \
-             ~{transcript_selection_arg}~{default="" sep=" --transcript-list " transcript_selection_list} \
-             ~{annotation_def_arg}~{default="" sep=" --annotation-default " annotation_defaults} \
-             ~{annotation_over_arg}~{default="" sep=" --annotation-override " annotation_overrides} \
-             ~{excluded_fields_args}~{default="" sep=" --exclude-field " funcotator_excluded_fields} \
-             ~{filter_funcotations_args} \
-             ~{extra_args_arg} \
-             ~{"--gcs-project-for-requester-pays " + gcs_project_for_requester_pays}
-         # Make sure we have a placeholder index for MAF files so this workflow doesn't fail:
-         if [[ "~{output_format}" == "MAF" ]] ; then
-            touch ~{output_maf_index}
-         fi
-     >>>
-
-    runtime {
-        docker: runtime_params.gatk_docker
-        bootDiskSizeGb: runtime_params.boot_disk_size
-        memory: runtime_params.machine_mem + " MB"
-        disks: "local-disk " + select_first([disk_space, runtime_params.disk]) + " HDD"
-        preemptible: runtime_params.preemptible
-        maxRetries: runtime_params.max_retries
-        cpu: runtime_params.cpu
-    }
-
-     output {
-         File funcotated_output_file = "~{output_file}"
-         File funcotated_output_file_index = "~{output_file_index}"
-     }
-}

From ec011233efc7fdcf11122de462b9eff1e1003753 Mon Sep 17 00:00:00 2001
From: David Benjamin <davidben@broadinstitute.org>
Date: Thu, 4 Aug 2022 22:49:17 -0400
Subject: [PATCH 03/10] simplify default inputs

---
 scripts/mutect2_wdl/mutect2.wdl               | 168 ++++---
 scripts/mutect2_wdl/mutect3_training_data.wdl | 415 ------------------
 2 files changed, 75 insertions(+), 508 deletions(-)
 delete mode 100644 scripts/mutect2_wdl/mutect3_training_data.wdl

diff --git a/scripts/mutect2_wdl/mutect2.wdl b/scripts/mutect2_wdl/mutect2.wdl
index 638b4e054cd..ddf64bf8bf1 100755
--- a/scripts/mutect2_wdl/mutect2.wdl
+++ b/scripts/mutect2_wdl/mutect2.wdl
@@ -68,79 +68,67 @@ struct Runtime {
 
 workflow Mutect2 {
     input {
-      # Mutect2 inputs
-      File? intervals
-      File ref_fasta
-      File ref_fai
-      File ref_dict
-      File tumor_reads
-      File tumor_reads_index
-      File? normal_reads
-      File? normal_reads_index
-      File? pon
-      File? pon_idx
-      Int scatter_count
-      File? gnomad
-      File? gnomad_idx
-      File? variants_for_contamination
-      File? variants_for_contamination_idx
-      File? realignment_index_bundle
-      String? realignment_extra_args
-      Boolean? run_orientation_bias_mixture_model_filter
-      String? m2_extra_args
-      String? m2_extra_filtering_args
-      String? getpileupsummaries_extra_args
-      String? split_intervals_extra_args
-      Boolean? make_bamout
-      Boolean? compress_vcfs
-      File? gga_vcf
-      File? gga_vcf_idx
-      String? gcs_project_for_requester_pays
-
-      # runtime
-      String gatk_docker
-      File? gatk_override
-      String basic_bash_docker = "ubuntu:16.04"
-
-      Int? preemptible
-      Int? max_retries
-      Int small_task_cpu = 2
-      Int small_task_mem = 4
-      Int small_task_disk = 100
-      Int boot_disk_size = 12
-      Int learn_read_orientation_mem = 8000
-      Int filter_alignment_artifacts_mem = 9000
-
-      # Use as a last resort to increase the disk given to every task in case of ill behaving data
-      Int? emergency_extra_disk
-
-      # These are multipliers to multipler inputs by to make sure we have enough disk to accommodate for possible output sizes
-      # Large is for Bams/WGS vcfs
-      # Small is for metrics/other vcfs
-      Float large_input_to_output_multiplier = 2.25
-      Float small_input_to_output_multiplier = 2.0
-      Float cram_to_bam_multiplier = 6.0
+        # basic inputs
+        File? intervals
+        File ref_fasta
+        File ref_fai
+        File ref_dict
+        File tumor_reads
+        File tumor_reads_index
+        File? normal_reads
+        File? normal_reads_index
+
+        # optional but usually recommended resources
+        File? pon
+        File? pon_idx
+        File? gnomad
+        File? gnomad_idx
+        File? variants_for_contamination
+        File? variants_for_contamination_idx
+
+        # extra arguments
+        String? m2_extra_args
+        String? m2_extra_filtering_args
+        String? getpileupsummaries_extra_args
+        String? split_intervals_extra_args
+
+        # additional modes and outputs
+        File? realignment_index_bundle
+        String? realignment_extra_args
+        Boolean run_orientation_bias_mixture_model_filter = false
+        Boolean make_bamout = false
+        Boolean compress_vcfs = false
+        File? gga_vcf
+        File? gga_vcf_idx
+
+
+        # runtime
+        String gatk_docker
+        File? gatk_override
+        String basic_bash_docker = "ubuntu:16.04"
+        Int scatter_count
+        Int preemptible = 2
+        Int max_retries = 1
+        Int small_task_cpu = 2
+        Int small_task_mem = 4
+        Int small_task_disk = 100
+        Int boot_disk_size = 12
+        Int learn_read_orientation_mem = 8000
+        Int filter_alignment_artifacts_mem = 9000
+        String? gcs_project_for_requester_pays
+
+        # Use as a last resort to increase the disk given to every task in case of ill behaving data
+        Int emergency_extra_disk = 0
     }
 
-    Int preemptible_or_default = select_first([preemptible, 2])
-    Int max_retries_or_default = select_first([max_retries, 2])
-
-    Boolean compress = select_first([compress_vcfs, false])
-    Boolean run_ob_filter = select_first([run_orientation_bias_mixture_model_filter, false])
-    Boolean make_bamout_or_default = select_first([make_bamout, false])
-
-
     # Disk sizes used for dynamic sizing
     Int ref_size = ceil(size(ref_fasta, "GB") + size(ref_dict, "GB") + size(ref_fai, "GB"))
     Int tumor_reads_size = ceil(size(tumor_reads, "GB") + size(tumor_reads_index, "GB"))
     Int gnomad_vcf_size = if defined(gnomad) then ceil(size(gnomad, "GB")) else 0
     Int normal_reads_size = if defined(normal_reads) then ceil(size(normal_reads, "GB") + size(normal_reads_index, "GB")) else 0
 
-    # If no tar is provided, the task downloads one from broads ftp server
-    Int gatk_override_size = if defined(gatk_override) then ceil(size(gatk_override, "GB")) else 0
-
     # This is added to every task as padding, should increase if systematically you need more disk for every call
-    Int disk_pad = 10 + gatk_override_size + select_first([emergency_extra_disk,0])
+    Int disk_pad = 10 + select_first([emergency_extra_disk,0])
 
     # logic about output file names -- these are the names *without* .vcf extensions
     String output_basename = basename(basename(tumor_reads, ".bam"),".cram")  #hacky way to strip either .bam or .cram
@@ -149,15 +137,11 @@ workflow Mutect2 {
 
     String output_vcf_name = output_basename + ".vcf"
 
-    Int tumor_cram_to_bam_disk = ceil(tumor_reads_size * cram_to_bam_multiplier)
-    Int normal_cram_to_bam_disk = ceil(normal_reads_size * cram_to_bam_multiplier)
-
     Runtime standard_runtime = {"gatk_docker": gatk_docker, "gatk_override": gatk_override,
-            "max_retries": max_retries_or_default, "preemptible": preemptible_or_default, "cpu": small_task_cpu,
+            "max_retries": max_retries, "preemptible": preemptible, "cpu": small_task_cpu,
             "machine_mem": small_task_mem * 1000, "command_mem": small_task_mem * 1000 - 500,
             "disk": small_task_disk + disk_pad, "boot_disk_size": boot_disk_size}
 
-
     Int tumor_reads_size = ceil(size(tumor_reads, "GB") + size(tumor_reads_index, "GB"))
     Int normal_reads_size = if defined(normal_reads) then ceil(size(normal_reads, "GB") + size(normal_reads_index, "GB")) else 0
 
@@ -197,9 +181,9 @@ workflow Mutect2 {
                 getpileupsummaries_extra_args = getpileupsummaries_extra_args,
                 variants_for_contamination = variants_for_contamination,
                 variants_for_contamination_idx = variants_for_contamination_idx,
-                make_bamout = make_bamout_or_default,
-                run_ob_filter = run_ob_filter,
-                compress = compress,
+                make_bamout = make_bamout,
+                run_ob_filter = run_orientation_bias_mixture_model_filter,
+                compress_vcfs = compress_vcfs,
                 gga_vcf = gga_vcf,
                 gga_vcf_idx = gga_vcf_idx,
                 gatk_override = gatk_override,
@@ -212,7 +196,7 @@ workflow Mutect2 {
     Int merged_vcf_size = ceil(size(M2.unfiltered_vcf, "GB"))
     Int merged_bamout_size = ceil(size(M2.output_bamOut, "GB"))
 
-    if (run_ob_filter) {
+    if (run_orientation_bias_mixture_model_filter) {
         call LearnReadOrientationModel {
             input:
                 f1r2_tar_gz = M2.f1r2_counts,
@@ -226,11 +210,11 @@ workflow Mutect2 {
             input_vcfs = M2.unfiltered_vcf,
             input_vcf_indices = M2.unfiltered_vcf_idx,
             output_name = unfiltered_name,
-            compress = compress,
+            compress_vcfs = compress_vcfs,
             runtime_params = standard_runtime
     }
 
-    if (make_bamout_or_default) {
+    if (make_bamout) {
         call MergeBamOuts {
             input:
                 ref_fasta = ref_fasta,
@@ -239,7 +223,7 @@ workflow Mutect2 {
                 bam_outs = M2.output_bamOut,
                 output_vcf_name = basename(MergeVCFs.merged_vcf, ".vcf"),
                 runtime_params = standard_runtime,
-                disk_space = ceil(merged_bamout_size * large_input_to_output_multiplier) + disk_pad,
+                disk_space = ceil(merged_bamout_size * 4) + disk_pad,
         }
     }
 
@@ -281,14 +265,14 @@ workflow Mutect2 {
             unfiltered_vcf = MergeVCFs.merged_vcf,
             unfiltered_vcf_idx = MergeVCFs.merged_vcf_idx,
             output_name = filtered_name,
-            compress = compress,
+            compress_vcfs = compress_vcfs,
             mutect_stats = MergeStats.merged_stats,
             contamination_table = CalculateContamination.contamination_table,
             maf_segments = CalculateContamination.maf_segments,
             artifact_priors_tar_gz = LearnReadOrientationModel.artifact_prior_table,
             m2_extra_filtering_args = m2_extra_filtering_args,
             runtime_params = standard_runtime,
-            disk_space = ceil(size(MergeVCFs.merged_vcf, "GB") * small_input_to_output_multiplier) + disk_pad
+            disk_space = ceil(size(MergeVCFs.merged_vcf, "GB") * 4) + disk_pad
     }
 
     if (defined(realignment_index_bundle)) {
@@ -301,7 +285,7 @@ workflow Mutect2 {
                 reads_index = tumor_reads_index,
                 realignment_index_bundle = select_first([realignment_index_bundle]),
                 realignment_extra_args = realignment_extra_args,
-                compress = compress,
+                compress_vcfs = compress_vcfs,
                 output_name = filtered_name,
                 input_vcf = Filter.filtered_vcf,
                 input_vcf_idx = Filter.filtered_vcf_idx,
@@ -311,8 +295,6 @@ workflow Mutect2 {
         }
     }
 
-
-
     output {
         File filtered_vcf = select_first([FilterAlignmentArtifacts.filtered_vcf, Filter.filtered_vcf])
         File filtered_vcf_idx = select_first([FilterAlignmentArtifacts.filtered_vcf_idx, Filter.filtered_vcf_idx])
@@ -387,7 +369,7 @@ task M2 {
       String? getpileupsummaries_extra_args
       Boolean? make_bamout
       Boolean? run_ob_filter
-      Boolean compress
+      Boolean compress_vcfs
       File? gga_vcf
       File? gga_vcf_idx
       File? variants_for_contamination
@@ -407,8 +389,8 @@ task M2 {
       Boolean use_ssd = false
     }
 
-    String output_vcf = "output" + if compress then ".vcf.gz" else ".vcf"
-    String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx"
+    String output_vcf = "output" + if compress_vcfs then ".vcf.gz" else ".vcf"
+    String output_vcf_idx = output_vcf + if compress_vcfs then ".tbi" else ".idx"
 
     String output_stats = output_vcf + ".stats"
 
@@ -524,12 +506,12 @@ task MergeVCFs {
       Array[File] input_vcfs
       Array[File] input_vcf_indices
       String output_name
-      Boolean compress
+      Boolean compress_vcfs
       Runtime runtime_params
     }
 
-    String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf"
-    String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx"
+    String output_vcf = output_name + if compress_vcfs then ".vcf.gz" else ".vcf"
+    String output_vcf_idx = output_vcf + if compress_vcfs then ".tbi" else ".idx"
 
     # using MergeVcfs instead of GatherVcfs so we can create indices
     # WARNING 2015-10-28 15:01:48 GatherVcfs  Index creation not currently supported when gathering block compressed VCFs.
@@ -741,7 +723,7 @@ task Filter {
       File unfiltered_vcf
       File unfiltered_vcf_idx
       String output_name
-      Boolean compress
+      Boolean compress_vcfs
       File? mutect_stats
       File? artifact_priors_tar_gz
       File? contamination_table
@@ -752,8 +734,8 @@ task Filter {
       Int? disk_space
     }
 
-    String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf"
-    String output_vcf_idx = output_vcf + if compress then ".tbi" else ".idx"
+    String output_vcf = output_name + if compress_vcfs then ".vcf.gz" else ".vcf"
+    String output_vcf_idx = output_vcf + if compress_vcfs then ".tbi" else ".idx"
 
     parameter_meta{
       ref_fasta: {localization_optional: true}
@@ -804,7 +786,7 @@ task FilterAlignmentArtifacts {
       File reads
       File reads_index
       String output_name
-      Boolean compress
+      Boolean compress_vcfs
       File realignment_index_bundle
       String? realignment_extra_args
       String? gcs_project_for_requester_pays
@@ -812,8 +794,8 @@ task FilterAlignmentArtifacts {
       Int mem
     }
 
-    String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf"
-    String output_vcf_idx = output_vcf +  if compress then ".tbi" else ".idx"
+    String output_vcf = output_name + if compress_vcfs then ".vcf.gz" else ".vcf"
+    String output_vcf_idx = output_vcf +  if compress_vcfs then ".tbi" else ".idx"
 
     Int machine_mem = mem
     Int command_mem = machine_mem - 500
diff --git a/scripts/mutect2_wdl/mutect3_training_data.wdl b/scripts/mutect2_wdl/mutect3_training_data.wdl
deleted file mode 100644
index 444dfb76da1..00000000000
--- a/scripts/mutect2_wdl/mutect3_training_data.wdl
+++ /dev/null
@@ -1,415 +0,0 @@
-version 1.0
-
-import "https://raw.githubusercontent.com/gatk-workflows/gatk4-somatic-snvs-indels/2.6.0/mutect2.wdl" as m2
-
-workflow Mutect3TrainingData {
-    input {
-        File? intervals
-        File? masks
-        File ref_fasta
-        File ref_fai
-        File ref_dict
-        Int scatter_count
-        File tumor_bam
-        File tumor_bai
-        File? normal_bam
-        File? normal_bai
-        File? pon
-        File? gnomad
-        File? variants_for_contamination
-        String ref_downsample
-        Boolean? run_orientation_bias_mixture_model_filter
-        File? realignment_index_bundle
-        String? realignment_extra_args
-        String? m2_extra_args
-        String? m2_extra_filtering_args
-        String? normal_artifact_extra_args
-        String? split_intervals_extra_args
-        File? truth_vcf
-        File? truth_vcf_idx
-        Boolean? make_bamout
-
-        # runtime
-        String gatk_docker
-        File? gatk_override
-        Int? preemptible
-        Int? max_retries
-    }
-
-    String m2_extra_args_with_training_mode = select_first([m2_extra_args, ""]) + " --training-data-mode --training-data-mode-ref-downsample " + ref_downsample
-
-    Runtime small_runtime = {"gatk_docker": gatk_docker, "gatk_override": gatk_override,
-                                   "max_retries": 2, "preemptible": 0, "cpu": 2,
-                                   "machine_mem": 4000, "command_mem": 3500,
-                                   "disk": 100, "boot_disk_size": 12}
-
-    # call on the tumor (with normal if present) to get tumor read data and M2 filtering
-    call m2.Mutect2 as Tumor {
-        input:
-            ref_fasta = ref_fasta,
-            ref_fai = ref_fai,
-            ref_dict = ref_dict,
-            scatter_count = scatter_count,
-            tumor_reads = tumor_bam,
-            tumor_reads_index = tumor_bai,
-            normal_reads = normal_bam,
-            normal_reads_index = normal_bai,
-            intervals = intervals,
-            pon = pon,
-            gnomad = gnomad,
-            variants_for_contamination = variants_for_contamination,
-            run_orientation_bias_mixture_model_filter = run_orientation_bias_mixture_model_filter,
-            realignment_index_bundle = realignment_index_bundle,
-            realignment_extra_args = realignment_extra_args,
-            preemptible = preemptible,
-            max_retries = max_retries,
-            m2_extra_args = m2_extra_args_with_training_mode,
-            m2_extra_filtering_args = m2_extra_filtering_args,
-            make_bamout = make_bamout,
-            gatk_override = gatk_override,
-            gatk_docker = gatk_docker
-    }
-
-    if(defined(truth_vcf)) {
-        call Concordance  {
-            input:
-                intervals = intervals,
-                masks = masks,
-                truth_vcf = select_first([truth_vcf]),
-                truth_vcf_idx = select_first([truth_vcf_idx]),
-                eval_vcf = Tumor.filtered_vcf,
-                eval_vcf_idx = Tumor.filtered_vcf_idx,
-                preemptible = preemptible,
-                gatk_override = gatk_override,
-                gatk_docker = gatk_docker
-        }
-
-        call MakeTableFromConcordance as TumorConcordanceTable {
-            input:
-                tpfp = Concordance.tpfp,
-                tpfp_idx = Concordance.tpfp_idx,
-                ftnfn = Concordance.ftnfn,
-                ftnfn_idx = Concordance.ftnfn_idx,
-                gatk_override = gatk_override,
-                gatk_docker = gatk_docker,
-                preemptible = preemptible
-        }
-    }
-
-    if(!defined(truth_vcf)) {
-        call MakeTableFromMutect2 as TumorTable {
-            input:
-                filtered_vcf = Tumor.filtered_vcf,
-                filtered_vcf_idx = Tumor.filtered_vcf_idx,
-                gatk_override = gatk_override,
-                gatk_docker = gatk_docker,
-                preemptible = preemptible
-        }
-    }
-
-    # call on the normal, with tumor as "matched normal", to get normal read data and M2 filtering
-    if(defined(normal_bam)) {
-        call m2.Mutect2 as Normal {
-            input:
-                ref_fasta = ref_fasta,
-                ref_fai = ref_fai,
-                ref_dict = ref_dict,
-                scatter_count = scatter_count,
-                tumor_reads = select_first([normal_bam]),
-                tumor_reads_index = select_first([normal_bai]),
-                normal_reads = tumor_bam,
-                normal_reads_index = tumor_bai,
-                intervals = intervals,
-                pon = pon,
-                gnomad = gnomad,
-                variants_for_contamination = variants_for_contamination,
-                run_orientation_bias_mixture_model_filter = run_orientation_bias_mixture_model_filter,
-                realignment_index_bundle = realignment_index_bundle,
-                realignment_extra_args = realignment_extra_args,
-                preemptible = preemptible,
-                max_retries = max_retries,
-                m2_extra_args = m2_extra_args_with_training_mode,
-                m2_extra_filtering_args = m2_extra_filtering_args,
-                make_bamout = make_bamout,
-                gatk_override = gatk_override,
-                gatk_docker = gatk_docker
-        }
-
-        # there's no reason to call concordance on the normal because the calls will have no relation to the truth VCF
-
-        call MakeTableFromMutect2 as NormalTable {
-            input:
-                filtered_vcf = Normal.filtered_vcf,
-                filtered_vcf_idx = Normal.filtered_vcf_idx,
-                gatk_override = gatk_override,
-                gatk_docker = gatk_docker,
-                preemptible = preemptible
-        }
-
-        call m2.SplitIntervals as Split {
-            input:
-                intervals = intervals,
-                ref_fasta = ref_fasta,
-                ref_fai = ref_fai,
-                ref_dict = ref_dict,
-                scatter_count = scatter_count,
-                split_intervals_extra_args = split_intervals_extra_args,
-                runtime_params = small_runtime
-        }
-
-        scatter (subintervals in Split.interval_files ) {
-            call GetNormalArtifactData {
-                input:
-                    ref_fasta = ref_fasta,
-                    ref_fai = ref_fai,
-                    ref_dict = ref_dict,
-                    tumor_reads = select_first([normal_bam]),
-                    tumor_reads_index = select_first([normal_bai]),
-                    normal_reads = tumor_bam,
-                    normal_reads_index = tumor_bai,
-                    intervals = subintervals,
-                    preemptible = preemptible,
-                    max_retries = max_retries,
-                    extra_args = normal_artifact_extra_args,
-                    gatk_override = gatk_override,
-                    gatk_docker = gatk_docker
-            }
-        }
-
-        call MergeNormalArtifactData {
-            input:
-                input_tables = GetNormalArtifactData.table,
-                runtime_params = small_runtime
-        }
-    }
-
-    output {
-        File tumor_table = select_first([TumorConcordanceTable.table, TumorTable.table])
-        File? normal_table = NormalTable.table
-        File? normal_artifact_table = MergeNormalArtifactData.merged_table
-    }
-}
-
-task Concordance {
-    input {
-        File? intervals
-        File? masks
-        File truth_vcf
-        File truth_vcf_idx
-        File eval_vcf
-        File eval_vcf_idx
-
-        File? gatk_override
-
-        # runtime
-        String gatk_docker
-        Int? preemptible
-    }
-
-    command {
-        export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
-
-        gatk --java-options "-Xmx2g" Concordance \
-        ~{"-L " + intervals} \
-        ~{"-XL " + masks} \
-        -truth ~{truth_vcf} -eval ~{eval_vcf} \
-        -tpfp "tpfp.vcf" \
-        -ftnfn "ftnfn.vcf" \
-        -summary "summary.txt"
-    }
-
-    runtime {
-        memory: "5 GB"
-        bootDiskSizeGb: 12
-        docker: "${gatk_docker}"
-        disks: "local-disk " + 100 + " HDD"
-        preemptible: select_first([preemptible, 2])
-    }
-
-    output {
-        File tpfp = "tpfp.vcf"
-        File tpfp_idx = "tpfp.vcf.idx"
-        File ftnfn = "ftnfn.vcf"
-        File ftnfn_idx = "ftnfn.vcf.idx"
-        File summary = "summary.txt"
-    }
-}
-
-task MakeTableFromMutect2 {
-    input {
-        File filtered_vcf
-        File filtered_vcf_idx
-
-        File? gatk_override
-        String gatk_docker
-        Int? preemptible
-    }
-
-    command {
-        export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
-
-        gatk --java-options "-Xmx2g" SelectVariants -V ~{filtered_vcf} --restrict-alleles-to BIALLELIC -O biallelic.vcf
-        gatk --java-options "-Xmx2g" VariantsToTable -V biallelic.vcf \
-          -F CHROM -F POS -F REF -F ALT -F POPAF -F TLOD -F STATUS -F REF_BASES -F HEC -F HAPDOM -F HAPCOMP -GF DP -F FILTER -GF FRS \
-          --show-filtered \
-          -O output.table
-    }
-
-    runtime {
-        memory: "5 GB"
-        bootDiskSizeGb: 12
-        docker: "${gatk_docker}"
-        disks: "local-disk " + 100 + " HDD"
-        preemptible: select_first([preemptible, 2])
-    }
-
-    output {
-        File table = "output.table"
-    }
-}
-
-task MakeTableFromConcordance {
-    input {
-        File tpfp
-        File tpfp_idx
-        File ftnfn
-        File ftnfn_idx
-
-        File? gatk_override
-        String gatk_docker
-        Int? preemptible
-    }
-
-    command {
-        export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}
-
-        for file in ~{tpfp} ~{ftnfn}; do
-            gatk --java-options "-Xmx2g" SelectVariants -V $file --restrict-alleles-to BIALLELIC -O biallelic.vcf
-            gatk --java-options "-Xmx2g" VariantsToTable -V biallelic.vcf \
-              -F CHROM -F POS -F REF -F ALT -F POPAF -F TLOD -F STATUS -F REF_BASES -F HEC -F HAPDOM -F HAPCOMP -GF DP -F FILTER -GF FRS \
-              --show-filtered \
-              -O tmp.table
-
-            # if it's the first table, copy it to the output; otherwise copy all but the header line
-            if [ ! -f output.table ]; then
-                mv tmp.table output.table
-            else
-                tail -n +2 tmp.table >> output.table
-            fi
-        done
-    }
-
-    runtime {
-        memory: "5 GB"
-        bootDiskSizeGb: 12
-        docker: "${gatk_docker}"
-        disks: "local-disk " + 100 + " HDD"
-        preemptible: select_first([preemptible, 2])
-    }
-
-    output {
-        File table = "output.table"
-    }
-}
-
-task GetNormalArtifactData {
-    input {
-        File? intervals
-        File ref_fasta
-        File ref_fai
-        File ref_dict
-        File tumor_reads
-        File tumor_reads_index
-        File? normal_reads
-        File? normal_reads_index
-        String? extra_args
-
-        File? gatk_override
-        String? gcs_project_for_requester_pays
-
-        # runtime
-        String gatk_docker
-        Int? mem
-        Int? preemptible
-        Int? max_retries
-        Int? disk_space
-        Int? cpu
-        Boolean use_ssd = false
-    }
-
-    # Mem is in units of GB but our command and memory runtime values are in MB
-    Int machine_mem = if defined(mem) then mem * 1000 else 3500
-    Int command_mem = machine_mem - 500
-
-    parameter_meta{
-        intervals: {localization_optional: true}
-        ref_fasta: {localization_optional: true}
-        ref_fai: {localization_optional: true}
-        ref_dict: {localization_optional: true}
-        tumor_reads: {localization_optional: true}
-        tumor_reads_index: {localization_optional: true}
-        normal_reads: {localization_optional: true}
-        normal_reads_index: {localization_optional: true}
-    }
-
-    command <<<
-        set -e
-
-        export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}
-
-        if [[ ! -z "~{normal_reads}" ]]; then
-            gatk --java-options "-Xmx~{command_mem}m" GetSampleName -R ~{ref_fasta} -I ~{normal_reads} -O normal_name.txt -encode \
-            ~{"--gcs-project-for-requester-pays " + gcs_project_for_requester_pays}
-            normal_sample="`cat normal_name.txt`"
-        fi
-
-        gatk --java-options "-Xmx~{command_mem}m" GetNormalArtifactData \
-            -R ~{ref_fasta} ~{"-L " + intervals} -I ~{tumor_reads} -I ~{normal_reads} -O normal_artifact.table \
-            -normal $normal_sample \
-            ~{extra_args} ~{"--gcs-project-for-requester-pays " + gcs_project_for_requester_pays}
-    >>>
-
-    runtime {
-        docker: gatk_docker
-        bootDiskSizeGb: 12
-        memory: machine_mem + " MB"
-        disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD"
-        preemptible: select_first([preemptible, 10])
-        maxRetries: select_first([max_retries, 0])
-        cpu: select_first([cpu, 1])
-    }
-
-    output {
-        File table = "normal_artifact.table"
-    }
-}
-
-task MergeNormalArtifactData {
-    input {
-        Array[File] input_tables
-        Runtime runtime_params
-    }
-
-    command {
-        set -e
-        export GATK_LOCAL_JAR=~{default="/root/gatk.jar" runtime_params.gatk_override}
-
-        gatk --java-options "-Xmx~{runtime_params.command_mem}m" GatherNormalArtifactData \
-            -I ~{sep=' -I ' input_tables} \
-            -O normal_artifact.table
-    }
-
-    runtime {
-        docker: runtime_params.gatk_docker
-        bootDiskSizeGb: runtime_params.boot_disk_size
-        memory: runtime_params.machine_mem + " MB"
-        disks: "local-disk " + runtime_params.disk + " HDD"
-        preemptible: runtime_params.preemptible
-        maxRetries: runtime_params.max_retries
-        cpu: runtime_params.cpu
-    }
-
-    output {
-        File merged_table = "normal_artifact.table"
-    }
-}
\ No newline at end of file

From 8c18d872c8afabee4376f7db7a1258a7a6850c03 Mon Sep 17 00:00:00 2001
From: David Benjamin <davidben@broadinstitute.org>
Date: Thu, 4 Aug 2022 23:03:13 -0400
Subject: [PATCH 04/10] simplifed file names

---
 scripts/mutect2_wdl/mutect2.wdl | 64 +++++++++++++--------------------
 1 file changed, 24 insertions(+), 40 deletions(-)

diff --git a/scripts/mutect2_wdl/mutect2.wdl b/scripts/mutect2_wdl/mutect2.wdl
index ddf64bf8bf1..72cd390aa3f 100755
--- a/scripts/mutect2_wdl/mutect2.wdl
+++ b/scripts/mutect2_wdl/mutect2.wdl
@@ -129,14 +129,7 @@ workflow Mutect2 {
 
     # This is added to every task as padding, should increase if systematically you need more disk for every call
     Int disk_pad = 10 + select_first([emergency_extra_disk,0])
-
-    # logic about output file names -- these are the names *without* .vcf extensions
-    String output_basename = basename(basename(tumor_reads, ".bam"),".cram")  #hacky way to strip either .bam or .cram
-    String unfiltered_name = output_basename + "-unfiltered"
-    String filtered_name = output_basename + "-filtered"
-
-    String output_vcf_name = output_basename + ".vcf"
-
+    
     Runtime standard_runtime = {"gatk_docker": gatk_docker, "gatk_override": gatk_override,
             "max_retries": max_retries, "preemptible": preemptible, "cpu": small_task_cpu,
             "machine_mem": small_task_mem * 1000, "command_mem": small_task_mem * 1000 - 500,
@@ -209,7 +202,6 @@ workflow Mutect2 {
         input:
             input_vcfs = M2.unfiltered_vcf,
             input_vcf_indices = M2.unfiltered_vcf_idx,
-            output_name = unfiltered_name,
             compress_vcfs = compress_vcfs,
             runtime_params = standard_runtime
     }
@@ -221,7 +213,6 @@ workflow Mutect2 {
                 ref_fai = ref_fai,
                 ref_dict = ref_dict,
                 bam_outs = M2.output_bamOut,
-                output_vcf_name = basename(MergeVCFs.merged_vcf, ".vcf"),
                 runtime_params = standard_runtime,
                 disk_space = ceil(merged_bamout_size * 4) + disk_pad,
         }
@@ -233,7 +224,7 @@ workflow Mutect2 {
         call MergePileupSummaries as MergeTumorPileups {
             input:
                 input_tables = flatten(M2.tumor_pileups),
-                output_name = output_basename,
+                output_name = "tumor-pileups",
                 ref_dict = ref_dict,
                 runtime_params = standard_runtime
         }
@@ -242,7 +233,7 @@ workflow Mutect2 {
             call MergePileupSummaries as MergeNormalPileups {
                 input:
                     input_tables = flatten(M2.normal_pileups),
-                    output_name = output_basename,
+                    output_name = "normal-pileups",
                     ref_dict = ref_dict,
                     runtime_params = standard_runtime
             }
@@ -264,7 +255,6 @@ workflow Mutect2 {
             intervals = intervals,
             unfiltered_vcf = MergeVCFs.merged_vcf,
             unfiltered_vcf_idx = MergeVCFs.merged_vcf_idx,
-            output_name = filtered_name,
             compress_vcfs = compress_vcfs,
             mutect_stats = MergeStats.merged_stats,
             contamination_table = CalculateContamination.contamination_table,
@@ -286,7 +276,6 @@ workflow Mutect2 {
                 realignment_index_bundle = select_first([realignment_index_bundle]),
                 realignment_extra_args = realignment_extra_args,
                 compress_vcfs = compress_vcfs,
-                output_name = filtered_name,
                 input_vcf = Filter.filtered_vcf,
                 input_vcf_idx = Filter.filtered_vcf_idx,
                 runtime_params = standard_runtime,
@@ -505,12 +494,11 @@ task MergeVCFs {
     input {
       Array[File] input_vcfs
       Array[File] input_vcf_indices
-      String output_name
       Boolean compress_vcfs
       Runtime runtime_params
     }
 
-    String output_vcf = output_name + if compress_vcfs then ".vcf.gz" else ".vcf"
+    String output_vcf = if compress_vcfs then "merged.vcf.gz" else "merged.vcf"
     String output_vcf_idx = output_vcf + if compress_vcfs then ".tbi" else ".idx"
 
     # using MergeVcfs instead of GatherVcfs so we can create indices
@@ -543,7 +531,6 @@ task MergeBamOuts {
       File ref_fai
       File ref_dict
       Array[File]+ bam_outs
-      String output_vcf_name
       Runtime runtime_params
       Int? disk_space   #override to request more disk than default small task params
     }
@@ -560,9 +547,8 @@ task MergeBamOuts {
         # overlapping bamouts
 
         gatk --java-options "-Xmx~{runtime_params.command_mem}m" SortSam -I unsorted.out.bam \
-            -O ~{output_vcf_name}.out.bam \
-            --SORT_ORDER coordinate -VALIDATION_STRINGENCY LENIENT
-        gatk --java-options "-Xmx~{runtime_params.command_mem}m" BuildBamIndex -I ~{output_vcf_name}.out.bam -VALIDATION_STRINGENCY LENIENT
+            -O bamout.bam --SORT_ORDER coordinate -VALIDATION_STRINGENCY LENIENT
+        gatk --java-options "-Xmx~{runtime_params.command_mem}m" BuildBamIndex -I bamout.bam -VALIDATION_STRINGENCY LENIENT
     >>>
 
     runtime {
@@ -576,8 +562,8 @@ task MergeBamOuts {
     }
 
     output {
-        File merged_bam_out = "~{output_vcf_name}.out.bam"
-        File merged_bam_out_index = "~{output_vcf_name}.out.bai"
+        File merged_bam_out = "bamout.bam"
+        File merged_bam_out_index = "bamout.bai"
     }
 }
 
@@ -716,25 +702,24 @@ task CalculateContamination {
 
 task Filter {
     input {
-      File? intervals
-      File ref_fasta
-      File ref_fai
-      File ref_dict
-      File unfiltered_vcf
-      File unfiltered_vcf_idx
-      String output_name
-      Boolean compress_vcfs
-      File? mutect_stats
-      File? artifact_priors_tar_gz
-      File? contamination_table
-      File? maf_segments
-      String? m2_extra_filtering_args
+        File? intervals
+        File ref_fasta
+        File ref_fai
+        File ref_dict
+        File unfiltered_vcf
+        File unfiltered_vcf_idx
+        Boolean compress_vcfs
+        File? mutect_stats
+        File? artifact_priors_tar_gz
+        File? contamination_table
+        File? maf_segments
+        String? m2_extra_filtering_args
 
-      Runtime runtime_params
-      Int? disk_space
+        Runtime runtime_params
+        Int? disk_space
     }
 
-    String output_vcf = output_name + if compress_vcfs then ".vcf.gz" else ".vcf"
+    String output_vcf = if compress_vcfs then "filtered.vcf.gz" else "filtered.vcf"
     String output_vcf_idx = output_vcf + if compress_vcfs then ".tbi" else ".idx"
 
     parameter_meta{
@@ -785,7 +770,6 @@ task FilterAlignmentArtifacts {
       File input_vcf_idx
       File reads
       File reads_index
-      String output_name
       Boolean compress_vcfs
       File realignment_index_bundle
       String? realignment_extra_args
@@ -794,7 +778,7 @@ task FilterAlignmentArtifacts {
       Int mem
     }
 
-    String output_vcf = output_name + if compress_vcfs then ".vcf.gz" else ".vcf"
+    String output_vcf = if compress_vcfs then "filtered.vcf.gz" else "filtered.vcf"
     String output_vcf_idx = output_vcf +  if compress_vcfs then ".tbi" else ".idx"
 
     Int machine_mem = mem

From 1eb76184f4e6ac26e7d4cd607ff0edea48004d9c Mon Sep 17 00:00:00 2001
From: David Benjamin <davidben@broadinstitute.org>
Date: Fri, 5 Aug 2022 02:21:58 -0400
Subject: [PATCH 05/10] M# dataset options in M2 WDL

---
 scripts/mutect2_wdl/mutect2.wdl | 153 ++++++++++++++++++++++----------
 1 file changed, 105 insertions(+), 48 deletions(-)

diff --git a/scripts/mutect2_wdl/mutect2.wdl b/scripts/mutect2_wdl/mutect2.wdl
index 72cd390aa3f..2594cf389e6 100755
--- a/scripts/mutect2_wdl/mutect2.wdl
+++ b/scripts/mutect2_wdl/mutect2.wdl
@@ -100,6 +100,10 @@ workflow Mutect2 {
         Boolean compress_vcfs = false
         File? gga_vcf
         File? gga_vcf_idx
+        Boolean make_m3_training_dataset = false
+        Boolean make_m3_test_dataset = false
+        File? m3_training_dataset_truth_vcf
+        File? m3_training_dataset_truth_vcf_idx
 
 
         # runtime
@@ -129,7 +133,7 @@ workflow Mutect2 {
 
     # This is added to every task as padding, should increase if systematically you need more disk for every call
     Int disk_pad = 10 + select_first([emergency_extra_disk,0])
-    
+
     Runtime standard_runtime = {"gatk_docker": gatk_docker, "gatk_override": gatk_override,
             "max_retries": max_retries, "preemptible": preemptible, "cpu": small_task_cpu,
             "machine_mem": small_task_mem * 1000, "command_mem": small_task_mem * 1000 - 500,
@@ -179,6 +183,10 @@ workflow Mutect2 {
                 compress_vcfs = compress_vcfs,
                 gga_vcf = gga_vcf,
                 gga_vcf_idx = gga_vcf_idx,
+                make_m3_training_dataset = make_m3_training_dataset,
+                make_m3_test_dataset = make_m3_test_dataset,
+                m3_training_dataset_truth_vcf = m3_training_dataset_truth_vcf,
+                m3_training_dataset_truth_vcf_idx = m3_training_dataset_truth_vcf_idx,
                 gatk_override = gatk_override,
                 gatk_docker = gatk_docker,
                 disk_space = m2_per_scatter_size,
@@ -247,6 +255,14 @@ workflow Mutect2 {
         }
     }
 
+    if (make_m3_training_dataset || make_m3_test_dataset) {
+        call Concatenate {
+            input:
+                input_files = M2.m3_dataset,
+                gatk_docker = gatk_docker
+        }
+    }
+
     call Filter {
         input:
             ref_fasta = ref_fasta,
@@ -295,6 +311,7 @@ workflow Mutect2 {
         File? bamout_index = MergeBamOuts.merged_bam_out_index
         File? maf_segments = CalculateContamination.maf_segments
         File? read_orientation_model_params = LearnReadOrientationModel.artifact_prior_table
+        File? m3_dataset = Concatenate.concatenated
     }
 }
 
@@ -342,40 +359,45 @@ task SplitIntervals {
 
 task M2 {
     input {
-      File? intervals
-      File ref_fasta
-      File ref_fai
-      File ref_dict
-      File tumor_reads
-      File tumor_reads_index
-      File? normal_reads
-      File? normal_reads_index
-      File? pon
-      File? pon_idx
-      File? gnomad
-      File? gnomad_idx
-      String? m2_extra_args
-      String? getpileupsummaries_extra_args
-      Boolean? make_bamout
-      Boolean? run_ob_filter
-      Boolean compress_vcfs
-      File? gga_vcf
-      File? gga_vcf_idx
-      File? variants_for_contamination
-      File? variants_for_contamination_idx
+        File? intervals
+        File ref_fasta
+        File ref_fai
+        File ref_dict
+        File tumor_reads
+        File tumor_reads_index
+        File? normal_reads
+        File? normal_reads_index
+        File? pon
+        File? pon_idx
+        File? gnomad
+        File? gnomad_idx
+        String? m2_extra_args
+        String? getpileupsummaries_extra_args
+        Boolean? make_bamout
+        Boolean? run_ob_filter
+        Boolean compress_vcfs
+        File? gga_vcf
+        File? gga_vcf_idx
+        File? variants_for_contamination
+        File? variants_for_contamination_idx
 
-      File? gatk_override
+        File? gatk_override
 
-      String? gcs_project_for_requester_pays
+        String? gcs_project_for_requester_pays
 
-      # runtime
-      String gatk_docker
-      Int? mem
-      Int? preemptible
-      Int? max_retries
-      Int? disk_space
-      Int? cpu
-      Boolean use_ssd = false
+        Boolean make_m3_training_dataset = false
+        Boolean make_m3_test_dataset = false
+        File? m3_training_dataset_truth_vcf
+        File? m3_training_dataset_truth_vcf_idx
+
+        # runtime
+        String gatk_docker
+        Int? mem
+        Int? preemptible
+        Int? max_retries
+        Int? disk_space
+        Int? cpu
+        Boolean use_ssd = false
     }
 
     String output_vcf = "output" + if compress_vcfs then ".vcf.gz" else ".vcf"
@@ -388,22 +410,24 @@ task M2 {
     Int command_mem = machine_mem - 500
 
     parameter_meta{
-      intervals: {localization_optional: true}
-      ref_fasta: {localization_optional: true}
-      ref_fai: {localization_optional: true}
-      ref_dict: {localization_optional: true}
-      tumor_reads: {localization_optional: true}
-      tumor_reads_index: {localization_optional: true}
-      normal_reads: {localization_optional: true}
-      normal_reads_index: {localization_optional: true}
-      pon: {localization_optional: true}
-      pon_idx: {localization_optional: true}
-      gnomad: {localization_optional: true}
-      gnomad_idx: {localization_optional: true}
-      gga_vcf: {localization_optional: true}
-      gga_vcf_idx: {localization_optional: true}
-      variants_for_contamination: {localization_optional: true}
-      variants_for_contamination_idx: {localization_optional: true}
+        intervals: {localization_optional: true}
+        ref_fasta: {localization_optional: true}
+        ref_fai: {localization_optional: true}
+        ref_dict: {localization_optional: true}
+        tumor_reads: {localization_optional: true}
+        tumor_reads_index: {localization_optional: true}
+        normal_reads: {localization_optional: true}
+        normal_reads_index: {localization_optional: true}
+        pon: {localization_optional: true}
+        pon_idx: {localization_optional: true}
+        gnomad: {localization_optional: true}
+        gnomad_idx: {localization_optional: true}
+        gga_vcf: {localization_optional: true}
+        gga_vcf_idx: {localization_optional: true}
+        variants_for_contamination: {localization_optional: true}
+        variants_for_contamination_idx: {localization_optional: true}
+        m3_training_dataset_truth_vcf: {localization_optional: true}
+        m3_training_dataset_truth_vcf_idx: {localization_optional: true}
     }
 
     command <<<
@@ -414,6 +438,7 @@ task M2 {
         # We need to create these files regardless, even if they stay empty
         touch bamout.bam
         touch f1r2.tar.gz
+        touch dataset.txt
         echo "" > normal_name.txt
 
         gatk --java-options "-Xmx~{command_mem}m" GetSampleName -R ~{ref_fasta} -I ~{tumor_reads} -O tumor_name.txt -encode \
@@ -437,6 +462,9 @@ task M2 {
             -O "~{output_vcf}" \
             ~{true='--bam-output bamout.bam' false='' make_bamout} \
             ~{true='--f1r2-tar-gz f1r2.tar.gz' false='' run_ob_filter} \
+            ~{true='--mutect3-dataset dataset.txt' false='' make_m3_test_dataset} \
+            ~{true='--mutect3-dataset dataset.txt --mutect3-training-mode' false='' make_m3_training_dataset} \
+            ~{"--mutect3-training-truth " + m3_training_dataset_truth_vcf} \
             ~{m2_extra_args} \
             ~{"--gcs-project-for-requester-pays " + gcs_project_for_requester_pays}
 
@@ -487,6 +515,7 @@ task M2 {
         File f1r2_counts = "f1r2.tar.gz"
         Array[File] tumor_pileups = glob("*tumor-pileups.table")
         Array[File] normal_pileups = glob("*normal-pileups.table")
+        File m3_dataset = "dataset.txt"
     }
 }
 
@@ -825,3 +854,31 @@ task FilterAlignmentArtifacts {
     }
 }
 
+task Concatenate {
+    input {
+        Array[File] input_files
+        Int? mem
+        String gatk_docker
+    }
+
+    Int machine_mem = if defined(mem) then mem * 1000 else 7000
+
+    command {
+        cat ~{sep=' ' input_files} > output.txt
+    }
+
+    runtime {
+        docker: gatk_docker
+        bootDiskSizeGb: 12
+        memory: machine_mem + " MB"
+        disks: "local-disk 100 HDD"
+        preemptible: 1
+        maxRetries: 1
+        cpu: 2
+    }
+
+    output {
+        File concatenated = "output.txt"
+    }
+}
+

From 121bb700b99f513e8e5eaa19ef72360c198c57c2 Mon Sep 17 00:00:00 2001
From: David Benjamin <davidben@broadinstitute.org>
Date: Tue, 16 Aug 2022 19:15:19 -0400
Subject: [PATCH 06/10] fix a very rare bug in Mutect3DatasetEngine

---
 .../tools/walkers/annotator/AssemblyComplexity.java          | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java
index bd1226975d4..c3220d1a865 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java
@@ -117,7 +117,10 @@ public static Triple<int[], int[], double[]> annotate(final VariantContext vc,
                     .filter(hap -> containsAltAllele(hap.getEventMap(), vc, altAlleleIndex))
                     .mapToInt(hap -> haplotypeSupportCounts.get(hap).intValue())
                     .toArray();
-            return MathUtils.arrayMax(counts) / (double) MathUtils.sum(counts);
+            // a very rare edge case occurs when no haplotypes containing the allele exist with non-zero read support.
+            // If this occurs, we set the dominance to 1 / the number of haplotypes.
+            final int maxCount = MathUtils.arrayMax(counts);
+            return maxCount == 0 ? (1 / (double) haplotypesByDescendingSupport.size()) : maxCount / (double) MathUtils.sum(counts);
         }).toArray();
 
         return Triple.of(equivalenceCounts, editDistances, haplotypeDominance);

From 724bb29d3ac2cbef8430cb29b8baa42d24f07a47 Mon Sep 17 00:00:00 2001
From: David Benjamin <davidben@broadinstitute.org>
Date: Wed, 17 Aug 2022 16:18:02 -0400
Subject: [PATCH 07/10] tidying PON WDL

---
 scripts/m2_cromwell_tests/run_m2_wdl.sh |  2 +-
 scripts/mutect2_wdl/mutect2_pon.wdl     | 22 +++++++++-------------
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/scripts/m2_cromwell_tests/run_m2_wdl.sh b/scripts/m2_cromwell_tests/run_m2_wdl.sh
index b676f58b463..9e11623aaaf 100644
--- a/scripts/m2_cromwell_tests/run_m2_wdl.sh
+++ b/scripts/m2_cromwell_tests/run_m2_wdl.sh
@@ -37,7 +37,7 @@ echo "Putting the newly built docker image into the json parameters"
 cd $WORKING_DIR/gatk/scripts/
 sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" m2_cromwell_tests/test_m2_wdl.json >$WORKING_DIR/test_m2_wdl_mod.json
 echo "JSON FILE (modified) ======="
-cat $WORKING_DIR/test_m2_wdl_multi_mod.json
+cat $WORKING_DIR/test_m2_wdl_mod.json
 sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" m2_cromwell_tests/test_mitochondria_m2_wdl.json >$WORKING_DIR/test_mitochondria_m2_wdl_mod.json
 echo "JSON FILE (modified) ======="
 cat $WORKING_DIR/test_mitochondria_m2_wdl_mod.json
diff --git a/scripts/mutect2_wdl/mutect2_pon.wdl b/scripts/mutect2_wdl/mutect2_pon.wdl
index 0da400ca584..b9d5956bee7 100644
--- a/scripts/mutect2_wdl/mutect2_pon.wdl
+++ b/scripts/mutect2_wdl/mutect2_pon.wdl
@@ -25,11 +25,11 @@ workflow Mutect2_Panel {
     File gnomad_idx
     String? m2_extra_args
     String? create_pon_extra_args
-    Boolean? compress
+    Boolean compress = false
     String pon_name
 
-    Int? min_contig_size
-    Int? create_panel_scatter_count
+    Int min_contig_size = 1000000
+    Int create_panel_scatter_count = 24
 
     String? gcs_project_for_requester_pays
 
@@ -38,8 +38,8 @@ workflow Mutect2_Panel {
     File? gatk_override
     String basic_bash_docker = "ubuntu:16.04"
 
-    Int? preemptible
-    Int? max_retries
+    Int preemptible = 2
+    Int max_retries = 2
     Int small_task_cpu = 2
     Int small_task_mem = 4
     Int small_task_disk = 100
@@ -49,12 +49,8 @@ workflow Mutect2_Panel {
     Int? emergency_extra_disk
   }
 
-  Int contig_size = select_first([min_contig_size, 1000000])
-  Int preemptible_or_default = select_first([preemptible, 2])
-  Int max_retries_or_default = select_first([max_retries, 2])
-
   Runtime standard_runtime = {"gatk_docker": gatk_docker, "gatk_override": gatk_override,
-            "max_retries": max_retries_or_default, "preemptible": preemptible_or_default, "cpu": small_task_cpu,
+            "max_retries": max_retries, "preemptible": preemptible, "cpu": small_task_cpu,
             "machine_mem": small_task_mem * 1000, "command_mem": small_task_mem * 1000 - 500,
             "disk": small_task_disk, "boot_disk_size": boot_disk_size}
 
@@ -82,8 +78,8 @@ workflow Mutect2_Panel {
             ref_fasta = ref_fasta,
             ref_fai = ref_fai,
             ref_dict = ref_dict,
-            scatter_count = select_first([create_panel_scatter_count, 24]),
-            split_intervals_extra_args = "--dont-mix-contigs --min-contig-size " + contig_size,
+            scatter_count = create_panel_scatter_count,
+            split_intervals_extra_args = "--dont-mix-contigs --min-contig-size " + min_contig_size,
             runtime_params = standard_runtime
     }
 
@@ -108,7 +104,7 @@ workflow Mutect2_Panel {
             input_vcfs = CreatePanel.output_vcf,
             input_vcf_indices = CreatePanel.output_vcf_index,
             output_name = pon_name,
-            compress = select_first([compress, false]),
+            compress = compress,
             runtime_params = standard_runtime
     }
 

From 8267742927fa97b792b9283a135dce247314ad07 Mon Sep 17 00:00:00 2001
From: David Benjamin <davidben@broadinstitute.org>
Date: Wed, 17 Aug 2022 16:43:50 -0400
Subject: [PATCH 08/10] few little womtool error fixes

---
 scripts/mutect2_wdl/mutect2_pon.wdl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/mutect2_wdl/mutect2_pon.wdl b/scripts/mutect2_wdl/mutect2_pon.wdl
index b9d5956bee7..46e41e720ed 100644
--- a/scripts/mutect2_wdl/mutect2_pon.wdl
+++ b/scripts/mutect2_wdl/mutect2_pon.wdl
@@ -103,8 +103,7 @@ workflow Mutect2_Panel {
         input:
             input_vcfs = CreatePanel.output_vcf,
             input_vcf_indices = CreatePanel.output_vcf_index,
-            output_name = pon_name,
-            compress = compress,
+            compress_vcfs = compress,
             runtime_params = standard_runtime
     }
 

From 14e25048992044252939985539180c8f2e43a8fb Mon Sep 17 00:00:00 2001
From: David Benjamin <davidben@broadinstitute.org>
Date: Wed, 17 Aug 2022 17:02:09 -0400
Subject: [PATCH 09/10] ditto

---
 scripts/mutect2_wdl/mutect2.wdl     | 2 +-
 scripts/mutect2_wdl/mutect2_pon.wdl | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/scripts/mutect2_wdl/mutect2.wdl b/scripts/mutect2_wdl/mutect2.wdl
index 2594cf389e6..9ff90cedef8 100755
--- a/scripts/mutect2_wdl/mutect2.wdl
+++ b/scripts/mutect2_wdl/mutect2.wdl
@@ -132,7 +132,7 @@ workflow Mutect2 {
     Int normal_reads_size = if defined(normal_reads) then ceil(size(normal_reads, "GB") + size(normal_reads_index, "GB")) else 0
 
     # This is added to every task as padding, should increase if systematically you need more disk for every call
-    Int disk_pad = 10 + select_first([emergency_extra_disk,0])
+    Int disk_pad = 10 + emergency_extra_disk
 
     Runtime standard_runtime = {"gatk_docker": gatk_docker, "gatk_override": gatk_override,
             "max_retries": max_retries, "preemptible": preemptible, "cpu": small_task_cpu,
diff --git a/scripts/mutect2_wdl/mutect2_pon.wdl b/scripts/mutect2_wdl/mutect2_pon.wdl
index 46e41e720ed..9b750a8c80f 100644
--- a/scripts/mutect2_wdl/mutect2_pon.wdl
+++ b/scripts/mutect2_wdl/mutect2_pon.wdl
@@ -44,9 +44,6 @@ workflow Mutect2_Panel {
     Int small_task_mem = 4
     Int small_task_disk = 100
     Int boot_disk_size = 12
-
-    # Use as a last resort to increase the disk given to every task in case of ill behaving data
-    Int? emergency_extra_disk
   }
 
   Runtime standard_runtime = {"gatk_docker": gatk_docker, "gatk_override": gatk_override,

From efd6394d829b6740d3dc5df0dc0483662ad753d9 Mon Sep 17 00:00:00 2001
From: David Benjamin <davidben@broadinstitute.org>
Date: Wed, 17 Aug 2022 17:25:11 -0400
Subject: [PATCH 10/10] whoops, more fixing json

---
 scripts/m2_cromwell_tests/mutect2.inputs.json | 4 ----
 scripts/m2_cromwell_tests/test_m2_wdl.json    | 3 +--
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/scripts/m2_cromwell_tests/mutect2.inputs.json b/scripts/m2_cromwell_tests/mutect2.inputs.json
index 4c7a08074eb..e67ed3c7356 100644
--- a/scripts/m2_cromwell_tests/mutect2.inputs.json
+++ b/scripts/m2_cromwell_tests/mutect2.inputs.json
@@ -4,10 +4,6 @@
   "Mutect2.intervals": "gs://gatk-best-practices/somatic-b37/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.baits.interval_list",
   "Mutect2.scatter_count": 50,
   "Mutect2.m2_extra_args": "--downsampling-stride 20 --max-reads-per-alignment-start 6 --max-suspicious-reads-per-alignment-start 6",
-  "Mutect2.filter_funcotations": "True",
-  "Mutect2.funco_reference_version": "hg19",
-  "Mutect2.funco_data_sources_tar_gz": "gs://broad-public-datasets/funcotator/funcotator_dataSources.v1.6.20190124s.tar.gz",
-  "Mutect2.funco_transcript_selection_list": "gs://broad-public-datasets/funcotator/transcriptList.exact_uniprot_matches.AKT1_CRLF2_FGFR1.txt",
 
   "Mutect2.ref_fasta": "gs://gatk-best-practices/somatic-b37/Homo_sapiens_assembly19.fasta",
   "Mutect2.ref_dict": "gs://gatk-best-practices/somatic-b37/Homo_sapiens_assembly19.dict",
diff --git a/scripts/m2_cromwell_tests/test_m2_wdl.json b/scripts/m2_cromwell_tests/test_m2_wdl.json
index e8f85f4bbdc..259bb43ae45 100644
--- a/scripts/m2_cromwell_tests/test_m2_wdl.json
+++ b/scripts/m2_cromwell_tests/test_m2_wdl.json
@@ -10,8 +10,7 @@
   "Mutect2.normal_reads_index": "/home/runner/work/gatk/gatk/src/test/resources/large/mutect/dream_synthetic_bams/normal_1.bam.bai",
   "Mutect2.scatter_count": 2,
   "Mutect2.run_orientation_bias_mixture_model_filter": true,
-  "Mutect2.run_funcotator": true,
-  "Mutect2.preemptible_attempts": 2,
+  "Mutect2.preemptible": 2,
   "Mutect2.compress_vcfs": false,
   "Mutect2.make_bamout": true
 }
\ No newline at end of file