broadinstitute · meganshand · Feb 2, 2024 · Jun 15, 2023 · Jun 16, 2023 · Aug 2, 2023
diff --git a/AzureJointGenotyping.wdl b/AzureJointGenotyping.wdl
@@ -12,8 +12,12 @@ workflow JointGenotyping {
     File unpadded_intervals_file
 
     String callset_name
+    #TODO: make sample_name_map from the gvcf_paths?
     File sample_name_map
 
+    File gvcf_paths_fofn
+    File gvcf_path_indexes_fofn
+
     File ref_fasta
     File ref_fasta_index
     File ref_dict
@@ -63,7 +67,8 @@ workflow JointGenotyping {
     Boolean use_gnarly_genotyper = false
     Boolean use_allele_specific_annotations = true
     Boolean cross_check_fingerprints = true
-    Boolean scatter_cross_check_fingerprints = false
+    # If cross check fingerprints should be scattered, how many gvcfs per shard? Typically set to 1000.
+    Int? cross_check_fingerprint_scatter_partition
   }
 
   Boolean allele_specific_annotations = !use_gnarly_genotyper && use_allele_specific_annotations
@@ -73,8 +78,8 @@ workflow JointGenotyping {
 
   Array[Array[String]] sample_name_map_lines_t = transpose(sample_name_map_lines)
   Array[String] sample_names_from_map = sample_name_map_lines_t[0]
-  Array[File] gvcf_paths_from_map = sample_name_map_lines_t[1]
-  Array[File] gvcf_index_paths_from_map = sample_name_map_lines_t[2]
+  #Array[File] gvcf_paths_from_map = sample_name_map_lines_t[1]
+  #Array[File] gvcf_index_paths_from_map = sample_name_map_lines_t[2]
 
   # Make a 2.5:1 interval number to samples in callset ratio interval list.
   # We allow overriding the behavior by specifying the desired number of vcfs
@@ -92,8 +97,8 @@ workflow JointGenotyping {
 
   #call Tasks.CheckSamplesUnique {
   #  input:
-  #    sample_name_map = sample_name_map,
-  #    sample_num_threshold = 10
+  #    sample_name_map = sample_name_map_for_fingerprinting,
+  #    sample_num_threshold = 1
   #}
 
   call Tasks.SplitIntervalList {
@@ -107,6 +112,27 @@ workflow JointGenotyping {
       sample_names_unique_done = true
   }
 
+  call Tasks.SplitFofn as SplitGvcfFofn {
+    input:
+      largeFofn = gvcf_paths_fofn
+  }
+
+  call Tasks.SplitFofn as SplitGvcfIndexFofn {
+    input:
+      largeFofn = gvcf_path_indexes_fofn
+  }
+
+  scatter (i in range(length(SplitGvcfFofn.tiny_fofns))) {
+    Array[File] gvcf_path_arrays = read_lines(SplitGvcfFofn.tiny_fofns[i])
+    Array[File] gvcf_index_path_arrays = read_lines(SplitGvcfIndexFofn.tiny_fofns[i])
+  }
+
+  Array[File] gvcf_paths = flatten(gvcf_path_arrays)
+  Array[File] gvcf_path_indexes = flatten(gvcf_index_path_arrays)
+
+  File header_vcf = gvcf_paths[0]
+  File header_vcf_index = gvcf_path_indexes[0]
+
   Array[File] unpadded_intervals = SplitIntervalList.output_intervals
 
   scatter (idx in range(length(unpadded_intervals))) {
@@ -117,9 +143,10 @@ workflow JointGenotyping {
     # the Hellbender (GATK engine) team!
     call Tasks.ImportGVCFs {
       input:
-        sample_names = sample_names_from_map,
-        gvcf_files = gvcf_paths_from_map,
-        gvcf_index_files = gvcf_index_paths_from_map,
+        sample_name_map = sample_name_map,
+        # need to provide an example header in order to stream from azure, so use the first gvcf
+        header_vcf = header_vcf,
+        header_vcf_index = header_vcf_index,
         interval = unpadded_intervals[idx],
         ref_fasta = ref_fasta,
         ref_fasta_index = ref_fasta_index,
@@ -153,15 +180,13 @@ workflow JointGenotyping {
             ref_fasta = ref_fasta,
             ref_fasta_index = ref_fasta_index,
             ref_dict = ref_dict,
-            dbsnp_vcf = dbsnp_vcf,
+            dbsnp_vcf = dbsnp_vcf
         }
       }
 
-      Array[File] gnarly_gvcfs = GnarlyGenotyper.output_vcf
-
       call Tasks.GatherVcfs as TotallyRadicalGatherVcfs {
         input:
-          input_vcfs = gnarly_gvcfs,
+          input_vcf_fofn = write_lines(GnarlyGenotyper.output_vcf),
           output_vcf_name = callset_name + "." + idx + ".gnarly.vcf.gz",
           disk_size = large_disk
       }
@@ -196,9 +221,10 @@ workflow JointGenotyping {
     }
   }
 
+  #TODO: I suspect having write_lines in the input here is breaking call caching
   call Tasks.GatherVcfs as SitesOnlyGatherVcf {
     input:
-      input_vcfs = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf,
+      input_vcf_fofn = write_lines(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf),
       output_vcf_name = callset_name + ".sites_only.vcf.gz",
       disk_size = medium_disk
   }
@@ -336,9 +362,10 @@ workflow JointGenotyping {
   # For small callsets we can gather the VCF shards and then collect metrics on it.
   # HUGE disk was failing in Azure...
   if (is_small_callset) {
+
     call Tasks.GatherVcfs as FinalGatherVcf {
       input:
-        input_vcfs = ApplyRecalibration.recalibrated_vcf,
+        input_vcf_fofn = write_lines(ApplyRecalibration.recalibrated_vcf),
         output_vcf_name = callset_name + ".vcf.gz",
         disk_size = large_disk
     }
@@ -369,7 +396,7 @@ workflow JointGenotyping {
 
   # CrossCheckFingerprints takes forever on large callsets.
   # We scatter over the input GVCFs to make things faster.
-  if (scatter_cross_check_fingerprints) {
+  if (defined(cross_check_fingerprint_scatter_partition)) {
     call Tasks.GetFingerprintingIntervalIndices {
       input:
         unpadded_intervals = unpadded_intervals,
@@ -384,37 +411,41 @@ workflow JointGenotyping {
 
     call Tasks.GatherVcfs as GatherFingerprintingVcfs {
       input:
-        input_vcfs = vcfs_to_fingerprint,
+        input_vcf_fofn = write_lines(vcfs_to_fingerprint),
         output_vcf_name = callset_name + ".gathered.fingerprinting.vcf.gz",
         disk_size = medium_disk
     }
 
     call Tasks.SelectFingerprintSiteVariants {
       input:
         input_vcf = GatherFingerprintingVcfs.output_vcf,
+        input_vcf_index = GatherFingerprintingVcfs.output_vcf_index,
         base_output_name = callset_name + ".fingerprinting",
         haplotype_database = haplotype_database,
         disk_size = medium_disk
     }
 
-    call Tasks.PartitionSampleNameMap {
-      input:
-        sample_name_map = sample_name_map,
-        line_limit = 1000
-    }
-
-    scatter (idx in range(length(PartitionSampleNameMap.partitions))) {
+    # Get partitions by partition number of gvcfs, including any remainder in the last partition
+    # Subsetting happens in the CrossCheckFingerprints task
+    Array[Int] partitions = range((num_gvcfs+cross_check_fingerprint_scatter_partition)/cross_check_fingerprint_scatter_partition)
 
-      Array[File] files_in_partition = read_lines(PartitionSampleNameMap.partitions[idx])
+    scatter (idx in range(length(partitions))) {
+      Int parition_scaled = (partitions[idx] + 1) * cross_check_fingerprint_scatter_partition
 
       call Tasks.CrossCheckFingerprint as CrossCheckFingerprintsScattered {
         input:
-          gvcf_paths = files_in_partition,
-          vcf_paths = vcfs_to_fingerprint,
-          sample_name_map = sample_name_map,
+          gvcf_paths_fofn = write_lines(gvcf_paths),
+          gvcf_index_paths_fofn = write_lines(gvcf_path_indexes),
+          vcf_paths_fofn = write_lines([SelectFingerprintSiteVariants.output_vcf]),
+          vcf_index_paths_fofn = write_lines([SelectFingerprintSiteVariants.output_vcf_index]),
+          sample_names_from_map_fofn = write_lines(sample_names_from_map),
+          partition_index = parition_scaled,
+          partition_ammount = cross_check_fingerprint_scatter_partition,
+          gvcf_paths_length = length(gvcf_paths),
           haplotype_database = haplotype_database,
           output_base_name = callset_name + "." + idx,
-          scattered = true
+          scattered = true,
+          disk = small_disk
       }
     }
 
@@ -426,19 +457,19 @@ workflow JointGenotyping {
     }
   }
 
-  if (!scatter_cross_check_fingerprints) {
-
-    scatter (line in sample_name_map_lines) {
-      File gvcf_paths = line[1]
-    }
+  if (!defined(cross_check_fingerprint_scatter_partition)) {
 
     call Tasks.CrossCheckFingerprint as CrossCheckFingerprintSolo {
       input:
-        gvcf_paths = gvcf_paths,
-        vcf_paths = ApplyRecalibration.recalibrated_vcf,
-        sample_name_map = sample_name_map,
+        gvcf_paths_fofn = write_lines(gvcf_paths),
+        gvcf_index_paths_fofn = write_lines(gvcf_path_indexes),
+        vcf_paths_fofn = write_lines(ApplyRecalibration.recalibrated_vcf),
+        vcf_index_paths_fofn = write_lines(ApplyRecalibration.recalibrated_vcf_index),
+        sample_names_from_map_fofn = write_lines(sample_names_from_map),
+        gvcf_paths_length = length(gvcf_paths),
         haplotype_database = haplotype_database,
-        output_base_name = callset_name
+        output_base_name = callset_name,
+        disk = small_disk
     }
   }