Performed a round of ablation on new annotation-based filtering tools.

broadinstitute · Aug 14, 2023 · bda84e0 · bda84e0
1 parent 4212fdf
commit bda84e0
Show file tree

Hide file tree

Showing 108 changed files with 194 additions and 653 deletions.
diff --git a/scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh b/scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh
@@ -28,7 +28,6 @@ fi
 echo "Docker build done =========="
 
 sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/vcf_site_level_filtering.json >$WORKING_DIR/vcf_site_level_filtering_mod.json
-sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/vcf_site_level_filtering_pos_neg.json >$WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json
 
 echo "Running Filtering WDL through cromwell"
 
@@ -41,6 +40,3 @@ done
 FIN
 cat $WORKING_DIR/vcf_site_level_filtering_mod.json
 java -jar $CROMWELL_JAR run $WDL_DIR/JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_mod.json
-
-cat $WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json
-java -jar $CROMWELL_JAR run $WDL_DIR/JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json
diff --git a/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_pos_neg.json b/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_pos_neg.json
diff --git a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
@@ -29,8 +29,9 @@ workflow JointVcfFiltering {
         String resource_args
 
         String? model_backend
-        File? python_script
+        File? training_python_script
         File? hyperparameters_json
+        File? scoring_python_script
 
         String? extract_extra_args
         String? train_extra_args
@@ -55,9 +56,9 @@ workflow JointVcfFiltering {
         model_backend: "(Optional) Model backend to be used by TrainVariantAnnotationsModel. See GATK documentation for this tool."
         python_script: "(Optional) Python script specifying custom model backend to be used by TrainVariantAnnotationsModel. See GATK documentation for this tool."
         hyperparameters_json: "(Optional) JSON file specifying model hyperparameters to be used by TrainVariantAnnotationsModel. See GATK documentation for this tool."
-        extract_extra_args: "(Optional) Catch-all string to provide additional arguments for ExtractVariantAnnotations. This can include intervals (as string arguments or non-localized files), variant-type modes, arguments for enabling positive-negative training, etc. The \"do-not-gzip-vcf-output\" argument is not supported by this workflow. See GATK documentation for this tool."
-        train_extra_args: "(Optional) Catch-all string to provide additional arguments for TrainVariantAnnotationsModel. This can include variant-type modes, arguments for enabling positive-negative training, etc. See GATK documentation for this tool."
-        score_extra_args: "(Optional) Catch-all string to provide additional arguments for ScoreVariantAnnotations. This can include intervals (as string arguments or non-localized files), variant-type modes, arguments for enabling positive-negative training and hard filtering, etc. The \"do-not-gzip-vcf-output\" argument is not supported by this workflow. See GATK documentation for this tool."
+        extract_extra_args: "(Optional) Catch-all string to provide additional arguments for ExtractVariantAnnotations. This can include intervals (as string arguments or non-localized files), variant-type modes, arguments for enabling positive-unlabeled learning, etc. The \"do-not-gzip-vcf-output\" argument is not supported by this workflow. See GATK documentation for this tool."
+        train_extra_args: "(Optional) Catch-all string to provide additional arguments for TrainVariantAnnotationsModel. This can include variant-type modes, arguments for enabling positive-unlabeled learning, etc. See GATK documentation for this tool."
+        score_extra_args: "(Optional) Catch-all string to provide additional arguments for ScoreVariantAnnotations. This can include intervals (as string arguments or non-localized files), variant-type modes, arguments for enabling positive-unlabeled learning and hard filtering, etc. The \"do-not-gzip-vcf-output\" argument is not supported by this workflow. See GATK documentation for this tool."
     }
 
     call ExtractVariantAnnotations {
@@ -79,7 +80,7 @@ workflow JointVcfFiltering {
             annotations_hdf5 = ExtractVariantAnnotations.annotations_hdf5,
             unlabeled_annotations_hdf5 = ExtractVariantAnnotations.unlabeled_annotations_hdf5,
             model_backend = model_backend,
-            python_script = python_script,
+            python_script = training_python_script,
             hyperparameters_json = hyperparameters_json,
             output_prefix = output_prefix,
             extra_args = train_extra_args,
@@ -101,6 +102,8 @@ workflow JointVcfFiltering {
                 extracted_vcf_idx = ExtractVariantAnnotations.extracted_vcf_idx,
                 model_prefix = output_prefix,
                 model_files = TrainVariantAnnotationsModel.model_files,
+                model_backend = model_backend,
+                python_script = scoring_python_script,
                 extra_args = score_extra_args,
                 gatk_docker = gatk_docker,
                 gatk_override = gatk_override,
@@ -251,6 +254,8 @@ task ScoreVariantAnnotations {
         File extracted_vcf_idx
         String model_prefix
         Array[File] model_files
+        String? model_backend
+        File? python_script
         String? extra_args
         File? monitoring_script
 
@@ -287,6 +292,8 @@ task ScoreVariantAnnotations {
                 ~{resource_args} \
                 --resource:extracted,extracted=true ~{extracted_vcf} \
                 --model-prefix model-files/~{model_prefix}.train \
+                ~{"--model-backend " + model_backend} \
+                ~{"--python-script " + python_script} \
                 ~{extra_args}
     }
 

diff --git a/.../org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotations.java b/.../org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotations.java
@@ -65,7 +65,7 @@
  * <ul>
  *     <li>
  *         Input VCF file. Site-level annotations will be extracted from the contained variants (or alleles, 
- *         if the {@value USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME} argument is specified).
+ *         if at least one allele-specific annotation with {@code Number=A} is specified).
  *     </li>
  *     <li>
  *         Annotations to extract.
@@ -128,7 +128,7 @@
  *         <p>
  *             Here, each chunk is a double matrix, with dimensions given by (number of sites in the chunk) x (number of annotations).
  *             See the methods {@link HDF5Utils#writeChunkedDoubleMatrix} and {@link HDF5Utils#writeIntervals} for additional details.
- *             If {@value USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME} is specified, each record corresponds to an individual allele;
+ *             In allele-specific mode (i.e., when allele-specific annotations are requested), each record corresponds to an individual allele;
  *             otherwise, each record corresponds to a variant site, which may contain multiple alleles.
  *             Storage of alleles can be omitted using the {@value OMIT_ALLELES_IN_HDF5_LONG_NAME} argument, which will reduce
  *             the size of the file. This file will only be produced if resources are provided and the number of extracted
@@ -184,9 +184,8 @@
  *     1000000 unlabeled (i.e., non-training/calibration) sites, producing the outputs
  *     1) {@code extract.annot.hdf5}, 2) {@code extract.unlabeled.annot.hdf5}, 3) {@code extract.vcf.gz},
  *     and 4) {@code extract.vcf.gz.tbi}. The HDF5 files can then be provided to {@link TrainVariantAnnotationsModel}
- *     to train a model using a positive-negative approach (similar to that used in {@link VariantRecalibrator}).
- *     Note that the {@value MODE_LONG_NAME} arguments are made explicit here, although both SNP and INDEL modes are
- *     selected by default.
+ *     to train a model using a positive-unlabeled approach. Note that the {@value MODE_LONG_NAME} arguments
+ *     are made explicit here, although both SNP and INDEL modes are selected by default.
  *
  * <pre>
  *     gatk ExtractVariantAnnotations \
@@ -249,11 +248,10 @@ public final class ExtractVariantAnnotations extends LabeledVariantAnnotationsWa
             doc = "Maximum number of unlabeled variants to extract. " +
                     "If greater than zero, reservoir sampling will be used to randomly sample this number " +
                     "of sites from input sites that are not present in the specified resources. " +
-                    "Choice of this number should be guided by considerations for training the negative model in " +
+                    "Choice of this number should be guided by considerations for training the model in " +
                     "TrainVariantAnnotationsModel; users may wish to choose a number that is comparable to the " +
                     "expected size of the labeled training set or that is compatible with available memory resources. " +
-                    "Note that in allele-specific mode (--" + LabeledVariantAnnotationsWalker.USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME +
-                    " true), this argument limits the number of variant records, rather than the number of alleles.",
+                    "Note that in allele-specific mode, this argument limits the number of variant records, rather than the number of alleles.",
             minValue = 0)
     private int maximumNumberOfUnlabeledVariants = 0;
 

diff --git a/...roadinstitute/hellbender/tools/walkers/vqsr/scalable/LabeledVariantAnnotationsWalker.java b/...roadinstitute/hellbender/tools/walkers/vqsr/scalable/LabeledVariantAnnotationsWalker.java
@@ -9,6 +9,7 @@
 import htsjdk.variant.vcf.VCFConstants;
 import htsjdk.variant.vcf.VCFHeader;
 import htsjdk.variant.vcf.VCFHeaderLine;
+import htsjdk.variant.vcf.VCFHeaderLineCount;
 import htsjdk.variant.vcf.VCFHeaderLineType;
 import htsjdk.variant.vcf.VCFInfoHeaderLine;
 import org.apache.commons.collections4.ListUtils;
@@ -34,6 +35,7 @@
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.EnumSet;
+import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -87,7 +89,6 @@
 public abstract class LabeledVariantAnnotationsWalker extends MultiplePassVariantWalker {
 
     public static final String MODE_LONG_NAME = "mode";
-    public static final String USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME = "use-allele-specific-annotations";
     public static final String IGNORE_FILTER_LONG_NAME = "ignore-filter";
     public static final String IGNORE_ALL_FILTERS_LONG_NAME = "ignore-all-filters";
     public static final String DO_NOT_TRUST_ALL_POLYMORPHIC_LONG_NAME = "do-not-trust-all-polymorphic";
@@ -129,12 +130,6 @@ enum ResourceMatchingStrategy {
             minElements = 1)
     private List<VariantType> variantTypesToExtractList = new ArrayList<>(Arrays.asList(VariantType.SNP, VariantType.INDEL));
 
-    @Argument(
-            fullName = USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME,
-            doc = "If true, use the allele-specific versions of the specified annotations.",
-            optional = true)
-    boolean useASAnnotations = false;
-
     @Argument(
             fullName = IGNORE_FILTER_LONG_NAME,
             doc = "Ignore the specified filter(s) in the input VCF.",
@@ -159,13 +154,13 @@ enum ResourceMatchingStrategy {
     @Argument(
             fullName = RESOURCE_MATCHING_STRATEGY_LONG_NAME,
             doc = "The strategy to use for determining whether an input variant is present in a resource " +
-                    "in non-allele-specific mode (--" + USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME + " false). " +
+                    "in non-allele-specific mode. " +
                     "START_POSITION: Start positions of input and resource variants must match. " +
                     "START_POSITION_AND_GIVEN_REPRESENTATION: The intersection of the sets of input and resource alleles " +
                     "(in their given representations) must also be non-empty. " +
                     "START_POSITION_AND_MINIMAL_REPRESENTATION: The intersection of the sets of input and resource alleles " +
                     "(after converting alleles to their minimal representations) must also be non-empty. " +
-                    "This argument has no effect in allele-specific mode (--" + USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME + " true), " +
+                    "This argument has no effect in allele-specific mode, " +
                     "in which the minimal representations of the input and resource alleles must match.",
             optional = true)
     private ResourceMatchingStrategy resourceMatchingStrategy = ResourceMatchingStrategy.START_POSITION;
@@ -186,6 +181,7 @@ enum ResourceMatchingStrategy {
     private final Set<String> ignoreInputFilterSet = new TreeSet<>();
     Set<VariantType> variantTypesToExtract;
     TreeSet<String> resourceLabels = new TreeSet<>();
+    boolean useASAnnotations;
 
     File outputAnnotationsFile;
     VariantContextWriter vcfWriter;
@@ -222,9 +218,11 @@ public void onTraversalStart() {
                     LabeledVariantAnnotationsData.SNP_LABEL));
         }
 
+        useASAnnotations = isAlleleSpecificAnnotationRequested();
+
         if (useASAnnotations && resourceMatchingStrategy != ResourceMatchingStrategy.START_POSITION_AND_MINIMAL_REPRESENTATION) {
-            logger.warn(String.format("The %s argument is ignored when %s is set to true. The START_POSITION_AND_MINIMAL_REPRESENTATION strategy will be used.",
-                    RESOURCE_MATCHING_STRATEGY_LONG_NAME, USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME));
+            logger.warn(String.format("The %s argument is ignored when allele-specific annotations are requested. The START_POSITION_AND_MINIMAL_REPRESENTATION strategy will be used.",
+                    RESOURCE_MATCHING_STRATEGY_LONG_NAME));
             resourceMatchingStrategy = ResourceMatchingStrategy.START_POSITION_AND_MINIMAL_REPRESENTATION;
         }
 
@@ -251,6 +249,12 @@ public Object onTraversalSuccess() {
         return null;
     }
 
+    private boolean isAlleleSpecificAnnotationRequested() {
+        final Set<String> distinctAnnotationNames = new LinkedHashSet<>(annotationNames);
+        final VCFHeader inputHeader = getHeaderForVariants();
+        return distinctAnnotationNames.stream().anyMatch(a -> inputHeader.getInfoHeaderLine(a).getCountType() == VCFHeaderLineCount.A);
+    }
+
     static void addExtractedVariantToData(final LabeledVariantAnnotationsData data,
                                           final VariantContext variant,
                                           final List<Triple<List<Allele>, VariantType, TreeSet<String>>> metadata) {