Responded to review comments

broadinstitute · Mar 17, 2023 · a690509 · a690509
1 parent 8d3d123
commit a690509
Show file tree

Hide file tree

Showing 24 changed files with 350 additions and 13,531 deletions.
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/AssemblyRegionIterator.java b/src/main/java/org/broadinstitute/hellbender/engine/AssemblyRegionIterator.java
@@ -140,9 +140,7 @@ private AssemblyRegion loadNextAssemblyRegion() {
 
 
             final ActivityProfileState profile = evaluator.isActive(pileup, pileupRefContext, pileupFeatureContext);
-            //TODO pull out the qual score for later
             logger.debug(() -> profile.toString());
-//            if (profile.isActiveProb() > 0.001) System.out.println(profile.toString());
             activityProfile.add(profile);
 
             if (pendingAlignmentData!=null) {
@@ -186,8 +184,6 @@ private AssemblyRegion loadNextAssemblyRegion() {
             fillNextAssemblyRegionWithReads(nextRegion);
             // fillnextessemblyregion; check you are on correct chr; if alignment data is not in the assembly region then pop it
             fillNextAssemblyRegionWithPileupData(nextRegion);
-            //TODO this is a hack
-//            fillNextAssemblyRegionWith(activityProfile.)
         }
 
         return nextRegion;

diff --git a/...org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtils.java b/...org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtils.java
@@ -628,7 +628,7 @@ private static List<Allele> getAllelesNotPresentInAssembly(VariantContext givenV
                     ReferenceConfidenceVariantContextMerger.remapAlleles(assembledVC, longerRef));
             final Set<Allele> givenAlleleSet = new HashSet<>(longerRef.length() == givenVCRefLength ? givenVC.getAlternateAlleles() :
                     ReferenceConfidenceVariantContextMerger.remapAlleles(givenVC, longerRef));
-            //TODO ADD A TEST FOR THIS CHANGE! IT WAS A RARE BUG BEFORE
+            //We must filter out the Ref alleles here to protect against edge cases and undexpected behavior from the pileupcalling code
             unassembledGivenAlleles = givenAlleleSet.stream().filter(a -> !assembledAlleleSet.contains(a)).filter(a -> !a.isReference()).collect(Collectors.toList());
         }
         return unassembledGivenAlleles;
@@ -930,7 +930,7 @@ public static Map<Allele, List<Haplotype>> createAlleleMapper(final VariantConte
         for (final Haplotype h : haplotypes) {
 
             // Partially determined haplotypes know at what position they are determined, only determined position haps should be considered for genotyping
-            if (h instanceof PartiallyDeterminedHaplotype && ((PartiallyDeterminedHaplotype) h).getDeterminedPosition() != loc) {
+            if (h.isPartiallyDetermined() && ((PartiallyDeterminedHaplotype) h).getDeterminedPosition() != loc) {
                 continue;
             }
             final List<VariantContext> spanningEvents = h.getEventMap().getOverlappingEvents(loc);

diff --git a/...n/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyResultSet.java b/...n/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyResultSet.java
@@ -39,7 +39,7 @@ public final class AssemblyResultSet {
             .thenComparing(vc -> vc.getAlternateAllele(0));
     private final Map<Integer,AssemblyResult> assemblyResultByKmerSize;
     private final Set<Haplotype> haplotypes;
-    private Set<Haplotype> originalAssemblyHaps = new HashSet<>();
+    private Set<Haplotype> originalAssemblyHaps = new LinkedHashSet<>();
     private final Map<Haplotype,AssemblyResult> assemblyResultByHaplotype;
     private AssemblyRegion regionForGenotyping;
     private byte[] fullReferenceWithPadding;
@@ -594,7 +594,9 @@ public void replaceAllHaplotypes(Set<Haplotype> list) {
         }
     }
 
-    //TODO this is REALLY BAD get rid of it for future code:
+    // For PDHMM use: Remove a haplotype from this AssemblyResultSet and update all of the various references in the
+    //                object to that haplotype to be current.
+    // WARNING: Deleting haplotypes in this manner is highly dangerous and will likely lead to lost variants
     public void removeHapltotype(final Haplotype hap) {
 
         haplotypes.remove(hap);
@@ -604,7 +606,6 @@ public void removeHapltotype(final Haplotype hap) {
             discovered.remove(hap);
             assemblyResultByKmerSize.get(kmerSize).setDiscoveredHaplotypes(discovered);
         }
-        assemblyResultByKmerSize.keySet().forEach(kmer -> assemblyResultByKmerSize.get(kmer).getDiscoveredHaplotypes());
     }
 
     public void setPartiallyDeterminedMode() {
@@ -615,8 +616,10 @@ public boolean isPartiallyDeterminedList() {
         return isPartiallyDeterminedList;
     }
 
+    // For PDHMM use: store original assembly haplotypes if we are injecting artificial haplotypes later
+    // WARNING: This is likely not set in every case, use with caution
     public void storeAssemblyHaplotypes() {
-        originalAssemblyHaps = new HashSet<>(haplotypes);
+        originalAssemblyHaps = new LinkedHashSet<>(haplotypes);
     }
 
     public boolean hasOverwrittenHaps() {

diff --git a/...nstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedAlignmentLikelihoodEngine.java b/...nstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedAlignmentLikelihoodEngine.java
@@ -8,7 +8,6 @@
 import org.broadinstitute.hellbender.utils.Utils;
 import org.broadinstitute.hellbender.utils.genotyper.*;
 import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
-import org.broadinstitute.hellbender.utils.haplotype.PartiallyDeterminedHaplotype;
 import org.broadinstitute.hellbender.utils.read.*;
 import org.broadinstitute.hellbender.utils.haplotype.FlowBasedHaplotype;
 

diff --git a/...ain/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCaller.java b/...ain/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCaller.java
@@ -20,11 +20,9 @@
 import org.broadinstitute.hellbender.tools.walkers.annotator.Annotation;
 import org.broadinstitute.hellbender.tools.walkers.annotator.HaplotypeFilteringAnnotation;
 import org.broadinstitute.hellbender.tools.walkers.annotator.VariantAnnotatorEngine;
-import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAssignmentMethod;
 import org.broadinstitute.hellbender.transformers.DRAGENMappingQualityReadTransformer;
 import org.broadinstitute.hellbender.transformers.ReadTransformer;
 import org.broadinstitute.hellbender.utils.fasta.CachingIndexedFastaSequenceFile;
-import org.broadinstitute.hellbender.cmdline.ModeArgumentUtils;
 
 import java.util.Collection;
 import java.util.List;
@@ -172,21 +170,21 @@ public List<ReadFilter> getDefaultReadFilters() {
      */
     @Override
     protected String[] customCommandLineValidation() {
-        if ((hcArgs.dragenMode || hcArgs.dragen378Mode) && hcArgs.flowMode != HaplotypeCallerArgumentCollection.FlowMode.NONE ) {
+        if ((hcArgs.isDragenGATKMode()) && hcArgs.isFlowBasedCallingMode()) {
             throw new UserException("dragen mode and flow mode can't be both specified");
         }
 
-        if (hcArgs.dragenMode || hcArgs.dragen378Mode) {
+        if (hcArgs.isDragenGATKMode()) {
             final GATKReadFilterPluginDescriptor readFilterPlugin =
                     getCommandLineParser().getPluginDescriptor(GATKReadFilterPluginDescriptor.class);
             Optional<ReadFilter> filterOptional = readFilterPlugin.getResolvedInstances().stream().filter(rf -> rf instanceof MappingQualityReadFilter).findFirst();
             filterOptional.ifPresent(readFilter -> ((MappingQualityReadFilter) readFilter).minMappingQualityScore = 1);
             ModeArgumentUtils.setArgValues(
                     getCommandLineParser(),
-                    hcArgs.dragen378Mode? hcArgs.getDragenVersion2NameValuePairs() : hcArgs.getDragenNameValuePairs(),
+                    hcArgs.dragen378Mode? hcArgs.getDragenVersion378NameValuePairs() : hcArgs.getDragenVersion3412NameValuePairs(),
                     hcArgs.dragen378Mode? HaplotypeCallerArgumentCollection.DRAGEN_378_GATK_MODE_LONG_NAME : HaplotypeCallerArgumentCollection.DRAGEN_GATK_MODE_LONG_NAME);
         }
-        if (hcArgs.flowMode != HaplotypeCallerArgumentCollection.FlowMode.NONE) {
+        if (hcArgs.isFlowBasedCallingMode()) {
             ModeArgumentUtils.setArgValues(
                     getCommandLineParser(),
                     hcArgs.flowMode.getNameValuePairs(),

diff --git a/...institute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java b/...institute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerArgumentCollection.java
@@ -154,33 +154,42 @@ protected ReadThreadingAssemblerArgumentCollection getReadThreadingAssemblerArgu
      */
     @Argument(fullName = DRAGEN_GATK_MODE_LONG_NAME, optional = true, doc="Single argument for enabling the bulk of DRAGEN-GATK features. NOTE: THIS WILL OVERWRITE PROVIDED ARGUMENT CHECK TOOL INFO TO SEE WHICH ARGUMENTS ARE SET).", mutex = {DRAGEN_378_GATK_MODE_LONG_NAME})
     public Boolean dragenMode = false;
+
     /**
      * DRAGEN-GATK version 2: This includes PDHMM and Columnwise detection (with hopes to add Joint Detection and new STRE as well in the future)
      */
+    @Advanced
     @Argument(fullName = DRAGEN_378_GATK_MODE_LONG_NAME, optional = true, doc="Single argument for enabling the bulk of DRAGEN-GATK features including new developments for concordance against DRAGEN 3.7.8. NOTE: THIS WILL OVERWRITE PROVIDED ARGUMENT CHECK TOOL INFO TO SEE WHICH ARGUMENTS ARE SET).", mutex = {DRAGEN_GATK_MODE_LONG_NAME})
     public Boolean dragen378Mode = false;
+
     @Advanced
     @Argument(fullName = FLOW_GATK_MODE_LONG_NAME, optional = true, doc="Single argument for enabling the bulk of Flow Based features. NOTE: THIS WILL OVERWRITE PROVIDED ARGUMENT CHECK TOOL INFO TO SEE WHICH ARGUMENTS ARE SET).")
     public FlowMode flowMode = FlowMode.NONE;
+
     @Advanced
     @Argument(fullName = APPLY_BQD_LONG_NAME, doc = "If enabled this argument will apply the DRAGEN-GATK BaseQualityDropout model to the genotyping model for filtering sites due to Linked Error mode.", optional = true)
     public boolean applyBQD = false;
+
     @Advanced
     @Argument(fullName = APPLY_FRD_LONG_NAME, doc = "If enabled this argument will apply the DRAGEN-GATK ForeignReadDetection model to the genotyping model for filtering sites.", optional = true)
     public boolean applyFRD = false;
+
     @Advanced
     @Argument(fullName = DISABLE_SPANNING_EVENT_GENOTYPING_LONG_NAME, doc = "If enabled this argument will disable inclusion of the '*' spanning event when genotyping events that overlap deletions", optional = true)
     public boolean disableSpanningEventGenotyping = false;
+
     @Advanced
     @Argument(fullName = TRANSFORM_DRAGEN_MAPPING_QUALITY_LONG_NAME, doc = "If enabled this argument will map DRAGEN aligner aligned reads with mapping quality <=250 to scale up to MQ 50", optional = true)
     public boolean transformDRAGENMapQ = false;
-    //TODO NOTE TO THE REVIEWER, THIS ARGUMENT IS INSUFFICIENT BOTH THIS AND --minimum-mapping-quality must be set, unfortunatley
+
+    //TODO NOTE TO THE REVIEWER, THIS ARGUMENT IS INSUFFICIENT BOTH THIS AND --minimum-mapping-quality must be set, unfortunately
     //TODO they can't be unified since the toolDefaultReadFilters get instantiated before this field gets populated, and we can't
     //TODO pull the threshold from that filter since it might or might not exist by the time we go to filter for threading, really
-    //TODO we should unify on the readFilter version of this check i think but perhaps they are seperate for athropological historical reasons and it is thus culturally protected?
+    //TODO we should unify on the readFilter version of this check i think but perhaps they are separate for anthropological historical reasons and it is thus culturally protected?
     @Advanced
     @Argument(fullName = MAPPING_QUALITY_THRESHOLD_FOR_GENOTYPING_LONG_NAME, doc = "Control the threshold for discounting reads from the genotyper due to mapping quality after the active region detection and assembly steps but before genotyping. NOTE: this is in contrast to the --"+ ReadFilterArgumentDefinitions.MINIMUM_MAPPING_QUALITY_NAME+" argument which filters reads from all parts of the HaplotypeCaller. If you would like to call genotypes with a different threshold both arguments must be set.", optional = true)
     public int mappingQualityThreshold = HaplotypeCallerEngine.DEFAULT_READ_QUALITY_FILTER_THRESHOLD;
+
     @Advanced
     @Argument(fullName = MAX_EFFECTIVE_DEPTH_ADJUSTMENT_FOR_FRD_LONG_NAME, doc = "Set the maximum depth to modify FRD adjustment to in the event of high depth sites (0 to disable)", optional = false)
     public int maxEffectiveDepthAdjustment = 0;
@@ -232,7 +241,7 @@ protected ReadThreadingAssemblerArgumentCollection getReadThreadingAssemblerArgu
     @Argument(fullName= USE_FILTERED_READS_FOR_ANNOTATIONS_LONG_NAME, doc = "Use the contamination-filtered read maps for the purposes of annotating variants", optional=true)
     public boolean useFilteredReadMapForAnnotations = false;
 
-    public String[] getDragenNameValuePairs() {
+    public String[] getDragenVersion3412NameValuePairs() {
         return new String[]{
                 APPLY_BQD_LONG_NAME, "true",
                 APPLY_FRD_LONG_NAME, "true",
@@ -251,7 +260,15 @@ public String[] getDragenNameValuePairs() {
         };
     }
 
-    public String[] getDragenVersion2NameValuePairs() {
+    /**
+     * Extra arguments required for DRAGEN-GATK to be Functionally-Equivalent to DRAGEN v3.7.8. These are a super-set
+     * of the arguments above required for DRAGEN v3.4.12 Functional-Equivalence.
+     * The notable differences in the two versions are as follows:
+     *  - v3.7.8 is run with Pileup-Calling enabled which is equivalent to DRAGEN "Joint Detection" in function
+     *  - v3.7.8 adds heuristics to the Pileup-Caller to filter out false-positives. These heuristics may also filter out false positives from the assembly graphs.
+     *  - v3.7.8 is run with the PartiallyDetermined HMM (or "PDHMM") in place of the normal HMM for assigning allele likelihoods
+     */
+    public String[] getDragenVersion378NameValuePairs() {
         return new String[]{
                 APPLY_BQD_LONG_NAME, "true",
                 APPLY_FRD_LONG_NAME, "true",
@@ -273,7 +290,7 @@ public String[] getDragenVersion2NameValuePairs() {
                 PileupDetectionArgumentCollection.PILEUP_DETECTION_ABSOLUTE_ALT_DEPTH, "0",
                 PileupDetectionArgumentCollection.PILEUP_DETECTION_BAD_READ_RATIO_LONG_NAME, "0.40",
                 PileupDetectionArgumentCollection.PILEUP_DETECTION_ENABLE_INDELS, "true",
-                PileupDetectionArgumentCollection.PILEUP_DETECTION_ACTIVE_REGION_LOD_THRESHOLD_LONG_NAME, "3.0",
+                PileupDetectionArgumentCollection.PILEUP_DETECTION_ACTIVE_REGION_PHRED_THRESHOLD_LONG_NAME, "3.0",
                 PileupDetectionArgumentCollection.GENERATE_PARTIALLY_DETERMINED_HAPLOTYPES_LONG_NAME, "true"
         };
     }
@@ -284,6 +301,17 @@ public String[] getDragenVersion2NameValuePairs() {
     @Argument(fullName = STEPWISE_FITLERING_ARGUMENT, doc = "If enabled, this will create a FlowBasedAligner to use for filtering haplotypes before using another likelihoods engine for scoring.")
     public boolean stepwiseFiltering = false;
 
+    // Should we make any alterations to the pipeline for DRAGEN-GATK
+    boolean isDragenGATKMode() {
+        return dragenMode || dragen378Mode;
+    }
+
+    // Should we make any alterations to the pipeline for FlowBasedCalling
+    boolean isFlowBasedCallingMode() {
+        return flowMode != FlowMode.NONE;
+    }
+
+
     /**
      * the different flow modes, in terms of their parameters and their values
      *