Checkpoint of necessary changes for DRAGEN v3.7.8 concordance (#8083)

Added a new mode to HaplotypeCaller: --dragen-378-concordance-mode - Rewrote the pileup-detection code to support DRAGEN based heuristics for filtering out pileup alleles - Extended pileup-detection code to support both insertions and deletions. - Added a new HMM model, the "PDHMM" or the "Partially Determined" HMM. This new model treats only one allele at a time as "determined" and everything else in a region is a wildcard that allows for no-penalty passage through the HMM matrix for reads with any wildcard allele represented, greatly reducing the combinatorial complexity at messy assembly regions. - Added a PartiallyDeterminedHaplotypeComputationEngine.java class for managing the construction of PDHaplotypes from the union (plus filtering) of both assembly alleles and pileup-detection alleles. - Added a series of extra debugging arguments to the HaplotypeCaller and PDHMM to facilitate future development.
broadinstitute · Mar 21, 2023 · 9afa998 · 9afa998
1 parent e68f066
commit 9afa998
Show file tree

Hide file tree

Showing 36 changed files with 3,834 additions and 287 deletions.
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/AlignmentAndReferenceContext.java b/src/main/java/org/broadinstitute/hellbender/engine/AlignmentAndReferenceContext.java
@@ -7,6 +7,7 @@ public class AlignmentAndReferenceContext {
 
     private final AlignmentContext alignmentContext;
     private final ReferenceContext referenceContext;
+    private double activityScore = 0.0;
 
     public AlignmentAndReferenceContext(final AlignmentContext alignmentContext,
                                         final ReferenceContext referenceContext) {
@@ -29,4 +30,12 @@ public AlignmentContext getAlignmentContext() {
     public ReferenceContext getReferenceContext() {
         return referenceContext;
     }
+
+    public double getActivityScore() {
+        return activityScore;
+    }
+
+    public void setActivityScore(double activityScore) {
+        this.activityScore = activityScore;
+    }
 }
diff --git a/src/main/java/org/broadinstitute/hellbender/engine/AssemblyRegionIterator.java b/src/main/java/org/broadinstitute/hellbender/engine/AssemblyRegionIterator.java
@@ -137,14 +137,18 @@ private AssemblyRegion loadNextAssemblyRegion() {
             final SimpleInterval pileupInterval = new SimpleInterval(pileup);
             final ReferenceContext pileupRefContext = new ReferenceContext(reference, pileupInterval);
             final FeatureContext pileupFeatureContext = new FeatureContext(features, pileupInterval);
-            if (pendingAlignmentData!=null) {
-                pendingAlignmentData.add(new AlignmentAndReferenceContext(pileup, pileupRefContext));
-            }
+
 
             final ActivityProfileState profile = evaluator.isActive(pileup, pileupRefContext, pileupFeatureContext);
             logger.debug(() -> profile.toString());
             activityProfile.add(profile);
 
+            if (pendingAlignmentData!=null) {
+                AlignmentAndReferenceContext alignmentAndRefContext = new AlignmentAndReferenceContext(pileup, pileupRefContext);
+                alignmentAndRefContext.setActivityScore(profile.getOriginalActiveProb());
+                pendingAlignmentData.add(alignmentAndRefContext);
+            }
+
             // A pending region only becomes ready once our locus iterator has advanced beyond the end of its extended span
             // (this ensures that we've loaded all reads that belong in the new region)
             if ( ! pendingRegions.isEmpty() && IntervalUtils.isAfter(pileup.getLocation(), pendingRegions.peek().getPaddedSpan(), readHeader.getSequenceDictionary()) ) {

diff --git a/...org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtils.java b/...org/broadinstitute/hellbender/tools/walkers/haplotypecaller/AssemblyBasedCallerUtils.java
@@ -31,8 +31,11 @@
 import org.broadinstitute.hellbender.utils.genotyper.SampleList;
 import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
 import org.broadinstitute.hellbender.utils.haplotype.HaplotypeBAMWriter;
+import org.broadinstitute.hellbender.utils.haplotype.PartiallyDeterminedHaplotype;
 import org.broadinstitute.hellbender.utils.io.IOUtils;
 import org.broadinstitute.hellbender.utils.locusiterator.LocusIteratorByState;
+import org.broadinstitute.hellbender.utils.logging.OneShotLogger;
+import org.broadinstitute.hellbender.utils.pileup.PileupBasedAlleles;
 import org.broadinstitute.hellbender.utils.pileup.ReadPileup;
 import org.broadinstitute.hellbender.utils.read.*;
 import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
@@ -72,6 +75,8 @@ public final class AssemblyBasedCallerUtils {
     // get realigned incorrectly.  See https://github.com/broadinstitute/gatk/issues/5060
     public static final int MINIMUM_READ_LENGTH_AFTER_TRIMMING = 10;
 
+    public static final OneShotLogger haplotypeDeletionWarningLogger = new OneShotLogger(AssemblyBasedCallerUtils.class);
+
     // Phase group notation can be interpreted as a representation of the alleles present on the two phased haplotypes at the site:
     // "0": REF or '*'; "1": site-specific alt allele
     enum PhaseGroup {
@@ -263,8 +268,7 @@ public static ReadLikelihoodCalculationEngine createLikelihoodCalculationEngine(
                                                                                     final boolean handleSoftclips,
                                                                                     final ReadLikelihoodCalculationEngine.Implementation implementation) {
         //AlleleLikelihoods::normalizeLikelihoods uses Double.NEGATIVE_INFINITY as a flag to disable capping
-        final double log10GlobalReadMismappingRate = likelihoodArgs.phredScaledGlobalReadMismappingRate < 0 ? Double.NEGATIVE_INFINITY
-                : QualityUtils.qualToErrorProbLog10(likelihoodArgs.phredScaledGlobalReadMismappingRate);
+        final double log10GlobalReadMismappingRate = getGlobalMismatchingRateFromArgs(likelihoodArgs);
 
         switch ( implementation) {
             // TODO these constructors should eventually be matched so they both incorporate all the same ancilliary arguments
@@ -284,6 +288,14 @@ public static ReadLikelihoodCalculationEngine createLikelihoodCalculationEngine(
         }
     }
 
+    /**
+     * Exposed so that PDHMM can be constructed outside of this class
+     */
+    public static double getGlobalMismatchingRateFromArgs(LikelihoodEngineArgumentCollection likelihoodArgs) {
+        return likelihoodArgs.phredScaledGlobalReadMismappingRate < 0 ? Double.NEGATIVE_INFINITY
+                : QualityUtils.qualToErrorProbLog10(likelihoodArgs.phredScaledGlobalReadMismappingRate);
+    }
+
     public static Optional<HaplotypeBAMWriter> createBamWriter(final AssemblyBasedCallerArgumentCollection args,
                                                                final boolean createBamOutIndex,
                                                                final boolean createBamOutMD5,
@@ -320,7 +332,6 @@ public static VariantContext makeMergedVariantContext(final List<VariantContext>
      * for further HC steps
      */
     public static AssemblyResultSet assembleReads(final AssemblyRegion region,
-                                                  final List<VariantContext> forcedPileupAlleles,
                                                   final AssemblyBasedCallerArgumentCollection argumentCollection,
                                                   final SAMFileHeader header,
                                                   final SampleList sampleList,
@@ -397,10 +408,6 @@ public static AssemblyResultSet assembleReads(final AssemblyRegion region,
                             haplotypeCollapsing);
 
             assemblyResultSet.setHaplotypeCollapsingEngine(haplotypeCollapsing);
-
-            if (!forcedPileupAlleles.isEmpty()) {
-                processPileupAlleles(region, forcedPileupAlleles, argumentCollection.pileupDetectionArgs.snpAdajacentToAssemblyIndel, argumentCollection.maxMnpDistance, aligner, refHaplotype, assemblyResultSet, argumentCollection.pileupDetectionArgs.numHaplotypesToIterate, argumentCollection.pileupDetectionArgs.filteringKmerSize, argumentCollection.getHaplotypeToReferenceSWParameters());
-            }
             assemblyResultSet.setDebug(argumentCollection.assemblerArgs.debugAssembly);
             assemblyResultSet.debugDump(logger);
             return assemblyResultSet;
@@ -429,15 +436,76 @@ private static int determineFlowAssemblyColapseHmer(SAMFileHeader readsHeader) {
         return result;
     }
 
+    /**
+     * Helper method that handles the actual "GGA-like" Merging of haplotype alleles into an assembly result set.
+     *
+     * First this method will filter out haplotypes that contain alleles that have failed the pileup calling filtering steps,
+     * Then the list will attempt to poke into the haplotype list artificial haplotypes that have the found alleles present.
+     *
+     * @param region
+     * @param argumentCollection
+     * @param aligner
+     * @param refHaplotype
+     * @param assemblyResultSet
+     * @param pileupAllelesFoundShouldFilter
+     * @param pileupAllelesPassingFilters
+     * @return
+     */
+    public static AssemblyResultSet applyPileupEventsAsForcedAlleles(final AssemblyRegion region, final AssemblyBasedCallerArgumentCollection argumentCollection,
+                                                                      final SmithWatermanAligner aligner, final Haplotype refHaplotype,
+                                                                      final AssemblyResultSet assemblyResultSet, final List<VariantContext> pileupAllelesFoundShouldFilter,
+                                                                      final List<VariantContext> pileupAllelesPassingFilters, final boolean debug) {
+        List<Haplotype> haplotypesWithFilterAlleles = new ArrayList<>();
+        // IF we find pileup alleles that we want to filter... AND we are not generating the partially determined haplotypes,
+        // we resort to a messy approach where we filter alleles by throwing away every haplotype supporting an allele. This is
+        // very dangerous since this could easily destroy phased variants with the haplotype.
+        if (!pileupAllelesFoundShouldFilter.isEmpty() && !argumentCollection.pileupDetectionArgs.generatePDHaplotypes) {
+            // TODO this is a bad algorithm for bad people
+            for(VariantContext delVariant : pileupAllelesFoundShouldFilter) {
+                for (Haplotype hap : assemblyResultSet.getHaplotypeList()) {
+                    if (hap.getEventMap()==null) {
+                        // NOTE: This check should be a reasonable sanity check on the inputs. However, there edge cases in the assembly engine + SW realignment that can cause
+                        //       haplotypes to re-merge into the reference and end up with an empty event map. (Also the event map code explicitly expects to fail in some instances)
+                        //       thus we can't enforce that the haplotypes no variants.
+//                        if (!hap.isReference()) {
+//                            //throw new RuntimeException("empty event map for haplotype" + hap);
+//                        }
+                    } else {
+                        if (hap.getEventMap().getVariantContexts().stream().anyMatch(v -> v.getStart() == delVariant.getStart()
+                                && delVariant.getReference().equals(v.getReference())
+                                && delVariant.getAlternateAllele(0).equals(v.getAlternateAllele(0)))) {
+                            if (argumentCollection.pileupDetectionArgs.debugPileupStdout) System.err.println("Flagging hap " + hap + " for containing variant " + delVariant);
+                            haplotypesWithFilterAlleles.add(hap);
+                        }
+                    }
+                }
+            }
+        }
+        // TODO removing haplotypes whole-cloth is messy and will likely lead to errors and dropped variants in this code
+        if (!haplotypesWithFilterAlleles.isEmpty()) {
+            if (debug) System.out.println("Found Assembly Haps with filtered Variants:\n"+haplotypesWithFilterAlleles.stream().map(Haplotype::toString).collect(Collectors.joining("\n")));
+
+            haplotypeDeletionWarningLogger.warn(() -> "Haplotypes from Assembly are being filtered by heuristics from the PileupCaller. This can lead to missing variants. See --"+PileupDetectionArgumentCollection.PILEUP_DETECTION_FILTER_ASSEMBLY_HAPS_THRESHOLD+" for more details");
+
+            for (Haplotype hap : haplotypesWithFilterAlleles) {
+                assemblyResultSet.removeHapltotype(hap);
+            }
+        }
+        if (!pileupAllelesPassingFilters.isEmpty()) {
+            processPileupAlleles(region, pileupAllelesPassingFilters, argumentCollection.maxMnpDistance, argumentCollection.pileupDetectionArgs.snpAdajacentToAssemblyIndel, aligner, refHaplotype, assemblyResultSet, argumentCollection.pileupDetectionArgs.numHaplotypesToIterate, argumentCollection.pileupDetectionArgs.filteringKmerSize, argumentCollection.getHaplotypeToReferenceSWParameters(), debug);
+        }
+        return assemblyResultSet;
+    }
+
     /**
      * Handle pileup detected alternate alleles.
      */
     @VisibleForTesting
     @SuppressWarnings("deprecation")
-    static void processPileupAlleles(final AssemblyRegion region, final List<VariantContext> givenAlleles, final int maxMnpDistance,
+    static void processPileupAlleles(final AssemblyRegion region, final List<VariantContext> pileupVC, final int maxMnpDistance,
                                      final int snpAdjacentToIndelLimit, final SmithWatermanAligner aligner, final Haplotype refHaplotype,
                                      final AssemblyResultSet assemblyResultSet, final int numHaplotypesPerIteration, final int hapFilteringKmerSize,
-                                     final SWParameters haplotypeToReferenceSWParameters) {
+                                     final SWParameters haplotypeToReferenceSWParameters, final boolean debug) {
         final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
         final Map<Integer, VariantContext> assembledVariants = assemblyResultSet.getVariationEvents(maxMnpDistance).stream()
                 .collect(Collectors.groupingBy(VariantContext::getStart, Collectors.collectingAndThen(Collectors.toList(), AssemblyBasedCallerUtils::makeMergedVariantContext)));
@@ -455,7 +523,7 @@ static void processPileupAlleles(final AssemblyRegion region, final List<Variant
         Map<Kmer, Integer> kmerReadCounts = getKmerReadCounts(region.getHardClippedPileupReads(), hapFilteringKmerSize);
 
         // Remove SNPs that are too close to assembled indels.
-        final List<VariantContext> givenAllelesFiltered = givenAlleles.stream().filter(vc -> vc.isIndel() || assembledIndels.stream().noneMatch(indel -> vc.withinDistanceOf(indel, snpAdjacentToIndelLimit))).collect(Collectors.toList());
+        final List<VariantContext> givenAllelesFiltered = pileupVC.stream().filter(vc -> vc.isIndel() || assembledIndels.stream().noneMatch(indel -> vc.withinDistanceOf(indel, snpAdjacentToIndelLimit))).collect(Collectors.toList());
 
         for (final VariantContext givenVC : givenAllelesFiltered) {
             final VariantContext assembledVC = assembledVariants.get(givenVC.getStart());
@@ -470,6 +538,7 @@ static void processPileupAlleles(final AssemblyRegion region, final List<Variant
 
             final List<Haplotype> newPileupHaplotypes = new ArrayList<>();
             for (final Allele givenAllele : unassembledNonSymbolicAlleles) {
+                if (debug) System.out.println("Processing new Haplotypes for Pileup Allele that was not in the assembly: "+givenVC);
                 for (final Haplotype baseHaplotype : baseHaplotypes) {
                     // make sure this allele doesn't collide with a variant on the haplotype
                     if (baseHaplotype.getEventMap() == null || baseHaplotype.getEventMap().getVariantContexts().stream().noneMatch(vc -> vc.overlaps(givenVC))) {
@@ -490,7 +559,9 @@ static void processPileupAlleles(final AssemblyRegion region, final List<Variant
                 }
             }
 
-            baseHaplotypes.addAll(filterPileupHaplotypes(newPileupHaplotypes, kmerReadCounts, numHaplotypesPerIteration, hapFilteringKmerSize));
+            Set<Haplotype> refactoredHaps = filterPileupHaplotypes(newPileupHaplotypes, kmerReadCounts, numHaplotypesPerIteration, hapFilteringKmerSize);
+            baseHaplotypes.addAll(refactoredHaps);
+            if (debug) System.out.println("Constructed the following new Pileup Haplotypes after filtering:\n"+refactoredHaps.stream().map(Haplotype::toString).collect(Collectors.joining("\n")));
 
         }
         baseHaplotypes.forEach(assemblyResultSet::add);
@@ -556,7 +627,8 @@ private static List<Allele> getAllelesNotPresentInAssembly(VariantContext givenV
                     ReferenceConfidenceVariantContextMerger.remapAlleles(assembledVC, longerRef));
             final Set<Allele> givenAlleleSet = new HashSet<>(longerRef.length() == givenVCRefLength ? givenVC.getAlternateAlleles() :
                     ReferenceConfidenceVariantContextMerger.remapAlleles(givenVC, longerRef));
-            unassembledGivenAlleles = givenAlleleSet.stream().filter(a -> !assembledAlleleSet.contains(a)).collect(Collectors.toList());
+            //We must filter out the Ref alleles here to protect against edge cases and undexpected behavior from the pileupcalling code
+            unassembledGivenAlleles = givenAlleleSet.stream().filter(a -> !assembledAlleleSet.contains(a)).filter(a -> !a.isReference()).collect(Collectors.toList());
         }
         return unassembledGivenAlleles;
     }
@@ -856,6 +928,10 @@ public static Map<Allele, List<Haplotype>> createAlleleMapper(final VariantConte
 
         for (final Haplotype h : haplotypes) {
 
+            // Partially determined haplotypes know at what position they are determined, only determined position haps should be considered for genotyping
+            if (h.isPartiallyDetermined() && ((PartiallyDeterminedHaplotype) h).getDeterminedPosition() != loc) {
+                continue;
+            }
             final List<VariantContext> spanningEvents = h.getEventMap().getOverlappingEvents(loc);
 
             if (spanningEvents.isEmpty()) {    //no events --> this haplotype supports the reference at this locus