Skip to content

Commit

Permalink
Checkpoint of necessary changes for DRAGEN v3.7.8 concordance (#8083)
Browse files Browse the repository at this point in the history
Added a new mode to HaplotypeCaller: --dragen-378-concordance-mode
- Rewrote the pileup-detection code to support DRAGEN based heuristics for filtering out pileup alleles
- Extended pileup-detection code to support both insertions and deletions.
- Added a new HMM model, the "PDHMM" or the "Partially Determined" HMM. This new model treats only one allele at a time as "determined" and everything else in a region is a wildcard that allows for no-penalty passage through the HMM matrix for reads with any wildcard allele represented, greatly reducing the combinatorial complexity at messy assembly regions.
- Added a PartiallyDeterminedHaplotypeComputationEngine.java class for managing the construction of PDHaplotypes from the union (plus filtering) of both assembly alleles and pileup-detection alleles. 
- Added a series of extra debugging arguments to the HaplotypeCaller and PDHMM to facilitate future development.
  • Loading branch information
jamesemery committed Mar 21, 2023
1 parent e68f066 commit 9afa998
Show file tree
Hide file tree
Showing 36 changed files with 3,834 additions and 287 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ public class AlignmentAndReferenceContext {

private final AlignmentContext alignmentContext;
private final ReferenceContext referenceContext;
private double activityScore = 0.0;

public AlignmentAndReferenceContext(final AlignmentContext alignmentContext,
final ReferenceContext referenceContext) {
Expand All @@ -29,4 +30,12 @@ public AlignmentContext getAlignmentContext() {
public ReferenceContext getReferenceContext() {
return referenceContext;
}

public double getActivityScore() {
return activityScore;
}

public void setActivityScore(double activityScore) {
this.activityScore = activityScore;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -137,14 +137,18 @@ private AssemblyRegion loadNextAssemblyRegion() {
final SimpleInterval pileupInterval = new SimpleInterval(pileup);
final ReferenceContext pileupRefContext = new ReferenceContext(reference, pileupInterval);
final FeatureContext pileupFeatureContext = new FeatureContext(features, pileupInterval);
if (pendingAlignmentData!=null) {
pendingAlignmentData.add(new AlignmentAndReferenceContext(pileup, pileupRefContext));
}


final ActivityProfileState profile = evaluator.isActive(pileup, pileupRefContext, pileupFeatureContext);
logger.debug(() -> profile.toString());
activityProfile.add(profile);

if (pendingAlignmentData!=null) {
AlignmentAndReferenceContext alignmentAndRefContext = new AlignmentAndReferenceContext(pileup, pileupRefContext);
alignmentAndRefContext.setActivityScore(profile.getOriginalActiveProb());
pendingAlignmentData.add(alignmentAndRefContext);
}

// A pending region only becomes ready once our locus iterator has advanced beyond the end of its extended span
// (this ensures that we've loaded all reads that belong in the new region)
if ( ! pendingRegions.isEmpty() && IntervalUtils.isAfter(pileup.getLocation(), pendingRegions.peek().getPaddedSpan(), readHeader.getSequenceDictionary()) ) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,11 @@
import org.broadinstitute.hellbender.utils.genotyper.SampleList;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.haplotype.HaplotypeBAMWriter;
import org.broadinstitute.hellbender.utils.haplotype.PartiallyDeterminedHaplotype;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.locusiterator.LocusIteratorByState;
import org.broadinstitute.hellbender.utils.logging.OneShotLogger;
import org.broadinstitute.hellbender.utils.pileup.PileupBasedAlleles;
import org.broadinstitute.hellbender.utils.pileup.ReadPileup;
import org.broadinstitute.hellbender.utils.read.*;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
Expand Down Expand Up @@ -72,6 +75,8 @@ public final class AssemblyBasedCallerUtils {
// get realigned incorrectly. See https://github.com/broadinstitute/gatk/issues/5060
public static final int MINIMUM_READ_LENGTH_AFTER_TRIMMING = 10;

public static final OneShotLogger haplotypeDeletionWarningLogger = new OneShotLogger(AssemblyBasedCallerUtils.class);

// Phase group notation can be interpreted as a representation of the alleles present on the two phased haplotypes at the site:
// "0": REF or '*'; "1": site-specific alt allele
enum PhaseGroup {
Expand Down Expand Up @@ -263,8 +268,7 @@ public static ReadLikelihoodCalculationEngine createLikelihoodCalculationEngine(
final boolean handleSoftclips,
final ReadLikelihoodCalculationEngine.Implementation implementation) {
//AlleleLikelihoods::normalizeLikelihoods uses Double.NEGATIVE_INFINITY as a flag to disable capping
final double log10GlobalReadMismappingRate = likelihoodArgs.phredScaledGlobalReadMismappingRate < 0 ? Double.NEGATIVE_INFINITY
: QualityUtils.qualToErrorProbLog10(likelihoodArgs.phredScaledGlobalReadMismappingRate);
final double log10GlobalReadMismappingRate = getGlobalMismatchingRateFromArgs(likelihoodArgs);

switch ( implementation) {
// TODO these constructors should eventually be matched so they both incorporate all the same ancilliary arguments
Expand All @@ -284,6 +288,14 @@ public static ReadLikelihoodCalculationEngine createLikelihoodCalculationEngine(
}
}

/**
* Exposed so that PDHMM can be constructed outside of this class
*/
public static double getGlobalMismatchingRateFromArgs(LikelihoodEngineArgumentCollection likelihoodArgs) {
return likelihoodArgs.phredScaledGlobalReadMismappingRate < 0 ? Double.NEGATIVE_INFINITY
: QualityUtils.qualToErrorProbLog10(likelihoodArgs.phredScaledGlobalReadMismappingRate);
}

public static Optional<HaplotypeBAMWriter> createBamWriter(final AssemblyBasedCallerArgumentCollection args,
final boolean createBamOutIndex,
final boolean createBamOutMD5,
Expand Down Expand Up @@ -320,7 +332,6 @@ public static VariantContext makeMergedVariantContext(final List<VariantContext>
* for further HC steps
*/
public static AssemblyResultSet assembleReads(final AssemblyRegion region,
final List<VariantContext> forcedPileupAlleles,
final AssemblyBasedCallerArgumentCollection argumentCollection,
final SAMFileHeader header,
final SampleList sampleList,
Expand Down Expand Up @@ -397,10 +408,6 @@ public static AssemblyResultSet assembleReads(final AssemblyRegion region,
haplotypeCollapsing);

assemblyResultSet.setHaplotypeCollapsingEngine(haplotypeCollapsing);

if (!forcedPileupAlleles.isEmpty()) {
processPileupAlleles(region, forcedPileupAlleles, argumentCollection.pileupDetectionArgs.snpAdajacentToAssemblyIndel, argumentCollection.maxMnpDistance, aligner, refHaplotype, assemblyResultSet, argumentCollection.pileupDetectionArgs.numHaplotypesToIterate, argumentCollection.pileupDetectionArgs.filteringKmerSize, argumentCollection.getHaplotypeToReferenceSWParameters());
}
assemblyResultSet.setDebug(argumentCollection.assemblerArgs.debugAssembly);
assemblyResultSet.debugDump(logger);
return assemblyResultSet;
Expand Down Expand Up @@ -429,15 +436,76 @@ private static int determineFlowAssemblyColapseHmer(SAMFileHeader readsHeader) {
return result;
}

/**
* Helper method that handles the actual "GGA-like" Merging of haplotype alleles into an assembly result set.
*
* First this method will filter out haplotypes that contain alleles that have failed the pileup calling filtering steps,
* Then the list will attempt to poke into the haplotype list artificial haplotypes that have the found alleles present.
*
* @param region
* @param argumentCollection
* @param aligner
* @param refHaplotype
* @param assemblyResultSet
* @param pileupAllelesFoundShouldFilter
* @param pileupAllelesPassingFilters
* @return
*/
public static AssemblyResultSet applyPileupEventsAsForcedAlleles(final AssemblyRegion region, final AssemblyBasedCallerArgumentCollection argumentCollection,
final SmithWatermanAligner aligner, final Haplotype refHaplotype,
final AssemblyResultSet assemblyResultSet, final List<VariantContext> pileupAllelesFoundShouldFilter,
final List<VariantContext> pileupAllelesPassingFilters, final boolean debug) {
List<Haplotype> haplotypesWithFilterAlleles = new ArrayList<>();
// IF we find pileup alleles that we want to filter... AND we are not generating the partially determined haplotypes,
// we resort to a messy approach where we filter alleles by throwing away every haplotype supporting an allele. This is
// very dangerous since this could easily destroy phased variants with the haplotype.
if (!pileupAllelesFoundShouldFilter.isEmpty() && !argumentCollection.pileupDetectionArgs.generatePDHaplotypes) {
// TODO this is a bad algorithm for bad people
for(VariantContext delVariant : pileupAllelesFoundShouldFilter) {
for (Haplotype hap : assemblyResultSet.getHaplotypeList()) {
if (hap.getEventMap()==null) {
// NOTE: This check should be a reasonable sanity check on the inputs. However, there edge cases in the assembly engine + SW realignment that can cause
// haplotypes to re-merge into the reference and end up with an empty event map. (Also the event map code explicitly expects to fail in some instances)
// thus we can't enforce that the haplotypes no variants.
// if (!hap.isReference()) {
// //throw new RuntimeException("empty event map for haplotype" + hap);
// }
} else {
if (hap.getEventMap().getVariantContexts().stream().anyMatch(v -> v.getStart() == delVariant.getStart()
&& delVariant.getReference().equals(v.getReference())
&& delVariant.getAlternateAllele(0).equals(v.getAlternateAllele(0)))) {
if (argumentCollection.pileupDetectionArgs.debugPileupStdout) System.err.println("Flagging hap " + hap + " for containing variant " + delVariant);
haplotypesWithFilterAlleles.add(hap);
}
}
}
}
}
// TODO removing haplotypes whole-cloth is messy and will likely lead to errors and dropped variants in this code
if (!haplotypesWithFilterAlleles.isEmpty()) {
if (debug) System.out.println("Found Assembly Haps with filtered Variants:\n"+haplotypesWithFilterAlleles.stream().map(Haplotype::toString).collect(Collectors.joining("\n")));

haplotypeDeletionWarningLogger.warn(() -> "Haplotypes from Assembly are being filtered by heuristics from the PileupCaller. This can lead to missing variants. See --"+PileupDetectionArgumentCollection.PILEUP_DETECTION_FILTER_ASSEMBLY_HAPS_THRESHOLD+" for more details");

for (Haplotype hap : haplotypesWithFilterAlleles) {
assemblyResultSet.removeHapltotype(hap);
}
}
if (!pileupAllelesPassingFilters.isEmpty()) {
processPileupAlleles(region, pileupAllelesPassingFilters, argumentCollection.maxMnpDistance, argumentCollection.pileupDetectionArgs.snpAdajacentToAssemblyIndel, aligner, refHaplotype, assemblyResultSet, argumentCollection.pileupDetectionArgs.numHaplotypesToIterate, argumentCollection.pileupDetectionArgs.filteringKmerSize, argumentCollection.getHaplotypeToReferenceSWParameters(), debug);
}
return assemblyResultSet;
}

/**
* Handle pileup detected alternate alleles.
*/
@VisibleForTesting
@SuppressWarnings("deprecation")
static void processPileupAlleles(final AssemblyRegion region, final List<VariantContext> givenAlleles, final int maxMnpDistance,
static void processPileupAlleles(final AssemblyRegion region, final List<VariantContext> pileupVC, final int maxMnpDistance,
final int snpAdjacentToIndelLimit, final SmithWatermanAligner aligner, final Haplotype refHaplotype,
final AssemblyResultSet assemblyResultSet, final int numHaplotypesPerIteration, final int hapFilteringKmerSize,
final SWParameters haplotypeToReferenceSWParameters) {
final SWParameters haplotypeToReferenceSWParameters, final boolean debug) {
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
final Map<Integer, VariantContext> assembledVariants = assemblyResultSet.getVariationEvents(maxMnpDistance).stream()
.collect(Collectors.groupingBy(VariantContext::getStart, Collectors.collectingAndThen(Collectors.toList(), AssemblyBasedCallerUtils::makeMergedVariantContext)));
Expand All @@ -455,7 +523,7 @@ static void processPileupAlleles(final AssemblyRegion region, final List<Variant
Map<Kmer, Integer> kmerReadCounts = getKmerReadCounts(region.getHardClippedPileupReads(), hapFilteringKmerSize);

// Remove SNPs that are too close to assembled indels.
final List<VariantContext> givenAllelesFiltered = givenAlleles.stream().filter(vc -> vc.isIndel() || assembledIndels.stream().noneMatch(indel -> vc.withinDistanceOf(indel, snpAdjacentToIndelLimit))).collect(Collectors.toList());
final List<VariantContext> givenAllelesFiltered = pileupVC.stream().filter(vc -> vc.isIndel() || assembledIndels.stream().noneMatch(indel -> vc.withinDistanceOf(indel, snpAdjacentToIndelLimit))).collect(Collectors.toList());

for (final VariantContext givenVC : givenAllelesFiltered) {
final VariantContext assembledVC = assembledVariants.get(givenVC.getStart());
Expand All @@ -470,6 +538,7 @@ static void processPileupAlleles(final AssemblyRegion region, final List<Variant

final List<Haplotype> newPileupHaplotypes = new ArrayList<>();
for (final Allele givenAllele : unassembledNonSymbolicAlleles) {
if (debug) System.out.println("Processing new Haplotypes for Pileup Allele that was not in the assembly: "+givenVC);
for (final Haplotype baseHaplotype : baseHaplotypes) {
// make sure this allele doesn't collide with a variant on the haplotype
if (baseHaplotype.getEventMap() == null || baseHaplotype.getEventMap().getVariantContexts().stream().noneMatch(vc -> vc.overlaps(givenVC))) {
Expand All @@ -490,7 +559,9 @@ static void processPileupAlleles(final AssemblyRegion region, final List<Variant
}
}

baseHaplotypes.addAll(filterPileupHaplotypes(newPileupHaplotypes, kmerReadCounts, numHaplotypesPerIteration, hapFilteringKmerSize));
Set<Haplotype> refactoredHaps = filterPileupHaplotypes(newPileupHaplotypes, kmerReadCounts, numHaplotypesPerIteration, hapFilteringKmerSize);
baseHaplotypes.addAll(refactoredHaps);
if (debug) System.out.println("Constructed the following new Pileup Haplotypes after filtering:\n"+refactoredHaps.stream().map(Haplotype::toString).collect(Collectors.joining("\n")));

}
baseHaplotypes.forEach(assemblyResultSet::add);
Expand Down Expand Up @@ -556,7 +627,8 @@ private static List<Allele> getAllelesNotPresentInAssembly(VariantContext givenV
ReferenceConfidenceVariantContextMerger.remapAlleles(assembledVC, longerRef));
final Set<Allele> givenAlleleSet = new HashSet<>(longerRef.length() == givenVCRefLength ? givenVC.getAlternateAlleles() :
ReferenceConfidenceVariantContextMerger.remapAlleles(givenVC, longerRef));
unassembledGivenAlleles = givenAlleleSet.stream().filter(a -> !assembledAlleleSet.contains(a)).collect(Collectors.toList());
//We must filter out the Ref alleles here to protect against edge cases and undexpected behavior from the pileupcalling code
unassembledGivenAlleles = givenAlleleSet.stream().filter(a -> !assembledAlleleSet.contains(a)).filter(a -> !a.isReference()).collect(Collectors.toList());
}
return unassembledGivenAlleles;
}
Expand Down Expand Up @@ -856,6 +928,10 @@ public static Map<Allele, List<Haplotype>> createAlleleMapper(final VariantConte

for (final Haplotype h : haplotypes) {

// Partially determined haplotypes know at what position they are determined, only determined position haps should be considered for genotyping
if (h.isPartiallyDetermined() && ((PartiallyDeterminedHaplotype) h).getDeterminedPosition() != loc) {
continue;
}
final List<VariantContext> spanningEvents = h.getEventMap().getOverlappingEvents(loc);

if (spanningEvents.isEmpty()) { //no events --> this haplotype supports the reference at this locus
Expand Down
Loading

0 comments on commit 9afa998

Please sign in to comment.