Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Checkpoint of necessary changes for DRAGEN v3.7.8 concordance #8083

Merged
merged 9 commits into from
Mar 21, 2023
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ public class AlignmentAndReferenceContext {

private final AlignmentContext alignmentContext;
private final ReferenceContext referenceContext;
private double activityScore = 0.0;

public AlignmentAndReferenceContext(final AlignmentContext alignmentContext,
final ReferenceContext referenceContext) {
Expand All @@ -29,4 +30,12 @@ public AlignmentContext getAlignmentContext() {
public ReferenceContext getReferenceContext() {
return referenceContext;
}

public double getActivityScore() {
return activityScore;
}

public void setActivityScore(double activityScore) {
this.activityScore = activityScore;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -137,14 +137,18 @@ private AssemblyRegion loadNextAssemblyRegion() {
final SimpleInterval pileupInterval = new SimpleInterval(pileup);
final ReferenceContext pileupRefContext = new ReferenceContext(reference, pileupInterval);
final FeatureContext pileupFeatureContext = new FeatureContext(features, pileupInterval);
if (pendingAlignmentData!=null) {
pendingAlignmentData.add(new AlignmentAndReferenceContext(pileup, pileupRefContext));
}


final ActivityProfileState profile = evaluator.isActive(pileup, pileupRefContext, pileupFeatureContext);
logger.debug(() -> profile.toString());
activityProfile.add(profile);

if (pendingAlignmentData!=null) {
AlignmentAndReferenceContext alignmentAndRefContext = new AlignmentAndReferenceContext(pileup, pileupRefContext);
alignmentAndRefContext.setActivityScore(profile.getOriginalActiveProb());
pendingAlignmentData.add(alignmentAndRefContext);
}

// A pending region only becomes ready once our locus iterator has advanced beyond the end of its extended span
// (this ensures that we've loaded all reads that belong in the new region)
if ( ! pendingRegions.isEmpty() && IntervalUtils.isAfter(pileup.getLocation(), pendingRegions.peek().getPaddedSpan(), readHeader.getSequenceDictionary()) ) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,11 @@
import org.broadinstitute.hellbender.utils.genotyper.SampleList;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.haplotype.HaplotypeBAMWriter;
import org.broadinstitute.hellbender.utils.haplotype.PartiallyDeterminedHaplotype;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.locusiterator.LocusIteratorByState;
import org.broadinstitute.hellbender.utils.logging.OneShotLogger;
import org.broadinstitute.hellbender.utils.pileup.PileupBasedAlleles;
import org.broadinstitute.hellbender.utils.pileup.ReadPileup;
import org.broadinstitute.hellbender.utils.read.*;
import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner;
Expand Down Expand Up @@ -72,6 +75,8 @@ public final class AssemblyBasedCallerUtils {
// get realigned incorrectly. See https://github.com/broadinstitute/gatk/issues/5060
public static final int MINIMUM_READ_LENGTH_AFTER_TRIMMING = 10;

public static final OneShotLogger haplotypeDeletionWarningLogger = new OneShotLogger(AssemblyBasedCallerUtils.class);

// Phase group notation can be interpreted as a representation of the alleles present on the two phased haplotypes at the site:
// "0": REF or '*'; "1": site-specific alt allele
enum PhaseGroup {
Expand Down Expand Up @@ -263,8 +268,7 @@ public static ReadLikelihoodCalculationEngine createLikelihoodCalculationEngine(
final boolean handleSoftclips,
final ReadLikelihoodCalculationEngine.Implementation implementation) {
//AlleleLikelihoods::normalizeLikelihoods uses Double.NEGATIVE_INFINITY as a flag to disable capping
final double log10GlobalReadMismappingRate = likelihoodArgs.phredScaledGlobalReadMismappingRate < 0 ? Double.NEGATIVE_INFINITY
: QualityUtils.qualToErrorProbLog10(likelihoodArgs.phredScaledGlobalReadMismappingRate);
final double log10GlobalReadMismappingRate = getGlobalMismatchingRateFromArgs(likelihoodArgs);

switch ( implementation) {
// TODO these constructors should eventually be matched so they both incorporate all the same ancilliary arguments
Expand All @@ -284,6 +288,14 @@ public static ReadLikelihoodCalculationEngine createLikelihoodCalculationEngine(
}
}

/**
* Exposed so that PDHMM can be constructed outside of this class
*/
public static double getGlobalMismatchingRateFromArgs(LikelihoodEngineArgumentCollection likelihoodArgs) {
return likelihoodArgs.phredScaledGlobalReadMismappingRate < 0 ? Double.NEGATIVE_INFINITY
: QualityUtils.qualToErrorProbLog10(likelihoodArgs.phredScaledGlobalReadMismappingRate);
}

public static Optional<HaplotypeBAMWriter> createBamWriter(final AssemblyBasedCallerArgumentCollection args,
final boolean createBamOutIndex,
final boolean createBamOutMD5,
Expand Down Expand Up @@ -320,7 +332,6 @@ public static VariantContext makeMergedVariantContext(final List<VariantContext>
* for further HC steps
*/
public static AssemblyResultSet assembleReads(final AssemblyRegion region,
final List<VariantContext> forcedPileupAlleles,
final AssemblyBasedCallerArgumentCollection argumentCollection,
final SAMFileHeader header,
final SampleList sampleList,
Expand Down Expand Up @@ -397,10 +408,6 @@ public static AssemblyResultSet assembleReads(final AssemblyRegion region,
haplotypeCollapsing);

assemblyResultSet.setHaplotypeCollapsingEngine(haplotypeCollapsing);

if (!forcedPileupAlleles.isEmpty()) {
processPileupAlleles(region, forcedPileupAlleles, argumentCollection.pileupDetectionArgs.snpAdajacentToAssemblyIndel, argumentCollection.maxMnpDistance, aligner, refHaplotype, assemblyResultSet, argumentCollection.pileupDetectionArgs.numHaplotypesToIterate, argumentCollection.pileupDetectionArgs.filteringKmerSize, argumentCollection.getHaplotypeToReferenceSWParameters());
}
assemblyResultSet.setDebug(argumentCollection.assemblerArgs.debugAssembly);
assemblyResultSet.debugDump(logger);
return assemblyResultSet;
Expand Down Expand Up @@ -429,15 +436,76 @@ private static int determineFlowAssemblyColapseHmer(SAMFileHeader readsHeader) {
return result;
}

/**
* Helper method that handles the actual "GGA-like" Merging of haplotype alleles into an assembly result set.
*
* First this method will filter out haplotypes that contain alleles that have failed the pileup calling filtering steps,
* Then the list will attempt to poke into the haplotype list artificial haplotypes that have the found alleles present.
*
* @param region
* @param argumentCollection
* @param aligner
* @param refHaplotype
* @param assemblyResultSet
* @param pileupAllelesFoundShouldFilter
* @param pileupAllelesPassingFilters
* @return
*/
public static AssemblyResultSet applyPileupEventsAsForcedAlleles(final AssemblyRegion region, final AssemblyBasedCallerArgumentCollection argumentCollection,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this method have a unit test? Or is there any code path covered by integration tests that exercises it?

If it's unused in the pdhmm codepath, can we just remove it? Or do we actually need it for some reason?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its captured by some HaplotypeCallerIntegrationTest consistent-with-past-results tests... Its the old method for pileupcalling that Bhanu added for the bacterial pipeline. I leave it in for legacy reasons as its a reasonable alternative approach to the PDHMM dependent one that is more usable by other users than DRAGEN-GATK.

final SmithWatermanAligner aligner, final Haplotype refHaplotype,
final AssemblyResultSet assemblyResultSet, final List<VariantContext> pileupAllelesFoundShouldFilter,
final List<VariantContext> pileupAllelesPassingFilters, final boolean debug) {
List<Haplotype> haplotypesWithFilterAlleles = new ArrayList<>();
// IF we find pileup alleles that we want to filter... AND we are not generating the partially determined haplotypes,
// we resort to a messy approach where we filter alleles by throwing away every haplotype supporting an allele. This is
// very dangerous since this could easily destroy phased variants with the haplotype.
if (!pileupAllelesFoundShouldFilter.isEmpty() && !argumentCollection.pileupDetectionArgs.generatePDHaplotypes) {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This logic is messy and dangerous, There should be an explicit opt-in and probably a user warning printed out if we are actually throwing away haplotypes here... The non-pdhmm version of this code should really be considered an off-label approach and it really wasn't extensively tested except as a stepping stone to the full PDHmm implementation so maybe we should re-evaluate if we want this at all..

// TODO this is a bad algorithm for bad people
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But is it faster than the Java version of the pdhmm? Does it do effectively the same thing?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its slow, faster than PDHMM by a longshot though because currently the JavaPDHMM is veeery slow

for(VariantContext delVariant : pileupAllelesFoundShouldFilter) {
for (Haplotype hap : assemblyResultSet.getHaplotypeList()) {
if (hap.getEventMap()==null) {
// NOTE: This check should be a reasonable sanity check on the inputs. However, there edge cases in the assembly engine + SW realignment that can cause
// haplotypes to re-merge into the reference and end up with an empty event map. (Also the event map code explicitly expects to fail in some instances)
// thus we can't enforce that the haplotypes no variants.
// if (!hap.isReference()) {
// //throw new RuntimeException("empty event map for haplotype" + hap);
// }
} else {
if (hap.getEventMap().getVariantContexts().stream().anyMatch(v -> v.getStart() == delVariant.getStart()
&& delVariant.getReference().equals(v.getReference())
&& delVariant.getAlternateAllele(0).equals(v.getAlternateAllele(0)))) {
if (argumentCollection.pileupDetectionArgs.debugPileupStdout) System.err.println("Flagging hap " + hap + " for containing variant " + delVariant);
haplotypesWithFilterAlleles.add(hap);
}
}
}
}
}
// TODO removing haplotypes whole-cloth is messy and will likely lead to errors and dropped variants in this code
if (!haplotypesWithFilterAlleles.isEmpty()) {
if (debug) System.out.println("Found Assembly Haps with filtered Variants:\n"+haplotypesWithFilterAlleles.stream().map(Haplotype::toString).collect(Collectors.joining("\n")));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not use logger.debug() for this sort of thing instead of passing debug booleans around?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Logger.debug is polluted with a bunch of other garbage that i don't want to see when i'm debugging these things... I agree this is inelegant....


haplotypeDeletionWarningLogger.warn(() -> "Haplotypes from Assembly are being filtered by heuristics from the PileupCaller. This can lead to missing variants. See --"+PileupDetectionArgumentCollection.PILEUP_DETECTION_FILTER_ASSEMBLY_HAPS_THRESHOLD+" for more details");

for (Haplotype hap : haplotypesWithFilterAlleles) {
assemblyResultSet.removeHapltotype(hap);
}
}
if (!pileupAllelesPassingFilters.isEmpty()) {
processPileupAlleles(region, pileupAllelesPassingFilters, argumentCollection.maxMnpDistance, argumentCollection.pileupDetectionArgs.snpAdajacentToAssemblyIndel, aligner, refHaplotype, assemblyResultSet, argumentCollection.pileupDetectionArgs.numHaplotypesToIterate, argumentCollection.pileupDetectionArgs.filteringKmerSize, argumentCollection.getHaplotypeToReferenceSWParameters(), debug);
}
return assemblyResultSet;
}

/**
* Handle pileup detected alternate alleles.
*/
@VisibleForTesting
@SuppressWarnings("deprecation")
static void processPileupAlleles(final AssemblyRegion region, final List<VariantContext> givenAlleles, final int maxMnpDistance,
static void processPileupAlleles(final AssemblyRegion region, final List<VariantContext> pileupVC, final int maxMnpDistance,
final int snpAdjacentToIndelLimit, final SmithWatermanAligner aligner, final Haplotype refHaplotype,
final AssemblyResultSet assemblyResultSet, final int numHaplotypesPerIteration, final int hapFilteringKmerSize,
final SWParameters haplotypeToReferenceSWParameters) {
final SWParameters haplotypeToReferenceSWParameters, final boolean debug) {
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
final Map<Integer, VariantContext> assembledVariants = assemblyResultSet.getVariationEvents(maxMnpDistance).stream()
.collect(Collectors.groupingBy(VariantContext::getStart, Collectors.collectingAndThen(Collectors.toList(), AssemblyBasedCallerUtils::makeMergedVariantContext)));
Expand All @@ -455,7 +523,7 @@ static void processPileupAlleles(final AssemblyRegion region, final List<Variant
Map<Kmer, Integer> kmerReadCounts = getKmerReadCounts(region.getHardClippedPileupReads(), hapFilteringKmerSize);

// Remove SNPs that are too close to assembled indels.
final List<VariantContext> givenAllelesFiltered = givenAlleles.stream().filter(vc -> vc.isIndel() || assembledIndels.stream().noneMatch(indel -> vc.withinDistanceOf(indel, snpAdjacentToIndelLimit))).collect(Collectors.toList());
final List<VariantContext> givenAllelesFiltered = pileupVC.stream().filter(vc -> vc.isIndel() || assembledIndels.stream().noneMatch(indel -> vc.withinDistanceOf(indel, snpAdjacentToIndelLimit))).collect(Collectors.toList());

for (final VariantContext givenVC : givenAllelesFiltered) {
final VariantContext assembledVC = assembledVariants.get(givenVC.getStart());
Expand All @@ -470,6 +538,7 @@ static void processPileupAlleles(final AssemblyRegion region, final List<Variant

final List<Haplotype> newPileupHaplotypes = new ArrayList<>();
for (final Allele givenAllele : unassembledNonSymbolicAlleles) {
if (debug) System.out.println("Processing new Haplotypes for Pileup Allele that was not in the assembly: "+givenVC);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

logger.debug()?

for (final Haplotype baseHaplotype : baseHaplotypes) {
// make sure this allele doesn't collide with a variant on the haplotype
if (baseHaplotype.getEventMap() == null || baseHaplotype.getEventMap().getVariantContexts().stream().noneMatch(vc -> vc.overlaps(givenVC))) {
Expand All @@ -490,7 +559,9 @@ static void processPileupAlleles(final AssemblyRegion region, final List<Variant
}
}

baseHaplotypes.addAll(filterPileupHaplotypes(newPileupHaplotypes, kmerReadCounts, numHaplotypesPerIteration, hapFilteringKmerSize));
Set<Haplotype> refactoredHaps = filterPileupHaplotypes(newPileupHaplotypes, kmerReadCounts, numHaplotypesPerIteration, hapFilteringKmerSize);
baseHaplotypes.addAll(refactoredHaps);
if (debug) System.out.println("Constructed the following new Pileup Haplotypes after filtering:\n"+refactoredHaps.stream().map(Haplotype::toString).collect(Collectors.joining("\n")));

}
baseHaplotypes.forEach(assemblyResultSet::add);
Expand Down Expand Up @@ -556,7 +627,8 @@ private static List<Allele> getAllelesNotPresentInAssembly(VariantContext givenV
ReferenceConfidenceVariantContextMerger.remapAlleles(assembledVC, longerRef));
final Set<Allele> givenAlleleSet = new HashSet<>(longerRef.length() == givenVCRefLength ? givenVC.getAlternateAlleles() :
ReferenceConfidenceVariantContextMerger.remapAlleles(givenVC, longerRef));
unassembledGivenAlleles = givenAlleleSet.stream().filter(a -> !assembledAlleleSet.contains(a)).collect(Collectors.toList());
//We must filter out the Ref alleles here to protect against edge cases and undexpected behavior from the pileupcalling code
unassembledGivenAlleles = givenAlleleSet.stream().filter(a -> !assembledAlleleSet.contains(a)).filter(a -> !a.isReference()).collect(Collectors.toList());
jamesemery marked this conversation as resolved.
Show resolved Hide resolved
}
return unassembledGivenAlleles;
}
Expand Down Expand Up @@ -856,6 +928,10 @@ public static Map<Allele, List<Haplotype>> createAlleleMapper(final VariantConte

for (final Haplotype h : haplotypes) {

// Partially determined haplotypes know at what position they are determined, only determined position haps should be considered for genotyping
if (h.isPartiallyDetermined() && ((PartiallyDeterminedHaplotype) h).getDeterminedPosition() != loc) {
continue;
}
final List<VariantContext> spanningEvents = h.getEventMap().getOverlappingEvents(loc);

if (spanningEvents.isEmpty()) { //no events --> this haplotype supports the reference at this locus
Expand Down
Loading