Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make M2 haplotype and clustered events filters smarter about germline events #8717

Merged
merged 13 commits into from
Mar 25, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.VCFConstants;
import org.apache.commons.collections4.ListUtils;
import org.apache.commons.lang3.mutable.MutableInt;
import org.apache.commons.math3.linear.Array2DRowRealMatrix;
import org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor;
import org.apache.commons.math3.linear.RealMatrix;
Expand Down Expand Up @@ -113,12 +114,14 @@ public CalledHaplotypes callMutations(
}
final AlleleLikelihoods<Fragment, Haplotype> logFragmentLikelihoods = logReadLikelihoods.groupEvidence(MTAC.independentMates ? read -> read : GATKRead::getName, Fragment::createAndAvoidFailure);

final Set<Event> somaticEventsInRegion = new HashSet<>();
for( final int loc : eventStarts ) {
final List<VariantContext> eventsAtThisLoc = AssemblyBasedCallerUtils.getVariantsFromActiveHaplotypes(loc, haplotypes, false);
VariantContext mergedVC = AssemblyBasedCallerUtils.makeMergedVariantContext(eventsAtThisLoc);
if( mergedVC == null ) {
VariantContext merged = AssemblyBasedCallerUtils.makeMergedVariantContext(eventsAtThisLoc);
davidbenjamin marked this conversation as resolved.
Show resolved Hide resolved
if( merged == null ) {
continue;
}
final VariantContext mergedVC = emitRefConf ? ReferenceConfidenceUtils.addNonRefSymbolicAllele(merged) : merged;

// converting haplotype likelihoods to allele likelihoods
final Map<Allele, List<Haplotype>> alleleMapper = AssemblyBasedCallerUtils.createAlleleMapper(mergedVC, loc, haplotypes, true);
Expand All @@ -127,7 +130,6 @@ public CalledHaplotypes callMutations(
logLikelihoods.retainEvidence(variantCallingRelevantFragmentOverlap::overlaps);

if (emitRefConf) {
mergedVC = ReferenceConfidenceUtils.addNonRefSymbolicAllele(mergedVC);
logLikelihoods.addNonReferenceAllele(Allele.NON_REF_ALLELE);
}
final List<LikelihoodMatrix<Fragment, Allele>> tumorMatrices = IntStream.range(0, logLikelihoods.numberOfSamples())
Expand All @@ -152,13 +154,21 @@ public CalledHaplotypes callMutations(
.filter(allele -> forcedAlleles.contains(allele) || tumorLogOdds.getAlt(allele) > MTAC.getEmissionLogOdds())
.collect(Collectors.toList());

final long somaticAltCount = tumorAltAlleles.stream()
final List<Allele> allelesToGenotype = tumorAltAlleles.stream()
.filter(allele -> forcedAlleles.contains(allele) || !hasNormal || MTAC.genotypeGermlineSites || normalLogOdds.getAlt(allele) > MathUtils.log10ToLog(MTAC.normalLog10Odds))
.count();
.toList();

// record somatic alleles for later use in the Event Count annotation
// note that in tumor-only calling it does not attempt to detect germline events
mergedVC.getAlternateAlleles().stream()
.filter(allele -> tumorLogOdds.getAlt(allele) > MTAC.getEmissionLogOdds())
.filter(allele -> !hasNormal || normalLogOdds.getAlt(allele) > MathUtils.log10ToLog(MTAC.normalLog10Odds))
.map(allele -> new Event(mergedVC.getContig(), mergedVC.getStart(), mergedVC.getReference(), allele))
.forEach(somaticEventsInRegion::add);

// if every alt allele is germline, skip this variant. However, if some alt alleles are germline and others
// are not we emit them all so that the filtering engine can see them
if (somaticAltCount == 0) {
if (allelesToGenotype.isEmpty()) {
continue;
}

Expand Down Expand Up @@ -222,8 +232,39 @@ public CalledHaplotypes callMutations(

final List<VariantContext> outputCalls = AssemblyBasedCallerUtils.phaseCalls(returnCalls, calledHaplotypes);
final int eventCount = outputCalls.size();

// calculate the number of somatic events in the best haplotype of each called variant
final Map<Haplotype, MutableInt> haplotypeSupportCounts = logReadLikelihoods.alleles().stream()
.collect(Collectors.toMap(hap -> hap, label -> new MutableInt(0)));
logReadLikelihoods.bestAllelesBreakingTies()
.forEach(bestHaplotype -> haplotypeSupportCounts.get(bestHaplotype.allele).increment());

final Map<Event, List<Haplotype>> haplotypesByEvent= new HashMap<>();
for (final Haplotype haplotype : logReadLikelihoods.alleles()) {
for (final Event event : haplotype.getEventMap().getEvents()) {
haplotypesByEvent.computeIfAbsent(event, e -> new ArrayList<>()).add(haplotype);
}
}
final Map<VariantContext, List<Integer>> eventCountAnnotations = new HashMap<>();
for (final VariantContext outputCall : outputCalls) {
for (final Allele allele : outputCall.getAlternateAlleles()) {
// note: this creates the minimal representation behind the scenes
final Event event = new Event(outputCall.getContig(), outputCall.getStart(), outputCall.getReference(), allele);
// haplotypesByEvent contains every *assembled* event, including events injected into the original assembly,
// but there are some modes where we genotype events that were never in an assembly graph, in which case
// this annotation is irrelevant
if (haplotypesByEvent.containsKey(event)) {
final Haplotype bestHaplotype = haplotypesByEvent.get(event).stream()
.sorted(Comparator.comparingInt(h -> haplotypeSupportCounts.getOrDefault(h, new MutableInt(0)).intValue()).reversed())
.findFirst().get();

eventCountAnnotations.computeIfAbsent(outputCall, vc -> new ArrayList<>())
.add(bestHaplotype.getEventMap().getNumberOfEvents());
}
}
}
final List<VariantContext> outputCallsWithEventCountAnnotation = outputCalls.stream()
.map(vc -> new VariantContextBuilder(vc).attribute(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, eventCount).make())
.map(vc -> new VariantContextBuilder(vc).attribute(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, eventCountAnnotations.get(vc)).make())
.collect(Collectors.toList());
return new CalledHaplotypes(outputCallsWithEventCountAnnotation, calledHaplotypes);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ public ClusteredEventsFilter(final int maxEventsInRegion) {

@Override
public boolean isArtifact(final VariantContext vc, final Mutect2FilteringEngine filteringEngine) {
final Integer eventCount = vc.getAttributeAsInt(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, -1);
return eventCount > maxEventsInRegion;
final List<Integer> eventCounts = vc.getAttributeAsIntList(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, 0);
return eventCounts.stream().mapToInt(n -> n).max().getAsInt() > maxEventsInRegion;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ public double calculateErrorProbability(final VariantContext vc, final Mutect2Fi
protected void accumulateDataForLearning(final VariantContext vc, final ErrorProbabilities errorProbabilities, final Mutect2FilteringEngine filteringEngine) {
// we record the maximum non-sequencing artifact that is not this filter itself
davidbenjamin marked this conversation as resolved.
Show resolved Hide resolved
final double artifactProbability = errorProbabilities.getProbabilitiesByFilter().entrySet().stream()
.filter(e -> e.getKey().errorType() != ErrorType.SEQUENCING)
.filter(e -> e.getKey().errorType() == ErrorType.ARTIFACT)
.filter(e -> !e.getKey().filterName().equals(filterName()))
.flatMap(e -> e.getValue().stream()) // the value is a list of double, we need the max of all the lists
.max(Double::compareTo).orElse(0.0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ public static VCFFormatHeaderLine getEquivalentFormatHeaderLine(final String inf
addInfoLine(new VCFInfoHeaderLine(TREE_SCORE, 1, VCFHeaderLineType.Float, "Score from single sample filtering with random forest model."));

// M2-related info lines
addInfoLine(new VCFInfoHeaderLine(EVENT_COUNT_IN_HAPLOTYPE_KEY, 1, VCFHeaderLineType.Integer, "Number of events in this haplotype"));
addInfoLine(new VCFInfoHeaderLine(EVENT_COUNT_IN_HAPLOTYPE_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Number of somatic events in best supporting haplotype for each alt allele"));
addInfoLine(new VCFInfoHeaderLine(NORMAL_LOG_10_ODDS_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Normal log 10 likelihood ratio of diploid het or hom alt genotypes"));
addInfoLine(new VCFInfoHeaderLine(TUMOR_LOG_10_ODDS_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Log 10 likelihood ratio score of variant existing versus not existing"));
addFormatLine(new VCFFormatHeaderLine(TUMOR_LOG_10_ODDS_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Log 10 likelihood ratio score of variant existing versus not existing"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFUtils;
import org.apache.commons.collections4.SetUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.hellbender.CommandLineProgramTest;
Expand Down Expand Up @@ -528,7 +529,7 @@ public void testContaminationFilter() {
filteredVariants.get(10).stream().filter(vc -> vc.getFilters().contains(GATKVCFConstants.CONTAMINATION_FILTER_NAME)).count());

final List<VariantContext> missedObviousVariantsAtTenPercent = filteredVariants.get(10).stream()
.filter(vc -> !vc.getFilters().contains(GATKVCFConstants.CONTAMINATION_FILTER_NAME))
.filter(vc -> !vc.isFiltered())
.filter(VariantContext::isBiallelic)
.filter(vc -> {
final int[] AD = vc.getGenotype(0).getAD();
Expand Down Expand Up @@ -1088,7 +1089,18 @@ public void testExposureOfSmithWatermanParametersIsConsistentWithPastResults(fin
};

runCommandLine(args);
IntegrationTestSpec.assertEqualTextFiles(output, expected);

// This used to be an exact text match, which cause hours of aggravation when the test passed locally and
// failed on the cloud.
final double[] outputTlods = VariantContextTestUtils.streamVcf(output)
.flatMap(vc -> vc.getAttributeAsDoubleList(GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, 0).stream())
.mapToDouble(x -> x).toArray();

final double[] expectedTlods = VariantContextTestUtils.streamVcf(expected)
.flatMap(vc -> vc.getAttributeAsDoubleList(GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, 0).stream())
.mapToDouble(x -> x).toArray();

Assert.assertEquals(outputTlods, expectedTlods);
davidbenjamin marked this conversation as resolved.
Show resolved Hide resolved
}

@SafeVarargs
Expand Down
Loading
Loading