Skip to content

Commit

Permalink
Make M2 haplotype and clustered events filters smarter about germline…
Browse files Browse the repository at this point in the history
… events (#8717)

* M2 bad haplotype filter does not filter variants that share a haplotype with a germline event

* two ECNT annotations -- haplotype and region -- and clustered events filter looks at both
  • Loading branch information
davidbenjamin committed Mar 25, 2024
1 parent 47a97ae commit 105b63e
Show file tree
Hide file tree
Showing 11 changed files with 243 additions and 142 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,8 @@ protected static void removeStaleAttributesAfterMerge(final Map<String, Object>
attributes.remove(GATKVCFConstants.MLE_ALLELE_COUNT_KEY);
attributes.remove(GATKVCFConstants.MLE_ALLELE_FREQUENCY_KEY);
attributes.remove(VCFConstants.END_KEY);
attributes.remove(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY); //median doesn't make sense here so drop it; used for ClusteredEventFilter, which doesn't apply to MT
attributes.remove(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY);
attributes.remove(GATKVCFConstants.EVENT_COUNT_IN_REGION_KEY); //median doesn't make sense here so drop it; used for ClusteredEventFilter, which doesn't apply to MT
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@
public final class Mutect2Engine implements AssemblyRegionEvaluator, AutoCloseable {

private static final List<String> STANDARD_MUTECT_INFO_FIELDS = Arrays.asList(GATKVCFConstants.NORMAL_LOG_10_ODDS_KEY, GATKVCFConstants.TUMOR_LOG_10_ODDS_KEY, GATKVCFConstants.NORMAL_ARTIFACT_LOG_10_ODDS_KEY,
GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, GATKVCFConstants.IN_PON_KEY, GATKVCFConstants.POPULATION_AF_KEY,
GATKVCFConstants.EVENT_COUNT_IN_REGION_KEY, GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, GATKVCFConstants.IN_PON_KEY, GATKVCFConstants.POPULATION_AF_KEY,
GATKVCFConstants.GERMLINE_QUAL_KEY, GATKVCFConstants.CONTAMINATION_QUAL_KEY, GATKVCFConstants.SEQUENCING_QUAL_KEY,
GATKVCFConstants.POLYMERASE_SLIPPAGE_QUAL_KEY, GATKVCFConstants.READ_ORIENTATION_QUAL_KEY,
GATKVCFConstants.STRAND_QUAL_KEY, GATKVCFConstants.ORIGINAL_CONTIG_MISMATCH_KEY, GATKVCFConstants.N_COUNT_KEY, GATKVCFConstants.AS_UNIQUE_ALT_READ_SET_COUNT_KEY);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import htsjdk.variant.variantcontext.*;
import htsjdk.variant.vcf.VCFConstants;
import org.apache.commons.collections4.ListUtils;
import org.apache.commons.lang3.mutable.MutableInt;
import org.apache.commons.math3.linear.Array2DRowRealMatrix;
import org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor;
import org.apache.commons.math3.linear.RealMatrix;
Expand Down Expand Up @@ -113,12 +114,14 @@ public CalledHaplotypes callMutations(
}
final AlleleLikelihoods<Fragment, Haplotype> logFragmentLikelihoods = logReadLikelihoods.groupEvidence(MTAC.independentMates ? read -> read : GATKRead::getName, Fragment::createAndAvoidFailure);

final Set<Event> potentialSomaticEventsInRegion = new HashSet<>();
for( final int loc : eventStarts ) {
final List<VariantContext> eventsAtThisLoc = AssemblyBasedCallerUtils.getVariantsFromActiveHaplotypes(loc, haplotypes, false);
VariantContext mergedVC = AssemblyBasedCallerUtils.makeMergedVariantContext(eventsAtThisLoc);
if( mergedVC == null ) {
VariantContext merged = AssemblyBasedCallerUtils.makeMergedVariantContext(eventsAtThisLoc);
if( merged == null ) {
continue;
}
final VariantContext mergedVC = emitRefConf ? ReferenceConfidenceUtils.addNonRefSymbolicAllele(merged) : merged;

// converting haplotype likelihoods to allele likelihoods
final Map<Allele, List<Haplotype>> alleleMapper = AssemblyBasedCallerUtils.createAlleleMapper(mergedVC, loc, haplotypes, true);
Expand All @@ -127,7 +130,6 @@ public CalledHaplotypes callMutations(
logLikelihoods.retainEvidence(variantCallingRelevantFragmentOverlap::overlaps);

if (emitRefConf) {
mergedVC = ReferenceConfidenceUtils.addNonRefSymbolicAllele(mergedVC);
logLikelihoods.addNonReferenceAllele(Allele.NON_REF_ALLELE);
}
final List<LikelihoodMatrix<Fragment, Allele>> tumorMatrices = IntStream.range(0, logLikelihoods.numberOfSamples())
Expand All @@ -152,13 +154,21 @@ public CalledHaplotypes callMutations(
.filter(allele -> forcedAlleles.contains(allele) || tumorLogOdds.getAlt(allele) > MTAC.getEmissionLogOdds())
.collect(Collectors.toList());

final long somaticAltCount = tumorAltAlleles.stream()
final List<Allele> allelesToGenotype = tumorAltAlleles.stream()
.filter(allele -> forcedAlleles.contains(allele) || !hasNormal || MTAC.genotypeGermlineSites || normalLogOdds.getAlt(allele) > MathUtils.log10ToLog(MTAC.normalLog10Odds))
.count();
.toList();

// record somatic alleles for later use in the Event Count annotation
// note that in tumor-only calling it does not attempt to detect germline events
mergedVC.getAlternateAlleles().stream()
.filter(allele -> tumorLogOdds.getAlt(allele) > MTAC.getEmissionLogOdds())
.filter(allele -> !hasNormal || normalLogOdds.getAlt(allele) > MathUtils.log10ToLog(MTAC.normalLog10Odds))
.map(allele -> new Event(mergedVC.getContig(), mergedVC.getStart(), mergedVC.getReference(), allele))
.forEach(potentialSomaticEventsInRegion::add);

// if every alt allele is germline, skip this variant. However, if some alt alleles are germline and others
// are not we emit them all so that the filtering engine can see them
if (somaticAltCount == 0) {
if (allelesToGenotype.isEmpty()) {
continue;
}

Expand Down Expand Up @@ -222,8 +232,41 @@ public CalledHaplotypes callMutations(

final List<VariantContext> outputCalls = AssemblyBasedCallerUtils.phaseCalls(returnCalls, calledHaplotypes);
final int eventCount = outputCalls.size();

// calculate the number of somatic events in the best haplotype of each called variant
final Map<Haplotype, MutableInt> haplotypeSupportCounts = logReadLikelihoods.alleles().stream()
.collect(Collectors.toMap(hap -> hap, label -> new MutableInt(0)));
logReadLikelihoods.bestAllelesBreakingTies()
.forEach(bestHaplotype -> haplotypeSupportCounts.get(bestHaplotype.allele).increment());

final Map<Event, List<Haplotype>> haplotypesByEvent= new HashMap<>();
for (final Haplotype haplotype : logReadLikelihoods.alleles()) {
for (final Event event : haplotype.getEventMap().getEvents()) {
haplotypesByEvent.computeIfAbsent(event, e -> new ArrayList<>()).add(haplotype);
}
}
final Map<VariantContext, List<Integer>> eventCountAnnotations = new HashMap<>();
for (final VariantContext outputCall : outputCalls) {
for (final Allele allele : outputCall.getAlternateAlleles()) {
// note: this creates the minimal representation behind the scenes
final Event event = new Event(outputCall.getContig(), outputCall.getStart(), outputCall.getReference(), allele);
// haplotypesByEvent contains every *assembled* event, including events injected into the original assembly,
// but there are some modes where we genotype events that were never in an assembly graph, in which case
// this annotation is irrelevant
if (haplotypesByEvent.containsKey(event)) {
final Haplotype bestHaplotype = haplotypesByEvent.get(event).stream()
.sorted(Comparator.comparingInt(h -> haplotypeSupportCounts.getOrDefault(h, new MutableInt(0)).intValue()).reversed())
.findFirst().get();

eventCountAnnotations.computeIfAbsent(outputCall, vc -> new ArrayList<>())
.add((int) bestHaplotype.getEventMap().getEvents().stream().filter(potentialSomaticEventsInRegion::contains).count());
}
}
}
final List<VariantContext> outputCallsWithEventCountAnnotation = outputCalls.stream()
.map(vc -> new VariantContextBuilder(vc).attribute(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, eventCount).make())
.map(vc -> new VariantContextBuilder(vc)
.attribute(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, eventCountAnnotations.get(vc))
.attribute(GATKVCFConstants.EVENT_COUNT_IN_REGION_KEY, potentialSomaticEventsInRegion.size()).make())
.collect(Collectors.toList());
return new CalledHaplotypes(outputCallsWithEventCountAnnotation, calledHaplotypes);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,21 @@

public class ClusteredEventsFilter extends HardFilter {
private final int maxEventsInRegion;
private final int maxEventsInHaplotype;

public ClusteredEventsFilter(final int maxEventsInRegion) {
public ClusteredEventsFilter(final int maxEventsInRegion, final int maxEventsInHaplotype) {
this.maxEventsInRegion = maxEventsInRegion;
this.maxEventsInHaplotype = maxEventsInHaplotype;
}

@Override
public ErrorType errorType() { return ErrorType.ARTIFACT; }

@Override
public boolean isArtifact(final VariantContext vc, final Mutect2FilteringEngine filteringEngine) {
final Integer eventCount = vc.getAttributeAsInt(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, -1);
return eventCount > maxEventsInRegion;
final List<Integer> haplotypeEventCounts = vc.getAttributeAsIntList(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY, 0);
final int regionEventCounts = vc.getAttributeAsInt(GATKVCFConstants.EVENT_COUNT_IN_REGION_KEY, 0);
return haplotypeEventCounts.stream().mapToInt(n -> n).max().getAsInt() > maxEventsInHaplotype || regionEventCounts > maxEventsInRegion;
}

@Override
Expand All @@ -28,5 +31,5 @@ public String filterName() {
}

@Override
protected List<String> requiredInfoAnnotations() { return Collections.singletonList(GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY); }
protected List<String> requiredInfoAnnotations() { return List.of(GATKVCFConstants.EVENT_COUNT_IN_REGION_KEY, GATKVCFConstants.EVENT_COUNT_IN_HAPLOTYPE_KEY); }
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import java.util.*;

public class FilteredHaplotypeFilter extends Mutect2VariantFilter {
private static final double GERMLINE_PROBABILITY_TO_IGNORE_NORMAL_ARTIFACT = 0.25;
private final double maxIntraHaplotypeDistance;

// for each pgt + pid phasing string, a list of loci-error probability pairs
Expand Down Expand Up @@ -54,10 +55,21 @@ public double calculateErrorProbability(final VariantContext vc, final Mutect2Fi

@Override
protected void accumulateDataForLearning(final VariantContext vc, final ErrorProbabilities errorProbabilities, final Mutect2FilteringEngine filteringEngine) {
// we record the maximum non-sequencing artifact that is not this filter itself
final double artifactProbability = errorProbabilities.getProbabilitiesByFilter().entrySet().stream()
.filter(e -> e.getKey().errorType() != ErrorType.SEQUENCING)
.filter(e -> !e.getKey().filterName().equals(filterName()))
// we record the maximum non-sequencing, non-germline, artifact probability that is not from this filter itself
final Map<Mutect2Filter, List<Double>> probabilitiesByFilter = errorProbabilities.getProbabilitiesByFilter();

final double germlineProbability = probabilitiesByFilter.entrySet().stream()
.filter(e -> e.getKey().filterName() == GATKVCFConstants.GERMLINE_RISK_FILTER_NAME)
.flatMap(e -> e.getValue().stream()) // the value is a list of double, we need the max of all the lists
.max(Double::compareTo).orElse(0.0);

// the normal artifact filter often lights up when there's a non-artifactual germline event, which we don't want here
final boolean ignoreNormalArtifact = germlineProbability > GERMLINE_PROBABILITY_TO_IGNORE_NORMAL_ARTIFACT;

final double artifactProbability = probabilitiesByFilter.entrySet().stream()
.filter(e -> e.getKey().errorType() != ErrorType.NON_SOMATIC)
.filter(e -> !(ignoreNormalArtifact && e.getKey().filterName() == GATKVCFConstants.ARTIFACT_IN_NORMAL_FILTER_NAME))
.filter(e -> !e.getKey().filterName().equals(filterName())) // exclude the haplotype filter itself, which would be circular
.flatMap(e -> e.getValue().stream()) // the value is a list of double, we need the max of all the lists
.max(Double::compareTo).orElse(0.0);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ public class M2FiltersArgumentCollection {
* Hard filter thresholds
*/
public static final String MAX_EVENTS_IN_REGION_LONG_NAME = "max-events-in-region";
public static final String MAX_EVENTS_IN_HAPLOTYPE_LONG_NAME = "max-events-in-haplotype";
public static final String MAX_ALT_ALLELE_COUNT_LONG_NAME = "max-alt-allele-count";
public static final String UNIQUE_ALT_READ_COUNT_LONG_NAME = "unique-alt-read-count";
public static final String MIN_MEDIAN_MAPPING_QUALITY_LONG_NAME = "min-median-mapping-quality";
Expand All @@ -65,7 +66,8 @@ public class M2FiltersArgumentCollection {
public static final String MIN_READS_ON_EACH_STRAND_LONG_NAME = "min-reads-per-strand";
public static final String MIN_AF_LONG_NAME = "min-allele-fraction";

private static final int DEFAULT_MAX_EVENTS_IN_REGION = 2;
private static final int DEFAULT_MAX_EVENTS_IN_REGION = 3;
private static final int DEFAULT_MAX_EVENTS_IN_HAPLOTYPE = 2;
private static final int DEFAULT_MAX_ALT_ALLELES = 1;
private static final int DEFAULT_MIN_UNIQUE_ALT_READS = 0;
private static final int DEFAULT_MIN_MEDIAN_MAPPING_QUALITY = 30;
Expand All @@ -77,9 +79,12 @@ public class M2FiltersArgumentCollection {
private static final int DEFAULT_MIN_READS_ON_EACH_STRAND = 0;
private static final double DEFAULT_MIN_AF = 0;

@Argument(fullName = MAX_EVENTS_IN_REGION_LONG_NAME, optional = true, doc = "Maximum events in a single assembly region. Filter all variants if exceeded.")
@Argument(fullName = MAX_EVENTS_IN_REGION_LONG_NAME, optional = true, doc = "Maximum number of non-germline events in a single assembly region. Filter all variants if exceeded.")
public int maxEventsInRegion = DEFAULT_MAX_EVENTS_IN_REGION;

@Argument(fullName = MAX_EVENTS_IN_HAPLOTYPE_LONG_NAME, optional = true, doc = "Maximum number of non-germline events in a variant allele's best haplotype.")
public int maxEventsInHaplotype = DEFAULT_MAX_EVENTS_IN_HAPLOTYPE;

@Argument(fullName = MAX_ALT_ALLELE_COUNT_LONG_NAME, optional = true, doc = "Maximum alt alleles per site.")
public int numAltAllelesThreshold = DEFAULT_MAX_ALT_ALLELES;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ private void buildFiltersList(final M2FiltersArgumentCollection MTFAC) {
}

if (!MTFAC.mitochondria && !MTFAC.microbial) {
filters.add(new ClusteredEventsFilter(MTFAC.maxEventsInRegion));
filters.add(new ClusteredEventsFilter(MTFAC.maxEventsInRegion, MTFAC.maxEventsInHaplotype));
filters.add(new MultiallelicFilter(MTFAC.numAltAllelesThreshold));
filters.add(new FragmentLengthFilter(MTFAC.maxMedianFragmentLengthDifference));
filters.add(new PolymeraseSlippageFilter(MTFAC.minSlippageLength, MTFAC.slippageRate));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ public final class GATKVCFConstants {
public static final String CULPRIT_KEY = "culprit";
public static final String ORIGINAL_DP_KEY = "DP_Orig"; //SelectVariants
public static final String DOWNSAMPLED_KEY = "DS";
public static final String EVENT_COUNT_IN_HAPLOTYPE_KEY = "ECNT"; //M2
public static final String EVENT_COUNT_IN_REGION_KEY = "ECNT"; //M2
public static final String EVENT_COUNT_IN_HAPLOTYPE_KEY = "ECNTH"; //M2
public static final String FISHER_STRAND_KEY = "FS";
public static final String AS_FISHER_STRAND_KEY = "AS_FS";
public static final String AS_SB_TABLE_KEY = "AS_SB_TABLE";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,8 @@ public static VCFFormatHeaderLine getEquivalentFormatHeaderLine(final String inf
addInfoLine(new VCFInfoHeaderLine(TREE_SCORE, 1, VCFHeaderLineType.Float, "Score from single sample filtering with random forest model."));

// M2-related info lines
addInfoLine(new VCFInfoHeaderLine(EVENT_COUNT_IN_HAPLOTYPE_KEY, 1, VCFHeaderLineType.Integer, "Number of events in this haplotype"));
addInfoLine(new VCFInfoHeaderLine(EVENT_COUNT_IN_HAPLOTYPE_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Integer, "Number of somatic events in best supporting haplotype for each alt allele"));
addInfoLine(new VCFInfoHeaderLine(EVENT_COUNT_IN_REGION_KEY, 1, VCFHeaderLineType.Integer, "Number of potential somatic events in the assembly region"));
addInfoLine(new VCFInfoHeaderLine(NORMAL_LOG_10_ODDS_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Normal log 10 likelihood ratio of diploid het or hom alt genotypes"));
addInfoLine(new VCFInfoHeaderLine(TUMOR_LOG_10_ODDS_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Log 10 likelihood ratio score of variant existing versus not existing"));
addFormatLine(new VCFFormatHeaderLine(TUMOR_LOG_10_ODDS_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Log 10 likelihood ratio score of variant existing versus not existing"));
Expand Down
Loading

0 comments on commit 105b63e

Please sign in to comment.