Skip to content

Commit

Permalink
Improve consistency for heterozygosity annotations on reblocked inputs (
Browse files Browse the repository at this point in the history
#7471)

Modify the new reblocked GT calling behavior a little bit more so annotations are
more accurate compared to non-reblocked GVCFs
Fix bug due to InbreedingCoeff vs. AS_InbreedingCoeff code duplication
  • Loading branch information
ldgauthier committed Sep 25, 2021
1 parent 0f89f46 commit c1f3737
Show file tree
Hide file tree
Showing 13 changed files with 258 additions and 70 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import org.broadinstitute.hellbender.cmdline.argumentcollections.DbsnpArgumentCollection;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AS_RankSumTest;
import org.broadinstitute.hellbender.utils.dragstr.DragstrParams;
import org.broadinstitute.hellbender.tools.walkers.annotator.*;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AS_RMSMappingQuality;
Expand Down Expand Up @@ -179,13 +178,15 @@ private VariantContext regenotypeVC(final VariantContext originalVC, final Refer
// We could theoretically make 2 passes to re-create the genotypes, but that gets extremely expensive with large sample sizes.
if (result.isPolymorphicInSamples()) {
// For polymorphic sites we need to make sure e.g. the SB tag is sent to the annotation engine and then removed later.
final VariantContext reannotated = annotationEngine.annotateContext(result, features, ref, null, a -> true);
return new VariantContextBuilder(reannotated).genotypes(cleanupGenotypeAnnotations(reannotated, false)).make();
final VariantContextBuilder vcBuilder = new VariantContextBuilder(result);
//don't count sites with no depth and no confidence towards things like AN and InbreedingCoeff
vcBuilder.genotypes(assignNoCallsAnnotationExcludedGenotypes(result.getGenotypes()));
VariantContext annotated = annotationEngine.annotateContext(vcBuilder.make(), features, ref, null, a -> true);
return new VariantContextBuilder(annotated).genotypes(cleanupGenotypeAnnotations(result, false)).make();
} else if (includeNonVariants) {
// For monomorphic sites we need to make sure e.g. the hom ref genotypes are created and only then are passed to the annotation engine.
VariantContext reannotated = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, true)).make();
reannotated = annotationEngine.annotateContext(reannotated, features, ref, null, GenotypeGVCFsEngine::annotationShouldBeSkippedForHomRefSites);
return reannotated;
VariantContext preannotated = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, true)).make();
return annotationEngine.annotateContext(preannotated, features, ref, null, GenotypeGVCFsEngine::annotationShouldBeSkippedForHomRefSites);
} else {
return null;
}
Expand Down Expand Up @@ -482,6 +483,28 @@ static List<Genotype> cleanupGenotypeAnnotations(final VariantContext vc, final
return recoveredGs;
}

static boolean excludeFromAnnotations(Genotype oldGT) {
return oldGT.isHomRef() && !oldGT.hasPL()
&& ((oldGT.hasDP() && oldGT.getDP() == 0) || !oldGT.hasDP())
&& oldGT.hasGQ() && oldGT.getGQ() == 0;
}

private List<Genotype> assignNoCallsAnnotationExcludedGenotypes(final GenotypesContext genotypes) {
final List<Genotype> returnList = new ArrayList<>();
for (final Genotype oldGT : genotypes) {
//convert GQ0s that were reblocked back to no-calls for better AN and InbreedingCoeff annotations
if (excludeFromAnnotations(oldGT)) {
final GenotypeBuilder builder = new GenotypeBuilder(oldGT);
builder.alleles(Collections.nCopies(oldGT.getPloidy(), Allele.NO_CALL));
returnList.add(builder.make());
} else {
returnList.add(oldGT);
}

}
return returnList;
}


private static int parseInt(Object attribute){
if( attribute instanceof String) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAssignmentMethod;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeLikelihoodCalculators;
import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.Mutect2FilteringEngine;
import org.broadinstitute.hellbender.utils.GenotypeUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.logging.OneShotLogger;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
Expand Down Expand Up @@ -546,6 +547,9 @@ private GenotypesContext mergeRefConfidenceGenotypes(final VariantContext vc,
final int[] PLs = generatePL(g, genotypeIndexMapByPloidy);
final int[] AD = g.hasAD() ? AlleleSubsettingUtils.generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles) : null;
genotypeBuilder.PL(PLs).AD(AD);
//clean up low confidence hom refs for better annotations later
} else if (GenotypeGVCFsEngine.excludeFromAnnotations(g)) {
genotypeBuilder.alleles(Collections.nCopies(ploidy, Allele.NO_CALL));
}
}
else { //doSomaticMerge
Expand Down Expand Up @@ -598,8 +602,12 @@ private GenotypesContext mergeRefConfidenceGenotypes(final VariantContext vc,
}
}
genotypeBuilder.name(name);
final GenotypeAssignmentMethod assignmentMethod = callGTAlleles ? GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL :
GenotypeAssignmentMethod.SET_TO_NO_CALL;
final GenotypeAssignmentMethod assignmentMethod;
if (callGTAlleles && GenotypeUtils.shouldBeCalled(g)) {
assignmentMethod = GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL;
} else {
assignmentMethod = GenotypeAssignmentMethod.SET_TO_NO_CALL;
}
GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(),
genotypeBuilder, assignmentMethod,
g.hasLikelihoods() ? g.getLikelihoods().getAsVector() : null,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ public Map<String, Object> annotate(final ReferenceContext ref,
static Pair<Integer, Double> calculateEH(final VariantContext vc, final GenotypesContext genotypes) {
final GenotypeCounts t = GenotypeUtils.computeDiploidGenotypeCounts(vc, genotypes, ROUND_GENOTYPE_COUNTS);
// number of samples that have likelihoods
final int sampleCount = (int) genotypes.stream().filter(g->GenotypeUtils.isDiploidWithLikelihoodsOrCalledWithGQ(g)).count();
final int sampleCount = (int) genotypes.stream().filter(g->GenotypeUtils.isCalledAndDiploidWithLikelihoodsOrWithGQ(g)).count();

return calculateEH(vc, t, sampleCount);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.broadinstitute.hellbender.tools.walkers.annotator;

import htsjdk.variant.variantcontext.*;
import org.broadinstitute.hellbender.utils.GenotypeUtils;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.Utils;

Expand Down Expand Up @@ -56,7 +57,7 @@ private void doGenotypeCalculations() {

//for each sample
for (final Genotype g : genotypes) {
if (g.isCalled() && g.hasLikelihoods() && g.getPloidy() == 2){ // only work for diploid samples
if (GenotypeUtils.isCalledAndDiploidWithLikelihoodsOrWithGQ(g)){ // only work for diploid samples
sampleCount++;
} else {
continue;
Expand All @@ -66,8 +67,13 @@ private void doGenotypeCalculations() {
for(final Allele a : vc.getAlternateAlleles()) {
//for each alt allele index from 1 to N
altIndex++;

final double[] normalizedLikelihoods = MathUtils.normalizeFromLog10ToLinearSpace(g.getLikelihoods().getAsVector());
final double[] normalizedLikelihoods;
if (g.hasLikelihoods()) {
normalizedLikelihoods = MathUtils.normalizeFromLog10ToLinearSpace(g.getLikelihoods().getAsVector());
} else {
final double[] log10Likelihoods = GenotypeUtils.makeApproximateDiploidLog10LikelihoodsFromGQ(g, vc.getNAlleles());
normalizedLikelihoods = MathUtils.normalizeFromLog10ToLinearSpace(log10Likelihoods);
}
if (returnRounded) {
MathUtils.applyToArrayInPlace(normalizedLikelihoods, Math::round);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,17 @@
*
* <h3>Details</h3>
* <p>The output is the inbreeding coefficient 'F' (fixation) statistic, which for large sample sizes converges to the probability
* that an individual's two alleles are identical by descent, provided that cosanguinity is the only source of deviation from Hardy-Weinberg equilibrium.
* that an individual's two alleles are identical by descent, provided that consanguinity is the only source of deviation from Hardy-Weinberg equilibrium.
* If this assumption is not true F may be negative and the excess heterozygosity often indicates an artifactual variant.
* It is calculated as F = 1 - (# of het genotypes)/(# of het genotypes expected under Hardy-Weinberg equilibrium). The number of het genotypes expeced under Hardy-Weinberg equilibrium
* It is calculated as F = 1 - (# of het genotypes)/(# of het genotypes expected under Hardy-Weinberg equilibrium). The number of het genotypes expected under Hardy-Weinberg equilibrium
* is 2*(# of samples)*(ref allele frequency)*(alt allele frequency), where allele frequencies are calculated from the samples' genotypes.</p>
*
* <h3>Caveats</h3>
* <ul>
* <li>The Inbreeding Coefficient annotation can only be calculated for cohorts containing at least 10 founder samples.</li>
* <li>The Inbreeding Coefficient annotation can only be calculated for diploid samples.</li>
* <li>The implementation here uses likelihoods rather than counts, so computed InbreedingCoeff values can be sensitive to small
* changes in the low GQ range, as might happen after "reblocking" low coverage sites, as well as GQ0 versus no-call changes</li>
* </ul>
*
* <h3>Related annotations</h3>
Expand All @@ -51,7 +53,7 @@ public final class InbreedingCoeff extends PedigreeAnnotation implements InfoFie

private static final OneShotLogger oneShotLogger = new OneShotLogger(InbreedingCoeff.class);
private static final int MIN_SAMPLES = 10;
private static final boolean ROUND_GENOTYPE_COUNTS = false;
private static final boolean ROUND_GENOTYPE_COUNTS = false; //if this changes update the caveats above

public InbreedingCoeff(){
super((Set<String>) null);
Expand Down Expand Up @@ -95,11 +97,11 @@ static Pair<Integer, Double> calculateIC(final VariantContext vc, final Genotype
final double hetCount = t.getHets();
final double homCount = t.getHoms();
// number of samples that have likelihoods
final int sampleCount = (int) genotypes.stream().filter(g-> GenotypeUtils.isDiploidWithLikelihoodsOrCalledWithGQ(g)).count();
final int sampleCount = (int) genotypes.stream().filter(g-> GenotypeUtils.isCalledAndDiploidWithLikelihoodsOrWithGQ(g) || GenotypeUtils.isDiploidWithLikelihoods(g)).count();

final double p = ( 2.0 * refCount + hetCount ) / ( 2.0 * (refCount + hetCount + homCount) ); // expected reference allele frequency
final double q = 1.0 - p; // expected alternative allele frequency
final double expectedHets = 2.0 * p * q * sampleCount; //numbers of hets that would be expected based on the allele frequency (asuming Hardy Weinberg Equilibrium)
final double expectedHets = 2.0 * p * q * sampleCount; //numbers of hets that would be expected based on the allele frequency (assuming Hardy Weinberg Equilibrium)
final double F = 1.0 - ( hetCount / expectedHets ); // inbreeding coefficient

return Pair.of(sampleCount, F);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeLikelihoods;
import htsjdk.variant.variantcontext.VariantContext;
import it.unimi.dsi.fastutil.doubles.DoubleArrayList;
import it.unimi.dsi.fastutil.ints.Int2ObjectArrayMap;
Expand All @@ -29,11 +28,6 @@ public final class AlleleFrequencyCalculator {
private static final GenotypeLikelihoodCalculators GL_CALCS = new GenotypeLikelihoodCalculators();
private static final double THRESHOLD_FOR_ALLELE_COUNT_CONVERGENCE = 0.1;
private static final int HOM_REF_GENOTYPE_INDEX = 0;
private static final int TYPICAL_BASE_QUALITY = 30;
//from the genotype likelihoods equations assuming the SNP ref conf model with no mismatches
//PL[2] = GQ; scaleFactor = PL[3]/GQ ~ -10 * DP * log10(P_error) / (-10 * DP * log10(1/ploidy)) where BASE_QUALITY = -10 * log10(P_error)
private static final int PLOIDY_2_HOM_VAR_SCALE_FACTOR = (int)Math.round(TYPICAL_BASE_QUALITY/-10.0/Math.log10(.5));


private final double refPseudocount;
private final double snpPseudocount;
Expand Down Expand Up @@ -87,22 +81,7 @@ private static double[] log10NormalizedGenotypePosteriors(final Genotype g, fina
"but were not found for genotype " + g + " with ploidy " + g.getPloidy());
}
if (g.hasGQ()) {
//for a hom-ref, as long as we have GQ we can make a very accurate QUAL calculation
// since the hom-var likelihood should make a minuscule contribution
final int[] perSampleIndexesOfRelevantAlleles = new int[log10AlleleFrequencies.length];
Arrays.fill(perSampleIndexesOfRelevantAlleles, 1);
perSampleIndexesOfRelevantAlleles[0] = 0; //ref still maps to ref
final int gq = g.getGQ();
final int ploidy = g.getPloidy();
//use these values for diploid ref/ref, ref/alt, alt/alt likelihoods
final int[] approxLikelihoods = {0, gq, PLOIDY_2_HOM_VAR_SCALE_FACTOR*gq};
//map likelihoods for any other alts to biallelic ref/alt likelihoods above
final int[] genotypeIndexMapByPloidy = GL_CALCS.getInstance(ploidy, log10AlleleFrequencies.length).genotypeIndexMap(perSampleIndexesOfRelevantAlleles, GL_CALCS); //probably horribly slow
final int[] PLs = new int[genotypeIndexMapByPloidy.length];
for (int i = 0; i < PLs.length; i++) {
PLs[i] = approxLikelihoods[genotypeIndexMapByPloidy[i]];
}
log10Likelihoods = GenotypeLikelihoods.fromPLs(PLs).getAsVector();
log10Likelihoods = GenotypeUtils.makeApproximateDiploidLog10LikelihoodsFromGQ(g, log10AlleleFrequencies.length);
} else {
throw new IllegalStateException("Genotype " + g + " does not contain GQ necessary to calculate posteriors.");
}
Expand All @@ -117,16 +96,6 @@ private static double[] log10NormalizedGenotypePosteriors(final Genotype g, fina
return MathUtils.normalizeLog10(log10Posteriors);
}

private static double[] approximateHomRefPLsFromGQ(final int genotypeCount, final int gq, final GenotypeLikelihoodCalculator glCalc) {
final int[] newPLs = new int[genotypeCount];
for (int i = 0; i < genotypeCount; i++) {
final GenotypeAlleleCounts gac = glCalc.genotypeAlleleCountsAt(i);
final int refCount = gac.alleleCountFor(0);
newPLs[i] = (glCalc.ploidy() - refCount) * gq;
}
return GenotypeLikelihoods.fromPLs(newPLs).getAsVector();
}

private static int[] genotypeIndicesWithOnlyRefAndSpanDel(final int ploidy, final List<Allele> alleles) {
final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(ploidy, alleles.size());
final boolean spanningDeletionPresent = alleles.contains(Allele.SPAN_DEL);
Expand Down
Loading

0 comments on commit c1f3737

Please sign in to comment.