Improve consistency for heterozygosity annotations on reblocked inputs (

#7471) Modify the new reblocked GT calling behavior a little bit more so annotations are more accurate compared to non-reblocked GVCFs Fix bug due to InbreedingCoeff vs. AS_InbreedingCoeff code duplication
broadinstitute · Sep 25, 2021 · c1f3737 · c1f3737
1 parent 0f89f46
commit c1f3737
Show file tree

Hide file tree

Showing 13 changed files with 258 additions and 70 deletions.
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFsEngine.java
@@ -11,7 +11,6 @@
 import org.broadinstitute.hellbender.cmdline.argumentcollections.DbsnpArgumentCollection;
 import org.broadinstitute.hellbender.engine.FeatureContext;
 import org.broadinstitute.hellbender.engine.ReferenceContext;
-import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AS_RankSumTest;
 import org.broadinstitute.hellbender.utils.dragstr.DragstrParams;
 import org.broadinstitute.hellbender.tools.walkers.annotator.*;
 import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AS_RMSMappingQuality;
@@ -179,13 +178,15 @@ private VariantContext regenotypeVC(final VariantContext originalVC, final Refer
         // We could theoretically make 2 passes to re-create the genotypes, but that gets extremely expensive with large sample sizes.
         if (result.isPolymorphicInSamples()) {
             // For polymorphic sites we need to make sure e.g. the SB tag is sent to the annotation engine and then removed later.
-            final VariantContext reannotated = annotationEngine.annotateContext(result, features, ref, null, a -> true);
-            return new VariantContextBuilder(reannotated).genotypes(cleanupGenotypeAnnotations(reannotated, false)).make();
+            final VariantContextBuilder vcBuilder = new VariantContextBuilder(result);
+            //don't count sites with no depth and no confidence towards things like AN and InbreedingCoeff
+            vcBuilder.genotypes(assignNoCallsAnnotationExcludedGenotypes(result.getGenotypes()));
+            VariantContext annotated = annotationEngine.annotateContext(vcBuilder.make(), features, ref, null, a -> true);
+            return new VariantContextBuilder(annotated).genotypes(cleanupGenotypeAnnotations(result, false)).make();
         } else if (includeNonVariants) {
             // For monomorphic sites we need to make sure e.g. the hom ref genotypes are created and only then are passed to the annotation engine.
-            VariantContext reannotated = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, true)).make();
-            reannotated = annotationEngine.annotateContext(reannotated, features, ref, null, GenotypeGVCFsEngine::annotationShouldBeSkippedForHomRefSites);
-            return reannotated;
+            VariantContext preannotated = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, true)).make();
+            return annotationEngine.annotateContext(preannotated, features, ref, null, GenotypeGVCFsEngine::annotationShouldBeSkippedForHomRefSites);
         } else {
             return null;
         }
@@ -482,6 +483,28 @@ static List<Genotype> cleanupGenotypeAnnotations(final VariantContext vc, final
         return recoveredGs;
     }
 
+    static boolean excludeFromAnnotations(Genotype oldGT) {
+        return oldGT.isHomRef() && !oldGT.hasPL()
+                && ((oldGT.hasDP() && oldGT.getDP() == 0) || !oldGT.hasDP())
+                && oldGT.hasGQ() && oldGT.getGQ() == 0;
+    }
+
+    private List<Genotype> assignNoCallsAnnotationExcludedGenotypes(final GenotypesContext genotypes) {
+        final List<Genotype> returnList = new ArrayList<>();
+        for (final Genotype oldGT : genotypes) {
+            //convert GQ0s that were reblocked back to no-calls for better AN and InbreedingCoeff annotations
+            if (excludeFromAnnotations(oldGT)) {
+                final GenotypeBuilder builder = new GenotypeBuilder(oldGT);
+                builder.alleles(Collections.nCopies(oldGT.getPloidy(), Allele.NO_CALL));
+                returnList.add(builder.make());
+            } else {
+                returnList.add(oldGT);
+            }
+
+        }
+        return returnList;
+    }
+
 
     private static int parseInt(Object attribute){
         if( attribute instanceof String) {

diff --git a/.../org/broadinstitute/hellbender/tools/walkers/ReferenceConfidenceVariantContextMerger.java b/.../org/broadinstitute/hellbender/tools/walkers/ReferenceConfidenceVariantContextMerger.java
@@ -15,6 +15,7 @@
 import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAssignmentMethod;
 import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeLikelihoodCalculators;
 import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.Mutect2FilteringEngine;
+import org.broadinstitute.hellbender.utils.GenotypeUtils;
 import org.broadinstitute.hellbender.utils.Utils;
 import org.broadinstitute.hellbender.utils.logging.OneShotLogger;
 import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
@@ -546,6 +547,9 @@ private GenotypesContext mergeRefConfidenceGenotypes(final VariantContext vc,
                     final int[] PLs = generatePL(g, genotypeIndexMapByPloidy);
                     final int[] AD = g.hasAD() ? AlleleSubsettingUtils.generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles) : null;
                     genotypeBuilder.PL(PLs).AD(AD);
+                //clean up low confidence hom refs for better annotations later
+                } else if (GenotypeGVCFsEngine.excludeFromAnnotations(g)) {
+                    genotypeBuilder.alleles(Collections.nCopies(ploidy, Allele.NO_CALL));
                 }
             }
             else {  //doSomaticMerge
@@ -598,8 +602,12 @@ private GenotypesContext mergeRefConfidenceGenotypes(final VariantContext vc,
                 }
             }
             genotypeBuilder.name(name);
-            final GenotypeAssignmentMethod assignmentMethod = callGTAlleles ? GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL :
-                    GenotypeAssignmentMethod.SET_TO_NO_CALL;
+            final GenotypeAssignmentMethod assignmentMethod;
+            if (callGTAlleles && GenotypeUtils.shouldBeCalled(g)) {
+                assignmentMethod = GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL;
+            } else {
+                assignmentMethod = GenotypeAssignmentMethod.SET_TO_NO_CALL;
+            }
             GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(),
                     genotypeBuilder, assignmentMethod,
                     g.hasLikelihoods() ? g.getLikelihoods().getAsVector() : null,

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/ExcessHet.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/ExcessHet.java
@@ -82,7 +82,7 @@ public Map<String, Object> annotate(final ReferenceContext ref,
     static Pair<Integer, Double> calculateEH(final VariantContext vc, final GenotypesContext genotypes) {
         final GenotypeCounts t = GenotypeUtils.computeDiploidGenotypeCounts(vc, genotypes, ROUND_GENOTYPE_COUNTS);
         // number of samples that have likelihoods
-        final int sampleCount = (int) genotypes.stream().filter(g->GenotypeUtils.isDiploidWithLikelihoodsOrCalledWithGQ(g)).count();
+        final int sampleCount = (int) genotypes.stream().filter(g->GenotypeUtils.isCalledAndDiploidWithLikelihoodsOrWithGQ(g)).count();
 
         return calculateEH(vc, t, sampleCount);
     }

diff --git a/.../java/org/broadinstitute/hellbender/tools/walkers/annotator/HeterozygosityCalculator.java b/.../java/org/broadinstitute/hellbender/tools/walkers/annotator/HeterozygosityCalculator.java
@@ -1,6 +1,7 @@
 package org.broadinstitute.hellbender.tools.walkers.annotator;
 
 import htsjdk.variant.variantcontext.*;
+import org.broadinstitute.hellbender.utils.GenotypeUtils;
 import org.broadinstitute.hellbender.utils.MathUtils;
 import org.broadinstitute.hellbender.utils.Utils;
 
@@ -56,7 +57,7 @@ private void doGenotypeCalculations() {
 
             //for each sample
             for (final Genotype g : genotypes) {
-                if (g.isCalled() && g.hasLikelihoods() && g.getPloidy() == 2){  // only work for diploid samples
+                if (GenotypeUtils.isCalledAndDiploidWithLikelihoodsOrWithGQ(g)){  // only work for diploid samples
                     sampleCount++;
                 } else {
                     continue;
@@ -66,8 +67,13 @@ private void doGenotypeCalculations() {
                 for(final Allele a : vc.getAlternateAlleles()) {
                     //for each alt allele index from 1 to N
                     altIndex++;
-
-                        final double[] normalizedLikelihoods = MathUtils.normalizeFromLog10ToLinearSpace(g.getLikelihoods().getAsVector());
+                    final double[] normalizedLikelihoods;
+                    if (g.hasLikelihoods()) {
+                        normalizedLikelihoods = MathUtils.normalizeFromLog10ToLinearSpace(g.getLikelihoods().getAsVector());
+                    } else {
+                        final double[] log10Likelihoods = GenotypeUtils.makeApproximateDiploidLog10LikelihoodsFromGQ(g, vc.getNAlleles());
+                        normalizedLikelihoods = MathUtils.normalizeFromLog10ToLinearSpace(log10Likelihoods);
+                    }
                         if (returnRounded) {
                             MathUtils.applyToArrayInPlace(normalizedLikelihoods, Math::round);
                         }

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/InbreedingCoeff.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/InbreedingCoeff.java
@@ -32,15 +32,17 @@
  *
  * <h3>Details</h3>
  * <p>The output is the inbreeding coefficient 'F' (fixation) statistic, which for large sample sizes converges to the probability
- * that an individual's two alleles are identical by descent, provided that cosanguinity is the only source of deviation from Hardy-Weinberg equilibrium.
+ * that an individual's two alleles are identical by descent, provided that consanguinity is the only source of deviation from Hardy-Weinberg equilibrium.
  * If this assumption is not true F may be negative and the excess heterozygosity often indicates an artifactual variant.
- * It is calculated as F = 1 - (# of het genotypes)/(# of het genotypes expected under Hardy-Weinberg equilibrium).  The number of het genotypes expeced under Hardy-Weinberg equilibrium
+ * It is calculated as F = 1 - (# of het genotypes)/(# of het genotypes expected under Hardy-Weinberg equilibrium).  The number of het genotypes expected under Hardy-Weinberg equilibrium
  * is 2*(# of samples)*(ref allele frequency)*(alt allele frequency), where allele frequencies are calculated from the samples' genotypes.</p>
  *
  * <h3>Caveats</h3>
  * <ul>
  * <li>The Inbreeding Coefficient annotation can only be calculated for cohorts containing at least 10 founder samples.</li>
  * <li>The Inbreeding Coefficient annotation can only be calculated for diploid samples.</li>
+ * <li>The implementation here uses likelihoods rather than counts, so computed InbreedingCoeff values can be sensitive to small
+ * changes in the low GQ range, as might happen after "reblocking" low coverage sites, as well as GQ0 versus no-call changes</li>
  * </ul>
  *
  * <h3>Related annotations</h3>
@@ -51,7 +53,7 @@ public final class InbreedingCoeff extends PedigreeAnnotation implements InfoFie
 
     private static final OneShotLogger oneShotLogger = new OneShotLogger(InbreedingCoeff.class);
     private static final int MIN_SAMPLES = 10;
-    private static final boolean ROUND_GENOTYPE_COUNTS = false;
+    private static final boolean ROUND_GENOTYPE_COUNTS = false;  //if this changes update the caveats above
 
     public InbreedingCoeff(){
         super((Set<String>) null);
@@ -95,11 +97,11 @@ static Pair<Integer, Double> calculateIC(final VariantContext vc, final Genotype
         final double hetCount = t.getHets();
         final double homCount = t.getHoms();
         // number of samples that have likelihoods
-        final int sampleCount = (int) genotypes.stream().filter(g-> GenotypeUtils.isDiploidWithLikelihoodsOrCalledWithGQ(g)).count();
+        final int sampleCount = (int) genotypes.stream().filter(g-> GenotypeUtils.isCalledAndDiploidWithLikelihoodsOrWithGQ(g) || GenotypeUtils.isDiploidWithLikelihoods(g)).count();
 
         final double p = ( 2.0 * refCount + hetCount ) / ( 2.0 * (refCount + hetCount + homCount) ); // expected reference allele frequency
         final double q = 1.0 - p; // expected alternative allele frequency
-        final double expectedHets = 2.0 * p * q * sampleCount; //numbers of hets that would be expected based on the allele frequency (asuming Hardy Weinberg Equilibrium)
+        final double expectedHets = 2.0 * p * q * sampleCount; //numbers of hets that would be expected based on the allele frequency (assuming Hardy Weinberg Equilibrium)
         final double F = 1.0 - ( hetCount / expectedHets ); // inbreeding coefficient
 
         return Pair.of(sampleCount, F);

diff --git a/...g/broadinstitute/hellbender/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java b/...g/broadinstitute/hellbender/tools/walkers/genotyper/afcalc/AlleleFrequencyCalculator.java
@@ -2,7 +2,6 @@
 
 import htsjdk.variant.variantcontext.Allele;
 import htsjdk.variant.variantcontext.Genotype;
-import htsjdk.variant.variantcontext.GenotypeLikelihoods;
 import htsjdk.variant.variantcontext.VariantContext;
 import it.unimi.dsi.fastutil.doubles.DoubleArrayList;
 import it.unimi.dsi.fastutil.ints.Int2ObjectArrayMap;
@@ -29,11 +28,6 @@ public final class AlleleFrequencyCalculator {
     private static final GenotypeLikelihoodCalculators GL_CALCS = new GenotypeLikelihoodCalculators();
     private static final double THRESHOLD_FOR_ALLELE_COUNT_CONVERGENCE = 0.1;
     private static final int HOM_REF_GENOTYPE_INDEX = 0;
-    private static final int TYPICAL_BASE_QUALITY = 30;
-    //from the genotype likelihoods equations assuming the SNP ref conf model with no mismatches
-    //PL[2] = GQ; scaleFactor = PL[3]/GQ ~ -10 * DP * log10(P_error) / (-10 * DP * log10(1/ploidy)) where BASE_QUALITY = -10 * log10(P_error)
-    private static final int PLOIDY_2_HOM_VAR_SCALE_FACTOR = (int)Math.round(TYPICAL_BASE_QUALITY/-10.0/Math.log10(.5));
-
 
     private final double refPseudocount;
     private final double snpPseudocount;
@@ -87,22 +81,7 @@ private static double[] log10NormalizedGenotypePosteriors(final Genotype g, fina
                         "but were not found for genotype " + g + " with ploidy " + g.getPloidy());
             }
             if (g.hasGQ()) {
-                //for a hom-ref, as long as we have GQ we can make a very accurate QUAL calculation
-                // since the hom-var likelihood should make a minuscule contribution
-                final int[] perSampleIndexesOfRelevantAlleles = new int[log10AlleleFrequencies.length];
-                Arrays.fill(perSampleIndexesOfRelevantAlleles, 1);
-                perSampleIndexesOfRelevantAlleles[0] = 0;  //ref still maps to ref
-                final int gq = g.getGQ();
-                final int ploidy = g.getPloidy();
-                //use these values for diploid ref/ref, ref/alt, alt/alt likelihoods
-                final int[] approxLikelihoods = {0, gq, PLOIDY_2_HOM_VAR_SCALE_FACTOR*gq};
-                //map likelihoods for any other alts to biallelic ref/alt likelihoods above
-                final int[] genotypeIndexMapByPloidy = GL_CALCS.getInstance(ploidy, log10AlleleFrequencies.length).genotypeIndexMap(perSampleIndexesOfRelevantAlleles, GL_CALCS); //probably horribly slow
-                final int[] PLs = new int[genotypeIndexMapByPloidy.length];
-                for (int i = 0; i < PLs.length; i++) {
-                        PLs[i] = approxLikelihoods[genotypeIndexMapByPloidy[i]];
-                    }
-                log10Likelihoods = GenotypeLikelihoods.fromPLs(PLs).getAsVector();
+                log10Likelihoods = GenotypeUtils.makeApproximateDiploidLog10LikelihoodsFromGQ(g, log10AlleleFrequencies.length);
             } else {
                 throw new IllegalStateException("Genotype " + g + " does not contain GQ necessary to calculate posteriors.");
             }
@@ -117,16 +96,6 @@ private static double[] log10NormalizedGenotypePosteriors(final Genotype g, fina
         return MathUtils.normalizeLog10(log10Posteriors);
     }
 
-    private static double[] approximateHomRefPLsFromGQ(final int genotypeCount, final int gq, final GenotypeLikelihoodCalculator glCalc) {
-        final int[] newPLs = new int[genotypeCount];
-        for (int i = 0; i < genotypeCount; i++) {
-            final GenotypeAlleleCounts gac = glCalc.genotypeAlleleCountsAt(i);
-            final int refCount = gac.alleleCountFor(0);
-                newPLs[i] = (glCalc.ploidy() - refCount) * gq;
-        }
-        return GenotypeLikelihoods.fromPLs(newPLs).getAsVector();
-    }
-
     private static int[] genotypeIndicesWithOnlyRefAndSpanDel(final int ploidy, final List<Allele> alleles) {
         final GenotypeLikelihoodCalculator glCalc = GL_CALCS.getInstance(ploidy, alleles.size());
         final boolean spanningDeletionPresent = alleles.contains(Allele.SPAN_DEL);