Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert AF calculation bug and yet another reblocking fix #7670

Merged
merged 10 commits into from
Mar 8, 2022
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ public class GenomicsDBArgumentCollection implements Serializable {
* Must be at least one greater than the maximum number of alternate alleles for genotyping.
* A typical value is 3 more than the --max-alternate-alleles value that's used by GenotypeGVCFs and larger differences
* result in more robustness to PCR-related indel errors.
* Note that GenotypeGVCFs will drop highly multi-allelic sites that are missing likelihoods.
* NOTE: GenotypeGVCFs will drop multi-allelic sites with more than this many alternate alleles since they are missing likelihoods.
*
* See also {@link org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeCalculationArgumentCollection#MAX_ALTERNATE_ALLELES_LONG_NAME}
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,13 @@ private AlleleSubsettingUtils() {} // prevent instantiation
* @param originalAlleles the original alleles
* @param allelesToKeep the subset of alleles to use with the new Genotypes
* @param assignmentMethod assignment strategy for the (subsetted) PLs
* @param depth the original variant DP or 0 if there was no DP
* @param suppressUninformativePLs force the output of a PL array even if there is no data
* @return a new non-null GenotypesContext
*/
public static GenotypesContext subsetAlleles(final GenotypesContext originalGs, final int defaultPloidy,
final List<Allele> originalAlleles,
final List<Allele> allelesToKeep,
final GenotypePriorCalculator gpc,
final GenotypeAssignmentMethod assignmentMethod,
final int depth,
final boolean suppressUninformativePLs) {
final GenotypeAssignmentMethod assignmentMethod) {
Utils.nonNull(originalGs, "original GenotypesContext must not be null");
Utils.nonNull(allelesToKeep, "allelesToKeep is null");
Utils.nonEmpty(allelesToKeep, "must keep at least one allele");
Expand Down Expand Up @@ -94,9 +90,6 @@ public static GenotypesContext subsetAlleles(final GenotypesContext originalGs,
newLog10GQ = -0.1*g.getGQ();
}

final boolean useNewLikelihoods = !suppressUninformativePLs ||
(newLikelihoods != null && (depth != 0 || GATKVariantContextUtils.isInformative(newLikelihoods)));

final GenotypeBuilder gb = new GenotypeBuilder(g);
final Map<String, Object> attributes = new HashMap<>(g.getExtendedAttributes());
attributes.remove(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY);
Expand All @@ -107,9 +100,7 @@ public static GenotypesContext subsetAlleles(final GenotypesContext originalGs,
if (newLog10GQ != Double.NEGATIVE_INFINITY && g.hasGQ()) { //only put GQ if originally present
gb.log10PError(newLog10GQ);
}
if (useNewLikelihoods) {
gb.PL(newLikelihoods);
}
gb.PL(newLikelihoods);

GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, assignmentMethod, newLikelihoods, allelesToKeep, g.getAlleles(), gpc);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,8 @@ public VariantContext calculateGenotypes(final VariantContext vc, final Genotype
if (maxAltAlleles < vc.getAlternateAlleles().size()) {
final List<Allele> allelesToKeep = AlleleSubsettingUtils.calculateMostLikelyAlleles(vc, defaultPloidy, maxAltAlleles);
final GenotypesContext reducedGenotypes = allelesToKeep.size() == 1 ? GATKVariantContextUtils.subsetToRefOnly(vc, defaultPloidy) :
AlleleSubsettingUtils.subsetAlleles(vc.getGenotypes(), defaultPloidy, vc.getAlleles(), allelesToKeep, gpc, GenotypeAssignmentMethod.SET_TO_NO_CALL, vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0), false);
AlleleSubsettingUtils.subsetAlleles(vc.getGenotypes(), defaultPloidy, vc.getAlleles(), allelesToKeep, gpc,
GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL); //with no PLs in some reblocked GVCFs, no-calls are just going to cause problems, so keep 0/0 genotypes as such without trying to recall
reducedVC = new VariantContextBuilder(vc).alleles(allelesToKeep).genotypes(reducedGenotypes).make();
}

Expand Down Expand Up @@ -181,7 +182,7 @@ && noAllelesOrFirstAlleleIsNotNonRef(outputAlternativeAlleles.alleles) && givenA
// create the genotypes
//TODO: omit subsetting if output alleles is not a proper subset of vc.getAlleles
final GenotypesContext genotypes = outputAlleles.size() == 1 ? GATKVariantContextUtils.subsetToRefOnly(vc, defaultPloidy) :
AlleleSubsettingUtils.subsetAlleles(vc.getGenotypes(), defaultPloidy, vc.getAlleles(), outputAlleles, gpc, configuration.genotypeArgs.genotypeAssignmentMethod, vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0), false);
AlleleSubsettingUtils.subsetAlleles(vc.getGenotypes(), defaultPloidy, vc.getAlleles(), outputAlleles, gpc, configuration.genotypeArgs.genotypeAssignmentMethod);

if (configuration.genotypeArgs.usePosteriorProbabilitiesToCalculateQual && hasPosteriors(genotypes)) {
final double log10NoVariantPosterior = phredNoVariantPosteriorProbability(outputAlleles, genotypes) * -.1;
Expand Down Expand Up @@ -391,11 +392,11 @@ boolean isVcCoveredByDeletion(final VariantContext vc) {
* {@link GenotypeLikelihoods#MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED}.
*/
protected final boolean cannotBeGenotyped(final VariantContext vc) {
// protect against too many alternate alleles that we can't even run AF on:
if (vc.getNAlleles() <= GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED
&& vc.getGenotypes().stream().anyMatch(GenotypeUtils::genotypeIsUsableForAFCalculation)) {
return false;
}
// protect against too many alternate alleles that we can't even run AF on:
if (vc.getNAlleles() > GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) {
logger.warn("Attempting to genotype more than " + GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED +
" alleles. Site will be skipped at location " + vc.getContig() + ":" + vc.getStart());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ private static double[] log10NormalizedGenotypePosteriors(final Genotype g, fina
final double[] log10Likelihoods;
if (g.hasLikelihoods()) {
log10Likelihoods = g.getLikelihoods().getAsVector();
} else if (g.isHomRef()) {
} else if ( g.isHomRef()) {
if (g.getPloidy() != 2) {
throw new IllegalStateException("Likelihoods are required to calculate posteriors for hom-refs with ploidy != 2, " +
"but were not found for genotype " + g + " with ploidy " + g.getPloidy());
Expand All @@ -88,6 +88,7 @@ private static double[] log10NormalizedGenotypePosteriors(final Genotype g, fina
throw new IllegalStateException("Genotype " + g + " does not contain GQ necessary to calculate posteriors.");
}
} else {
//no-call with no PLs are too risky -- don't assume they're reblocked hom-refs
throw new IllegalStateException("Genotype " + g + " does not contain likelihoods necessary to calculate posteriors.");
}
final double[] log10Posteriors = new IndexRange(0, glCalc.genotypeCount()).mapToDouble(genotypeIndex -> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ protected List<SimpleInterval> transformTraversalIntervals(final List<SimpleInte
protected GenomicsDBOptions getGenomicsDBOptions() {
if (genomicsDBOptions == null) {
genomicsdbArgs.callGenotypes = CALL_GENOTYPES;
genotypeArgs.maxAlternateAlleles = PIPELINE_MAX_ALT_COUNT;
genomicsdbArgs.maxDiploidAltAllelesThatCanBeGenotyped = genotypeArgs.maxAlternateAlleles + 1; //plus one for internal NON_REF
genomicsDBOptions = new GenomicsDBOptions(referenceArguments.getReferencePath(), genomicsdbArgs, genotypeArgs);
}
return genomicsDBOptions;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -512,9 +512,7 @@ protected GenotypeBuilder changeCallToHomRefVersusNonRef(final VariantContext lo
// the called alleles and this is a reference genotype that will stay hom-ref
final GenotypesContext context = AlleleSubsettingUtils.subsetAlleles(lowQualVariant.getGenotypes(),
genotype.getPloidy(), lowQualVariant.getAlleles(), Arrays.asList(inputRefAllele, bestAlt),
null, GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, //BEST_MATCH to avoid no-calling low qual genotypes
lowQualVariant.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0),
false); //emitEmptyPLs = true to make sure we always subset
null, GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL); //BEST_MATCH to avoid no-calling low qual genotypes
final Genotype subsetG = context.get(0);
gb = new GenotypeBuilder(subsetG).noAttributes(); //remove attributes because hom ref blocks shouldn't have posteriors
//subsetting may strip GQ and PLs for low qual genotypes
Expand Down Expand Up @@ -568,8 +566,7 @@ VariantContext cleanUpHighQualityVariant(final VariantContext variant) {
if(allelesNeedSubsetting && !keepAllAlts) {
newAlleleSetUntrimmed.removeAll(allelesToDrop);
final GenotypesContext gc = AlleleSubsettingUtils.subsetAlleles(variant.getGenotypes(), genotype.getPloidy(), variant.getAlleles(),
newAlleleSetUntrimmed, null, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN,
variant.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0), true);
newAlleleSetUntrimmed, null, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a comment documenting why we're using GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN here instead of the GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL we're using everywhere else.

if (gc.get(0).isHomRef() || !gc.get(0).hasGQ() || gc.get(0).getAlleles().contains(Allele.NO_CALL)) { //could be low quality or no-call after subsetting
if (dropLowQuals) {
return null;
Expand Down Expand Up @@ -780,8 +777,8 @@ private static void addQualAnnotations(final Map<String, Object> destination, fi
//TODO: this isn't going to work for DRAGEN's genotype posteriors
final GenotypesContext gc = AlleleSubsettingUtils.subsetAlleles(updatedAllelesVC.getGenotypes(),
updatedAllelesGenotype.getPloidy(), updatedAllelesVC.getAlleles(), Arrays.asList(updatedAllelesVC.getReference(), alt), null,
GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, 0, true);
//assignment method doens't really matter as long as we don't zero out PLs; don't need depth to get PLs for quals
GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL);
//assignment method doesn't really matter as long as we don't zero out PLs; don't need depth to get PLs for quals

final Genotype subsettedGenotype = gc.get(0);
final int[] likelihoods = getGenotypePosteriorsOtherwiseLikelihoods(subsettedGenotype, posteriorsKey);
Expand Down Expand Up @@ -862,8 +859,10 @@ private static void copyInfoAnnotations(final Map<String, Object> destinationAtt
final List<String> subsetList;
if (alleleSpecificValues.size() > 0) {
subsetList = AlleleSubsettingUtils.remapRLengthList(alleleSpecificValues, relevantIndices, "");
//zero out non-ref value, just in case
subsetList.set(subsetList.size()-1,((AlleleSpecificAnnotation)annotation).getEmptyRawValue());
if (sourceVC.getAlleles().get(relevantIndices[relevantIndices.length - 1]).equals(Allele.NON_REF_ALLELE)) {
//zero out non-ref value, just in case
subsetList.set(subsetList.size() - 1, ((AlleleSpecificAnnotation) annotation).getEmptyRawValue());
}
} else {
subsetList = Collections.nCopies(relevantIndices.length, "");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1032,8 +1032,7 @@ private VariantContext subsetRecord(final VariantContext vc, final boolean prese
final GenotypesContext subGenotypesWithOldAlleles = sub.getGenotypes(); //we need sub for the right samples, but PLs still go with old alleles
newGC = sub.getNAlleles() == vc.getNAlleles() ? subGenotypesWithOldAlleles :
AlleleSubsettingUtils.subsetAlleles(subGenotypesWithOldAlleles, 0, vc.getAlleles(),
sub.getAlleles(), null, GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES,
vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0), false);
sub.getAlleles(), null, GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES);
} else {
newGC = sub.getGenotypes();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ public static GenotypeCounts computeDiploidGenotypeCounts(final VariantContext v
/**
* Do we have (or can we infer) likelihoods necessary for allele frequency calculation?
* Some reblocked and/or DRAGEN GVCFs omit likelihoods for ref blocks, but we can estimate them
* If GenomicsDB max alt threshold is too low, non-reference genotypes may also be missing PLs -- we can't estimate, so reject those
* @param g a genotype of unknown call and ploidy
* @return true if we have enough info for AF calculation
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1687,7 +1687,7 @@ public static List<VariantContext> splitVariantContextToBiallelics(final Variant
genotypeAssignmentMethodUsed != GenotypeAssignmentMethod.SET_TO_NO_CALL)
AlleleSubsettingUtils.addInfoFieldAnnotations(vc, builder, keepOriginalChrCounts);

builder.genotypes(AlleleSubsettingUtils.subsetAlleles(vc.getGenotypes(),2,vc.getAlleles(), alleles, null, genotypeAssignmentMethodUsed,vc.getAttributeAsInt("DP",0), true));
builder.genotypes(AlleleSubsettingUtils.subsetAlleles(vc.getGenotypes(),2,vc.getAlleles(), alleles, null, genotypeAssignmentMethodUsed));
final VariantContext trimmed = trimAlleles(builder.make(), trimLeft, true);
biallelics.add(trimmed);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,12 @@ public Object[][] gvcfsToGenotype() {
{getTestFile( "combined.single.sample.pipeline.gatk3.vcf"),
getTestFile( "expected/includeLowQualSites.vcf"),
Arrays.asList( " --" + GenotypeGVCFs.ALL_SITES_LONG_NAME + " -L 20:10,012,730-10,012,740"),
b37_reference_20_21}
b37_reference_20_21},

//23 highly multi-allelic sites across 54 1000G exomes to test allele subsetting and QUAL calculation
{getTestFile("multiallelicQualRegression.vcf "),
getTestFile("multiallelicQualRegression.expected.vcf"),
NO_EXTRA_ARGS, hg38Reference}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You've confirmed that this new test case fails with 4.2.5.0?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes -- it fails like the Hindenburg.

};
}

Expand Down Expand Up @@ -281,23 +286,35 @@ public void assertMatchingGenotypesFromGenomicsDB(File input, File expected, Loc
runGenotypeGVCFSAndAssertComparison(genomicsDBUri, expected, args, VariantContextTestUtils::assertVariantContextsHaveSameGenotypes, reference);
}

@Test
public void testMaxAltsToCombineInGenomicsDB() {
final File tempGenomicsDB = GenomicsDBTestUtils.createTempGenomicsDB(CEUTRIO_20_21_GATK3_4_G_VCF, new SimpleInterval("20", 1, 11_000_000));
final String genomicsDBUri = GenomicsDBTestUtils.makeGenomicsDBUri(tempGenomicsDB);
final List<String> args = new ArrayList<String>();
@Test //here GDBMaxAlts is greater than GGVCFsMaxAlts
public void testMaxAltsToCombineInGenomicsDB() throws IOException {
//multi-input tests
//8 ALT VC will get dropped if GDB max is < 8 because GDB doesn't return PLs and GGVCFs drops variants with no PLs
final String gnarlyTestPath = toolsTestDir + "walkers/GnarlyGenotyper/";
final List<File> inputs = Arrays.asList(new File(gnarlyTestPath + "sample6.vcf"),
new File(gnarlyTestPath + "sample7.vcf"),
new File(gnarlyTestPath + "sample8.vcf"),
new File(gnarlyTestPath + "sample9.vcf"));
final SimpleInterval interval = new SimpleInterval("chr20", 257008, 257008);
final File tempGenomicsDB2 = GenomicsDBTestUtils.createTempGenomicsDB(inputs, interval);
final String genomicsDBUri2 = GenomicsDBTestUtils.makeGenomicsDBUri(tempGenomicsDB2);
final List<String> args = new ArrayList<>();
args.add("--"+GenomicsDBArgumentCollection.MAX_ALTS_LONG_NAME);
args.add("7");
args.add("--"+GenotypeCalculationArgumentCollection.MAX_ALTERNATE_ALLELES_LONG_NAME);
args.add("3");
args.add("--" + GenomicsDBArgumentCollection.MAX_ALTS_LONG_NAME);
args.add("4");
runGenotypeGVCFSAndAssertCount(genomicsDBUri, args, 3, VariantContextTestUtils::assertVariantContextMaxAltAlleleCount, b37_reference_20_21);
args.add("5");
final File output = runGenotypeGVCFS(genomicsDBUri2, null, args, hg38Reference);
final Pair<VCFHeader, List<VariantContext>> outputDataNoVariant = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath());
Assert.assertTrue(outputDataNoVariant.getRight().isEmpty());

args.clear();
//8 ALT VC will be output if GDB max is >= 8, but with only as many ALTs are requested in the GenotypeCalculationArguments
final List<String> args2 = new ArrayList<String>();
args.add("--"+GenomicsDBArgumentCollection.MAX_ALTS_LONG_NAME);
args.add("15");
args.add("--"+GenotypeCalculationArgumentCollection.MAX_ALTERNATE_ALLELES_LONG_NAME);
args.add("2");
args.add("--" + GenomicsDBArgumentCollection.MAX_ALTS_LONG_NAME);
args.add("20");
runGenotypeGVCFSAndAssertCount(genomicsDBUri, args, 2, VariantContextTestUtils::assertVariantContextMaxAltAlleleCount, b37_reference_20_21);
args.add("5");
runGenotypeGVCFSAndAssertComparison(genomicsDBUri2, getTestFile("fourSamplesEightAlts.expected.vcf"), args2,
VariantContextTestUtils::assertVariantContextsHaveSameGenotypes, hg38Reference);
}

@Test(expectedExceptions = UserException.BadInput.class)
Expand Down Expand Up @@ -333,7 +350,7 @@ private void runAndCheckGenomicsDBOutput(final ArgumentsBuilder args, final File
runCommandLine(args);

// Note that if this isn't working it will take *FOREVER*
// runs in 0.06 minutes with no input intervals specfied
// runs in 0.06 minutes with no input intervals specified
final List<VariantContext> expectedVC = VariantContextTestUtils.getVariantContexts(expected);
final List<VariantContext> actualVC = VariantContextTestUtils.getVariantContexts(output);
assertForEachElementInLists(actualVC, expectedVC, VariantContextTestUtils::assertVariantContextsHaveSameGenotypes);
Expand Down Expand Up @@ -422,6 +439,10 @@ private void runGenotypeGVCFSAndAssertComparison(File input, File expected, List
);
}


/**
* Note that this method does not use expected for comparison, but rather for updating exact match outputs
*/
private File runGenotypeGVCFS(String input, File expected, List<String> additionalArguments, String reference) {
final File output = UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ? expected : createTempFile("genotypegvcf", ".vcf");
final ArgumentsBuilder args = new ArgumentsBuilder();
Expand Down
Loading