-
Notifications
You must be signed in to change notification settings - Fork 586
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Revert AF calculation bug and yet another reblocking fix #7670
Changes from 8 commits
5562afe
7339c05
a245499
baab476
04cd52c
d107744
b8456e5
b51cb54
dc9ccd5
8f71a0d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -133,7 +133,7 @@ public VariantContext calculateGenotypes(final VariantContext vc, final Genotype | |
if (maxAltAlleles < vc.getAlternateAlleles().size()) { | ||
final List<Allele> allelesToKeep = AlleleSubsettingUtils.calculateMostLikelyAlleles(vc, defaultPloidy, maxAltAlleles); | ||
final GenotypesContext reducedGenotypes = allelesToKeep.size() == 1 ? GATKVariantContextUtils.subsetToRefOnly(vc, defaultPloidy) : | ||
AlleleSubsettingUtils.subsetAlleles(vc.getGenotypes(), defaultPloidy, vc.getAlleles(), allelesToKeep, gpc, GenotypeAssignmentMethod.SET_TO_NO_CALL, vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0), false); | ||
AlleleSubsettingUtils.subsetAlleles(vc.getGenotypes(), defaultPloidy, vc.getAlleles(), allelesToKeep, gpc, GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Confirming that this should be hardcoded to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, with no PLs no-calls are just going to cause problems, so keep any 0/0 genotypes as such. I added a comment to that extent. |
||
reducedVC = new VariantContextBuilder(vc).alleles(allelesToKeep).genotypes(reducedGenotypes).make(); | ||
} | ||
|
||
|
@@ -181,7 +181,7 @@ && noAllelesOrFirstAlleleIsNotNonRef(outputAlternativeAlleles.alleles) && givenA | |
// create the genotypes | ||
//TODO: omit subsetting if output alleles is not a proper subset of vc.getAlleles | ||
final GenotypesContext genotypes = outputAlleles.size() == 1 ? GATKVariantContextUtils.subsetToRefOnly(vc, defaultPloidy) : | ||
AlleleSubsettingUtils.subsetAlleles(vc.getGenotypes(), defaultPloidy, vc.getAlleles(), outputAlleles, gpc, configuration.genotypeArgs.genotypeAssignmentMethod, vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0), false); | ||
AlleleSubsettingUtils.subsetAlleles(vc.getGenotypes(), defaultPloidy, vc.getAlleles(), outputAlleles, gpc, configuration.genotypeArgs.genotypeAssignmentMethod); | ||
|
||
if (configuration.genotypeArgs.usePosteriorProbabilitiesToCalculateQual && hasPosteriors(genotypes)) { | ||
final double log10NoVariantPosterior = phredNoVariantPosteriorProbability(outputAlleles, genotypes) * -.1; | ||
|
@@ -391,11 +391,12 @@ boolean isVcCoveredByDeletion(final VariantContext vc) { | |
* {@link GenotypeLikelihoods#MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED}. | ||
*/ | ||
protected final boolean cannotBeGenotyped(final VariantContext vc) { | ||
// protect against too many alternate alleles that we can't even run AF on: | ||
// | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fill in or delete empty comment
ldgauthier marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (vc.getNAlleles() <= GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED | ||
&& vc.getGenotypes().stream().anyMatch(GenotypeUtils::genotypeIsUsableForAFCalculation)) { | ||
return false; | ||
} | ||
// protect against too many alternate alleles that we can't even run AF on: | ||
if (vc.getNAlleles() > GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) { | ||
logger.warn("Attempting to genotype more than " + GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED + | ||
" alleles. Site will be skipped at location " + vc.getContig() + ":" + vc.getStart()); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -77,7 +77,9 @@ private static double[] log10NormalizedGenotypePosteriors(final Genotype g, fina | |
final double[] log10Likelihoods; | ||
if (g.hasLikelihoods()) { | ||
log10Likelihoods = g.getLikelihoods().getAsVector(); | ||
} else if (g.isHomRef()) { | ||
} else if ( g.isHomRef()) { | ||
//no-call with no PLs seems risky, but there are a few places in the QUAL/AF code where we subset alleles, | ||
// but then leave the genotypes as no-calls | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This comment seems misplaced/confusing, since no-calls with no PLs will now get handled by the
Recommend adding a comment to the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This comment still seems confusing/misplaced. It talks about handling no-calls in the case that now handles hom-ref (no-calls now fall through to the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed. |
||
if (g.getPloidy() != 2) { | ||
throw new IllegalStateException("Likelihoods are required to calculate posteriors for hom-refs with ploidy != 2, " + | ||
"but were not found for genotype " + g + " with ploidy " + g.getPloidy()); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -512,9 +512,7 @@ protected GenotypeBuilder changeCallToHomRefVersusNonRef(final VariantContext lo | |
// the called alleles and this is a reference genotype that will stay hom-ref | ||
final GenotypesContext context = AlleleSubsettingUtils.subsetAlleles(lowQualVariant.getGenotypes(), | ||
genotype.getPloidy(), lowQualVariant.getAlleles(), Arrays.asList(inputRefAllele, bestAlt), | ||
null, GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, //BEST_MATCH to avoid no-calling low qual genotypes | ||
lowQualVariant.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0), | ||
false); //emitEmptyPLs = true to make sure we always subset | ||
null, GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL); //BEST_MATCH to avoid no-calling low qual genotypes | ||
final Genotype subsetG = context.get(0); | ||
gb = new GenotypeBuilder(subsetG).noAttributes(); //remove attributes because hom ref blocks shouldn't have posteriors | ||
//subsetting may strip GQ and PLs for low qual genotypes | ||
|
@@ -568,8 +566,7 @@ VariantContext cleanUpHighQualityVariant(final VariantContext variant) { | |
if(allelesNeedSubsetting && !keepAllAlts) { | ||
newAlleleSetUntrimmed.removeAll(allelesToDrop); | ||
final GenotypesContext gc = AlleleSubsettingUtils.subsetAlleles(variant.getGenotypes(), genotype.getPloidy(), variant.getAlleles(), | ||
newAlleleSetUntrimmed, null, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, | ||
variant.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0), true); | ||
newAlleleSetUntrimmed, null, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a comment documenting why we're using |
||
if (gc.get(0).isHomRef() || !gc.get(0).hasGQ() || gc.get(0).getAlleles().contains(Allele.NO_CALL)) { //could be low quality or no-call after subsetting | ||
if (dropLowQuals) { | ||
return null; | ||
|
@@ -780,8 +777,8 @@ private static void addQualAnnotations(final Map<String, Object> destination, fi | |
//TODO: this isn't going to work for DRAGEN's genotype posteriors | ||
final GenotypesContext gc = AlleleSubsettingUtils.subsetAlleles(updatedAllelesVC.getGenotypes(), | ||
updatedAllelesGenotype.getPloidy(), updatedAllelesVC.getAlleles(), Arrays.asList(updatedAllelesVC.getReference(), alt), null, | ||
GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, 0, true); | ||
//assignment method doens't really matter as long as we don't zero out PLs; don't need depth to get PLs for quals | ||
GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL); | ||
//assignment method doesn't really matter as long as we don't zero out PLs; don't need depth to get PLs for quals | ||
|
||
final Genotype subsettedGenotype = gc.get(0); | ||
final int[] likelihoods = getGenotypePosteriorsOtherwiseLikelihoods(subsettedGenotype, posteriorsKey); | ||
|
@@ -862,8 +859,10 @@ private static void copyInfoAnnotations(final Map<String, Object> destinationAtt | |
final List<String> subsetList; | ||
if (alleleSpecificValues.size() > 0) { | ||
subsetList = AlleleSubsettingUtils.remapRLengthList(alleleSpecificValues, relevantIndices, ""); | ||
//zero out non-ref value, just in case | ||
subsetList.set(subsetList.size()-1,((AlleleSpecificAnnotation)annotation).getEmptyRawValue()); | ||
if (sourceVC.getAlleles().get(relevantIndices[relevantIndices.length - 1]).equals(Allele.NON_REF_ALLELE)) { | ||
//zero out non-ref value, just in case | ||
subsetList.set(subsetList.size() - 1, ((AlleleSpecificAnnotation) annotation).getEmptyRawValue()); | ||
} | ||
} else { | ||
subsetList = Collections.nCopies(relevantIndices.length, ""); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -139,6 +139,7 @@ public static GenotypeCounts computeDiploidGenotypeCounts(final VariantContext v | |
/** | ||
* Do we have (or can we infer) likelihoods necessary for allele frequency calculation? | ||
* Some reblocked and/or DRAGEN GVCFs omit likelihoods for ref blocks, but we can estimate them | ||
* If GenomicsDB max alt threshold is too low, variants may also be missimg PLs -- reject those | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. missimg -> missing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. missimg -> missing Is this comment still relevant? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed typo. Yes, still relevant. I clarified that here variants means non-reference genotypes, where we can't estimate PLs. |
||
* @param g a genotype of unknown call and ploidy | ||
* @return true if we have enough info for AF calculation | ||
*/ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -156,7 +156,12 @@ public Object[][] gvcfsToGenotype() { | |
{getTestFile( "combined.single.sample.pipeline.gatk3.vcf"), | ||
getTestFile( "expected/includeLowQualSites.vcf"), | ||
Arrays.asList( " --" + GenotypeGVCFs.ALL_SITES_LONG_NAME + " -L 20:10,012,730-10,012,740"), | ||
b37_reference_20_21} | ||
b37_reference_20_21}, | ||
|
||
//23 highly multi-allelic sites across 54 1000G exomes to test allele subsetting and QUAL calculation | ||
{getTestFile("multiallelicQualRegression.vcf "), | ||
getTestFile("multiallelicQualRegression.expected.vcf"), | ||
NO_EXTRA_ARGS, hg38Reference} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You've confirmed that this new test case fails with 4.2.5.0? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes -- it fails like the Hindenburg. |
||
}; | ||
} | ||
|
||
|
@@ -282,22 +287,34 @@ public void assertMatchingGenotypesFromGenomicsDB(File input, File expected, Loc | |
} | ||
|
||
@Test | ||
public void testMaxAltsToCombineInGenomicsDB() { | ||
final File tempGenomicsDB = GenomicsDBTestUtils.createTempGenomicsDB(CEUTRIO_20_21_GATK3_4_G_VCF, new SimpleInterval("20", 1, 11_000_000)); | ||
final String genomicsDBUri = GenomicsDBTestUtils.makeGenomicsDBUri(tempGenomicsDB); | ||
final List<String> args = new ArrayList<String>(); | ||
public void testMaxAltsToCombineInGenomicsDB() throws IOException { | ||
//multi-input tests | ||
//8 ALT VC will get dropped if GDB max is < 8 because GDB doesn't return PLs and GGVCFs drops variants with no PLs | ||
final String gnarlyTestPath = toolsTestDir + "walkers/GnarlyGenotyper/"; | ||
final List<File> inputs = Arrays.asList(new File(gnarlyTestPath + "sample6.vcf"), | ||
new File(gnarlyTestPath + "sample7.vcf"), | ||
new File(gnarlyTestPath + "sample8.vcf"), | ||
new File(gnarlyTestPath + "sample9.vcf")); | ||
final SimpleInterval interval = new SimpleInterval("chr20", 257008, 257008); | ||
final File tempGenomicsDB2 = GenomicsDBTestUtils.createTempGenomicsDB(inputs, interval); | ||
final String genomicsDBUri2 = GenomicsDBTestUtils.makeGenomicsDBUri(tempGenomicsDB2); | ||
final List<String> args = new ArrayList<>(); | ||
args.add("--"+GenomicsDBArgumentCollection.MAX_ALTS_LONG_NAME); | ||
args.add("7"); | ||
args.add("--"+GenotypeCalculationArgumentCollection.MAX_ALTERNATE_ALLELES_LONG_NAME); | ||
args.add("3"); | ||
args.add("--" + GenomicsDBArgumentCollection.MAX_ALTS_LONG_NAME); | ||
args.add("4"); | ||
runGenotypeGVCFSAndAssertCount(genomicsDBUri, args, 3, VariantContextTestUtils::assertVariantContextMaxAltAlleleCount, b37_reference_20_21); | ||
args.add("5"); | ||
final File output = runGenotypeGVCFS(genomicsDBUri2, null, args, hg38Reference); | ||
final Pair<VCFHeader, List<VariantContext>> outputDataNoVariant = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath()); | ||
Assert.assertTrue(outputDataNoVariant.getRight().isEmpty()); | ||
|
||
args.clear(); | ||
//8 ALT VC will be output if GDB max is >= 8, but with only as many ALTs are requested in the GenotypeCalculationArguments | ||
final List<String> args2 = new ArrayList<String>(); | ||
args.add("--"+GenomicsDBArgumentCollection.MAX_ALTS_LONG_NAME); | ||
args.add("15"); | ||
args.add("--"+GenotypeCalculationArgumentCollection.MAX_ALTERNATE_ALLELES_LONG_NAME); | ||
args.add("2"); | ||
args.add("--" + GenomicsDBArgumentCollection.MAX_ALTS_LONG_NAME); | ||
args.add("20"); | ||
runGenotypeGVCFSAndAssertCount(genomicsDBUri, args, 2, VariantContextTestUtils::assertVariantContextMaxAltAlleleCount, b37_reference_20_21); | ||
args.add("5"); | ||
runGenotypeGVCFSAndAssertComparison(genomicsDBUri2, getTestFile("fourSamplesEightAlts.expected.vcf"), args2, | ||
VariantContextTestUtils::assertVariantContextsHaveSameGenotypes, hg38Reference); | ||
} | ||
|
||
@Test(expectedExceptions = UserException.BadInput.class) | ||
|
@@ -328,12 +345,17 @@ public void testGDBMaxAltsEqualsGGVCFsMaxAlts() { | |
File output = runGenotypeGVCFS(genomicsDBUri, null, args, b37_reference_20_21); | ||
} | ||
|
||
@Test | ||
public void testGDBMaxAltsGreaterThanGGVCFsMaxAlts() throws IOException { | ||
|
||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fill in empty test method There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I realized this case was redundant with |
||
|
||
private void runAndCheckGenomicsDBOutput(final ArgumentsBuilder args, final File expected, final File output) { | ||
Utils.resetRandomGenerator(); | ||
runCommandLine(args); | ||
|
||
// Note that if this isn't working it will take *FOREVER* | ||
// runs in 0.06 minutes with no input intervals specfied | ||
// runs in 0.06 minutes with no input intervals specified | ||
final List<VariantContext> expectedVC = VariantContextTestUtils.getVariantContexts(expected); | ||
final List<VariantContext> actualVC = VariantContextTestUtils.getVariantContexts(output); | ||
assertForEachElementInLists(actualVC, expectedVC, VariantContextTestUtils::assertVariantContextsHaveSameGenotypes); | ||
|
@@ -422,6 +444,10 @@ private void runGenotypeGVCFSAndAssertComparison(File input, File expected, List | |
); | ||
} | ||
|
||
|
||
/** | ||
* Note that this method does not use expected for comparison, but rather for updating exact match outputs | ||
*/ | ||
private File runGenotypeGVCFS(String input, File expected, List<String> additionalArguments, String reference) { | ||
final File output = UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ? expected : createTempFile("genotypegvcf", ".vcf"); | ||
final ArgumentsBuilder args = new ArgumentsBuilder(); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add a comment documenting why we want to use
GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL
here