Skip to content

Commit

Permalink
Be more clear about max alts arg; don't let no-calls get inferred
Browse files Browse the repository at this point in the history
because that can cause problems
  • Loading branch information
ldgauthier committed Feb 17, 2022
1 parent 7339c05 commit a245499
Show file tree
Hide file tree
Showing 11 changed files with 39 additions and 57 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ public class GenomicsDBArgumentCollection implements Serializable {
* Must be at least one greater than the maximum number of alternate alleles for genotyping.
* A typical value is 3 more than the --max-alternate-alleles value that's used by GenotypeGVCFs and larger differences
* result in more robustness to PCR-related indel errors.
* Note that GenotypeGVCFs will drop highly multi-allelic sites that are missing likelihoods.
* NOTE: GenotypeGVCFs will drop multi-allelic sites with more than this many alternate alleles since they are missing likelihoods.
*
* See also {@link org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeCalculationArgumentCollection#MAX_ALTERNATE_ALLELES_LONG_NAME}
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,13 @@ private AlleleSubsettingUtils() {} // prevent instantiation
* @param originalAlleles the original alleles
* @param allelesToKeep the subset of alleles to use with the new Genotypes
* @param assignmentMethod assignment strategy for the (subsetted) PLs
* @param depth the original variant DP or 0 if there was no DP
* @param forceEmitUninformativePLs force the output of a PL array even if there is no data
* @return a new non-null GenotypesContext
*/
public static GenotypesContext subsetAlleles(final GenotypesContext originalGs, final int defaultPloidy,
final List<Allele> originalAlleles,
final List<Allele> allelesToKeep,
final GenotypePriorCalculator gpc,
final GenotypeAssignmentMethod assignmentMethod,
final int depth,
final boolean forceEmitUninformativePLs) {
final GenotypeAssignmentMethod assignmentMethod) {
Utils.nonNull(originalGs, "original GenotypesContext must not be null");
Utils.nonNull(allelesToKeep, "allelesToKeep is null");
Utils.nonEmpty(allelesToKeep, "must keep at least one allele");
Expand Down Expand Up @@ -94,9 +90,6 @@ public static GenotypesContext subsetAlleles(final GenotypesContext originalGs,
newLog10GQ = -0.1*g.getGQ();
}

final boolean useNewLikelihoods = forceEmitUninformativePLs ||
(newLikelihoods != null && (depth != 0 || GATKVariantContextUtils.isInformative(newLikelihoods)));

final GenotypeBuilder gb = new GenotypeBuilder(g);
final Map<String, Object> attributes = new HashMap<>(g.getExtendedAttributes());
attributes.remove(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY);
Expand All @@ -107,9 +100,7 @@ public static GenotypesContext subsetAlleles(final GenotypesContext originalGs,
if (newLog10GQ != Double.NEGATIVE_INFINITY && g.hasGQ()) { //only put GQ if originally present
gb.log10PError(newLog10GQ);
}
if (useNewLikelihoods) {
gb.PL(newLikelihoods);
}
gb.PL(newLikelihoods);

GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, assignmentMethod, newLikelihoods, allelesToKeep, g.getAlleles(), gpc);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ public VariantContext calculateGenotypes(final VariantContext vc, final Genotype
if (maxAltAlleles < vc.getAlternateAlleles().size()) {
final List<Allele> allelesToKeep = AlleleSubsettingUtils.calculateMostLikelyAlleles(vc, defaultPloidy, maxAltAlleles);
final GenotypesContext reducedGenotypes = allelesToKeep.size() == 1 ? GATKVariantContextUtils.subsetToRefOnly(vc, defaultPloidy) :
AlleleSubsettingUtils.subsetAlleles(vc.getGenotypes(), defaultPloidy, vc.getAlleles(), allelesToKeep, gpc, GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0), true);
AlleleSubsettingUtils.subsetAlleles(vc.getGenotypes(), defaultPloidy, vc.getAlleles(), allelesToKeep, gpc, GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL);
reducedVC = new VariantContextBuilder(vc).alleles(allelesToKeep).genotypes(reducedGenotypes).make();
}

Expand Down Expand Up @@ -181,7 +181,7 @@ && noAllelesOrFirstAlleleIsNotNonRef(outputAlternativeAlleles.alleles) && givenA
// create the genotypes
//TODO: omit subsetting if output alleles is not a proper subset of vc.getAlleles
final GenotypesContext genotypes = outputAlleles.size() == 1 ? GATKVariantContextUtils.subsetToRefOnly(vc, defaultPloidy) :
AlleleSubsettingUtils.subsetAlleles(vc.getGenotypes(), defaultPloidy, vc.getAlleles(), outputAlleles, gpc, configuration.genotypeArgs.genotypeAssignmentMethod, vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0), true);
AlleleSubsettingUtils.subsetAlleles(vc.getGenotypes(), defaultPloidy, vc.getAlleles(), outputAlleles, gpc, configuration.genotypeArgs.genotypeAssignmentMethod);

if (configuration.genotypeArgs.usePosteriorProbabilitiesToCalculateQual && hasPosteriors(genotypes)) {
final double log10NoVariantPosterior = phredNoVariantPosteriorProbability(outputAlleles, genotypes) * -.1;
Expand Down Expand Up @@ -391,11 +391,12 @@ boolean isVcCoveredByDeletion(final VariantContext vc) {
* {@link GenotypeLikelihoods#MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED}.
*/
protected final boolean cannotBeGenotyped(final VariantContext vc) {
// protect against too many alternate alleles that we can't even run AF on:
//
if (vc.getNAlleles() <= GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED
&& vc.getGenotypes().stream().anyMatch(GenotypeUtils::genotypeIsUsableForAFCalculation)) {
return false;
}
// protect against too many alternate alleles that we can't even run AF on:
if (vc.getNAlleles() > GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED) {
logger.warn("Attempting to genotype more than " + GenotypeLikelihoods.MAX_DIPLOID_ALT_ALLELES_THAT_CAN_BE_GENOTYPED +
" alleles. Site will be skipped at location " + vc.getContig() + ":" + vc.getStart());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -512,9 +512,7 @@ protected GenotypeBuilder changeCallToHomRefVersusNonRef(final VariantContext lo
// the called alleles and this is a reference genotype that will stay hom-ref
final GenotypesContext context = AlleleSubsettingUtils.subsetAlleles(lowQualVariant.getGenotypes(),
genotype.getPloidy(), lowQualVariant.getAlleles(), Arrays.asList(inputRefAllele, bestAlt),
null, GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, //BEST_MATCH to avoid no-calling low qual genotypes
lowQualVariant.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0),
true); //forceEmitUniformativePLs = true to make sure we always have PLs
null, GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL); //BEST_MATCH to avoid no-calling low qual genotypes
final Genotype subsetG = context.get(0);
gb = new GenotypeBuilder(subsetG).noAttributes(); //remove attributes because hom ref blocks shouldn't have posteriors
//subsetting may strip GQ and PLs for low qual genotypes
Expand Down Expand Up @@ -568,8 +566,7 @@ VariantContext cleanUpHighQualityVariant(final VariantContext variant) {
if(allelesNeedSubsetting && !keepAllAlts) {
newAlleleSetUntrimmed.removeAll(allelesToDrop);
final GenotypesContext gc = AlleleSubsettingUtils.subsetAlleles(variant.getGenotypes(), genotype.getPloidy(), variant.getAlleles(),
newAlleleSetUntrimmed, null, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN,
variant.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0), true); //forceEmit = true so we always have PLs
newAlleleSetUntrimmed, null, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN);
if (gc.get(0).isHomRef() || !gc.get(0).hasGQ() || gc.get(0).getAlleles().contains(Allele.NO_CALL)) { //could be low quality or no-call after subsetting
if (dropLowQuals) {
return null;
Expand Down Expand Up @@ -780,7 +777,7 @@ private static void addQualAnnotations(final Map<String, Object> destination, fi
//TODO: this isn't going to work for DRAGEN's genotype posteriors
final GenotypesContext gc = AlleleSubsettingUtils.subsetAlleles(updatedAllelesVC.getGenotypes(),
updatedAllelesGenotype.getPloidy(), updatedAllelesVC.getAlleles(), Arrays.asList(updatedAllelesVC.getReference(), alt), null,
GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL, 0, true);
GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL);
//assignment method doesn't really matter as long as we don't zero out PLs; don't need depth to get PLs for quals

final Genotype subsettedGenotype = gc.get(0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1032,8 +1032,7 @@ private VariantContext subsetRecord(final VariantContext vc, final boolean prese
final GenotypesContext subGenotypesWithOldAlleles = sub.getGenotypes(); //we need sub for the right samples, but PLs still go with old alleles
newGC = sub.getNAlleles() == vc.getNAlleles() ? subGenotypesWithOldAlleles :
AlleleSubsettingUtils.subsetAlleles(subGenotypesWithOldAlleles, 0, vc.getAlleles(),
sub.getAlleles(), null, GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES,
vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0), true);
sub.getAlleles(), null, GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES);
} else {
newGC = sub.getGenotypes();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,11 +139,12 @@ public static GenotypeCounts computeDiploidGenotypeCounts(final VariantContext v
/**
* Do we have (or can we infer) likelihoods necessary for allele frequency calculation?
* Some reblocked and/or DRAGEN GVCFs omit likelihoods for ref blocks, but we can estimate them
* If GenomicsDB max alt threshold is too low, variants may also be missimg PLs -- reject those
* @param g a genotype of unknown call and ploidy
* @return true if we have enough info for AF calculation
*/
public static boolean genotypeIsUsableForAFCalculation(Genotype g) {
return g.hasLikelihoods() || g.hasGQ() || g.getAlleles().stream().anyMatch(a -> a.isCalled() && a.isNonReference() && !a.isSymbolic());
return g.hasLikelihoods() || (g.isHomRef() && g.hasGQ() && 2 == g.getPloidy());
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1687,7 +1687,7 @@ public static List<VariantContext> splitVariantContextToBiallelics(final Variant
genotypeAssignmentMethodUsed != GenotypeAssignmentMethod.SET_TO_NO_CALL)
AlleleSubsettingUtils.addInfoFieldAnnotations(vc, builder, keepOriginalChrCounts);

builder.genotypes(AlleleSubsettingUtils.subsetAlleles(vc.getGenotypes(),2,vc.getAlleles(), alleles, null, genotypeAssignmentMethodUsed,vc.getAttributeAsInt("DP",0), false));
builder.genotypes(AlleleSubsettingUtils.subsetAlleles(vc.getGenotypes(),2,vc.getAlleles(), alleles, null, genotypeAssignmentMethodUsed));
final VariantContext trimmed = trimAlleles(builder.make(), trimLeft, true);
biallelics.add(trimmed);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@ private static <T> void assertCountForEachElementInList(final List<T> actual, In
@DataProvider(name = "gvcfsToGenotype")
public Object[][] gvcfsToGenotype() {
return new Object[][]{
/*
//combine not supported yet, see https://github.com/broadinstitute/gatk/issues/2429 and https://github.com/broadinstitute/gatk/issues/2584
//{"combine.single.sample.pipeline.1.vcf", null, Arrays.asList("-V", getTestFile("combine.single.sample.pipeline.2.vcf").toString() , "-V", getTestFile("combine.single.sample.pipeline.3.vcf").toString()), b37_reference_20_21},
{getTestFile("leadingDeletion.g.vcf"), getTestFile("leadingDeletionRestrictToStartExpected.vcf"), Arrays.asList("-L", "20:69512-69513", "--"+GenotypeGVCFs.ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME), b37_reference_20_21},
Expand Down Expand Up @@ -157,7 +156,7 @@ public Object[][] gvcfsToGenotype() {
{getTestFile( "combined.single.sample.pipeline.gatk3.vcf"),
getTestFile( "expected/includeLowQualSites.vcf"),
Arrays.asList( " --" + GenotypeGVCFs.ALL_SITES_LONG_NAME + " -L 20:10,012,730-10,012,740"),
b37_reference_20_21},*/
b37_reference_20_21},

//23 highly multi-allelic sites across 54 1000G exomes to test allele subsetting and QUAL calculation
{getTestFile("multiallelicQualRegression.vcf "),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,7 @@ public void testUpdatePLsAndADData(final VariantContext originalVC,
final GenotypesContext actual = selectedVCwithGTs.getNAlleles() == originalVC.getNAlleles() ? oldGs :
AlleleSubsettingUtils.subsetAlleles(oldGs, 0, originalVC.getAlleles(),
selectedVCwithGTs.getAlleles(), null,
GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES,
originalVC.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0), false);
GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES);

Assert.assertEquals(actual.size(), expectedGenotypes.size());
for ( final Genotype expected : expectedGenotypes ) {
Expand Down Expand Up @@ -327,7 +326,7 @@ public void testUninformativePLsAreKeptWhenDepthIsNotZero(){
final List<Allele> alleles = Arrays.asList(Aref);
final Genotype uniformativePL = new GenotypeBuilder("sample", alleles).PL(new int[] {0}).make();
final GenotypesContext result = AlleleSubsettingUtils.subsetAlleles(GenotypesContext.create(uniformativePL), 2,
alleles, alleles, null, GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES, 10, false);
alleles, alleles, null, GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES);
final Genotype genotype = result.get(0);
Assert.assertTrue(genotype.hasPL());
Assert.assertEquals(genotype.getPL(), new int[]{0});
Expand Down Expand Up @@ -412,7 +411,7 @@ public void testAlleleReorderingBehavior() {

final GenotypesContext newGs = AlleleSubsettingUtils.subsetAlleles(GenotypesContext.create(g5),
2, threeAlleles, threeAllelesSorted, null,
GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES, 10, false);
GenotypeAssignmentMethod.DO_NOT_ASSIGN_GENOTYPES);

Assert.assertEquals(newGs.get(0).getPL(), new int[] {50, 20, 0, 40, 10, 30});
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeCalculationArgumentCollection;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.io.File;
Expand All @@ -28,35 +29,29 @@ public class ReblockGVCFIntegrationTest extends CommandLineProgramTest {

private static final String hg38_reference_20_21 = largeFileTestDir + "Homo_sapiens_assembly38.20.21.fasta";
private static final String b37_reference_20_21 = largeFileTestDir + "human_g1k_v37.20.21.fasta";
public static final String WARP_PROD_REBLOCKING_ARGS = " -do-qual-approx --floor-blocks -GQB 20 -GQB 30 -GQB 40 ";

@DataProvider(name = "getCommandLineArgsForExactTest")
public Object[][] getCommandLineArgsForExactTest() {
return new Object[][]{
//covers inputs with old format "MQ" annotation
{getTestFile("gvcfForReblocking.g.vcf"), getTestFile("testJustOneSample.expected.g.vcf"), " -L chr20:69771 -rgq-threshold 19", hg38_reference_20_21},
//Broad production arguments on WGS data
{getTestFile("prodWgsInput.g.vcf "), getTestFile("prodWgsOutput.g.vcf"), WARP_PROD_REBLOCKING_ARGS, hg38Reference},
//Exome data with AS annotations and zero DP regression test
{getTestFile("prodWesInput.g.vcf "), getTestFile("prodWesOutput.g.vcf"), WARP_PROD_REBLOCKING_ARGS, hg38Reference}
};
}

@Test
public void testWithExactComparison() throws Exception {
//covers inputs with old format "MQ" annotation
@Test(dataProvider = "getCommandLineArgsForExactTest")
public void testWithExactComparison(final File input, final File expected, final String extraArgs, final String reference) throws Exception {
final IntegrationTestSpec spec = new IntegrationTestSpec(
"-L chr20:69771 -O %s -R " + hg38_reference_20_21 +
" -V " + getToolTestDataDir() + "gvcfForReblocking.g.vcf -rgq-threshold 19" +
" -O %s -R " + reference +
" -V " + input.getAbsolutePath() +
extraArgs +
" --" + StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE + " false",
Arrays.asList(getToolTestDataDir() + "testJustOneSample.expected.g.vcf"));
Arrays.asList(expected.getAbsolutePath()));
spec.executeTest("testWithExactComparison", this);


//Broad production arguments on WGS data
final IntegrationTestSpec spec2 = new IntegrationTestSpec(
"-O %s -R " + hg38Reference +
" -V " + getToolTestDataDir() + "prodWgsInput.g.vcf " +
" -do-qual-approx --floor-blocks -GQB 20 -GQB 30 -GQB 40 " +
" --" + StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE + " false",
Arrays.asList(getToolTestDataDir() + "prodWgsOutput.g.vcf"));
spec2.executeTest("testWithExactComparison", this);

//Exome data with AS annotations and zero DP regression test
final IntegrationTestSpec spec3 = new IntegrationTestSpec(
"-O %s -R " + hg38Reference +
" -V " + getToolTestDataDir() + "prodWesInput.g.vcf " +
" -do-qual-approx --floor-blocks -GQB 20 -GQB 30 -GQB 40 " +
" --" + StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE + " false",
Arrays.asList(getToolTestDataDir() + "prodWesOutput.g.vcf"));
spec3.executeTest("testWithExactComparison", this);
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ public static VariantContext sortAlleles(final VariantContext vc, final VCFHeade
result.alleles(sortedAlleles);

GenotypesContext newGT = AlleleSubsettingUtils.subsetAlleles(vc.getGenotypes(),2,vc.getAlleles(),sortedAlleles, null,
GenotypeAssignmentMethod.SET_TO_NO_CALL, vc.getAttributeAsInt(VCFConstants.DEPTH_KEY,0), false);
GenotypeAssignmentMethod.SET_TO_NO_CALL);

// Asserting that the new genotypes were calculated properly in case AlleleSubsettingUtils behavior changes
if (newGT.getSampleNames().size() != vc.getGenotypes().size()) throw new IllegalStateException("Sorting this variant context resulted in a different number of genotype alleles, check that AlleleSubsettingUtils still supports reordering:" + vc.toString());
Expand Down

0 comments on commit a245499

Please sign in to comment.