Skip to content

Commit

Permalink
fix no data hom refs (#8715)
Browse files Browse the repository at this point in the history
Output GQ0 genotypes from reference blocks as no-call rather than hom-ref
  • Loading branch information
ldgauthier committed Mar 7, 2024
1 parent a2ebb37 commit b0463e4
Show file tree
Hide file tree
Showing 17 changed files with 555 additions and 3,392 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,7 @@ private VariantContext referenceBlockMerge(final List<VariantContext> vcs, final
final GenotypeBuilder gBuilder = new GenotypeBuilder(g);
GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(),
gBuilder, assignmentMethod,
g.hasLikelihoods() ? g.getLikelihoods().getAsVector() : null, allelesToUse, g.getAlleles(), null);
g.hasLikelihoods() ? g.getLikelihoods().getAsVector() : null, allelesToUse, g, null);
genotypes.add(gBuilder.make());
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -658,7 +658,7 @@ private GenotypesContext mergeRefConfidenceGenotypes(final VariantContext vc,
GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(),
genotypeBuilder, assignmentMethod,
g.hasLikelihoods() ? g.getLikelihoods().getAsVector() : null,
targetAlleles, originalGTAlleles, null);
targetAlleles, new GenotypeBuilder(g.getSampleName(), originalGTAlleles).make(), null);
mergedGenotypes.add(genotypeBuilder.make());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ public static GenotypesContext subsetAlleles(final GenotypesContext originalGs,
}
gb.PL(newLikelihoods);

GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, assignmentMethod, newLikelihoods, allelesToKeep, g.getAlleles(), gpc);
GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, assignmentMethod, newLikelihoods, allelesToKeep, g, gpc);

// restrict SAC to the new allele subset
if (g.hasExtendedAttribute(GATKVCFConstants.STRAND_COUNT_BY_SAMPLE_KEY)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ public enum GenotypeAssignmentMethod {

/**
* Use PLs unless they are unavailable, in which case use best match to original
* GQ0 hom-refs will be converted to no-calls
*/
PREFER_PLS
}
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,8 @@ protected GenotypesContext iterateOnGenotypes(final VariantContext vc, final Lis
//If GenomicsDB returns no-call genotypes like CombineGVCFs (depending on the GenomicsDBExportConfiguration),
// then we need to actually find the GT from PLs
makeGenotypeCall(g, genotypeBuilder, GenotypeLikelihoods.fromPLs(PLs).getAsVector(), targetAlleles);
} else if (g.hasGQ() && g.getGQ() == 0) {
makeGenotypeCall(g, genotypeBuilder, null, targetAlleles); //null likelihoods for reblocked hom-ref that we want to no-call
}
final Map<String, Object> attrs = new HashMap<>(g.getExtendedAttributes());
attrs.remove(GATKVCFConstants.MIN_DP_FORMAT_KEY);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ public final class ReblockGVCF extends MultiVariantWalker {

/**
* Output the band lower bound for each GQ block instead of the min GQ -- for better compression
* Note that this argument also drops PLS for more efficient storage
*/
@Advanced
@Argument(fullName=HaplotypeCallerArgumentCollection.OUTPUT_BLOCK_LOWER_BOUNDS, doc = "Output the band lower bound for each GQ block regardless of the data it represents", optional = true)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -296,17 +296,20 @@ public static List<Allele> homozygousAlleleList(final Allele allele, final int p
/**
* Add the genotype call (GT) field to GenotypeBuilder using the requested {@link GenotypeAssignmentMethod}
*
* @param ploidy output ploidy for no-call GTs and likelihood array length
* @param gb the builder where we should put our newly called alleles, cannot be null
* @param assignmentMethod the method to use to do the assignment, cannot be null
* @param genotypeLikelihoods a vector of likelihoods to use if the method requires PLs, should be log10 likelihoods, cannot be null
* @param allelesToUse the alleles with respect to which the likelihoods are defined
* @param originalGT Genotype that includes GQ when available
* @param gpc utility class to help with likelihood calculations
*/
public static void makeGenotypeCall(final int ploidy,
final GenotypeBuilder gb,
final GenotypeAssignmentMethod assignmentMethod,
final double[] genotypeLikelihoods,
final List<Allele> allelesToUse,
final List<Allele> originalGT,
final Genotype originalGT,
final GenotypePriorCalculator gpc) {
if(originalGT == null && assignmentMethod == GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL) {
throw new IllegalArgumentException("original GT cannot be null if assignmentMethod is BEST_MATCH_TO_ORIGINAL");
Expand All @@ -315,12 +318,14 @@ public static void makeGenotypeCall(final int ploidy,
gb.alleles(noCallAlleles(ploidy));
} else if (assignmentMethod == GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN ||
assignmentMethod == GenotypeAssignmentMethod.PREFER_PLS) {
if ( genotypeLikelihoods == null || !isInformative(genotypeLikelihoods) ) {
if (genotypeLikelihoods == null || !isInformative(genotypeLikelihoods)) {
if (assignmentMethod == GenotypeAssignmentMethod.PREFER_PLS) {
if (originalGT == null) {
throw new IllegalArgumentException("original GT cannot be null if assignmentMethod is PREFER_PLS");
} else if (originalGT.hasGQ() && originalGT.getGQ() > 0){
gb.alleles(bestMatchToOriginalGT(allelesToUse, originalGT.getAlleles()));
} else {
gb.alleles(bestMatchToOriginalGT(allelesToUse, originalGT));
gb.alleles(noCallAlleles(ploidy));
}
} else {
gb.alleles(noCallAlleles(ploidy)).noGQ();
Expand All @@ -344,7 +349,7 @@ public static void makeGenotypeCall(final int ploidy,
} else if (assignmentMethod == GenotypeAssignmentMethod.SET_TO_NO_CALL_NO_ANNOTATIONS) {
gb.alleles(noCallAlleles(ploidy)).noGQ().noAD().noPL().noAttributes();
} else if (assignmentMethod == GenotypeAssignmentMethod.BEST_MATCH_TO_ORIGINAL) {
gb.alleles(bestMatchToOriginalGT(allelesToUse, originalGT));
gb.alleles(bestMatchToOriginalGT(allelesToUse, originalGT.getAlleles()));
} else if (assignmentMethod == GenotypeAssignmentMethod.USE_POSTERIOR_PROBABILITIES) {
if (gpc == null) {
throw new GATKException("cannot uses posteriors without an genotype prior calculator present");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ public void testCombineReblockedGVCFs() {
Assert.assertFalse(g0.hasPL());
Assert.assertFalse(g0.hasGQ());
Assert.assertFalse(g0.hasExtendedAttribute(VCFConstants.DEPTH_KEY));
Assert.assertTrue(g1.isHomRef());
Assert.assertTrue(g1.isNoCall());
Assert.assertFalse(g1.hasPL());
Assert.assertEquals(g1.getGQ(), 0);
Assert.assertEquals(g1.getAnyAttribute(VCFConstants.DEPTH_KEY), 34);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,12 @@
import java.util.stream.Stream;

public class GenotypeGVCFsIntegrationTest extends CommandLineProgramTest {
//GenotypeGVCFs test coverage should include:
// * HaplotypeCaller GVCFs
// * HaplotypeCaller GVCFs post-processed with ReblockGVCF
// * DRAGEN GVCFs (versions 3.4 and 3.7.8)
// * DRAGEN GVCFs (3.4 & 3.7.8) post-processed with ReblockGVCF
//Note that tests may need to create a temp GenomicsDB to take multiple input GVCFs

// If true, update the expected outputs in tests that assert an exact match vs. prior output,
// instead of actually running the tests. Can be used with "./gradlew test --tests org.broadinstitute.hellbender.tools.walkers.GenotypeGVCFsIntegrationTest"
Expand Down Expand Up @@ -91,6 +97,7 @@ public Object[][] gvcfsToGenotype() {
return new Object[][]{
//combine not supported yet, see https://github.com/broadinstitute/gatk/issues/2429 and https://github.com/broadinstitute/gatk/issues/2584
//{"combine.single.sample.pipeline.1.vcf", null, Arrays.asList("-V", getTestFile("combine.single.sample.pipeline.2.vcf").toString() , "-V", getTestFile("combine.single.sample.pipeline.3.vcf").toString()), b37_reference_20_21},
/*
{getTestFile("leadingDeletion.g.vcf"), getTestFile("leadingDeletionRestrictToStartExpected.vcf"), Arrays.asList("-L", "20:69512-69513", "--"+GenotypeGVCFs.ONLY_OUTPUT_CALLS_STARTING_IN_INTERVALS_FULL_NAME), b37_reference_20_21},
{getTestFile("leadingDeletion.g.vcf"), getTestFile("leadingDeletionExpected.vcf"), Arrays.asList("-L", "20:69512-69513"), b37_reference_20_21},
{getTestFile(BASE_PAIR_GVCF), getTestFile( BASE_PAIR_EXPECTED), NO_EXTRA_ARGS, b37_reference_20_21}, //base pair level gvcf
Expand Down Expand Up @@ -119,7 +126,7 @@ public Object[][] gvcfsToGenotype() {
{getTestFile( "withOxoGReadCounts.g.vcf"), getTestFile( "withOxoGReadCounts.vcf"), Arrays.asList("-G", "AS_StandardAnnotation", "-G", "StandardAnnotation"), b37_reference_20_21},
{getTestFile( "multiSamples.g.vcf"), getTestFile( "multiSamples.GATK3expected.g.vcf"), Arrays.asList( "-A", "ClippingRankSumTest", "-G", "AS_StandardAnnotation", "-G", "StandardAnnotation"), b37_reference_20_21},
{getTestFile( "testAlleleSpecificAnnotations.CombineGVCF.output.g.vcf"), getTestFile( "testAlleleSpecificAnnotations.CombineGVCF.expected.g.vcf"), Arrays.asList( "-A", "ClippingRankSumTest", "-G", "AS_StandardAnnotation", "-G", "StandardAnnotation"), b37_reference_20_21},

*/
// all sites/--include-non-variant-sites tests
// The results from these tests differ from GATK3 in the following ways:
// - sites where the only alternate allele is a spanning deletion are emitted by GATK3, but not emitted by GATK4
Expand Down Expand Up @@ -158,6 +165,7 @@ public Object[][] gvcfsToGenotype() {

//23 highly multi-allelic sites across 54 1000G exomes to test allele subsetting and QUAL calculation
//plus one 10-allele WGS variant that's all hom-ref with one GT that has unnormalized PLs from some sort of GenomicsDB corner case
//this VCF still has the haploid-looking GDB no-calls as in sample NA21137 at position chr1:3836468 -- allegedly GATK 4.2.5.0 from February 7, 2022, possibly due to --call-genotypes
{getTestFile("multiallelicQualRegression.vcf "),
getTestFile("multiallelicQualRegression.expected.vcf"),
NO_EXTRA_ARGS, hg38Reference}
Expand Down Expand Up @@ -804,6 +812,9 @@ public void testWithReblockedGVCF() {
final List<VariantContext> actualVC = VariantContextTestUtils.getVariantContexts(output);
Assert.assertFalse(actualVC.stream().anyMatch(vc -> vc.getGenotype(1).isHomRef() && vc.getGenotype(1).hasPL())); //second sample has a bunch of 0/0s -- shouldn't have PLs

//this comes from a callset of NYGC 1000G samples plus syndip
//it seems likely that there's a variant that wasn't discovered in the graph because a bunch of samples are hom-ref with PL=[0,0,X]
//this is a pretty variant dense region with 4 in 20 bases for NA12872
final File bigCombinedReblockedGVCF = new File("src/test/resources/org/broadinstitute/hellbender/tools/walkers/GenotypeGVCFs/combineReblocked.g.vcf");
final File cohortOutput = createTempFile("biggerCohort.rb", ".vcf");

Expand All @@ -822,10 +833,11 @@ public void testWithReblockedGVCF() {
Assert.assertEquals(vc0.getAlternateAllele(0).getBaseString(), "G");
Assert.assertTrue(vc0.getGenotypes().stream().allMatch(g -> g.isCalled() && g.hasGQ() && g.hasDP()));

//reblocked GVCFs with no PLs have genotypes that will be assigned as no-calls because of GQ0, so AN and ExcessHet differ here
final VariantContext vc1 = outputVCs.get(1);
Assert.assertTrue(vc1.getAttributeAsDouble(GATKVCFConstants.EXCESS_HET_KEY, 1000.0) < 10.0); //will be ~72 if homRefs aren't counted
Assert.assertTrue(vc1.getAttributeAsDouble(GATKVCFConstants.EXCESS_HET_KEY, 0.0) > 50.0); //will be ~72 if homRefs aren't counted
Assert.assertTrue(vc1.hasAttribute(GATKVCFConstants.INBREEDING_COEFFICIENT_KEY));
Assert.assertEquals(vc1.getAttributeAsInt(VCFConstants.ALLELE_NUMBER_KEY, 0), 362);
Assert.assertEquals(vc1.getAttributeAsInt(VCFConstants.ALLELE_NUMBER_KEY, 0), 300);
Assert.assertEquals(vc1.getAlternateAlleles().size(), 3);
Assert.assertTrue(vc1.isIndel());
Assert.assertTrue(vc0.getGenotypes().stream().allMatch(g -> g.isCalled() && g.hasGQ() && g.hasDP()));
Expand All @@ -838,7 +850,7 @@ public void testWithReblockedGVCF() {
.addOutput(compareICvalues);
runCommandLine(args3);

//requires InbreedingCoeff to use isCalledAndDiploidWithLikelihoodsOrWithGQ for sampleNum
//highlight differences with and without PLs
final List<VariantContext> compareICvariants = VariantContextTestUtils.getVariantContexts(compareICvalues);
final VariantContext vcWithPLs = compareICvariants.get(0);
final VariantContext vcWithoutPLs = compareICvariants.get(1);
Expand All @@ -848,8 +860,9 @@ public void testWithReblockedGVCF() {
final double ic2 = vcWithoutPLs.getAttributeAsDouble(GATKVCFConstants.INBREEDING_COEFFICIENT_KEY, 0);
Assert.assertTrue(ic1 > 0); //make sure lookup worked, otherwise 0 == 0
Assert.assertTrue(ic2 > 0); //if GQ0s with no data are output as hom-ref, then ic2 is ~0.7
Assert.assertEquals(ic1, ic2, 0.1); //there will be some difference because the old version zeros out low depth hom-refs and makes them no-calls
Assert.assertEquals(vcWithoutPLs.getAttributeAsInt(VCFConstants.ALLELE_NUMBER_KEY, 0), 114); //don't count no-calls that are PL=[0,0,0] in classic VCF
Assert.assertTrue(ic1 - ic2 > .25); //there will be a significant difference because with noPLs, GQ0s become no-calls
Assert.assertEquals(vcWithPLs.getAttributeAsInt(VCFConstants.ALLELE_NUMBER_KEY, 0) -
vcWithoutPLs.getAttributeAsInt(VCFConstants.ALLELE_NUMBER_KEY, 0), 16); //don't count no-calls that are PL=[0,0,0] in classic VCF
}

@Test
Expand Down Expand Up @@ -976,4 +989,75 @@ public void dbSNPError() {
//make sure it doesn't throw an error
runCommandLine(args);
}

@Test
public void testNoReadsOutputAsNoCall() {
//The input data for this test comes from a GVCF produced from an empty region of one of the NA12878 test bams
// and a GVCF that was edited to have a variant so we can force that position to be output.
//note these are b37 data
File no_reads = new File(toolsTestDir, "/walkers/GenotypeGVCFs/combine.single.sample.pipeline.1.vcf");
File fake_variant = getTestFile("fake_sample2.vcf");
final SimpleInterval interval = new SimpleInterval("20", 10000000, 10000000);
File tempGdb = GenomicsDBTestUtils.createTempGenomicsDB(Arrays.asList(no_reads, fake_variant), interval);
final String genomicsDBUri = GenomicsDBTestUtils.makeGenomicsDBUri(tempGdb);

final File output = createTempFile("checkNoCall", ".vcf");

final ArgumentsBuilder args = new ArgumentsBuilder();
args.addReference(b37_reference_20_21)
.addFlag("allow-old-rms-mapping-quality-annotation-data") //old GVCFs have old annotations
.addVCF(genomicsDBUri)
.addInterval(interval)
.addOutput(output);
runCommandLine(args);

final Pair<VCFHeader, List<VariantContext>> outputData = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath());
Assert.assertEquals(outputData.getRight().size(), 1);
final VariantContext vc = outputData.getRight().get(0);
Assert.assertEquals(vc.getGenotypes().size(), 2);
final Genotype g = vc.getGenotype("GTEX-RVPV-0003");
Assert.assertTrue(g.isNoCall()); //most importantly!
Assert.assertTrue(g.hasGQ());
Assert.assertEquals(g.getGQ(), 0);
Assert.assertTrue(g.hasDP());
Assert.assertEquals(g.getDP(), 0);
Assert.assertTrue(g.hasAD());
Assert.assertEquals(g.getAD(), new int[2]);
Assert.assertTrue(g.hasPL());
Assert.assertEquals(g.getPL(), new int[3]);
}

@Test
public void testNoReadsReblockedOutputAsNoCall() {
//There's a very similar test for Gnarly, but we expect the outputs to be a bit different here
//note these are b37 data
File no_reads = new File(toolsTestDir, "/walkers/GnarlyGenotyper/testNoReads.rb.g.vcf");
//this is an artisanal, hand-crafted VCF with a QUAL approx that's been artificially enhanced
File fake_variant = new File(toolsTestDir, "/walkers/GnarlyGenotyper/fake_sample2.rb.g.vcf");
final SimpleInterval interval = new SimpleInterval("20", 10000000, 10000000);
File tempGdb = GenomicsDBTestUtils.createTempGenomicsDB(Arrays.asList(no_reads, fake_variant), interval);
final String genomicsDBUri = GenomicsDBTestUtils.makeGenomicsDBUri(tempGdb);

final File output = createTempFile("checkNoCall", ".vcf");

final ArgumentsBuilder args = new ArgumentsBuilder();
args.addReference(b37_reference_20_21)
.addVCF(genomicsDBUri)
.addInterval(interval)
.addOutput(output);
runCommandLine(args);

final Pair<VCFHeader, List<VariantContext>> outputData = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath());
Assert.assertEquals(outputData.getRight().size(), 1);
final VariantContext vc = outputData.getRight().get(0);
Assert.assertEquals(vc.getGenotypes().size(), 2);
final Genotype g = vc.getGenotype("GTEX-RVPV-0003");
Assert.assertTrue(g.isNoCall()); //most importantly!
Assert.assertTrue(g.hasGQ());
Assert.assertEquals(g.getGQ(), 0);
Assert.assertTrue(g.hasDP());
Assert.assertEquals(g.getDP(), 0);
Assert.assertTrue(g.hasAD()); //this is different from Gnarly behavior
Assert.assertFalse(g.hasPL());
}
}
Loading

0 comments on commit b0463e4

Please sign in to comment.