Skip to content

Commit

Permalink
Add haploid genotype support to Gnarly, as for DRAGEN chrX
Browse files Browse the repository at this point in the history
  • Loading branch information
ldgauthier committed Jul 12, 2023
1 parent e0090de commit 060d139
Show file tree
Hide file tree
Showing 10 changed files with 70,659 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,6 @@
@BetaFeature
public final class GnarlyGenotyper extends VariantWalker {

public static final int PIPELINE_MAX_ALT_COUNT = GenotypeCalculationArgumentCollection.DEFAULT_MAX_ALTERNATE_ALLELES;

private static final OneShotLogger warning = new OneShotLogger(GnarlyGenotyper.class);

private static final boolean CALL_GENOTYPES = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ public final class GnarlyGenotyperEngine {

// cache the ploidy 2 PL array sizes for increasing numbers of alts up to the maximum of maxAltAllelesToOutput
private int[] likelihoodSizeCache;
private final ArrayList<GenotypeLikelihoodCalculator> glcCache = new ArrayList<>();
private Set<Class<? extends InfoFieldAnnotation>> allASAnnotations;

private final int maxAltAllelesToOutput;
Expand All @@ -58,10 +59,15 @@ public GnarlyGenotyperEngine(final boolean keepAllSites, final int maxAltAlleles
this.keepAllSites = keepAllSites;
this.stripASAnnotations = stripASAnnotations;

final GenotypeLikelihoodCalculators GLCprovider = new GenotypeLikelihoodCalculators();

//initialize PL size cache -- HTSJDK cache only goes up to 4 alts, but I need 6
likelihoodSizeCache = new int[maxAltAllelesToOutput + 1 + 1]; //+1 for ref and +1 so index == numAlleles
glcCache.add(null); //add a null at index zero because zero alleles (incl. ref) makes no sense
for (final int numAlleles : IntStream.rangeClosed(1, maxAltAllelesToOutput + 1).boxed().collect(Collectors.toList())) {
likelihoodSizeCache[numAlleles] = GenotypeLikelihoods.numLikelihoods(numAlleles, ASSUMED_PLOIDY);
//GL calculator cache is indexed by the total number of alleles, including ref
glcCache.add(numAlleles, GLCprovider.getInstance(ASSUMED_PLOIDY, numAlleles));
}

//TODO: fix weird reflection logging?
Expand Down Expand Up @@ -320,11 +326,6 @@ protected GenotypesContext iterateOnGenotypes(final VariantContext vc, final Lis
int newPLsize = -1;
final int maximumAlleleCount = inputAllelesWithNonRef.size();
final int numConcreteAlts = maximumAlleleCount - 2; //-1 for NON_REF and -1 for ref
if (maximumAlleleCount <= maxAllelesToOutput) {
newPLsize = likelihoodSizeCache[numConcreteAlts + 1]; //cache is indexed by #alleles with ref; don't count NON_REF
} else {
newPLsize = GenotypeLikelihoods.numLikelihoods(numConcreteAlts + 1, ASSUMED_PLOIDY);
}

for ( final Genotype g : vc.getGenotypes() ) {
final String name = g.getSampleName();
Expand All @@ -335,19 +336,21 @@ protected GenotypesContext iterateOnGenotypes(final VariantContext vc, final Lis
final Genotype calledGT;
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g);
genotypeBuilder.name(name);
if (isGDBnoCall(g) || g.getAllele(0).equals(Allele.NON_REF_ALLELE) || g.getAllele(1).equals(Allele.NON_REF_ALLELE)) {
genotypeBuilder.alleles(GATKVariantContextUtils.noCallAlleles(ASSUMED_PLOIDY));
if (isGDBnoCall(g) || g.getAlleles().contains(Allele.NON_REF_ALLELE)) {
genotypeBuilder.alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy())).noGQ();
}
else if (nonRefReturned) {
if (g.hasAD()) {
final int[] AD = trimADs(g, targetAlleles.size());
genotypeBuilder.AD(AD);
}
else if (g.countAllele(Allele.NON_REF_ALLELE) > 0) {
genotypeBuilder.alleles(GATKVariantContextUtils.noCallAlleles(ASSUMED_PLOIDY)).noGQ();
}
if (nonRefReturned && g.hasAD()) {
final int[] AD = trimADs(g, targetAlleles.size());
genotypeBuilder.AD(AD);
}
if (g.hasPL()) {
//lookup PL size from cache if ploidy matches and cache has our number of alts
if (maximumAlleleCount <= maxAllelesToOutput && g.getPloidy() == ASSUMED_PLOIDY) {
newPLsize = likelihoodSizeCache[numConcreteAlts + 1]; //cache is indexed by #alleles with ref; don't count NON_REF
//otherwise calculate size on the fly
} else {
newPLsize = GenotypeLikelihoods.numLikelihoods(numConcreteAlts + 1, g.getPloidy());
}
final int[] PLs = trimPLs(g, newPLsize);
if (emitPLs) {
genotypeBuilder.PL(PLs);
Expand All @@ -370,7 +373,7 @@ else if (g.countAllele(Allele.NON_REF_ALLELE) > 0) {
}

//running total for AC values
for (int i = 0; i < ASSUMED_PLOIDY; i++) {
for (int i = 0; i < calledGT.getPloidy(); i++) {
final Allele a = calledGT.getAllele(i);
final int count = targetAlleleCounts.getOrDefault(a, 0);
if (!a.equals(Allele.NO_CALL)) {
Expand All @@ -391,21 +394,31 @@ else if (g.countAllele(Allele.NON_REF_ALLELE) > 0) {
* For a genotype with likelihoods that has a no-call GT, determine the most likely genotype from PLs and set it
* We use a GenotypeLikelihoodCalculator to convert from the best PL index to the indexes of the alleles for that
* genotype so we can set the GenotypeBuilder with the alleles
* @param g Genotype used to make the gb GenotypeBuilder
* @param gb GenotypeBuilder to modify and pass back
* @param genotypeLikelihoods PLs to use to call genotype; count should agree with number of alleles in allelesToUse
* @param allelesToUse alleles in the parent VariantContext (with ref), because GenotypeBuilder needs the allele String rather than index
*/
@VisibleForTesting
protected void makeGenotypeCall(final GenotypeBuilder gb,
protected void makeGenotypeCall(final Genotype g, final GenotypeBuilder gb,
final double[] genotypeLikelihoods,
final List<Allele> allelesToUse) {
final int maxAllelesToOutput = maxAltAllelesToOutput + 1; //+1 for ref

if ( genotypeLikelihoods == null || !GATKVariantContextUtils.isInformative(genotypeLikelihoods) ) {
gb.alleles(GATKVariantContextUtils.noCallAlleles(ASSUMED_PLOIDY)).noGQ();
gb.alleles(GATKVariantContextUtils.noCallAlleles(g.getAlleles().size())).noGQ();
} else {
final int maxLikelihoodIndex = MathUtils.maxElementIndex(genotypeLikelihoods);
final GenotypeAlleleCounts alleleCounts = GenotypesCache.get(ASSUMED_PLOIDY, maxLikelihoodIndex);

GenotypeLikelihoodCalculator glCalc;
if ( allelesToUse.size() <= maxAllelesToOutput && g.getPloidy() == ASSUMED_PLOIDY) {
glCalc = glcCache.get(allelesToUse.size());
} else {
final GenotypeLikelihoodCalculators GLCprovider = new GenotypeLikelihoodCalculators();
glCalc = GLCprovider.getInstance(g.getPloidy(), allelesToUse.size());
}

final GenotypeAlleleCounts alleleCounts = glCalc.genotypeAlleleCountsAt(maxLikelihoodIndex);

gb.alleles(alleleCounts.asAlleleList(allelesToUse));
final int numAltAlleles = allelesToUse.size() - 1;
Expand All @@ -421,7 +434,7 @@ protected void makeGenotypeCall(final GenotypeBuilder gb,
* @return true if this is a genotype that should be represented as a ploidy-aware, spec compliant no-call
*/
private static boolean isGDBnoCall(final Genotype g) {
return g.getPloidy() == 1 && (g.getAllele(0).isReference() || g.getAllele(0).isNoCall());
return !g.hasPL() && !g.hasAD() && g.isNoCall();
}

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
package org.broadinstitute.hellbender.tools.walkers;

import com.google.errorprone.annotations.Var;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
Expand Down Expand Up @@ -53,6 +56,10 @@ public void assertThatExpectedOutputUpdateToggleIsDisabled() {
@DataProvider(name="VCFdata")
public Object[][] getVCFdata() {
return new Object[][]{
//chrX haploid sample plus diploid sample -- expected results validated with vcf-validator (samtools?)
{new File[]{getTestFile("NA12891.chrX.haploid.rb.g.vcf"), getTestFile("NA12892.chrX.diploid.rb.g.vcf")},
getTestFile("haploidPlusDiploid.expected.vcf"), null, Arrays.asList(new SimpleInterval("chrX", 1000000, 5000000)), Arrays.asList("--merge-input-intervals", "--only-output-calls-starting-in-intervals"), b38_reference_20_21},

//8 ALT alleles -- no PLs
{new File[]{getTestFile("sample6.vcf"), getTestFile("sample7.vcf"), getTestFile("sample8.vcf"), getTestFile("sample9.vcf")},
getTestFile("lotsOfAltsNoPLs.vcf"), null, Arrays.asList(new SimpleInterval("chr20", 257008, 257008)), Arrays.asList("--merge-input-intervals", "--only-output-calls-starting-in-intervals"), b38_reference_20_21},
Expand Down Expand Up @@ -217,4 +224,26 @@ public void testOnEmptyAnnotations() {
Assert.assertEquals(sors.get(1), VCFConstants.MISSING_VALUE_v4);
}

@Test
public void testHaploidInput() {
final File haploidGVCF = new File(getToolTestDataDir(), "haploid.mini.g.vcf");
final File output = createTempFile("GnarlyGenotyper", ".vcf");
final ArgumentsBuilder args = new ArgumentsBuilder();
args.addReference(new File(b37_reference_20_21))
.add("V", haploidGVCF)
.addOutput(output)
.add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false");
runCommandLine(args);

final Pair<VCFHeader, List<VariantContext>> outputData = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath());
Assert.assertEquals(outputData.getRight().size(), 1);
final VariantContext vc = outputData.getRight().get(0);
Assert.assertEquals(vc.getGenotypes().size(), 1);
final Genotype g = vc.getGenotype(0);
Assert.assertEquals(g.getPloidy(), 1);
Assert.assertEquals(g.getAlleles().get(0), Allele.ALT_A);
Assert.assertTrue(g.hasPL());
Assert.assertEquals(g.getPL().length, 2);
Assert.assertEquals(g.getPL(), new int[]{1169, 0});
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -61,19 +61,19 @@ public void testLotsOfAlts() {
//use more alts than the maxAltAllelesToOutput for the engine, forcing on-the-fly generation of GLCalculator not in the cache
@Test
public void testGenotypeCallForLotsOfAlts() {
final GnarlyGenotyperEngine engine = new GnarlyGenotyperEngine(false, 4, false, true);
final GnarlyGenotyperEngine engine = new GnarlyGenotyperEngine(false, 4, true);

final Genotype g1 = VariantContextTestUtils.makeG("g1", oneInserted, twoInserted, sample1pls);
final Genotype g2 = VariantContextTestUtils.makeG("g1", Aref, oneInserted, sample2pls);
final VariantContext vc = VariantContextTestUtils.makeVC("test", Arrays.asList(Aref, oneInserted, twoInserted, threeInserted, fourRepeats, fiveRepeats), g1, g2);

final GenotypeBuilder builder1 = new GenotypeBuilder(g1);
engine.makeGenotypeCall(builder1, GenotypeLikelihoods.fromPLs(sample1pls).getAsVector(), Arrays.asList(Aref, oneInserted, twoInserted, threeInserted, fourRepeats, fiveRepeats));
engine.makeGenotypeCall(g1, builder1, GenotypeLikelihoods.fromPLs(sample1pls).getAsVector(), Arrays.asList(Aref, oneInserted, twoInserted, threeInserted, fourRepeats, fiveRepeats));
final List<Allele> calledAlleles1 = builder1.make().getAlleles();
Assert.assertTrue(calledAlleles1.size() == 2 && calledAlleles1.contains(oneInserted) && calledAlleles1.contains(twoInserted));

final GenotypeBuilder builder2 = new GenotypeBuilder(g2);
engine.makeGenotypeCall(builder2, GenotypeLikelihoods.fromPLs(sample2pls).getAsVector(), Arrays.asList(Aref, oneInserted, twoInserted, threeInserted, fourRepeats, fiveRepeats));
engine.makeGenotypeCall(g2, builder2, GenotypeLikelihoods.fromPLs(sample2pls).getAsVector(), Arrays.asList(Aref, oneInserted, twoInserted, threeInserted, fourRepeats, fiveRepeats));
final List<Allele> calledAlleles2 = builder2.make().getAlleles();
Assert.assertTrue(calledAlleles2.size() == 2 && calledAlleles2.contains(Aref) && calledAlleles2.contains(oneInserted));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,7 @@ public void testMultipleInputs() {
}

@Test(expectedExceptions = UserException.class)
//ReblockGVCF can take multiple inputs, but only if they're non-overlapping shards from the same sample
public void testMixedSamples() {
final File output = createTempFile("reblockedgvcf", ".vcf");
final ArgumentsBuilder args = new ArgumentsBuilder();
Expand Down
Loading

0 comments on commit 060d139

Please sign in to comment.