Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add haploid support to GnarlyGenotyper #7750

Merged
merged 3 commits into from
Jul 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,6 @@
@BetaFeature
public final class GnarlyGenotyper extends VariantWalker {

public static final int PIPELINE_MAX_ALT_COUNT = GenotypeCalculationArgumentCollection.DEFAULT_MAX_ALTERNATE_ALLELES;

private static final OneShotLogger warning = new OneShotLogger(GnarlyGenotyper.class);

private static final boolean CALL_GENOTYPES = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.annotator.*;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.*;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAlleleCounts;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeCalculationArgumentCollection;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypesCache;
import org.broadinstitute.hellbender.tools.walkers.genotyper.*;
import org.broadinstitute.hellbender.utils.GenotypeCounts;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
Expand Down Expand Up @@ -41,6 +39,7 @@

// cache the ploidy 2 PL array sizes for increasing numbers of alts up to the maximum of maxAltAllelesToOutput
private int[] likelihoodSizeCache;
private final ArrayList<GenotypeLikelihoodCalculator> glcCache = new ArrayList<>();
private Set<Class<? extends InfoFieldAnnotation>> allASAnnotations;

private final int maxAltAllelesToOutput;
Expand All @@ -58,12 +57,6 @@
this.keepAllSites = keepAllSites;
this.stripASAnnotations = stripASAnnotations;

//initialize PL size cache -- HTSJDK cache only goes up to 4 alts, but I need 6
likelihoodSizeCache = new int[maxAltAllelesToOutput + 1 + 1]; //+1 for ref and +1 so index == numAlleles
for (final int numAlleles : IntStream.rangeClosed(1, maxAltAllelesToOutput + 1).boxed().collect(Collectors.toList())) {
likelihoodSizeCache[numAlleles] = GenotypeLikelihoods.numLikelihoods(numAlleles, ASSUMED_PLOIDY);
}

//TODO: fix weird reflection logging?
final Reflections reflections = new Reflections("org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific");
allASAnnotations = reflections.getSubTypesOf(InfoFieldAnnotation.class);
Expand Down Expand Up @@ -320,34 +313,32 @@
int newPLsize = -1;
final int maximumAlleleCount = inputAllelesWithNonRef.size();
final int numConcreteAlts = maximumAlleleCount - 2; //-1 for NON_REF and -1 for ref
if (maximumAlleleCount <= maxAllelesToOutput) {
newPLsize = likelihoodSizeCache[numConcreteAlts + 1]; //cache is indexed by #alleles with ref; don't count NON_REF
} else {
newPLsize = GenotypeLikelihoods.numLikelihoods(numConcreteAlts + 1, ASSUMED_PLOIDY);
}

for ( final Genotype g : vc.getGenotypes() ) {
final String name = g.getSampleName();
if(g.getPloidy() != ASSUMED_PLOIDY && !isGDBnoCall(g)) {
throw new UserException.BadInput("This tool assumes diploid genotypes, but sample " + name + " has ploidy "
+ g.getPloidy() + " at position " + vc.getContig() + ":" + vc.getStart() + ".");
}
final Genotype calledGT;
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g);
genotypeBuilder.name(name);
if (isGDBnoCall(g) || g.getAllele(0).equals(Allele.NON_REF_ALLELE) || g.getAllele(1).equals(Allele.NON_REF_ALLELE)) {
genotypeBuilder.alleles(GATKVariantContextUtils.noCallAlleles(ASSUMED_PLOIDY));
if (g.getAlleles().contains(Allele.NON_REF_ALLELE)) {
genotypeBuilder.alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy())).noGQ();

Check warning on line 323 in src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java#L323

Added line #L323 was not covered by tests
//there will be cases when we're running over Y or haploid X and we haven't seen any variants yet
// and we assume diploid when we shouldn't, but there's not a lot to be done about it
} else if (isGDBnoCall(g)) {
genotypeBuilder.alleles(GATKVariantContextUtils.noCallAlleles(ASSUMED_PLOIDY)).noGQ();
}
else if (nonRefReturned) {
if (g.hasAD()) {
final int[] AD = trimADs(g, targetAlleles.size());
genotypeBuilder.AD(AD);
}
else if (g.countAllele(Allele.NON_REF_ALLELE) > 0) {
genotypeBuilder.alleles(GATKVariantContextUtils.noCallAlleles(ASSUMED_PLOIDY)).noGQ();
}
if (nonRefReturned && g.hasAD()) {
final int[] AD = trimADs(g, targetAlleles.size());
genotypeBuilder.AD(AD);
}
if (g.hasPL()) {
//lookup PL size from cache if ploidy matches and cache has our number of alts
if (maximumAlleleCount <= maxAllelesToOutput && g.getPloidy() == ASSUMED_PLOIDY) {
newPLsize = GenotypeIndexCalculator.genotypeCount(g.getPloidy(), numConcreteAlts+1);
//newPLsize = likelihoodSizeCache[numConcreteAlts + 1]; //cache is indexed by #alleles with ref; don't count NON_REF
//otherwise calculate size on the fly
} else {
newPLsize = GenotypeLikelihoods.numLikelihoods(numConcreteAlts + 1, g.getPloidy());
}
final int[] PLs = trimPLs(g, newPLsize);
if (emitPLs) {
genotypeBuilder.PL(PLs);
Expand All @@ -357,7 +348,7 @@
genotypeBuilder.GQ(MathUtils.secondSmallestMinusSmallest(PLs, 0));
//If GenomicsDB returns no-call genotypes like CombineGVCFs (depending on the GenomicsDBExportConfiguration),
// then we need to actually find the GT from PLs
makeGenotypeCall(genotypeBuilder, GenotypeLikelihoods.fromPLs(PLs).getAsVector(), targetAlleles);
makeGenotypeCall(g, genotypeBuilder, GenotypeLikelihoods.fromPLs(PLs).getAsVector(), targetAlleles);
}
final Map<String, Object> attrs = new HashMap<>(g.getExtendedAttributes());
attrs.remove(GATKVCFConstants.MIN_DP_FORMAT_KEY);
Expand All @@ -370,7 +361,7 @@
}

//running total for AC values
for (int i = 0; i < ASSUMED_PLOIDY; i++) {
for (int i = 0; i < calledGT.getPloidy(); i++) {
final Allele a = calledGT.getAllele(i);
final int count = targetAlleleCounts.getOrDefault(a, 0);
if (!a.equals(Allele.NO_CALL)) {
Expand All @@ -391,37 +382,32 @@
* For a genotype with likelihoods that has a no-call GT, determine the most likely genotype from PLs and set it
* We use a GenotypeLikelihoodCalculator to convert from the best PL index to the indexes of the alleles for that
* genotype so we can set the GenotypeBuilder with the alleles
* @param g Genotype used to make the gb GenotypeBuilder
* @param gb GenotypeBuilder to modify and pass back
* @param genotypeLikelihoods PLs to use to call genotype; count should agree with number of alleles in allelesToUse
* @param allelesToUse alleles in the parent VariantContext (with ref), because GenotypeBuilder needs the allele String rather than index
*/
@VisibleForTesting
protected void makeGenotypeCall(final GenotypeBuilder gb,
protected void makeGenotypeCall(final Genotype g, final GenotypeBuilder gb,
final double[] genotypeLikelihoods,
final List<Allele> allelesToUse) {
final int maxAllelesToOutput = maxAltAllelesToOutput + 1; //+1 for ref

if ( genotypeLikelihoods == null || !GATKVariantContextUtils.isInformative(genotypeLikelihoods) ) {
gb.alleles(GATKVariantContextUtils.noCallAlleles(ASSUMED_PLOIDY)).noGQ();
//gb.alleles(GATKVariantContextUtils.noCallAlleles(g.getAlleles().size())).noGQ();
GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, GenotypeAssignmentMethod.SET_TO_NO_CALL,

Check warning on line 396 in src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java#L396

Added line #L396 was not covered by tests
genotypeLikelihoods, allelesToUse, null);
} else {
final int maxLikelihoodIndex = MathUtils.maxElementIndex(genotypeLikelihoods);
final GenotypeAlleleCounts alleleCounts = GenotypesCache.get(ASSUMED_PLOIDY, maxLikelihoodIndex);

gb.alleles(alleleCounts.asAlleleList(allelesToUse));
final int numAltAlleles = allelesToUse.size() - 1;
if ( numAltAlleles > 0 ) {
gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(maxLikelihoodIndex, genotypeLikelihoods));
}
GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN,
genotypeLikelihoods, allelesToUse, null);
}
}

/**
* Some legacy versions of GenomicsDB report no-calls as `0` or `.`
* @param g genotype to check
* @return true if this is a genotype that should be represented as a ploidy-aware, spec compliant no-call
* @return true if this is a genotype that should be represented spec compliant no-call (may need to fix ploidy)
*/
private static boolean isGDBnoCall(final Genotype g) {
return g.getPloidy() == 1 && (g.getAllele(0).isReference() || g.getAllele(0).isNoCall());
return !g.hasPL() && !g.hasAD() && g.isNoCall();
}

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.broadinstitute.hellbender.tools.walkers;

import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
Expand All @@ -8,7 +10,6 @@
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.FeatureDataSource;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AS_RMSMappingQuality;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
Expand All @@ -22,7 +23,6 @@

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
Expand Down Expand Up @@ -53,13 +53,19 @@ public void assertThatExpectedOutputUpdateToggleIsDisabled() {
@DataProvider(name="VCFdata")
public Object[][] getVCFdata() {
return new Object[][]{
/*
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you want this to be commented out?

//chrX haploid sample plus diploid sample -- expected results validated with vcf-validator (samtools?)
{new File[]{getTestFile("NA12891.chrX.haploid.rb.g.vcf"), getTestFile("NA12892.chrX.diploid.rb.g.vcf")},
getTestFile("haploidPlusDiploid.expected.vcf"), null, Arrays.asList(new SimpleInterval("chrX", 1000000, 5000000)), Arrays.asList("--merge-input-intervals", "--only-output-calls-starting-in-intervals"), b38_reference_20_21},

//8 ALT alleles -- no PLs
{new File[]{getTestFile("sample6.vcf"), getTestFile("sample7.vcf"), getTestFile("sample8.vcf"), getTestFile("sample9.vcf")},
getTestFile("lotsOfAltsNoPLs.vcf"), null, Arrays.asList(new SimpleInterval("chr20", 257008, 257008)), Arrays.asList("--merge-input-intervals", "--only-output-calls-starting-in-intervals"), b38_reference_20_21},
//6 ALT alleles -- yes PLs
{new File[]{getTestFile("sample6.vcf"), getTestFile("sample7.vcf"), getTestFile("sample8.vcf")},
getTestFile("lotsOfAltsYesPLs.vcf"), null, Arrays.asList(new SimpleInterval("chr20", 257008, 257008)), Arrays.asList("--merge-input-intervals", "--only-output-calls-starting-in-intervals"), b38_reference_20_21},
// Simple Test, spanning deletions; standard calling confidence
*/
// Simple Test, spanning deletions; standard calling confidence
//No variants outside requested intervals; no SNPs with QUAL < 60, no INDELs with QUAL < 69?; has star alleles after deletion at chr20:263497; has AC, AF, AN, DP, ExcessHet, FS, MQ, (MQRankSum), (ReadPosRankSum), SOR, QD; has called genotypes
{new File[]{getTestFile("sample1.vcf"), getTestFile("sample2.vcf"), getTestFile("sample3.vcf"), getTestFile("sample4.vcf"), getTestFile("sample5.vcf")},
getTestFile("fiveSampleTest.vcf"), null, Arrays.asList(new SimpleInterval("chr20", 251370, 252000), new SimpleInterval("chr20", 263000, 265600)), Arrays.asList("--merge-input-intervals", "--only-output-calls-starting-in-intervals"), b38_reference_20_21},
Expand Down Expand Up @@ -217,4 +223,26 @@ public void testOnEmptyAnnotations() {
Assert.assertEquals(sors.get(1), VCFConstants.MISSING_VALUE_v4);
}

@Test
public void testHaploidInput() {
final File haploidGVCF = new File(getToolTestDataDir(), "chrY_haploid_dragen.g.vcf");
final File output = createTempFile("GnarlyGenotyper", ".vcf");
final ArgumentsBuilder args = new ArgumentsBuilder();
args.addReference(new File(hg38Reference))
.add("V", haploidGVCF)
.addOutput(output)
.add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false");
runCommandLine(args);

final Pair<VCFHeader, List<VariantContext>> outputData = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath());
Assert.assertEquals(outputData.getRight().size(), 1);
final VariantContext vc = outputData.getRight().get(0);
Assert.assertEquals(vc.getGenotypes().size(), 1);
final Genotype g = vc.getGenotype(0);
Assert.assertEquals(g.getPloidy(), 1);
Assert.assertEquals(g.getAlleles().get(0), Allele.ALT_A);
Assert.assertTrue(g.hasPL());
Assert.assertEquals(g.getPL().length, 2);
Assert.assertEquals(g.getPL(), new int[]{83,0});
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -61,19 +61,19 @@ public void testLotsOfAlts() {
//use more alts than the maxAltAllelesToOutput for the engine, forcing on-the-fly generation of GLCalculator not in the cache
@Test
public void testGenotypeCallForLotsOfAlts() {
final GnarlyGenotyperEngine engine = new GnarlyGenotyperEngine(false, 4, false, true);
final GnarlyGenotyperEngine engine = new GnarlyGenotyperEngine(false, 4, true);

final Genotype g1 = VariantContextTestUtils.makeG("g1", oneInserted, twoInserted, sample1pls);
final Genotype g2 = VariantContextTestUtils.makeG("g1", Aref, oneInserted, sample2pls);
final VariantContext vc = VariantContextTestUtils.makeVC("test", Arrays.asList(Aref, oneInserted, twoInserted, threeInserted, fourRepeats, fiveRepeats), g1, g2);

final GenotypeBuilder builder1 = new GenotypeBuilder(g1);
engine.makeGenotypeCall(builder1, GenotypeLikelihoods.fromPLs(sample1pls).getAsVector(), Arrays.asList(Aref, oneInserted, twoInserted, threeInserted, fourRepeats, fiveRepeats));
engine.makeGenotypeCall(g1, builder1, GenotypeLikelihoods.fromPLs(sample1pls).getAsVector(), Arrays.asList(Aref, oneInserted, twoInserted, threeInserted, fourRepeats, fiveRepeats));
final List<Allele> calledAlleles1 = builder1.make().getAlleles();
Assert.assertTrue(calledAlleles1.size() == 2 && calledAlleles1.contains(oneInserted) && calledAlleles1.contains(twoInserted));

final GenotypeBuilder builder2 = new GenotypeBuilder(g2);
engine.makeGenotypeCall(builder2, GenotypeLikelihoods.fromPLs(sample2pls).getAsVector(), Arrays.asList(Aref, oneInserted, twoInserted, threeInserted, fourRepeats, fiveRepeats));
engine.makeGenotypeCall(g2, builder2, GenotypeLikelihoods.fromPLs(sample2pls).getAsVector(), Arrays.asList(Aref, oneInserted, twoInserted, threeInserted, fourRepeats, fiveRepeats));
final List<Allele> calledAlleles2 = builder2.make().getAlleles();
Assert.assertTrue(calledAlleles2.size() == 2 && calledAlleles2.contains(Aref) && calledAlleles2.contains(oneInserted));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,7 @@ public void testMultipleInputs() {
}

@Test(expectedExceptions = UserException.class)
//ReblockGVCF can take multiple inputs, but only if they're non-overlapping shards from the same sample
public void testMixedSamples() {
final File output = createTempFile("reblockedgvcf", ".vcf");
final ArgumentsBuilder args = new ArgumentsBuilder();
Expand Down
Loading