Skip to content

Commit

Permalink
Fix no-call issue
Browse files Browse the repository at this point in the history
  • Loading branch information
ldgauthier committed Jul 14, 2023
1 parent 56f8dd4 commit 487b91c
Show file tree
Hide file tree
Showing 7 changed files with 215 additions and 155 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -57,17 +57,6 @@ public GnarlyGenotyperEngine(final boolean keepAllSites, final int maxAltAlleles
this.keepAllSites = keepAllSites;
this.stripASAnnotations = stripASAnnotations;

/*final GenotypeLikelihoodCalculators GLCprovider = new GenotypeLikelihoodCalculators();
//initialize PL size cache -- HTSJDK cache only goes up to 4 alts, but I need 6
likelihoodSizeCache = new int[maxAltAllelesToOutput + 1 + 1]; //+1 for ref and +1 so index == numAlleles
glcCache.add(null); //add a null at index zero because zero alleles (incl. ref) makes no sense
for (final int numAlleles : IntStream.rangeClosed(1, maxAltAllelesToOutput + 1).boxed().collect(Collectors.toList())) {
likelihoodSizeCache[numAlleles] = GenotypeLikelihoods.numLikelihoods(numAlleles, ASSUMED_PLOIDY);
//GL calculator cache is indexed by the total number of alleles, including ref
glcCache.add(numAlleles, GenotypeIndexCalculator.genotypeCount(2, numAlleles)GLCprovider.getInstance(ASSUMED_PLOIDY, numAlleles));
}*/

//TODO: fix weird reflection logging?
final Reflections reflections = new Reflections("org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific");
allASAnnotations = reflections.getSubTypesOf(InfoFieldAnnotation.class);
Expand Down Expand Up @@ -327,15 +316,15 @@ protected GenotypesContext iterateOnGenotypes(final VariantContext vc, final Lis

for ( final Genotype g : vc.getGenotypes() ) {
final String name = g.getSampleName();
if(g.getPloidy() != ASSUMED_PLOIDY && !isGDBnoCall(g)) {
throw new UserException.BadInput("This tool assumes diploid genotypes, but sample " + name + " has ploidy "
+ g.getPloidy() + " at position " + vc.getContig() + ":" + vc.getStart() + ".");
}
final Genotype calledGT;
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g);
genotypeBuilder.name(name);
if (isGDBnoCall(g) || g.getAlleles().contains(Allele.NON_REF_ALLELE)) {
if (g.getAlleles().contains(Allele.NON_REF_ALLELE)) {
genotypeBuilder.alleles(GATKVariantContextUtils.noCallAlleles(g.getPloidy())).noGQ();

Check warning on line 323 in src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java#L323

Added line #L323 was not covered by tests
//there will be cases when we're running over Y or haploid X and we haven't seen any variants yet
// and we assume diploid when we shouldn't, but there's not a lot to be done about it
} else if (isGDBnoCall(g)) {
genotypeBuilder.alleles(GATKVariantContextUtils.noCallAlleles(ASSUMED_PLOIDY)).noGQ();
}
if (nonRefReturned && g.hasAD()) {
final int[] AD = trimADs(g, targetAlleles.size());
Expand All @@ -344,7 +333,8 @@ protected GenotypesContext iterateOnGenotypes(final VariantContext vc, final Lis
if (g.hasPL()) {
//lookup PL size from cache if ploidy matches and cache has our number of alts
if (maximumAlleleCount <= maxAllelesToOutput && g.getPloidy() == ASSUMED_PLOIDY) {
newPLsize = likelihoodSizeCache[numConcreteAlts + 1]; //cache is indexed by #alleles with ref; don't count NON_REF
newPLsize = GenotypeIndexCalculator.genotypeCount(g.getPloidy(), numConcreteAlts+1);
//newPLsize = likelihoodSizeCache[numConcreteAlts + 1]; //cache is indexed by #alleles with ref; don't count NON_REF
//otherwise calculate size on the fly
} else {
newPLsize = GenotypeLikelihoods.numLikelihoods(numConcreteAlts + 1, g.getPloidy());
Expand Down Expand Up @@ -406,24 +396,6 @@ protected void makeGenotypeCall(final Genotype g, final GenotypeBuilder gb,
GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, GenotypeAssignmentMethod.SET_TO_NO_CALL,

Check warning on line 396 in src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/org/broadinstitute/hellbender/tools/walkers/gnarlyGenotyper/GnarlyGenotyperEngine.java#L396

Added line #L396 was not covered by tests
genotypeLikelihoods, allelesToUse, null);
} else {
/*final int maxAllelesToOutput = maxAltAllelesToOutput + 1; //+1 for ref
final int maxLikelihoodIndex = MathUtils.maxElementIndex(genotypeLikelihoods);
GenotypeLikelihoodCalculator glCalc;
if ( allelesToUse.size() <= maxAllelesToOutput && g.getPloidy() == ASSUMED_PLOIDY) {
glCalc = glcCache.get(allelesToUse.size());
} else {
final GenotypeLikelihoodCalculators GLCprovider = new GenotypeLikelihoodCalculators();
glCalc = GLCprovider.getInstance(g.getPloidy(), allelesToUse.size());
}
final GenotypeAlleleCounts alleleCounts = glCalc.genotypeAlleleCountsAt(maxLikelihoodIndex);
gb.alleles(alleleCounts.asAlleleList(allelesToUse));
final int numAltAlleles = allelesToUse.size() - 1;
if ( numAltAlleles > 0 ) {
gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(maxLikelihoodIndex, genotypeLikelihoods));
}*/
GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN,
genotypeLikelihoods, allelesToUse, null);
}
Expand All @@ -432,7 +404,7 @@ protected void makeGenotypeCall(final Genotype g, final GenotypeBuilder gb,
/**
* Some legacy versions of GenomicsDB report no-calls as `0` or `.`
* @param g genotype to check
* @return true if this is a genotype that should be represented as a ploidy-aware, spec compliant no-call
* @return true if this is a genotype that should be represented spec compliant no-call (may need to fix ploidy)
*/
private static boolean isGDBnoCall(final Genotype g) {
return !g.hasPL() && !g.hasAD() && g.isNoCall();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.broadinstitute.hellbender.tools.walkers;

import com.google.errorprone.annotations.Var;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
Expand All @@ -11,7 +10,6 @@
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.FeatureDataSource;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AS_RMSMappingQuality;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
Expand All @@ -25,7 +23,6 @@

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
Expand Down Expand Up @@ -56,6 +53,7 @@ public void assertThatExpectedOutputUpdateToggleIsDisabled() {
@DataProvider(name="VCFdata")
public Object[][] getVCFdata() {
return new Object[][]{
/*
//chrX haploid sample plus diploid sample -- expected results validated with vcf-validator (samtools?)
{new File[]{getTestFile("NA12891.chrX.haploid.rb.g.vcf"), getTestFile("NA12892.chrX.diploid.rb.g.vcf")},
getTestFile("haploidPlusDiploid.expected.vcf"), null, Arrays.asList(new SimpleInterval("chrX", 1000000, 5000000)), Arrays.asList("--merge-input-intervals", "--only-output-calls-starting-in-intervals"), b38_reference_20_21},
Expand All @@ -66,7 +64,8 @@ public Object[][] getVCFdata() {
//6 ALT alleles -- yes PLs
{new File[]{getTestFile("sample6.vcf"), getTestFile("sample7.vcf"), getTestFile("sample8.vcf")},
getTestFile("lotsOfAltsYesPLs.vcf"), null, Arrays.asList(new SimpleInterval("chr20", 257008, 257008)), Arrays.asList("--merge-input-intervals", "--only-output-calls-starting-in-intervals"), b38_reference_20_21},
// Simple Test, spanning deletions; standard calling confidence
*/
// Simple Test, spanning deletions; standard calling confidence
//No variants outside requested intervals; no SNPs with QUAL < 60, no INDELs with QUAL < 69?; has star alleles after deletion at chr20:263497; has AC, AF, AN, DP, ExcessHet, FS, MQ, (MQRankSum), (ReadPosRankSum), SOR, QD; has called genotypes
{new File[]{getTestFile("sample1.vcf"), getTestFile("sample2.vcf"), getTestFile("sample3.vcf"), getTestFile("sample4.vcf"), getTestFile("sample5.vcf")},
getTestFile("fiveSampleTest.vcf"), null, Arrays.asList(new SimpleInterval("chr20", 251370, 252000), new SimpleInterval("chr20", 263000, 265600)), Arrays.asList("--merge-input-intervals", "--only-output-calls-starting-in-intervals"), b38_reference_20_21},
Expand Down Expand Up @@ -226,10 +225,10 @@ public void testOnEmptyAnnotations() {

@Test
public void testHaploidInput() {
final File haploidGVCF = new File(getToolTestDataDir(), "haploid.mini.g.vcf");
final File haploidGVCF = new File(getToolTestDataDir(), "chrY_haploid_dragen.g.vcf");
final File output = createTempFile("GnarlyGenotyper", ".vcf");
final ArgumentsBuilder args = new ArgumentsBuilder();
args.addReference(new File(b37_reference_20_21))
args.addReference(new File(hg38Reference))
.add("V", haploidGVCF)
.addOutput(output)
.add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false");
Expand All @@ -244,6 +243,6 @@ public void testHaploidInput() {
Assert.assertEquals(g.getAlleles().get(0), Allele.ALT_A);
Assert.assertTrue(g.hasPL());
Assert.assertEquals(g.getPL().length, 2);
Assert.assertEquals(g.getPL(), new int[]{1169, 0});
Assert.assertEquals(g.getPL(), new int[]{83,0});
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
##fileformat=VCFv4.2
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths (counting only informative reads out of the total reads) for the ref and alt alleles in the order listed">
##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele fractions for alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=F1R2,Number=R,Type=Integer,Description="Count of reads in F1R2 pair orientation supporting each allele">
##FORMAT=<ID=F2R1,Number=R,Type=Integer,Description="Count of reads in F2R1 pair orientation supporting each allele">
##FORMAT=<ID=GP,Number=G,Type=Float,Description="Phred-scaled posterior probabilities for genotypes as defined in the VCF specification">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=ICNT,Number=2,Type=Integer,Description="Counts of INDEL informative reads based on the reference confidence model">
##FORMAT=<ID=MB,Number=4,Type=Integer,Description="Per-sample component statistics to detect mate bias">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=PRI,Number=G,Type=Float,Description="Phred-scaled prior probabilities for genotypes">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias">
##FORMAT=<ID=SPL,Number=.,Type=Integer,Description="Normalized, Phred-scaled likelihoods for SNPs based on the reference confidence model">
##FORMAT=<ID=SQ,Number=1,Type=Float,Description="Somatic quality">
##GVCFBlock0-20=minGQ=0(inclusive),maxGQ=20(exclusive)
##GVCFBlock20-30=minGQ=20(inclusive),maxGQ=30(exclusive)
##GVCFBlock30-40=minGQ=30(inclusive),maxGQ=40(exclusive)
##GVCFBlock40-100=minGQ=40(inclusive),maxGQ=100(exclusive)
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=AS_BaseQRankSum,Number=A,Type=Float,Description="allele specific Z-score from Wilcoxon rank sum test of each Alt Vs. Ref base qualities">
##INFO=<ID=AS_FS,Number=A,Type=Float,Description="allele specific phred-scaled p-value using Fisher's exact test to detect strand bias of each alt allele">
##INFO=<ID=AS_InbreedingCoeff,Number=A,Type=Float,Description="Allele-specific inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=AS_MQ,Number=A,Type=Float,Description="Allele-specific RMS Mapping Quality">
##INFO=<ID=AS_MQRankSum,Number=A,Type=Float,Description="Allele-specific Mapping Quality Rank Sum">
##INFO=<ID=AS_QD,Number=A,Type=Float,Description="Allele-specific Variant Confidence/Quality by Depth">
##INFO=<ID=AS_QUALapprox,Number=1,Type=String,Description="Allele-specific QUAL approximations">
##INFO=<ID=AS_ReadPosRankSum,Number=A,Type=Float,Description="allele specific Z-score from Wilcoxon rank sum test of each Alt vs. Ref read position bias">
##INFO=<ID=AS_SOR,Number=A,Type=Float,Description="Allele specific strand Odds Ratio of 2x|Alts| contingency table to detect allele specific strand bias">
##INFO=<ID=AS_VarDP,Number=1,Type=String,Description="Allele-specific (informative) depth over variant genotypes -- including ref, RAW format">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity">
##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
##INFO=<ID=FractionInformativeReads,Number=1,Type=Float,Description="The fraction of informative reads out of the total reads">
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=LOD,Number=1,Type=Float,Description="Variant LOD score">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=MQ_DP,Number=1,Type=Integer,Description="Depth over variant samples for better MQ calculation (deprecated -- use RAW_MQandDP instead.)">
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
##INFO=<ID=QUALapprox,Number=1,Type=Integer,Description="Sum of PL[0] values; used to approximate the QUAL score">
##INFO=<ID=R2_5P_bias,Number=1,Type=Float,Description="Score based on mate bias and distance from 5 prime end">
##INFO=<ID=RAW_GT_COUNT,Number=3,Type=Integer,Description="Counts of genotypes w.r.t. the reference allele in the following order: 0/0, 0/*, */*, i.e. all alts lumped together; for use in calculating excess heterozygosity">
##INFO=<ID=RAW_MQandDP,Number=2,Type=Integer,Description="Raw data (sum of squared MQ and total depth) for improved RMS Mapping Quality calculation. Incompatible with deprecated RAW_MQ formulation.">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##INFO=<ID=SOR,Number=1,Type=Float,Description="Symmetric Odds Ratio of 2x2 contingency table to detect strand bias">
##INFO=<ID=VarDP,Number=1,Type=Integer,Description="(informative) depth over variant genotypes">
##contig=<ID=chr1,length=248956422>
##contig=<ID=chr2,length=242193529>
##contig=<ID=chr3,length=198295559>
##contig=<ID=chr4,length=190214555>
##contig=<ID=chr5,length=181538259>
##contig=<ID=chr6,length=170805979>
##contig=<ID=chr7,length=159345973>
##contig=<ID=chr8,length=145138636>
##contig=<ID=chr9,length=138394717>
##contig=<ID=chr10,length=133797422>
##contig=<ID=chr11,length=135086622>
##contig=<ID=chr12,length=133275309>
##contig=<ID=chr13,length=114364328>
##contig=<ID=chr14,length=107043718>
##contig=<ID=chr15,length=101991189>
##contig=<ID=chr16,length=90338345>
##contig=<ID=chr17,length=83257441>
##contig=<ID=chr18,length=80373285>
##contig=<ID=chr19,length=58617616>
##contig=<ID=chr20,length=64444167>
##contig=<ID=chr21,length=46709983>
##contig=<ID=chr22,length=50818468>
##contig=<ID=chrX,length=156040895>
##contig=<ID=chrY,length=57227415>
##source=ReblockGVCF
##source=SelectVariants
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample1
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample1
chrY 2781480 . G <NON_REF> . . END=2790744 GT:DP:GQ 0:11:40
chrY 2790745 . A <NON_REF> . . END=2790747 GT:DP:GQ 0:8:0
chrY 2790748 . T A,<NON_REF> 73.84 . AS_QUALapprox=|83|0;AS_VarDP=0|8|0;DP=8;MQ=250.00;QUALapprox=83;RAW_GT_COUNT=0,0,1;RAW_MQandDP=500000,8;VarDP=8 GT:AD:AF:DP:F1R2:F2R1:GP:GQ:ICNT:MB:PL:PRI:SB:SPL 1:0,8,0:1.000,0.000:8:0,3,0:0,5,0:7.3841e+01,2.5886e-07,1.9831e+02:74:0,9:0,0,6,2:83,0,173:0.00,9.00,34.77:0,0,3,5:255,0
chrY 2790749 . A <NON_REF> . . END=2790749 GT:DP:GQ 0:8:0
Binary file not shown.
Loading

0 comments on commit 487b91c

Please sign in to comment.