Skip to content

Commit

Permalink
Fixed a bug in ReblockGVCF that could cause the first position on a c…
Browse files Browse the repository at this point in the history
…ontig to be dropped. (#8028)
  • Loading branch information
samuelklee committed Sep 23, 2022
1 parent 663dadc commit dd3826f
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ private void regenotypeVC(final VariantContext originalVC) {
//variants with PL[0] less than threshold get turned to homRef with PL=[0,0,0], shouldn't get INFO attributes
//make sure we can call het variants with GQ >= rgqThreshold in joint calling downstream
if(shouldBeReblocked(result)) {
if (vcfWriter.getVcfOutputEnd() != null && result.getEnd() <= vcfWriter.getVcfOutputEnd().getEnd()) {
if (vcfWriter.getVcfOutputEnd() != null && result.contigsMatch(vcfWriter.getVcfOutputEnd()) && result.getEnd() <= vcfWriter.getVcfOutputEnd().getEnd()) {
//variant is entirely overlapped by variants already output to the VCF, so drop it
return;
}
Expand Down Expand Up @@ -488,7 +488,7 @@ public VariantContextBuilder lowQualVariantToGQ0HomRef(final VariantContext lowQ
return null;
}

final Map<String, Object> attrMap = new HashMap<>();
final Map<String, Object> attrMap = new LinkedHashMap<>();

//this method does a lot of things, including fixing alleles and adding the END key
final GenotypeBuilder gb = changeCallToHomRefVersusNonRef(lowQualityVariant, attrMap); //note that gb has all zero PLs
Expand All @@ -497,7 +497,7 @@ public VariantContextBuilder lowQualVariantToGQ0HomRef(final VariantContext lowQ

final Genotype newG = gb.make();
builder.alleles(Arrays.asList(newG.getAlleles().get(0), Allele.NON_REF_ALLELE)).genotypes(newG);
if (vcfWriter.getVcfOutputEnd() != null && lowQualityVariant.getStart() <= vcfWriter.getVcfOutputEnd().getStart()) {
if (vcfWriter.getVcfOutputEnd() != null && lowQualityVariant.contigsMatch(vcfWriter.getVcfOutputEnd()) && lowQualityVariant.getStart() <= vcfWriter.getVcfOutputEnd().getStart()) {
final int newStart = vcfWriter.getVcfOutputEnd().getEnd() + 1;
if (newStart > lowQualityVariant.getEnd()) {
return null;
Expand Down Expand Up @@ -578,7 +578,7 @@ protected GenotypeBuilder changeCallToHomRefVersusNonRef(final VariantContext lo
*/
@VisibleForTesting
VariantContext cleanUpHighQualityVariant(final VariantContext variant) {
final Map<String, Object> attrMap = new HashMap<>();
final Map<String, Object> attrMap = new LinkedHashMap<>();

final Genotype genotype = getCalledGenotype(variant);
VariantContextBuilder builder = new VariantContextBuilder(variant); //QUAL from result is carried through
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -479,4 +479,28 @@ public void testTreeScoreThreshold() {
}
}
}

/**
* Regression test for https://github.com/broadinstitute/gatk/issues/7884 using 2-record snippet of gVCF discussed there.
*/
@Test
public void testFirstPositionOnContigNotDropped() {
final File input = getTestFile("testFirstPositionOnContigNotDropped.g.vcf");
final File output = createTempFile("reblockedgvcf", ".vcf");

final ArgumentsBuilder args = new ArgumentsBuilder();
args.addReference(new File(hg38Reference))
.add("V", input)
.add("L", "chr12:18282464") // in the original gVCF, 18282464 is the first variant position on chr12 that is greater than the dropped position 18173860 on chr13
.add("L", "chr13:18173860")
.addOutput(output);
runCommandLine(args);

// we only check that records at both positions are retained
final List<VariantContext> outVCs = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath()).getRight();
Assert.assertEquals(outVCs.get(0).getContig(), "chr12");
Assert.assertEquals(outVCs.get(0).getStart(), 18282464);
Assert.assertEquals(outVCs.get(1).getContig(), "chr13");
Assert.assertEquals(outVCs.get(1).getStart(), 18173860);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
##FILTER=<ID=LowQual,Description="Low quality">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phasing set (typically the position of the first variant in the set)">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##GVCFBlock0-10=minGQ=0(inclusive),maxGQ=10(exclusive)
##GVCFBlock10-20=minGQ=10(inclusive),maxGQ=20(exclusive)
##GVCFBlock20-30=minGQ=20(inclusive),maxGQ=30(exclusive)
##GVCFBlock30-40=minGQ=30(inclusive),maxGQ=40(exclusive)
##GVCFBlock40-50=minGQ=40(inclusive),maxGQ=50(exclusive)
##GVCFBlock50-60=minGQ=50(inclusive),maxGQ=60(exclusive)
##GVCFBlock60-70=minGQ=60(inclusive),maxGQ=70(exclusive)
##GVCFBlock70-80=minGQ=70(inclusive),maxGQ=80(exclusive)
##GVCFBlock80-90=minGQ=80(inclusive),maxGQ=90(exclusive)
##GVCFBlock90-100=minGQ=90(inclusive),maxGQ=100(exclusive)
##INFO=<ID=AS_InbreedingCoeff,Number=A,Type=Float,Description="Allele-specific inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=AS_QD,Number=A,Type=Float,Description="Allele-specific Variant Confidence/Quality by Depth">
##INFO=<ID=AS_RAW_BaseQRankSum,Number=1,Type=String,Description="raw data for allele specific rank sum test of base qualities">
##INFO=<ID=AS_RAW_MQ,Number=1,Type=String,Description="Allele-specfic raw data for RMS Mapping Quality">
##INFO=<ID=AS_RAW_MQRankSum,Number=1,Type=String,Description="Allele-specfic raw data for Mapping Quality Rank Sum">
##INFO=<ID=AS_RAW_ReadPosRankSum,Number=1,Type=String,Description="allele specific raw data for rank sum test of read position bias">
##INFO=<ID=AS_SB_TABLE,Number=1,Type=String,Description="Allele-specific forward/reverse read counts for strand bias tests. Includes the reference and alleles separated by |.">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity">
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=RAW_MQandDP,Number=2,Type=Integer,Description="Raw data (sum of squared MQ and total depth) for improved RMS Mapping Quality calculation. Incompatible with deprecated RAW_MQ formulation.">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##contig=<ID=chr12,length=133275309,assembly=38>
##contig=<ID=chr13,length=114364328,assembly=38>
##source=HaplotypeCaller
##bcftools_viewVersion=1.10.2-105-g7cd83b7+htslib-1.10.2-110-gda59588
##bcftools_viewCommand=view -r chr12:18282464,chr13:18173860 test.g.vcf.gz; Date=Wed Sep 21 14:52:31 2022
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT test
chr12 18282464 . GCCC G,<NON_REF> 1827.6 . AS_RAW_BaseQRankSum=|-0.6,1|NaN;AS_RAW_MQ=165600.00|169200.00|0.00;AS_RAW_MQRankSum=|0.0,1|NaN;AS_RAW_ReadPosRankSum=|0.4,1|NaN;AS_SB_TABLE=18,28|18,29|0,0;BaseQRankSum=-0.55;DP=95;ExcessHet=0;MLEAC=1,0;MLEAF=0.5,0;MQRankSum=0;RAW_MQandDP=342000,95;ReadPosRankSum=0.449 GT:AD:DP:GQ:PL:SB 0/1:46,47,0:93:99:1835,0,1790,1974,1932,3905:18,28,18,29
chr13 18173860 . A C,<NON_REF> 0 . AS_RAW_BaseQRankSum=|-4.9,1|NaN;AS_RAW_MQ=51256.00|4709.00|0.00;AS_RAW_MQRankSum=|0.5,1|NaN;AS_RAW_ReadPosRankSum=|1.2,1|NaN;AS_SB_TABLE=23,2|1,1|0,0;BaseQRankSum=-4.896;DP=28;ExcessHet=0;MLEAC=0,0;MLEAF=0,0;MQRankSum=0.564;RAW_MQandDP=59565,28;ReadPosRankSum=1.252 GT:AD:DP:GQ:PL:SB 0/0:25,2,0:27:61:0,61,946,75,951,965:23,2,1,1
Binary file not shown.

0 comments on commit dd3826f

Please sign in to comment.