Skip to content

Commit

Permalink
removing A,R, and G length attributes when ReblockGVCF subsets an all…
Browse files Browse the repository at this point in the history
…ele (#8209)
  • Loading branch information
meganshand committed Mar 2, 2023
1 parent 227bbca commit 4c1fb92
Show file tree
Hide file tree
Showing 5 changed files with 3,512 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@
import htsjdk.variant.vcf.*;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.annotator.AnnotationUtils;
import org.broadinstitute.hellbender.utils.genotyper.GenotypePriorCalculator;
import org.broadinstitute.hellbender.tools.walkers.ReferenceConfidenceVariantContextMerger;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.collections.Permutation;
import org.broadinstitute.hellbender.utils.genotyper.IndexedAlleleList;
import org.broadinstitute.hellbender.utils.logging.OneShotLogger;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.hellbender.utils.variant.VariantContextGetters;
Expand All @@ -33,9 +35,18 @@ private AlleleSubsettingUtils() {} // prevent instantiation
private static final int PL_INDEX_OF_HOM_REF = 0;
public static final int NUM_OF_STRANDS = 2; // forward and reverse strands

private static final GenotypeLikelihoodCalculators GL_CALCS = new GenotypeLikelihoodCalculators();
private static final OneShotLogger attributesRemovedOneShotLogger = new OneShotLogger(AlleleSubsettingUtils.class);

private static final GenotypeLikelihoodCalculators GL_CALCS = new GenotypeLikelihoodCalculators();

public static GenotypesContext subsetAlleles(final GenotypesContext originalGs, final int defaultPloidy,
final List<Allele> originalAlleles,
final List<Allele> allelesToKeep,
final GenotypePriorCalculator gpc,
final GenotypeAssignmentMethod assignmentMethod) {
//TODO: if other usages of this method should update or remove A,R, or G length annotations then header parsing is necessary and the method below should be used
return subsetAlleles(originalGs, defaultPloidy, originalAlleles, allelesToKeep, gpc, assignmentMethod, Collections.emptyList());
}
/**
* Create the new GenotypesContext with the subsetted PLs and ADs
*
Expand All @@ -45,13 +56,15 @@ private AlleleSubsettingUtils() {} // prevent instantiation
* @param originalAlleles the original alleles
* @param allelesToKeep the subset of alleles to use with the new Genotypes
* @param assignmentMethod assignment strategy for the (subsetted) PLs
* @param alleleBasedLengthAnnots list of annotations that have lengths based on the number of alleles (A, R, and G length types)
* @return a new non-null GenotypesContext
*/
public static GenotypesContext subsetAlleles(final GenotypesContext originalGs, final int defaultPloidy,
final List<Allele> originalAlleles,
final List<Allele> allelesToKeep,
final GenotypePriorCalculator gpc,
final GenotypeAssignmentMethod assignmentMethod) {
final GenotypeAssignmentMethod assignmentMethod,
final List<String> alleleBasedLengthAnnots) {
Utils.nonNull(originalGs, "original GenotypesContext must not be null");
Utils.nonNull(allelesToKeep, "allelesToKeep is null");
Utils.nonEmpty(allelesToKeep, "must keep at least one allele");
Expand Down Expand Up @@ -93,9 +106,23 @@ public static GenotypesContext subsetAlleles(final GenotypesContext originalGs,
final GenotypeBuilder gb = new GenotypeBuilder(g);
final Map<String, Object> attributes = new HashMap<>(g.getExtendedAttributes());
attributes.remove(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY);
//TODO: remove other G-length attributes, although that may require header parsing
attributes.remove(VCFConstants.GENOTYPE_POSTERIORS_KEY);
attributes.remove(GATKVCFConstants.GENOTYPE_PRIOR_KEY);
final List<String> attributesToRemove = new ArrayList<>();
for(final String attribute : attributes.keySet()) {
if (alleleBasedLengthAnnots.contains(attribute)) {
if (attribute.equals(GATKVCFConstants.F1R2_KEY) || attribute.equals(GATKVCFConstants.F2R1_KEY)) {
final String oldAttributeString = (String) attributes.get(attribute);
final String[] oldAttributeArray = oldAttributeString.split(AnnotationUtils.LIST_DELIMITER);
final int[] oldAttribute = Arrays.stream(oldAttributeArray).mapToInt(Integer::parseInt).toArray();
final int[] newAttribute = getNewAlleleBasedReadCountAnnotation(allelesToKeep, allelePermutation, oldAttribute);
attributes.put(attribute, newAttribute);
} else {
attributesToRemove.add(attribute);
}
}
}
attributesToRemove.forEach(attributes::remove);
gb.noPL().noGQ().noAttributes().attributes(attributes); //if alleles are subset, old PLs and GQ are invalid
if (newLog10GQ != Double.NEGATIVE_INFINITY && g.hasGQ()) { //only put GQ if originally present
gb.log10PError(newLog10GQ);
Expand All @@ -112,19 +139,39 @@ public static GenotypesContext subsetAlleles(final GenotypesContext originalGs,

// restrict AD to the new allele subset
if(g.hasAD()) {
final int[] oldAD = g.getAD();
final int[] newAD = IntStream.range(0, allelesToKeep.size()).map(n -> oldAD[allelePermutation.fromIndex(n)]).toArray();
final int nonRefIndex = allelesToKeep.indexOf(Allele.NON_REF_ALLELE);
if (nonRefIndex != -1 && nonRefIndex < newAD.length) {
newAD[nonRefIndex] = 0; //we will "lose" coverage here, but otherwise merging NON_REF AD counts with other alleles "creates" reads
}
final int[] newAD = getNewAlleleBasedReadCountAnnotation(allelesToKeep, allelePermutation, g.getAD());
gb.AD(newAD);
// if we have recalculated AD and the original genotype had AF but was then removed, then recalculate AF based on AD counts
if (alleleBasedLengthAnnots.contains(GATKVCFConstants.ALLELE_FRACTION_KEY) && g.hasExtendedAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY)) {
attributesToRemove.remove(GATKVCFConstants.ALLELE_FRACTION_KEY);
final double[] newAFs = MathUtils.normalizeSumToOne(Arrays.stream(newAD).mapToDouble(x -> x).toArray());
gb.attribute(GATKVCFConstants.ALLELE_FRACTION_KEY, Arrays.copyOfRange(newAFs, 1, newAFs.length)); //omit the first entry of the array corresponding to the reference
}
}
if (attributesToRemove.size() > 0) {
attributesRemovedOneShotLogger.warn(() -> "The following attributes have been removed at sites where alleles were subset: " + String.join(",", attributesToRemove));
}
newGTs.add(gb.make());
}
return newGTs;
}

/**
* Subsets allele length annotation to match the alleles that have been kept
* @param allelesToKeep list of alleles that are not being filtered out
* @param allelePermutation permutation that matches the index from the old annotation to the new annotation given which alleles have been removed
* @param oldAnnot the original annotation
* @return the new subset annotation
*/
private static int[] getNewAlleleBasedReadCountAnnotation(List<Allele> allelesToKeep, Permutation<Allele> allelePermutation, int[] oldAnnot) {
final int[] newAnnot = IntStream.range(0, allelesToKeep.size()).map(n -> oldAnnot[allelePermutation.fromIndex(n)]).toArray();
final int nonRefIndex = allelesToKeep.indexOf(Allele.NON_REF_ALLELE);
if (nonRefIndex != -1 && nonRefIndex < newAnnot.length) {
newAnnot[nonRefIndex] = 0; //we will "lose" coverage here, but otherwise merging NON_REF AD counts with other alleles "creates" reads
}
return newAnnot;
}


/**
* Remove alternate alleles from a set of genotypes turning removed alleles to no-call and dropping other per-allele attributes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ public final class ReblockGVCF extends MultiVariantWalker {

private CachingIndexedFastaSequenceFile referenceReader;

private static final List<String> alleleBasedLengthAnnots = new ArrayList<>();

public static class AlleleLengthComparator implements Comparator<Allele> {
public int compare(Allele a1, Allele a2) {
return a1.getBaseString().length() - a2.getBaseString().length();
Expand Down Expand Up @@ -256,6 +258,15 @@ public void onTraversalStart() {
}
}

//Allele length and Genotype length annotations need to be subset or removed if alleles are dropped so we need to parse the header for annotation count types
for(final VCFFormatHeaderLine formatHeaderLine : inputHeader.getFormatHeaderLines()) {
if(formatHeaderLine.getCountType().equals(VCFHeaderLineCount.A) ||
formatHeaderLine.getCountType().equals(VCFHeaderLineCount.R) ||
formatHeaderLine.getCountType().equals(VCFHeaderLineCount.G)) {
alleleBasedLengthAnnots.add(formatHeaderLine.getID());
}
}

if ( dbsnp.dbsnp != null ) {
VCFStandardHeaderLines.addStandardInfoLines(headerLines, true, VCFConstants.DBSNP_KEY);
}
Expand Down Expand Up @@ -599,7 +610,7 @@ VariantContext cleanUpHighQualityVariant(final VariantContext variant) {
if(allelesNeedSubsetting && !keepAllAlts) {
newAlleleSetUntrimmed.removeAll(allelesToDrop);
final GenotypesContext gc = AlleleSubsettingUtils.subsetAlleles(variant.getGenotypes(), genotype.getPloidy(), variant.getAlleles(),
newAlleleSetUntrimmed, null, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN);
newAlleleSetUntrimmed, null, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, alleleBasedLengthAnnots);
if (gc.get(0).isHomRef() || !gc.get(0).hasGQ() || gc.get(0).getAlleles().contains(Allele.NO_CALL)) { //could be low quality or no-call after subsetting
if (dropLowQuals) {
return null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -517,4 +517,38 @@ public void testTreeScoreWithNoAnnotation() {

Assert.assertThrows(UserException.class, () -> runCommandLine(args));
}

@Test
public void testDragenGvcfs() {
final File input = getTestFile("dragen.g.vcf");
final File output = createTempFile("reblockedgvcf", ".vcf");

final ArgumentsBuilder args = new ArgumentsBuilder();
args.addReference(new File(hg38Reference))
.add("V", input)
.add("L", "chr1:939436")
.addOutput(output);
runCommandLine(args);

final VCFHeader outHeader = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath()).getLeft();
final VariantContext outVC = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath()).getRight().get(0);
// The extra allele was dropped so this site is now one alt allele and the <NON_REF> allele
Assert.assertEquals(outVC.getAlternateAlleles().size(), 2);
final Genotype g = outVC.getGenotype(0);
Assert.assertEquals(g.getExtendedAttribute(GATKVCFConstants.ALLELE_FRACTION_KEY),"1.00,0.00");
Assert.assertEquals(g.getExtendedAttribute(GATKVCFConstants.F1R2_KEY), "0,3,0");
Assert.assertEquals(g.getExtendedAttribute(GATKVCFConstants.F2R1_KEY), "0,3,0");
for (String attribute : g.getExtendedAttributes().keySet()) {
final VCFHeaderLineCount countType = outHeader.getFormatHeaderLine(attribute).getCountType();
if (countType.equals(VCFHeaderLineCount.A)) {
Assert.assertEquals(((String) g.getExtendedAttribute(attribute)).split(",").length, 2);
}
if (countType.equals(VCFHeaderLineCount.R)) {
Assert.assertEquals(((String) g.getExtendedAttribute(attribute)).split(",").length, 3);
}
if (countType.equals(VCFHeaderLineCount.G)) {
Assert.assertEquals(((String) g.getExtendedAttribute(attribute)).split(",").length, 6);
}
}
}
}
Loading

0 comments on commit 4c1fb92

Please sign in to comment.