Skip to content

Commit

Permalink
Enable ReblockGVCF to subset AS annotations that aren't "raw" (pipe-d…
Browse files Browse the repository at this point in the history
…elimited) (#8771)

* Enable ReblockGVCF to subset AS annotations that aren't "raw" (i.e. pipe-delimited)

* Fix tests by removing AssemblyComplexity from default annotations
  • Loading branch information
ldgauthier committed Apr 12, 2024
1 parent 47c4858 commit 986cb15
Show file tree
Hide file tree
Showing 11 changed files with 118 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,18 @@
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.commons.lang3.StringUtils;
import htsjdk.variant.vcf.VCFHeaderLineCount;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.*;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;

import java.util.*;

public final class AnnotationUtils {

public static final String ALLELE_SPECIFIC_ANNOTATION_KEY_PREFIX = "AS_";
public static final String ALLELE_SPECIFIC_RAW_DELIM = "|";
public static final String ALLELE_SPECIFIC_REDUCED_DELIM = ",";
public static final String ALLELE_SPECIFIC_SPLIT_REGEX = "\\|"; //String.split takes a regex, so we need to escape the pipe
Expand Down Expand Up @@ -74,16 +79,23 @@ public static List<String> decodeAnyASList( final String somethingList) {
* @param annotation the annotation to be tested
* @return true if the annotation is expected to have values per-allele
*/
public static boolean isAlleleSpecific(final InfoFieldAnnotation annotation) {
public static boolean isAlleleSpecific(final VariantAnnotation annotation) {
return annotation instanceof AlleleSpecificAnnotation;
}

public static boolean isAlleleSpecificGatkKey(final String annotationKey) {
final VCFInfoHeaderLine header = GATKVCFHeaderLines.getInfoLine(annotationKey);
return header.getCountType().equals(VCFHeaderLineCount.A) ||
header.getCountType().equals(VCFHeaderLineCount.R) ||
annotationKey.startsWith(ALLELE_SPECIFIC_ANNOTATION_KEY_PREFIX);
}

/**
* Handles all the Java and htsjdk parsing shenanigans
* @param rawDataString should not have surrounding brackets
* Handles all the Java and htsjdk parsing shenanigans from getAttributeAsString
* @param rawDataString may have surrounding brackets, with raw delimiter
* @return
*/
public static List<String> getAlleleLengthListOfString(String rawDataString) {
public static List<String> getAlleleLengthListOfStringFromRawData(String rawDataString) {
if (rawDataString == null) {
return Collections.emptyList();
}
Expand All @@ -93,6 +105,21 @@ public static List<String> getAlleleLengthListOfString(String rawDataString) {
return Arrays.asList(rawDataString.split(ALLELE_SPECIFIC_SPLIT_REGEX, -1)); //-1 to keep empty data
}

/**
* Handles all the Java and htsjdk parsing shenanigans from getAttributeAsString
* @param dataString may have surrounding brackets, with reduced delimieter
* @return
*/
public static List<String> getAlleleLengthListOfString(String dataString) {
if (dataString == null) {
return Collections.emptyList();
}
if (dataString.startsWith("[")) {
dataString = dataString.substring(1, dataString.length() - 1).replaceAll("\\s", "");
}
return Arrays.asList(dataString.split(ALLELE_SPECIFIC_REDUCED_DELIM, -1)); //-1 to keep empty data
}

static public String generateMissingDataWarning(final VariantContext vc, final Genotype g, final AlleleLikelihoods<GATKRead, Allele> likelihoods) {
final StringBuilder outString = new StringBuilder("Annotation will not be calculated at position " + vc.getContig() + ":" + vc.getStart() +
" and possibly subsequent");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AlleleSpecificAnnotation;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.haplotype.Event;
Expand All @@ -27,7 +28,7 @@
*/
@DocumentedFeature(groupName= HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY,
summary="Describe the complexity of an assembly region")
public class AssemblyComplexity implements JumboInfoAnnotation {
public class AssemblyComplexity implements JumboInfoAnnotation, AlleleSpecificAnnotation {

@Argument(fullName = "assembly-complexity-reference-mode",
doc="If enabled will treat the reference as the basis for assembly complexity as opposed to estimated germline haplotypes",
Expand Down Expand Up @@ -189,5 +190,4 @@ private static int uniqueVariants(final Haplotype hap1, final Haplotype hap2, fi
private static int editDistance(final Haplotype hap1, final Haplotype hap2, final int excludedPosition) {
return uniqueVariants(hap1, hap2, excludedPosition) + uniqueVariants(hap2, hap1, excludedPosition);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,10 @@ public List<InfoFieldAnnotation> getInfoAnnotations() {
return Collections.unmodifiableList(infoAnnotations);
}

public List<JumboInfoAnnotation> getJumboInfoAnnotations() {
return Collections.unmodifiableList(jumboInfoAnnotations);
}

/**
*
* @param infoAnnotationClassName the name of the Java class, NOT the annotation VCF key
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ public Map<String, Object> finalizeRawData(final VariantContext vc, final Varia
}

protected void parseRawDataString(ReducibleAnnotationData<List<Integer>> myData) {
List<String> values = AnnotationUtils.getAlleleLengthListOfString(myData.getRawData());
List<String> values = AnnotationUtils.getAlleleLengthListOfStringFromRawData(myData.getRawData());
if (values.size() != myData.getAlleles().size()) {
throw new IllegalStateException("Number of alleles and number of allele-specific entries do not match. " +
"Allele-specific annotations should have an entry for each allele including the reference.");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -936,7 +936,10 @@ private static void copyInfoAnnotations(final Map<String, Object> destinationAtt
final boolean allelesNeedSubsetting, final int[] relevantIndices) {
//copy over info annotations
final Map<String, Object> origMap = sourceVC.getAttributes();
for(final InfoFieldAnnotation annotation : annotationEngine.getInfoAnnotations()) {
final List<VariantAnnotation> engineAnnotations = new ArrayList<>();
engineAnnotations.addAll(annotationEngine.getInfoAnnotations());
engineAnnotations.addAll(annotationEngine.getJumboInfoAnnotations());
for(final VariantAnnotation annotation : engineAnnotations) {
for (final String key : annotation.getKeyNames()) {
if (infoFieldAnnotationKeyNamesToRemove.contains(key)) {
continue;
Expand All @@ -945,17 +948,34 @@ private static void copyInfoAnnotations(final Map<String, Object> destinationAtt
destinationAttrMap.put(key, origMap.get(key));
}
}
if (annotation instanceof ReducibleAnnotation) {
for (final String rawKey : ((ReducibleAnnotation)annotation).getRawKeyNames()) {
if (annotation instanceof AlleleSpecificAnnotation) {
final boolean isReducible = annotation instanceof ReducibleAnnotation;
final List<String> keyNames = isReducible ? ((ReducibleAnnotation)annotation).getRawKeyNames() :
annotation.getKeyNames();
for (final String rawKey : keyNames) {
if (infoFieldAnnotationKeyNamesToRemove.contains(rawKey)) {
continue;
}
if (origMap.containsKey(rawKey)) {
if (allelesNeedSubsetting && AnnotationUtils.isAlleleSpecific(annotation)) {
final List<String> alleleSpecificValues = AnnotationUtils.getAlleleLengthListOfString(sourceVC.getAttributeAsString(rawKey, null));
if (allelesNeedSubsetting && AnnotationUtils.isAlleleSpecific(annotation) && AnnotationUtils.isAlleleSpecificGatkKey(rawKey)) {
final List<String> alleleSpecificValues;
if (isReducible) {
alleleSpecificValues = AnnotationUtils.getAlleleLengthListOfStringFromRawData(sourceVC.getAttributeAsString(rawKey, null));
} else {
String value = sourceVC.getAttributeAsString(rawKey, null);
if (value == null) {
alleleSpecificValues = Collections.emptyList();
} else {
alleleSpecificValues = AnnotationUtils.getAlleleLengthListOfString(value);
}
}
final List<String> subsetList;
if (alleleSpecificValues.size() > 0) {
subsetList = AlleleSubsettingUtils.remapRLengthList(alleleSpecificValues, relevantIndices, "");
if (isReducible) {
subsetList = AlleleSubsettingUtils.remapRLengthList(alleleSpecificValues, relevantIndices, "");
} else {
subsetList = AlleleSubsettingUtils.remapALengthList(alleleSpecificValues, relevantIndices, "");
}
if (sourceVC.getAlleles().get(relevantIndices[relevantIndices.length - 1]).equals(Allele.NON_REF_ALLELE)) {
//zero out non-ref value, just in case
subsetList.set(subsetList.size() - 1, ((AlleleSpecificAnnotation) annotation).getEmptyRawValue());
Expand All @@ -964,7 +984,11 @@ private static void copyInfoAnnotations(final Map<String, Object> destinationAtt
subsetList = Collections.nCopies(relevantIndices.length, "");
}

destinationAttrMap.put(rawKey, AnnotationUtils.encodeAnyASListWithRawDelim(subsetList));
if (isReducible) {
destinationAttrMap.put(rawKey, AnnotationUtils.encodeAnyASListWithRawDelim(subsetList));
} else {
destinationAttrMap.put(rawKey, AnnotationUtils.encodeStringList(subsetList));
}
} else {
destinationAttrMap.put(rawKey, origMap.get(rawKey));
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.broadinstitute.hellbender.tools.walkers.variantutils;

import htsjdk.samtools.util.IOUtil;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
Expand All @@ -16,6 +17,7 @@
import org.broadinstitute.hellbender.testutils.IntegrationTestSpec;
import org.broadinstitute.hellbender.testutils.VariantContextTestUtils;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeCalculationArgumentCollection;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.HaplotypeCallerArgumentCollection;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
Expand All @@ -32,6 +34,8 @@ public class ReblockGVCFIntegrationTest extends CommandLineProgramTest {
private static final String b37_reference_20_21 = largeFileTestDir + "human_g1k_v37.20.21.fasta";
public static final String WARP_PROD_REBLOCKING_ARGS = " -do-qual-approx --floor-blocks -GQB 20 -GQB 30 -GQB 40 ";

private static final String pf_reference = largeFileTestDir + "PlasmoDB-61_Pfalciparum3D7_Genome.fasta";

@DataProvider(name = "getCommandLineArgsForExactTest")
public Object[][] getCommandLineArgsForExactTest() {
return new Object[][]{
Expand Down Expand Up @@ -634,4 +638,34 @@ public void testNonGVCFInput() {

Assert.assertThrows(GATKException.class, () -> runCommandLine(args));
}

@Test
public void testAddedHcAsAnnotations() throws IOException {

final File input = new File(largeFileTestDir + "reblockGVCFs_AS_test.p_falciparum.g.vcf");
final File output = createTempFile("reblockedgvcf", ".vcf");
final File expected = new File(largeFileTestDir + "expected.reblockGVCFs_AS_test.p_falciparum.rb.g.vcf");

final ArgumentsBuilder args = new ArgumentsBuilder();

args.addReference(new File(pf_reference))
.add("V", input)
.addFlag("do-qual-approx")
.add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false")
.add("A", "AssemblyComplexity")
.addFlag("floor-blocks")
.add("GQB", 20)
.add("GQB", 30)
.add("GQB", 40)
.addOutput(output);

runCommandLine(args);

IntegrationTestSpec.assertEqualTextFiles(
output.toPath(),
expected.toPath(),
"#",
true
);
}
}
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown

0 comments on commit 986cb15

Please sign in to comment.