From 1cfa52cf676c58cea27e563f86cfa5bfe005c3e2 Mon Sep 17 00:00:00 2001 From: Laura Gauthier Date: Fri, 1 Dec 2023 08:34:37 -0500 Subject: [PATCH 1/2] Enable ReblockGVCF to subset AS annotations that aren't "raw" (i.e. pipe-delimited) --- .../walkers/annotator/AnnotationUtils.java | 31 +++++++++++-- .../walkers/annotator/AssemblyComplexity.java | 4 +- .../annotator/VariantAnnotatorEngine.java | 4 ++ .../allelespecific/AS_StrandBiasTest.java | 2 +- .../walkers/variantutils/ReblockGVCF.java | 43 ++++++++++++++++--- .../ReblockGVCFIntegrationTest.java | 33 ++++++++++++++ .../PlasmoDB-61_Pfalciparum3D7_Genome.dict | 3 ++ .../PlasmoDB-61_Pfalciparum3D7_Genome.fasta | 3 ++ ...lasmoDB-61_Pfalciparum3D7_Genome.fasta.fai | 3 ++ ...reblockGVCFs_AS_test.p_falciparum.rb.g.vcf | 3 ++ .../reblockGVCFs_AS_test.p_falciparum.g.vcf | 3 ++ 11 files changed, 118 insertions(+), 14 deletions(-) create mode 100644 src/test/resources/large/PlasmoDB-61_Pfalciparum3D7_Genome.dict create mode 100644 src/test/resources/large/PlasmoDB-61_Pfalciparum3D7_Genome.fasta create mode 100644 src/test/resources/large/PlasmoDB-61_Pfalciparum3D7_Genome.fasta.fai create mode 100644 src/test/resources/large/expected.reblockGVCFs_AS_test.p_falciparum.rb.g.vcf create mode 100644 src/test/resources/large/reblockGVCFs_AS_test.p_falciparum.g.vcf diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java index 1d9949b219e..f13fcc581a5 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java @@ -4,9 +4,12 @@ import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; import org.apache.commons.lang3.StringUtils; +import htsjdk.variant.vcf.VCFHeaderLineCount; +import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.*; import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods; import org.broadinstitute.hellbender.utils.read.GATKRead; +import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines; import java.util.*; @@ -74,16 +77,21 @@ public static List decodeAnyASList( final String somethingList) { * @param annotation the annotation to be tested * @return true if the annotation is expected to have values per-allele */ - public static boolean isAlleleSpecific(final InfoFieldAnnotation annotation) { + public static boolean isAlleleSpecific(final VariantAnnotation annotation) { return annotation instanceof AlleleSpecificAnnotation; } + public static boolean isAlleleSpecificGatkKey(final String annotationKey) { + final VCFInfoHeaderLine header = GATKVCFHeaderLines.getInfoLine(annotationKey); + return header.getCountType().equals(VCFHeaderLineCount.A) || header.getCountType().equals(VCFHeaderLineCount.R); + } + /** - * Handles all the Java and htsjdk parsing shenanigans - * @param rawDataString should not have surrounding brackets + * Handles all the Java and htsjdk parsing shenanigans from getAttributeAsString + * @param rawDataString may have surrounding brackets, with raw delimiter * @return */ - public static List getAlleleLengthListOfString(String rawDataString) { + public static List getAlleleLengthListOfStringFromRawData(String rawDataString) { if (rawDataString == null) { return Collections.emptyList(); } @@ -93,6 +101,21 @@ public static List getAlleleLengthListOfString(String rawDataString) { return Arrays.asList(rawDataString.split(ALLELE_SPECIFIC_SPLIT_REGEX, -1)); //-1 to keep empty data } + /** + * Handles all the Java and htsjdk parsing shenanigans from getAttributeAsString + * @param dataString may have surrounding brackets, with reduced delimieter + * @return + */ + public static List getAlleleLengthListOfString(String dataString) { + if (dataString == null) { + return Collections.emptyList(); + } + if (dataString.startsWith("[")) { + dataString = dataString.substring(1, dataString.length() - 1).replaceAll("\\s", ""); + } + return Arrays.asList(dataString.split(ALLELE_SPECIFIC_REDUCED_DELIM, -1)); //-1 to keep empty data + } + static public String generateMissingDataWarning(final VariantContext vc, final Genotype g, final AlleleLikelihoods likelihoods) { final StringBuilder outString = new StringBuilder("Annotation will not be calculated at position " + vc.getContig() + ":" + vc.getStart() + " and possibly subsequent"); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java index 7045f4678ec..8ea5771b78e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AssemblyComplexity.java @@ -10,6 +10,7 @@ import org.broadinstitute.barclay.help.DocumentedFeature; import org.broadinstitute.hellbender.engine.FeatureContext; import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.AlleleSpecificAnnotation; import org.broadinstitute.hellbender.utils.MathUtils; import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods; import org.broadinstitute.hellbender.utils.haplotype.Event; @@ -27,7 +28,7 @@ */ @DocumentedFeature(groupName= HelpConstants.DOC_CAT_ANNOTATORS, groupSummary=HelpConstants.DOC_CAT_ANNOTATORS_SUMMARY, summary="Describe the complexity of an assembly region") -public class AssemblyComplexity implements JumboInfoAnnotation { +public class AssemblyComplexity implements JumboInfoAnnotation, AlleleSpecificAnnotation { @Argument(fullName = "assembly-complexity-reference-mode", doc="If enabled will treat the reference as the basis for assembly complexity as opposed to estimated germline haplotypes", @@ -189,5 +190,4 @@ private static int uniqueVariants(final Haplotype hap1, final Haplotype hap2, fi private static int editDistance(final Haplotype hap1, final Haplotype hap2, final int excludedPosition) { return uniqueVariants(hap1, hap2, excludedPosition) + uniqueVariants(hap2, hap1, excludedPosition); } - } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java index e83be35f50f..930a6a51033 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotatorEngine.java @@ -145,6 +145,10 @@ public List getInfoAnnotations() { return Collections.unmodifiableList(infoAnnotations); } + public List getJumboInfoAnnotations() { + return Collections.unmodifiableList(jumboInfoAnnotations); + } + /** * * @param infoAnnotationClassName the name of the Java class, NOT the annotation VCF key diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_StrandBiasTest.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_StrandBiasTest.java index d8466d0d997..b521f157d28 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_StrandBiasTest.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/allelespecific/AS_StrandBiasTest.java @@ -153,7 +153,7 @@ public Map finalizeRawData(final VariantContext vc, final Varia } protected void parseRawDataString(ReducibleAnnotationData> myData) { - List values = AnnotationUtils.getAlleleLengthListOfString(myData.getRawData()); + List values = AnnotationUtils.getAlleleLengthListOfStringFromRawData(myData.getRawData()); if (values.size() != myData.getAlleles().size()) { throw new IllegalStateException("Number of alleles and number of allele-specific entries do not match. " + "Allele-specific annotations should have an entry for each allele including the reference."); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF.java index bb9e63587c8..5816a58219d 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF.java @@ -221,6 +221,11 @@ public List> getDefaultVariantAnnotationGroups() { return Arrays.asList(StandardAnnotation.class, AS_StandardAnnotation.class); } + @Override + public List getDefaultVariantAnnotations() { + return Collections.singletonList(new AssemblyComplexity()); + } + @Override public boolean requiresReference() {return true;} @@ -936,7 +941,10 @@ private static void copyInfoAnnotations(final Map destinationAtt final boolean allelesNeedSubsetting, final int[] relevantIndices) { //copy over info annotations final Map origMap = sourceVC.getAttributes(); - for(final InfoFieldAnnotation annotation : annotationEngine.getInfoAnnotations()) { + final List engineAnnotations = new ArrayList<>(); + engineAnnotations.addAll(annotationEngine.getInfoAnnotations()); + engineAnnotations.addAll(annotationEngine.getJumboInfoAnnotations()); + for(final VariantAnnotation annotation : engineAnnotations) { for (final String key : annotation.getKeyNames()) { if (infoFieldAnnotationKeyNamesToRemove.contains(key)) { continue; @@ -945,17 +953,34 @@ private static void copyInfoAnnotations(final Map destinationAtt destinationAttrMap.put(key, origMap.get(key)); } } - if (annotation instanceof ReducibleAnnotation) { - for (final String rawKey : ((ReducibleAnnotation)annotation).getRawKeyNames()) { + if (annotation instanceof AlleleSpecificAnnotation) { + final boolean isReducible = annotation instanceof ReducibleAnnotation; + final List keyNames = isReducible ? ((ReducibleAnnotation)annotation).getRawKeyNames() : + annotation.getKeyNames(); + for (final String rawKey : keyNames) { if (infoFieldAnnotationKeyNamesToRemove.contains(rawKey)) { continue; } if (origMap.containsKey(rawKey)) { - if (allelesNeedSubsetting && AnnotationUtils.isAlleleSpecific(annotation)) { - final List alleleSpecificValues = AnnotationUtils.getAlleleLengthListOfString(sourceVC.getAttributeAsString(rawKey, null)); + if (allelesNeedSubsetting && AnnotationUtils.isAlleleSpecific(annotation) && AnnotationUtils.isAlleleSpecificGatkKey(rawKey)) { + final List alleleSpecificValues; + if (isReducible) { + alleleSpecificValues = AnnotationUtils.getAlleleLengthListOfStringFromRawData(sourceVC.getAttributeAsString(rawKey, null)); + } else { + String value = sourceVC.getAttributeAsString(rawKey, null); + if (value == null) { + alleleSpecificValues = Collections.emptyList(); + } else { + alleleSpecificValues = AnnotationUtils.getAlleleLengthListOfString(value); + } + } final List subsetList; if (alleleSpecificValues.size() > 0) { - subsetList = AlleleSubsettingUtils.remapRLengthList(alleleSpecificValues, relevantIndices, ""); + if (isReducible) { + subsetList = AlleleSubsettingUtils.remapRLengthList(alleleSpecificValues, relevantIndices, ""); + } else { + subsetList = AlleleSubsettingUtils.remapALengthList(alleleSpecificValues, relevantIndices, ""); + } if (sourceVC.getAlleles().get(relevantIndices[relevantIndices.length - 1]).equals(Allele.NON_REF_ALLELE)) { //zero out non-ref value, just in case subsetList.set(subsetList.size() - 1, ((AlleleSpecificAnnotation) annotation).getEmptyRawValue()); @@ -964,7 +989,11 @@ private static void copyInfoAnnotations(final Map destinationAtt subsetList = Collections.nCopies(relevantIndices.length, ""); } - destinationAttrMap.put(rawKey, AnnotationUtils.encodeAnyASListWithRawDelim(subsetList)); + if (isReducible) { + destinationAttrMap.put(rawKey, AnnotationUtils.encodeAnyASListWithRawDelim(subsetList)); + } else { + destinationAttrMap.put(rawKey, AnnotationUtils.encodeStringList(subsetList)); + } } else { destinationAttrMap.put(rawKey, origMap.get(rawKey)); } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCFIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCFIntegrationTest.java index b0f523b6f7e..dd684a247fc 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCFIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCFIntegrationTest.java @@ -1,5 +1,6 @@ package org.broadinstitute.hellbender.tools.walkers.variantutils; +import htsjdk.samtools.util.IOUtil; import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; @@ -16,6 +17,7 @@ import org.broadinstitute.hellbender.testutils.IntegrationTestSpec; import org.broadinstitute.hellbender.testutils.VariantContextTestUtils; import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeCalculationArgumentCollection; +import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.HaplotypeCallerArgumentCollection; import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import org.testng.Assert; import org.testng.annotations.DataProvider; @@ -32,6 +34,8 @@ public class ReblockGVCFIntegrationTest extends CommandLineProgramTest { private static final String b37_reference_20_21 = largeFileTestDir + "human_g1k_v37.20.21.fasta"; public static final String WARP_PROD_REBLOCKING_ARGS = " -do-qual-approx --floor-blocks -GQB 20 -GQB 30 -GQB 40 "; + private static final String pf_reference = largeFileTestDir + "PlasmoDB-61_Pfalciparum3D7_Genome.fasta"; + @DataProvider(name = "getCommandLineArgsForExactTest") public Object[][] getCommandLineArgsForExactTest() { return new Object[][]{ @@ -634,4 +638,33 @@ public void testNonGVCFInput() { Assert.assertThrows(GATKException.class, () -> runCommandLine(args)); } + + @Test + public void testAddedHcAsAnnotations() throws IOException { + + final File input = new File(largeFileTestDir + "reblockGVCFs_AS_test.p_falciparum.g.vcf"); + final File output = createTempFile("reblockedgvcf", ".vcf"); + final File expected = new File(largeFileTestDir + "expected.reblockGVCFs_AS_test.p_falciparum.rb.g.vcf"); + + final ArgumentsBuilder args = new ArgumentsBuilder(); + + args.addReference(new File(pf_reference)) + .add("V", input) + .addFlag("do-qual-approx") + .add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false") + .addFlag("floor-blocks") + .add("GQB", 20) + .add("GQB", 30) + .add("GQB", 40) + .addOutput(output); + + runCommandLine(args); + + IntegrationTestSpec.assertEqualTextFiles( + output.toPath(), + expected.toPath(), + "#", + true + ); + } } diff --git a/src/test/resources/large/PlasmoDB-61_Pfalciparum3D7_Genome.dict b/src/test/resources/large/PlasmoDB-61_Pfalciparum3D7_Genome.dict new file mode 100644 index 00000000000..08e284b93d4 --- /dev/null +++ b/src/test/resources/large/PlasmoDB-61_Pfalciparum3D7_Genome.dict @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ccf40862b4090c3cc1c65eee8a71e1cd50b6606a53ca6d7a58ff49a2ee08df0 +size 3090 diff --git a/src/test/resources/large/PlasmoDB-61_Pfalciparum3D7_Genome.fasta b/src/test/resources/large/PlasmoDB-61_Pfalciparum3D7_Genome.fasta new file mode 100644 index 00000000000..47e64861e78 --- /dev/null +++ b/src/test/resources/large/PlasmoDB-61_Pfalciparum3D7_Genome.fasta @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb5746feb5db1b597002f1c09b066bdcff93adcfa3fe8a4ef9fe43154b175350 +size 23723411 diff --git a/src/test/resources/large/PlasmoDB-61_Pfalciparum3D7_Genome.fasta.fai b/src/test/resources/large/PlasmoDB-61_Pfalciparum3D7_Genome.fasta.fai new file mode 100644 index 00000000000..fbf0068bf6b --- /dev/null +++ b/src/test/resources/large/PlasmoDB-61_Pfalciparum3D7_Genome.fasta.fai @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c55df0a06e6178fa542ba181518e1c97c6f6d6227c41a79efa6428f9be7a6142 +size 541 diff --git a/src/test/resources/large/expected.reblockGVCFs_AS_test.p_falciparum.rb.g.vcf b/src/test/resources/large/expected.reblockGVCFs_AS_test.p_falciparum.rb.g.vcf new file mode 100644 index 00000000000..00c1dd39c19 --- /dev/null +++ b/src/test/resources/large/expected.reblockGVCFs_AS_test.p_falciparum.rb.g.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:488245b1af6f3431eb310a5998f63df0dc78cb7517afca9ba0b74c66db0deaa6 +size 121588 diff --git a/src/test/resources/large/reblockGVCFs_AS_test.p_falciparum.g.vcf b/src/test/resources/large/reblockGVCFs_AS_test.p_falciparum.g.vcf new file mode 100644 index 00000000000..5fb46f912fe --- /dev/null +++ b/src/test/resources/large/reblockGVCFs_AS_test.p_falciparum.g.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a178b3a4cec3e884e61589ce6eb5d4bceec8634addd7f92eff3eba87f6ddc34 +size 178475 From 6add18581c1146935504dd220bb1cd71142a3e30 Mon Sep 17 00:00:00 2001 From: Laura Gauthier Date: Thu, 11 Apr 2024 12:51:38 -0400 Subject: [PATCH 2/2] Fix tests by removing AssemblyComplexity from default annotations --- .../hellbender/tools/walkers/annotator/AnnotationUtils.java | 6 +++++- .../hellbender/tools/walkers/variantutils/ReblockGVCF.java | 5 ----- .../walkers/variantutils/ReblockGVCFIntegrationTest.java | 1 + 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java index f13fcc581a5..7ba0a0baa24 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/AnnotationUtils.java @@ -14,6 +14,8 @@ import java.util.*; public final class AnnotationUtils { + + public static final String ALLELE_SPECIFIC_ANNOTATION_KEY_PREFIX = "AS_"; public static final String ALLELE_SPECIFIC_RAW_DELIM = "|"; public static final String ALLELE_SPECIFIC_REDUCED_DELIM = ","; public static final String ALLELE_SPECIFIC_SPLIT_REGEX = "\\|"; //String.split takes a regex, so we need to escape the pipe @@ -83,7 +85,9 @@ public static boolean isAlleleSpecific(final VariantAnnotation annotation) { public static boolean isAlleleSpecificGatkKey(final String annotationKey) { final VCFInfoHeaderLine header = GATKVCFHeaderLines.getInfoLine(annotationKey); - return header.getCountType().equals(VCFHeaderLineCount.A) || header.getCountType().equals(VCFHeaderLineCount.R); + return header.getCountType().equals(VCFHeaderLineCount.A) || + header.getCountType().equals(VCFHeaderLineCount.R) || + annotationKey.startsWith(ALLELE_SPECIFIC_ANNOTATION_KEY_PREFIX); } /** diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF.java index 5816a58219d..9b23601c32e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCF.java @@ -221,11 +221,6 @@ public List> getDefaultVariantAnnotationGroups() { return Arrays.asList(StandardAnnotation.class, AS_StandardAnnotation.class); } - @Override - public List getDefaultVariantAnnotations() { - return Collections.singletonList(new AssemblyComplexity()); - } - @Override public boolean requiresReference() {return true;} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCFIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCFIntegrationTest.java index dd684a247fc..af455202386 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCFIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/variantutils/ReblockGVCFIntegrationTest.java @@ -652,6 +652,7 @@ public void testAddedHcAsAnnotations() throws IOException { .add("V", input) .addFlag("do-qual-approx") .add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false") + .add("A", "AssemblyComplexity") .addFlag("floor-blocks") .add("GQB", 20) .add("GQB", 30)