Skip to content

Commit

Permalink
Adding argument to GenotypeGVCFs to keep specified raw annotations (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
meganshand committed Oct 24, 2022
1 parent 277bf00 commit 90bf668
Show file tree
Hide file tree
Showing 25 changed files with 128 additions and 27 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.broadinstitute.hellbender.cmdline.GATKPlugin;

import com.google.common.collect.Lists;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import org.broadinstitute.hellbender.utils.config.ConfigFactory;
import org.broadinstitute.hellbender.utils.config.GATKConfig;

import java.io.File;
import java.lang.reflect.Modifier;
import java.util.*;
import java.util.stream.Collectors;
Expand Down Expand Up @@ -485,6 +484,16 @@ public List<Annotation> getResolvedInstances() {
return resolvedInstances;
}

/**
* Returns a map of the String to Annotations only in the resolved instances.
*
* @return a Map of Strings to Annotations of resolved instances
*/
public Map<String, Annotation> getResolvedInstancesMap() {
return allDiscoveredAnnotations.entrySet().stream()
.filter(e -> getResolvedInstances().contains(e.getValue()))
.collect(Collectors.toMap(e -> e.getKey(), e -> e.getValue()));
}

/**
* Return the class representing the instance of the plugin specified by {@code pluginName}
Expand All @@ -496,5 +505,4 @@ public List<Annotation> getResolvedInstances() {
public Class<?> getClassForPluginHelp(final String pluginName) {
return allDiscoveredAnnotations.containsKey(pluginName) ? allDiscoveredAnnotations.get(pluginName).getClass() : null;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import htsjdk.variant.vcf.VCFHeaderLine;
import org.broadinstitute.barclay.argparser.*;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKAnnotationPluginDescriptor;
import org.broadinstitute.hellbender.cmdline.GATKPlugin.GATKReadFilterPluginDescriptor;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.DbsnpArgumentCollection;
import org.broadinstitute.hellbender.cmdline.programgroups.ShortVariantDiscoveryProgramGroup;
Expand All @@ -17,6 +19,7 @@
import org.broadinstitute.hellbender.engine.ReadsContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.VariantLocusWalker;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBArgumentCollection;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBImport;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBOptions;
Expand All @@ -28,6 +31,7 @@
import org.broadinstitute.hellbender.tools.walkers.mutect.M2ArgumentCollection;
import org.broadinstitute.hellbender.utils.*;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.ReducibleAnnotation;

import java.util.*;
import java.util.stream.Collectors;
Expand Down Expand Up @@ -132,17 +136,17 @@ public final class GenotypeGVCFs extends VariantLocusWalker {
doc = "LOD threshold to emit variant to VCF.")
protected double tlodThreshold = 3.5; //allow for some lower quality variants


/**
* Margin of error in allele fraction to consider a somatic variant homoplasmic, i.e. if there is less than a 0.1% reference allele fraction, those reads are likely errors
*/
@Argument(fullName=CombineGVCFs.ALLELE_FRACTION_DELTA_LONG_NAME, doc = "Margin of error in allele fraction to consider a somatic variant homoplasmic")
protected double afTolerance = 1e-3; //based on Q30 as a "good" base quality score

/**
* If specified, keep the combined raw annotations (e.g. AS_SB_TABLE) after genotyping. This is applicable to Allele-Specific annotations
* If specified, keep all the combined raw annotations (e.g. AS_SB_TABLE) after genotyping. This is applicable to Allele-Specific annotations. See {@link ReducibleAnnotation}
*/
@Argument(fullName=KEEP_COMBINED_LONG_NAME, shortName = KEEP_COMBINED_SHORT_NAME, doc = "If specified, keep the combined raw annotations")
@Argument(fullName=KEEP_COMBINED_LONG_NAME, shortName = KEEP_COMBINED_SHORT_NAME, doc = "If specified, keep the combined raw annotations",
mutex = {GenotypeGVCFsAnnotationArgumentCollection.KEEP_SPECIFIED_RAW_COMBINED_ANNOTATION_LONG_NAME})
protected boolean keepCombined = false;

@ArgumentCollection
Expand Down Expand Up @@ -172,6 +176,9 @@ public final class GenotypeGVCFs extends VariantLocusWalker {
@ArgumentCollection
private final DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();

// @ArgumentCollection deliberately omitted since this is passed to the annotation plugin
final GenotypeGVCFsAnnotationArgumentCollection genotypeGVCFsAnnotationArgs = new GenotypeGVCFsAnnotationArgumentCollection();

// the annotation engine
private VariantAnnotatorEngine annotationEngine;

Expand Down Expand Up @@ -221,6 +228,16 @@ protected GenomicsDBOptions getGenomicsDBOptions() {
@Override
public boolean useVariantAnnotations() { return true;}

@Override
public List<? extends CommandLinePluginDescriptor<?>> getPluginDescriptors() {
GATKReadFilterPluginDescriptor readFilterDescriptor = new GATKReadFilterPluginDescriptor(getDefaultReadFilters());
return useVariantAnnotations()?
Arrays.asList(readFilterDescriptor, new GATKAnnotationPluginDescriptor(
genotypeGVCFsAnnotationArgs,
getDefaultVariantAnnotations(), getDefaultVariantAnnotationGroups())):
Collections.singletonList(readFilterDescriptor);
}

@Override
public List<Class<? extends Annotation>> getDefaultVariantAnnotationGroups() {
return Arrays.asList(StandardAnnotation.class);
Expand Down Expand Up @@ -261,8 +278,9 @@ public void onTraversalStart() {
intervals = hasUserSuppliedIntervals() ? intervalArgumentCollection.getIntervals(getBestAvailableSequenceDictionary()) :
Collections.emptyList();

Collection<Annotation> variantAnnotations = makeVariantAnnotations();
annotationEngine = new VariantAnnotatorEngine(variantAnnotations, dbsnp.dbsnp, Collections.emptyList(), false, keepCombined);
final Collection<Annotation> variantAnnotations = makeVariantAnnotations();
final Set<Annotation> annotationsToKeep = getAnnotationsToKeep();
annotationEngine = new VariantAnnotatorEngine(variantAnnotations, dbsnp.dbsnp, Collections.emptyList(), false, keepCombined, annotationsToKeep);

merger = new ReferenceConfidenceVariantContextMerger(annotationEngine, getHeaderForVariants(), somaticInput, false, true);

Expand All @@ -279,6 +297,17 @@ public void onTraversalStart() {

}

private Set<Annotation> getAnnotationsToKeep() {
final GATKAnnotationPluginDescriptor pluginDescriptor = getCommandLineParser().getPluginDescriptor(GATKAnnotationPluginDescriptor.class);
final List<String> annotationStringsToKeep = genotypeGVCFsAnnotationArgs.getKeepSpecifiedCombinedAnnotationNames();
final Map<String, Annotation> resolvedInstancesMap = pluginDescriptor.getResolvedInstancesMap();
return annotationStringsToKeep.stream()
.peek(s -> {Annotation a = resolvedInstancesMap.get(s); if (a == null)
throw new UserException("Requested --" + GenotypeGVCFsAnnotationArgumentCollection.KEEP_SPECIFIED_RAW_COMBINED_ANNOTATION_LONG_NAME + ": " + s + " was not found in annotation list. Was it excluded with --" + StandardArgumentDefinitions.ANNOTATIONS_TO_EXCLUDE_LONG_NAME + " or not provided with --" + StandardArgumentDefinitions.ANNOTATION_LONG_NAME + "?"); })
.map(resolvedInstancesMap::get)
.collect(Collectors.toSet());
}

@Override
public void apply(final Locatable loc, List<VariantContext> variants, ReadsContext reads, ReferenceContext ref, FeatureContext features) {

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package org.broadinstitute.hellbender.tools.walkers;

import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.hellbender.cmdline.GATKPlugin.DefaultGATKVariantAnnotationArgumentCollection;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.ReducibleAnnotation;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class GenotypeGVCFsAnnotationArgumentCollection extends DefaultGATKVariantAnnotationArgumentCollection {
private static final long serialVersionUID = 1L;

public static final String KEEP_SPECIFIED_RAW_COMBINED_ANNOTATION_LONG_NAME = "keep-specific-combined-raw-annotation";
public static final String KEEP_SPECIFIED_RAW_COMBINED_ANNOTATION_SHORT_NAME = "keep-specific-combined";

/**
* Keep only the specific combined raw annotations specified. Cannot be used with --keep-combined-raw-annotations which saves all raw annotations.
* Duplicate values will be ignored. See {@link ReducibleAnnotation} for more information on raw annotations.
*/
@Argument(fullName= KEEP_SPECIFIED_RAW_COMBINED_ANNOTATION_LONG_NAME, shortName = KEEP_SPECIFIED_RAW_COMBINED_ANNOTATION_SHORT_NAME, optional = true,
mutex = {GenotypeGVCFs.KEEP_COMBINED_LONG_NAME},
doc="Keep only the specific combined raw annotations specified (removing the other raw annotations). Duplicate values will be ignored.")
protected List<String> keepSpecifiedCombined = new ArrayList<>();

public List<String> getKeepSpecifiedCombinedAnnotationNames() {return Collections.unmodifiableList(keepSpecifiedCombined);}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@
import htsjdk.variant.vcf.*;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.FeatureDataSource;
import org.broadinstitute.hellbender.engine.FeatureInput;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.GenotypeGVCFsAnnotationArgumentCollection;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.ReducibleAnnotation;
import org.broadinstitute.hellbender.tools.walkers.annotator.allelespecific.ReducibleAnnotationData;
import org.broadinstitute.hellbender.utils.Utils;
Expand Down Expand Up @@ -43,6 +45,7 @@ public final class VariantAnnotatorEngine {
private boolean expressionAlleleConcordance;
private final boolean useRawAnnotations;
private final boolean keepRawCombinedAnnotations;
private final List<String> rawAnnotationsToKeep;

private final static Logger logger = LogManager.getLogger(VariantAnnotatorEngine.class);
private final static OneShotLogger jumboAnnotationsLogger = new OneShotLogger(VariantAnnotatorEngine.class);
Expand All @@ -59,17 +62,20 @@ public final class VariantAnnotatorEngine {
* @param useRaw When this is set to true, the annotation engine will call {@link ReducibleAnnotation#annotateRawData(ReferenceContext, VariantContext, AlleleLikelihoods)}
* on annotations that extend {@link ReducibleAnnotation}, instead of {@link InfoFieldAnnotation#annotate(ReferenceContext, VariantContext, AlleleLikelihoods)},
* @param keepCombined If true, retain the combined raw annotation values instead of removing them after finalizing
* @param rawAnnotationsToKeep List of raw annotations to keep even when others are removed
*/
public VariantAnnotatorEngine(final Collection<Annotation> annotationList,
final FeatureInput<VariantContext> dbSNPInput,
final List<FeatureInput<VariantContext>> featureInputs,
final boolean useRaw,
boolean keepCombined){
final boolean keepCombined,
final Collection<Annotation> rawAnnotationsToKeep){
Utils.nonNull(featureInputs, "comparisonFeatureInputs is null");
infoAnnotations = new ArrayList<>();
genotypeAnnotations = new ArrayList<>();
jumboInfoAnnotations = new ArrayList<>();
jumboGenotypeAnnotations = new ArrayList<>();
this.rawAnnotationsToKeep = new ArrayList<>();
for (Annotation annot : annotationList) {
if (annot instanceof InfoFieldAnnotation) {
infoAnnotations.add((InfoFieldAnnotation) annot);
Expand All @@ -87,6 +93,9 @@ public VariantAnnotatorEngine(final Collection<Annotation> annotationList,
reducibleKeys = new LinkedHashSet<>();
useRawAnnotations = useRaw;
keepRawCombinedAnnotations = keepCombined;
for (final Annotation rawAnnot : rawAnnotationsToKeep) {
this.rawAnnotationsToKeep.addAll(((VariantAnnotation) rawAnnot).getKeyNames());
}
for (InfoFieldAnnotation annot : infoAnnotations) {
if (annot instanceof ReducibleAnnotation) {
for (final String rawKey : ((ReducibleAnnotation) annot).getRawKeyNames()) {
Expand All @@ -96,6 +105,14 @@ public VariantAnnotatorEngine(final Collection<Annotation> annotationList,
}
}

public VariantAnnotatorEngine(final Collection<Annotation> annotationList,
final FeatureInput<VariantContext> dbSNPInput,
final List<FeatureInput<VariantContext>> featureInputs,
final boolean useRaw,
boolean keepCombined){
this(annotationList, dbSNPInput, featureInputs, useRaw, keepCombined, Collections.emptyList());
}

private VariantOverlapAnnotator initializeOverlapAnnotator(final FeatureInput<VariantContext> dbSNPInput, final List<FeatureInput<VariantContext>> featureInputs) {
final Map<FeatureInput<VariantContext>, String> overlaps = new LinkedHashMap<>();
for ( final FeatureInput<VariantContext> fi : featureInputs) {
Expand Down Expand Up @@ -253,6 +270,14 @@ public Map<String, Object> combineAnnotations(final List<Allele> allelesList, Ma
public VariantContext finalizeAnnotations(VariantContext vc, VariantContext originalVC) {
final Map<String, Object> variantAnnotations = new LinkedHashMap<>(vc.getAttributes());

//save annotations that have been requested to be kept
final Map<String, Object> savedRawAnnotations = new LinkedHashMap<>();
for(final String rawAnnot : rawAnnotationsToKeep) {
if (variantAnnotations.containsKey(rawAnnot)) {
savedRawAnnotations.put(rawAnnot, variantAnnotations.get(rawAnnot));
}
}

// go through all the requested info annotationTypes
for (final InfoFieldAnnotation annotationType : infoAnnotations) {
if (annotationType instanceof ReducibleAnnotation) {
Expand Down Expand Up @@ -280,6 +305,8 @@ public VariantContext finalizeAnnotations(VariantContext vc, VariantContext orig
variantAnnotations.remove(GATKVCFConstants.VARIANT_DEPTH_KEY);
variantAnnotations.remove(GATKVCFConstants.RAW_GENOTYPE_COUNT_KEY);
}
//add back raw annotations that have specifically been requested to keep
variantAnnotations.putAll(savedRawAnnotations);

// generate a new annotated VC
final VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(variantAnnotations);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,21 @@
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFCompoundHeaderLine;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.walkers.annotator.Annotation;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;

/**
* An interface for annotations that are calculated using raw data across samples, rather than the median (or median of median) of samples values
* The Raw annotation keeps some summary (one example might be a histogram of the raw values for each sample) of the individual sample (or allele)
* level annotation. As the annotations are combined across multiple samples the raw annotation continues to contain individual values while
* the final reduced annotation will typically be a summary statistic from these raw values.
*
*/
public interface ReducibleAnnotation extends Annotation {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ public static VCFFormatHeaderLine getEquivalentFormatHeaderLine(final String inf
addInfoLine(new VCFInfoHeaderLine(INBREEDING_COEFFICIENT_KEY, 1, VCFHeaderLineType.Float, "Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation"));
addInfoLine(new VCFInfoHeaderLine(AS_INBREEDING_COEFFICIENT_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "Allele-specific inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation"));
addInfoLine(new VCFInfoHeaderLine(EXCESS_HET_KEY, 1, VCFHeaderLineType.Float, "Phred-scaled p-value for exact test of excess heterozygosity"));
addInfoLine(new VCFInfoHeaderLine(RAW_GENOTYPE_COUNT_KEY, 3, VCFHeaderLineType.Integer, "Counts of genotypes w.r.t. the reference allele: 0/0, 0/*, */*, i.e. all alts lumped together; for use in calculating excess heterozygosity"));
addInfoLine(new VCFInfoHeaderLine(RAW_GENOTYPE_COUNT_KEY, 3, VCFHeaderLineType.Integer, "Counts of genotypes w.r.t. the reference allele in the following order: 0/0, 0/*, */*, i.e. all alts lumped together; for use in calculating excess heterozygosity"));
addInfoLine(new VCFInfoHeaderLine(LIKELIHOOD_RANK_SUM_KEY, 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt Vs. Ref haplotype likelihoods"));
addInfoLine(new VCFInfoHeaderLine(MAP_QUAL_RANK_SUM_KEY, 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities"));
addInfoLine(new VCFInfoHeaderLine(AS_MAP_QUAL_RANK_SUM_KEY, VCFHeaderLineCount.A, VCFHeaderLineType.Float, "allele specific Z-score From Wilcoxon rank sum test of each Alt vs. Ref read mapping qualities"));
Expand Down
Loading

0 comments on commit 90bf668

Please sign in to comment.