Skip to content

Commit

Permalink
Performed a round of ablation on new annotation-based filtering tools.
Browse files Browse the repository at this point in the history
  • Loading branch information
samuelklee committed Aug 14, 2023
1 parent 4212fdf commit bda84e0
Show file tree
Hide file tree
Showing 108 changed files with 194 additions and 653 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ fi
echo "Docker build done =========="

sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/vcf_site_level_filtering.json >$WORKING_DIR/vcf_site_level_filtering_mod.json
sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/vcf_site_level_filtering_pos_neg.json >$WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json

echo "Running Filtering WDL through cromwell"

Expand All @@ -41,6 +40,3 @@ done
FIN
cat $WORKING_DIR/vcf_site_level_filtering_mod.json
java -jar $CROMWELL_JAR run $WDL_DIR/JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_mod.json

cat $WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json
java -jar $CROMWELL_JAR run $WDL_DIR/JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json

This file was deleted.

17 changes: 12 additions & 5 deletions scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@ workflow JointVcfFiltering {
String resource_args

String? model_backend
File? python_script
File? training_python_script
File? hyperparameters_json
File? scoring_python_script

String? extract_extra_args
String? train_extra_args
Expand All @@ -55,9 +56,9 @@ workflow JointVcfFiltering {
model_backend: "(Optional) Model backend to be used by TrainVariantAnnotationsModel. See GATK documentation for this tool."
python_script: "(Optional) Python script specifying custom model backend to be used by TrainVariantAnnotationsModel. See GATK documentation for this tool."
hyperparameters_json: "(Optional) JSON file specifying model hyperparameters to be used by TrainVariantAnnotationsModel. See GATK documentation for this tool."
extract_extra_args: "(Optional) Catch-all string to provide additional arguments for ExtractVariantAnnotations. This can include intervals (as string arguments or non-localized files), variant-type modes, arguments for enabling positive-negative training, etc. The \"do-not-gzip-vcf-output\" argument is not supported by this workflow. See GATK documentation for this tool."
train_extra_args: "(Optional) Catch-all string to provide additional arguments for TrainVariantAnnotationsModel. This can include variant-type modes, arguments for enabling positive-negative training, etc. See GATK documentation for this tool."
score_extra_args: "(Optional) Catch-all string to provide additional arguments for ScoreVariantAnnotations. This can include intervals (as string arguments or non-localized files), variant-type modes, arguments for enabling positive-negative training and hard filtering, etc. The \"do-not-gzip-vcf-output\" argument is not supported by this workflow. See GATK documentation for this tool."
extract_extra_args: "(Optional) Catch-all string to provide additional arguments for ExtractVariantAnnotations. This can include intervals (as string arguments or non-localized files), variant-type modes, arguments for enabling positive-unlabeled learning, etc. The \"do-not-gzip-vcf-output\" argument is not supported by this workflow. See GATK documentation for this tool."
train_extra_args: "(Optional) Catch-all string to provide additional arguments for TrainVariantAnnotationsModel. This can include variant-type modes, arguments for enabling positive-unlabeled learning, etc. See GATK documentation for this tool."
score_extra_args: "(Optional) Catch-all string to provide additional arguments for ScoreVariantAnnotations. This can include intervals (as string arguments or non-localized files), variant-type modes, arguments for enabling positive-unlabeled learning and hard filtering, etc. The \"do-not-gzip-vcf-output\" argument is not supported by this workflow. See GATK documentation for this tool."
}

call ExtractVariantAnnotations {
Expand All @@ -79,7 +80,7 @@ workflow JointVcfFiltering {
annotations_hdf5 = ExtractVariantAnnotations.annotations_hdf5,
unlabeled_annotations_hdf5 = ExtractVariantAnnotations.unlabeled_annotations_hdf5,
model_backend = model_backend,
python_script = python_script,
python_script = training_python_script,
hyperparameters_json = hyperparameters_json,
output_prefix = output_prefix,
extra_args = train_extra_args,
Expand All @@ -101,6 +102,8 @@ workflow JointVcfFiltering {
extracted_vcf_idx = ExtractVariantAnnotations.extracted_vcf_idx,
model_prefix = output_prefix,
model_files = TrainVariantAnnotationsModel.model_files,
model_backend = model_backend,
python_script = scoring_python_script,
extra_args = score_extra_args,
gatk_docker = gatk_docker,
gatk_override = gatk_override,
Expand Down Expand Up @@ -251,6 +254,8 @@ task ScoreVariantAnnotations {
File extracted_vcf_idx
String model_prefix
Array[File] model_files
String? model_backend
File? python_script
String? extra_args
File? monitoring_script

Expand Down Expand Up @@ -287,6 +292,8 @@ task ScoreVariantAnnotations {
~{resource_args} \
--resource:extracted,extracted=true ~{extracted_vcf} \
--model-prefix model-files/~{model_prefix}.train \
~{"--model-backend " + model_backend} \
~{"--python-script " + python_script} \
~{extra_args}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
* <ul>
* <li>
* Input VCF file. Site-level annotations will be extracted from the contained variants (or alleles,
* if the {@value USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME} argument is specified).
* if at least one allele-specific annotation with {@code Number=A} is specified).
* </li>
* <li>
* Annotations to extract.
Expand Down Expand Up @@ -128,7 +128,7 @@
* <p>
* Here, each chunk is a double matrix, with dimensions given by (number of sites in the chunk) x (number of annotations).
* See the methods {@link HDF5Utils#writeChunkedDoubleMatrix} and {@link HDF5Utils#writeIntervals} for additional details.
* If {@value USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME} is specified, each record corresponds to an individual allele;
* In allele-specific mode (i.e., when allele-specific annotations are requested), each record corresponds to an individual allele;
* otherwise, each record corresponds to a variant site, which may contain multiple alleles.
* Storage of alleles can be omitted using the {@value OMIT_ALLELES_IN_HDF5_LONG_NAME} argument, which will reduce
* the size of the file. This file will only be produced if resources are provided and the number of extracted
Expand Down Expand Up @@ -184,9 +184,8 @@
* 1000000 unlabeled (i.e., non-training/calibration) sites, producing the outputs
* 1) {@code extract.annot.hdf5}, 2) {@code extract.unlabeled.annot.hdf5}, 3) {@code extract.vcf.gz},
* and 4) {@code extract.vcf.gz.tbi}. The HDF5 files can then be provided to {@link TrainVariantAnnotationsModel}
* to train a model using a positive-negative approach (similar to that used in {@link VariantRecalibrator}).
* Note that the {@value MODE_LONG_NAME} arguments are made explicit here, although both SNP and INDEL modes are
* selected by default.
* to train a model using a positive-unlabeled approach. Note that the {@value MODE_LONG_NAME} arguments
* are made explicit here, although both SNP and INDEL modes are selected by default.
*
* <pre>
* gatk ExtractVariantAnnotations \
Expand Down Expand Up @@ -249,11 +248,10 @@ public final class ExtractVariantAnnotations extends LabeledVariantAnnotationsWa
doc = "Maximum number of unlabeled variants to extract. " +
"If greater than zero, reservoir sampling will be used to randomly sample this number " +
"of sites from input sites that are not present in the specified resources. " +
"Choice of this number should be guided by considerations for training the negative model in " +
"Choice of this number should be guided by considerations for training the model in " +
"TrainVariantAnnotationsModel; users may wish to choose a number that is comparable to the " +
"expected size of the labeled training set or that is compatible with available memory resources. " +
"Note that in allele-specific mode (--" + LabeledVariantAnnotationsWalker.USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME +
" true), this argument limits the number of variant records, rather than the number of alleles.",
"Note that in allele-specific mode, this argument limits the number of variant records, rather than the number of alleles.",
minValue = 0)
private int maximumNumberOfUnlabeledVariants = 0;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderLine;
import htsjdk.variant.vcf.VCFHeaderLineCount;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.apache.commons.collections4.ListUtils;
Expand All @@ -34,6 +35,7 @@
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
Expand Down Expand Up @@ -87,7 +89,6 @@
public abstract class LabeledVariantAnnotationsWalker extends MultiplePassVariantWalker {

public static final String MODE_LONG_NAME = "mode";
public static final String USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME = "use-allele-specific-annotations";
public static final String IGNORE_FILTER_LONG_NAME = "ignore-filter";
public static final String IGNORE_ALL_FILTERS_LONG_NAME = "ignore-all-filters";
public static final String DO_NOT_TRUST_ALL_POLYMORPHIC_LONG_NAME = "do-not-trust-all-polymorphic";
Expand Down Expand Up @@ -129,12 +130,6 @@ enum ResourceMatchingStrategy {
minElements = 1)
private List<VariantType> variantTypesToExtractList = new ArrayList<>(Arrays.asList(VariantType.SNP, VariantType.INDEL));

@Argument(
fullName = USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME,
doc = "If true, use the allele-specific versions of the specified annotations.",
optional = true)
boolean useASAnnotations = false;

@Argument(
fullName = IGNORE_FILTER_LONG_NAME,
doc = "Ignore the specified filter(s) in the input VCF.",
Expand All @@ -159,13 +154,13 @@ enum ResourceMatchingStrategy {
@Argument(
fullName = RESOURCE_MATCHING_STRATEGY_LONG_NAME,
doc = "The strategy to use for determining whether an input variant is present in a resource " +
"in non-allele-specific mode (--" + USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME + " false). " +
"in non-allele-specific mode. " +
"START_POSITION: Start positions of input and resource variants must match. " +
"START_POSITION_AND_GIVEN_REPRESENTATION: The intersection of the sets of input and resource alleles " +
"(in their given representations) must also be non-empty. " +
"START_POSITION_AND_MINIMAL_REPRESENTATION: The intersection of the sets of input and resource alleles " +
"(after converting alleles to their minimal representations) must also be non-empty. " +
"This argument has no effect in allele-specific mode (--" + USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME + " true), " +
"This argument has no effect in allele-specific mode, " +
"in which the minimal representations of the input and resource alleles must match.",
optional = true)
private ResourceMatchingStrategy resourceMatchingStrategy = ResourceMatchingStrategy.START_POSITION;
Expand All @@ -186,6 +181,7 @@ enum ResourceMatchingStrategy {
private final Set<String> ignoreInputFilterSet = new TreeSet<>();
Set<VariantType> variantTypesToExtract;
TreeSet<String> resourceLabels = new TreeSet<>();
boolean useASAnnotations;

File outputAnnotationsFile;
VariantContextWriter vcfWriter;
Expand Down Expand Up @@ -222,9 +218,11 @@ public void onTraversalStart() {
LabeledVariantAnnotationsData.SNP_LABEL));
}

useASAnnotations = isAlleleSpecificAnnotationRequested();

if (useASAnnotations && resourceMatchingStrategy != ResourceMatchingStrategy.START_POSITION_AND_MINIMAL_REPRESENTATION) {
logger.warn(String.format("The %s argument is ignored when %s is set to true. The START_POSITION_AND_MINIMAL_REPRESENTATION strategy will be used.",
RESOURCE_MATCHING_STRATEGY_LONG_NAME, USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME));
logger.warn(String.format("The %s argument is ignored when allele-specific annotations are requested. The START_POSITION_AND_MINIMAL_REPRESENTATION strategy will be used.",
RESOURCE_MATCHING_STRATEGY_LONG_NAME));
resourceMatchingStrategy = ResourceMatchingStrategy.START_POSITION_AND_MINIMAL_REPRESENTATION;
}

Expand All @@ -251,6 +249,12 @@ public Object onTraversalSuccess() {
return null;
}

private boolean isAlleleSpecificAnnotationRequested() {
final Set<String> distinctAnnotationNames = new LinkedHashSet<>(annotationNames);
final VCFHeader inputHeader = getHeaderForVariants();
return distinctAnnotationNames.stream().anyMatch(a -> inputHeader.getInfoHeaderLine(a).getCountType() == VCFHeaderLineCount.A);
}

static void addExtractedVariantToData(final LabeledVariantAnnotationsData data,
final VariantContext variant,
final List<Triple<List<Allele>, VariantType, TreeSet<String>>> metadata) {
Expand Down
Loading

0 comments on commit bda84e0

Please sign in to comment.