Skip to content

Commit

Permalink
Adding use_allele_specific_annotation arg and fixing task with empty …
Browse files Browse the repository at this point in the history
…input in JointVcfFiltering WDL (#8027)

* Small changes to JointVCFFiltering WDL

* making default for use_allele_specific_annotations

* addressing comments
  • Loading branch information
meganshand committed Sep 22, 2022
1 parent ef50c08 commit 663dadc
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,6 @@
"JointVcfFiltering.basename": "test_10_samples",
"JointVcfFiltering.snp_annotations": "-A ReadPosRankSum -A FS -A SOR -A QD -A AVERAGE_TREE_SCORE -A AVERAGE_ASSEMBLED_HAPS -A AVERAGE_FILTERED_HAPS",
"JointVcfFiltering.indel_annotations": "-A MQRankSum -A ReadPosRankSum -A FS -A SOR -A QD -A AVERAGE_TREE_SCORE",
"JointVcfFiltering.model_backend": "PYTHON_IFOREST"
"JointVcfFiltering.model_backend": "PYTHON_IFOREST",
"JointVcfFiltering.use_allele_specific_annotations": false
}
55 changes: 37 additions & 18 deletions scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ workflow JointVcfFiltering {
String indel_annotations
File? gatk_override

Boolean use_allele_specific_annotations

String snp_resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz"
String indel_resource_args = "--resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz"
}
Expand All @@ -46,6 +48,7 @@ workflow JointVcfFiltering {
resource_args = snp_resource_args,
basename = basename,
interval_list = extract_interval_list,
use_allele_specific_annotations = use_allele_specific_annotations,
gatk_override = gatk_override,
gatk_docker = gatk_docker
}
Expand All @@ -59,6 +62,7 @@ workflow JointVcfFiltering {
resource_args = indel_resource_args,
basename = basename,
interval_list = extract_interval_list,
use_allele_specific_annotations = use_allele_specific_annotations,
gatk_override = gatk_override,
gatk_docker = gatk_docker
}
Expand Down Expand Up @@ -102,6 +106,7 @@ workflow JointVcfFiltering {
interval_list = score_interval_list,
model_files = TrainVariantAnnotationModelSNPs.outputs,
resource_args = snp_resource_args,
use_allele_specific_annotations = use_allele_specific_annotations,
gatk_override = gatk_override,
gatk_docker = gatk_docker
}
Expand All @@ -120,14 +125,16 @@ workflow JointVcfFiltering {
interval_list = score_interval_list,
model_files = TrainVariantAnnotationModelINDELs.outputs,
resource_args = indel_resource_args,
use_allele_specific_annotations = use_allele_specific_annotations,
gatk_override = gatk_override,
gatk_docker = gatk_docker
}

}

output {
Array[File] variant_filtered_vcf = ScoreVariantAnnotationsINDELs.output_vcf
Array[File] variant_filtered_vcf_index = ScoreVariantAnnotationsINDELs.output_vcf_index
Array[File] variant_scored_vcf = ScoreVariantAnnotationsINDELs.output_vcf
Array[File] variant_scored_vcf_index = ScoreVariantAnnotationsINDELs.output_vcf_index
}

}
Expand All @@ -143,6 +150,7 @@ task ExtractVariantAnnotations {
String annotations
String resource_args
File? interval_list
Boolean use_allele_specific_annotations

Int memory_mb = 14000
Int command_mem = memory_mb - 1000
Expand All @@ -157,6 +165,7 @@ task ExtractVariantAnnotations {
-V ~{input_vcf} \
-O ~{basename}.~{mode} \
~{annotations} \
~{if use_allele_specific_annotations then "--use-allele-specific-annotations" else ""} \
~{"-L " + interval_list} \
--mode ~{mode} \
~{resource_args}
Expand Down Expand Up @@ -232,35 +241,45 @@ task ScoreVariantAnnotations {
File extracted_training_vcf_index
File? interval_list
Array[File] model_files
Boolean use_allele_specific_annotations

Int memory_mb = 16000
Int command_mem = memory_mb - 1000
}
Int disk_size = ceil(size(vcf, "GB") *2 + 50)

command {
zgrep -v '#' ~{vcf} > empty.txt
set -e

ln -s ~{sep=" . && ln -s " model_files} .
if [ -s empty.txt ]; then
ln -s ~{sep=" . && ln -s " model_files} .

export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}
export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}

gatk --java-options "-Xmx~{command_mem}m" \
ScoreVariantAnnotations \
~{"-L " + interval_list} \
-V ~{vcf} \
-O ~{basename}.~{mode} \
--model-backend ~{model_backend} \
~{"--python-script " + python_script} \
--model-prefix ~{basename} \
~{annotations} \
--mode ~{mode} \
--resource:extracted,extracted=true ~{extracted_training_vcf} \
~{resource_args}
gatk --java-options "-Xmx~{command_mem}m" \
ScoreVariantAnnotations \
~{"-L " + interval_list} \
-V ~{vcf} \
-O ~{basename}.~{mode} \
--model-backend ~{model_backend} \
~{"--python-script " + python_script} \
--model-prefix ~{basename} \
~{annotations} \
~{if use_allele_specific_annotations then "--use-allele-specific-annotations" else ""} \
-mode ~{mode} \
--resource:extracted,extracted=true ~{extracted_training_vcf} \
~{resource_args}
else
echo "Input VCF was empty so we'll return the same VCF that was input."
echo "Scores and annot hdf5 files will not be produced since the input was empty."
ln -s ~{vcf} ~{basename}.~{mode}.vcf.gz
ln -s ~{vcf_index} ~{basename}.~{mode}.vcf.gz.tbi
fi
}
output {
File scores = "~{basename}.~{mode}.scores.hdf5"
File annots = "~{basename}.~{mode}.annot.hdf5"
File? scores = "~{basename}.~{mode}.scores.hdf5"
File? annots = "~{basename}.~{mode}.annot.hdf5"
File output_vcf = "~{basename}.~{mode}.vcf.gz"
File output_vcf_index = "~{basename}.~{mode}.vcf.gz.tbi"
}
Expand Down

0 comments on commit 663dadc

Please sign in to comment.