Add VAT validation rule #2 [VS-19] (#7374)

* discussed w AH and LL--20 is a better cutoff than 19 * add validation rule 2 * localize template properly * typo on proj id * but where should this live? * use bespoke bcftools docker
broadinstitute · Aug 2, 2021 · b24f091 · b24f091
1 parent b7e06b9
commit b24f091
Show file tree

Hide file tree

Showing 3 changed files with 53 additions and 164 deletions.
diff --git a/custom_annotations_template.tsv b/custom_annotations_template.tsv
@@ -0,0 +1,7 @@
+#title=gvsAnnotations
+#assembly=GRCh38
+#matchVariantsBy=allele
+#CHROM	POS	REF	ALT	AN	AC	AF
+#categories	.	.	.	AlleleNumber	AlleleCount	AlleleFrequency
+#descriptions	.	.	.	.	.	.
+#type	.	.	.	number	number	number
diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
@@ -25,15 +25,14 @@ workflow GvsSitesOnlyVCF {
             input_vcf = gvs_extract_cohort_filtered_vcfs[i],
             input_vcf_index = gvs_extract_cohort_filtered_vcf_indices[i],
             service_account_json_path = service_account_json_path,
-            output_filename = "${output_sites_only_file_name}_${i}.sites_only.vcf.gz",
-            custom_annotations_template = AnAcAf_annotations_template
+            output_filename = "${output_sites_only_file_name}_${i}.sites_only.vcf.gz"
         }
 
         call ExtractAnAcAfFromVCF {
           input:
             input_vcf = SitesOnlyVcf.output_vcf,
             input_vcf_index = SitesOnlyVcf.output_vcf_idx,
-            custom_annotations_template = SitesOnlyVcf.annotations_template
+            custom_annotations_template = AnAcAf_annotations_template
         }
 
         call AnnotateVCF {
@@ -71,9 +70,9 @@ workflow GvsSitesOnlyVCF {
          input:
              project_id = project_id,
              dataset_name = dataset_name,
+             counts_variants = ExtractAnAcAfFromVCF.count_variants,
              table_suffix = table_suffix,
              service_account_json_path = service_account_json_path,
-             annotation_jsons = AnnotateVCF.annotation_json,
              load_jsons_done = BigQueryLoadJson.done
          }
 }
@@ -85,7 +84,6 @@ task SitesOnlyVcf {
         File input_vcf_index
         String? service_account_json_path
         String output_filename
-        File custom_annotations_template
     }
     String output_vcf_idx = basename(output_filename) + ".tbi"
 
@@ -124,9 +122,6 @@ task SitesOnlyVcf {
                 --sites-only-vcf-output \
                 -O ~{output_filename}
 
-
-        gsutil cp ~{custom_annotations_template} .
-
      >>>
     # ------------------------------------------------
     # Runtime settings:
@@ -142,7 +137,6 @@ task SitesOnlyVcf {
     output {
         File output_vcf="~{output_filename}"
         File output_vcf_idx="~{output_vcf_idx}"
-        File annotations_template="~{custom_annotations_template}"
     }
 }
 
@@ -152,17 +146,26 @@ task ExtractAnAcAfFromVCF {
         File input_vcf_index
         File custom_annotations_template
     }
+
     String custom_annotations_file_name = "an_ac_af.tsv"
+
     # separate multi-allelic sites into their own lines, remove deletions and extract the an/ac/af
     command <<<
         set -e
+
         cp ~{custom_annotations_template} ~{custom_annotations_file_name}
+
         bcftools norm -m- ~{input_vcf} | bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t%AN\t%AC\t%AF\n' | grep -v "*" >> ~{custom_annotations_file_name}
+
+        ### for validation of the pipeline
+        bcftools norm -m- ~{input_vcf} | grep -v "AC=0;" | grep "AC=" | grep "AN=" | grep "AF=" | grep -v "*" | wc -l > count.txt
+        # I find this ^ clearer, but could also do a regex like:  grep "AC=[1-9][0-9]*;A[N|F]=[.0-9]*;A[N|F]=[.0-9]*"
+        # Should this be where we do the filtering of the AC/AN/AF values rather than in the python? still have the <20 issue...?
     >>>
     # ------------------------------------------------
     # Runtime settings:
     runtime {
-        docker: "biocontainers/bcftools:v1.9-1-deb_cv1"
+        docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20210726"
         memory: "1 GB"
         preemptible: 3
         cpu: "1"
@@ -171,7 +174,8 @@ task ExtractAnAcAfFromVCF {
     # ------------------------------------------------
     # Outputs:
     output {
-        File annotations_file="~{custom_annotations_file_name}"
+        File annotations_file = "~{custom_annotations_file_name}"
+        Int count_variants = read_int("count.txt")
     }
 }
 
@@ -283,7 +287,7 @@ task PrepAnnotationJson {
     # ------------------------------------------------
     # Runtime settings:
     runtime {
-        docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20200718"
+        docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20210726"
         memory: "3 GB"
         preemptible: 5
         cpu: "1"
@@ -459,179 +463,58 @@ task BigQueryLoadJson {
     }
 }
 
-task BigQuerySmokeTest { # TO BE BROKEN UP AND POTENTIALLY RUN SEPARATELY
+task BigQuerySmokeTest {
     input {
         String project_id
         String dataset_name
-        Array[File] annotation_jsons
-        Boolean load_jsons_done
-        String? service_account_json_path
+        Array[Int] counts_variants
         String table_suffix
+        String? service_account_json_path
+        Boolean load_jsons_done
     }
-
     # Now query the final table for expected results
+    # Compare the number of variants we expect from the input with the size of the output / VAT
+    # The number of passing variants in GVS matches the number of variants in the VAT.
+    # Please note that we are counting the number of variants in GVS, not the number of sites, which may add a difficulty to this task.
 
     String vat_table = "vat_" + table_suffix
 
     String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
 
     command <<<
-        set +e
-
+        set -e
         if [ ~{has_service_account_file} = 'true' ]; then
             gsutil cp ~{service_account_json_path} local.service_account.json
-            export GOOGLE_APPLICATION_CREDENTIALS=local.service_account.json
             gcloud auth activate-service-account --key-file=local.service_account.json
             gcloud config set project ~{project_id}
         fi
-
-        # For the validation rules that use count in SQL--the results are parsed to grab all digits in the response, which will also include error responses
-
-        # ------------------------------------------------
-        # VALIDATION #1
-        echo  "VALIDATION #1"
-        # The pipeline completed without an error message
-        # The VAT has data in it
-
-        bq query --nouse_legacy_sql --project_id=~{project_id} 'SELECT COUNT (DISTINCT vid) FROM `~{dataset_name}.~{vat_table}`'
-        BQ_VAT_VARIANT_COUNT=$(bq query --nouse_legacy_sql --project_id=~{project_id} 'SELECT COUNT (DISTINCT vid) AS count FROM `~{dataset_name}.~{vat_table}`'| tr -dc '0-9')
-        echo $BQ_VAT_VARIANT_COUNT
-
-        if [[ $BQ_VAT_VARIANT_COUNT -le 0 ]]
-        then
-          echo "The VAT has no data in it"
-          echo  "Validation has failed"
-        else
-          echo "The VAT has been updated"
-          echo "Validation #1 has passed"
-        fi
-
-        # ------------------------------------------------
-        # VALIDATION #2
-        # The number of passing variants in GVS matches the number of variants in the VAT.
-        echo  "VALIDATION #2"
-        # Please note that we are counting the number of variants in GVS, not the number of sites, which may add a difficulty to this task.
-        ANNOTATE_JSON_VARIANT_COUNT=$(gunzip -dc ~{sep=" " annotation_jsons} | grep -o -i '"vid":' | wc -l)
-
-        echo $ANNOTATE_JSON_VARIANT_COUNT
-
-        if [[ $ANNOTATE_JSON_VARIANT_COUNT -ne $BQ_VAT_VARIANT_COUNT ]]
-        then
-          echo "The number of variants is incorrect"
-          echo  "Validation has failed"
-        else
-          echo "The number of passing variants in GVS matches the number of variants in the VAT"
-          echo "Validation #2 has passed"
-        fi
+        echo "project_id = ~{project_id}" > ~/.bigqueryrc
 
         # ------------------------------------------------
-        # VALIDATION #3
-        # All variants in the TESK2 gene region (chr1:45,343,883-45,491,163) list multiple genes and those genes are always TESK2 and AL451136.1.
-        echo  "VALIDATION #3"
-
-        # Note: Lee has swapped this valdation to chr19
-        # SELECT gene_symbol, consequence  FROM `~{dataset_name}.~{vat_table}` WHERE contig = "chr19" and position >= 35740407 and position <= 35740469 and gene_symbol!="IGFLR1" and gene_symbol!="AD000671.2"
-
-        POSITIONAL_COUNT=$(bq query --nouse_legacy_sql --project_id=~{project_id} 'SELECT COUNT (DISTINCT vid) AS distinct_vid_count FROM `~{dataset_name}.~{vat_table}` WHERE contig = "chr1" and position >= 45343883 and position <= 45491163'| tr -dc '0-9')
-        TESK2_COUNT=$(bq query --nouse_legacy_sql --project_id=~{project_id} 'SELECT COUNT (DISTINCT vid) AS distinct_vid_count FROM `~{dataset_name}.~{vat_table}` WHERE contig = "chr1" and position >= 45343883 and position <= 45491163 and gene_symbol="TESK2"'| tr -dc '0-9')
-        AL451136_COUNT=$(bq query --nouse_legacy_sql --project_id=~{project_id} 'SELECT COUNT (DISTINCT vid) AS distinct_vid_count FROM `~{dataset_name}.~{vat_table}` WHERE contig = "chr1" and position >= 45343883 and position <= 45491163 and gene_symbol="AL451136.1"'| tr -dc '0-9')
-        GENE_COUNT_SUM=$(( $TESK2_COUNT + $AL451136_COUNT ))
-
-        if [[ $POSITIONAL_COUNT -ne $GENE_COUNT_SUM ]]
-          then echo "There are unexpected genes in the TESK2 region"
-          echo  "Validation has failed"
-        else
-          echo "All variants in the TESK2 gene region list multiple genes and those genes are always TESK2 and AL451136.1"
-          echo "Validation #4 has passed"
-        fi
-
-        # ------------------------------------------------
-        # VALIDATION #4
-        # If a vid has a null transcript, then the vid is only in one row of the VAT.
-        echo  "VALIDATION #4"
-        # Get a count of all the rows in the vat with no transcript
-        # Get a count of all distinct VID in vat with no transcript
-        # Make sure they are the same
-        BQ_VAT_ROWS_NO_TRANSCRIPT_COUNT=$(bq query --nouse_legacy_sql --project_id=~{project_id} 'SELECT COUNT (DISTINCT vid) AS distinct_vid_count FROM `~{dataset_name}.~{vat_table}` WHERE transcript IS NULL'| tr -dc '0-9')
-        BQ_VAT_VARIANT_NO_TRANSCRIPT_COUNT=$(bq query --nouse_legacy_sql --project_id=~{project_id} 'SELECT COUNT(*) AS count FROM `~{dataset_name}.~{vat_table}` WHERE transcript IS NULL'| tr -dc '0-9')
-
-        if [[ $BQ_VAT_ROWS_NO_TRANSCRIPT_COUNT -ne $BQ_VAT_VARIANT_NO_TRANSCRIPT_COUNT ]]
-        then
-          echo "The number of rows for variants with no transcripts is incorrect"
-          echo  "Validation has failed"
+        # VALIDATION CALCULATION
+        # sum all the initial input variants across the shards
+
+        INITIAL_VARIANT_COUNT=$(python -c "print(sum([~{sep=', ' counts_variants}]))")
+
+        # Count number of variants in the VAT
+        bq query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT COUNT (DISTINCT vid) AS count FROM `~{dataset_name}.~{vat_table}`' > bq_variant_count.csv
+        VAT_COUNT=$(python3 -c "csvObj=open('bq_variant_count.csv','r');csvContents=csvObj.read();print(csvContents.split('\n')[1]);")
+        # if the result of the bq call and the csv parsing is a series of digits, then check that it matches the input
+        if [[ $VAT_COUNT =~ ^[0-9]+$ ]]; then
+            if [[ $INITIAL_VARIANT_COUNT -ne $VAT_COUNT ]]; then
+                echo "FAIL: The VAT table ~{vat_table} and the input files had mismatched variant counts."
+            else
+                echo "PASS: The VAT table ~{vat_table} has $VAT_COUNT variants in it, which is the expected number."
+            fi
+        # otherwise, something is off, so return the output from the bq query call
         else
-          echo "If a vid has a null transcript, then the vid is only in one row of the VAT"
-          echo "Validation #4 has passed"
+            echo "Something went wrong. The attempt to count the variants returned: " $(cat bq_variant_count.csv)
         fi
-
-        # ------------------------------------------------
-        # VALIDATION #5
-        # There is a non-zero number of transcript fields with null values in the VAT.
-        echo  "VALIDATION #5"
-
-        if [[ $BQ_VAT_VARIANT_NO_TRANSCRIPT_COUNT -gt 0 ]]
-        then
-          echo  "Validation has failed"
-        else
-          echo "Validation #5 has passed"
-        fi
-
-        # ------------------------------------------------
-        # VALIDATION #6
-        # No non-nullable fields contain null values.
-        # non-nullable fields: vid, contig, position, ref_allele, alt_allele, gvs_all_ac, gvs_all_an, gvs_all_af, variant_type, genomic_location
-        echo  "VALIDATION #6"
-
-        BQ_VAT_NULL=$(bq query --nouse_legacy_sql --project_id=~{project_id} 'SELECT COUNT (DISTINCT vid) as count FROM `~{dataset_name}.~{vat_table}` WHERE vid IS NULL OR contig IS NULL OR position IS NULL OR ref_allele IS NULL OR alt_allele IS NULL OR variant_type IS NULL OR genomic_location IS NULL')
-        echo $BQ_VAT_NULL
-
-        if [[ $BQ_VAT_NULL -gt 0 ]]
-        then
-          echo "A required value is null"
-          echo  "Validation has failed"
-        else
-          echo "No required values are null"
-          echo "Validation #6 has passed"
-        fi
-
-        # ------------------------------------------------
-        # VALIDATION #7
-        # Each key combination (vid+transcript) is unique.
-        echo  "VALIDATION #7"
-        # get the sum of all the distinct vids where transcript is null and all the distinct transcript where transcript is not null
-        BQ_VAT_UNIQUE_IDS_COUNT=$(bq query --nouse_legacy_sql --project_id=~{project_id} 'SELECT COUNT(*) AS count FROM (SELECT vid, transcript FROM `~{dataset_name}.~{vat_table}` group by vid, transcript)')
-        BQ_VAT_ROW_COUNT=$(bq query --nouse_legacy_sql --project_id=~{project_id} 'SELECT COUNT(*) AS count FROM `~{dataset_name}.~{vat_table}` ')
-
-        if [ $BQ_VAT_UNIQUE_IDS_COUNT -ne $BQ_VAT_ROW_COUNT ];
-        then echo "There are duplicate variant - transcript rows"
-          echo  "Validation has failed"
-        else:
-          echo "Each key combination is unique"
-          echo "Validation #7 has passed"
-        fi
-
-        # ------------------------------------------------
-
-        # VALIDATION #8
-        # If a vid has any non-null transcripts then one transcript must be Ensembl (transcript_source). Every transcript_source is Ensembl or null.
-        echo  "VALIDATION #8"
-        BQ_VAT_ENSEMBL_COUNT=$(bq query --nouse_legacy_sql --project_id=~{project_id} 'SELECT COUNT(*) AS count FROM `~{dataset_name}.~{vat_table}` where transcript is not null and transcript_source="Ensembl"')
-        BQ_VAT_TRANSCRIPT_COUNT=$(bq query --nouse_legacy_sql --project_id=~{project_id} 'SELECT COUNT(*) AS count FROM `~{dataset_name}.~{vat_table}` where transcript is not null')
-
-        if [[ $BQ_VAT_ENSEMBL_COUNT -ne $BQ_VAT_TRANSCRIPT_COUNT ]]
-        then
-          echo "All transcripts should be from Ensembl"
-          echo  "Validation has failed"
-        else
-          echo "If a vid has any non-null transcripts then one transcript must be Ensembl"
-          echo "Validation #8 has passed"
-        fi
-
     >>>
     # ------------------------------------------------
     # Runtime settings:
     runtime {
-        docker: "openbridge/ob_google-bigquery:latest"
+        docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:305.0.0"
         memory: "1 GB"
         preemptible: 3
         cpu: "1"

diff --git a/scripts/variantstore/wdl/extract/create_variant_annotation_table.py b/scripts/variantstore/wdl/extract/create_variant_annotation_table.py
@@ -175,11 +175,10 @@ def make_annotated_json_row(row_position, variant_line, transcript_line):
       row["revel"] = variant_line.get("revel").get("score")
 
     gvs_annotations = variant_line["gvsAnnotations"]
-    if gvs_annotations.get("AC") < 19: # if AC is between 1-18, make the value 19 and then recalculate AF as 19 / AN (we already checked for AC=0)
-      print("I am 19!", variant_line.get("vid"))
-      row["gvs_all_ac"] = 19
+    if gvs_annotations.get("AC") < 20: # if AC is between 1-19, make the value 20 and then recalculate AF as 20 / AN (we already checked for AC=0)
+      row["gvs_all_ac"] = 20
       row["gvs_all_an"] = gvs_annotations.get("AN")
-      row["gvs_all_af"] = 19 / gvs_annotations.get("AN")
+      row["gvs_all_af"] = 20 / gvs_annotations.get("AN")
     else:
       for vat_gvs_alleles_fieldname in vat_nirvana_gvs_alleles_dictionary.keys():  # like "gvs_all_ac"
         nirvana_gvs_alleles_fieldname = vat_nirvana_gvs_alleles_dictionary.get(vat_gvs_alleles_fieldname)