diff --git a/samples/bq_utils/README.md b/samples/bq_utils/README.md new file mode 100644 index 000000000..fe0ca8c41 --- /dev/null +++ b/samples/bq_utils/README.md @@ -0,0 +1,40 @@ +# Helper scripts for BigQuery validations + +## Dataset-level + +We do not natively support validations of an entire BQ dataset. This is a workaround to execute this task. + +This script will run validations on all the BigQuery tables within a provided dataset **as long as the table names match between source and target datasets.** + +**IMPORTANT:** The script will only run column and schema validations for BigQuery source and target databases. + +1. Enter the directory: + +```shell script +cd samples/bq_utils/ +``` + +1. Grant execution permissions to file: + +```shell script +chmod u+x bq-dataset-level-validation.sh +``` + +1. To run a validation, execute the script by passing the following parameters: + +```shell script +./bq-dataset-level-validation.sh [SOURCE_BQ_PROJECT] [SOURCE_BQ_DATASET] [TARGET_BQ_PROJECT] [TARGET_BQ_DATASET] [FULLNAME_BQ_RESULT_HANDLER] +``` + +Like this example: + +```shell script +./bq-dataset-level-validation.sh your-project dataset1 your-project dataset2 your-project.pso_data_validator.results +``` + +(Optional) Add an optional filter. Assume all your tables have a partition timestamp and you want to perform a validation within a specific timeframe. You can add the filter as an optional argument: + +```shell script +./bq-dataset-level-validation.sh your-project dataset1 your-project dataset2 your-project.pso_data_validator.results "--filters 'partitionTs BETWEEN TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL -3 DAY) AND CURRENT_TIMESTAMP()'" +``` + diff --git a/samples/bq_utils/bq-dataset-level-validation.sh b/samples/bq_utils/bq-dataset-level-validation.sh new file mode 100755 index 000000000..c09dabc03 --- /dev/null +++ b/samples/bq_utils/bq-dataset-level-validation.sh @@ -0,0 +1,27 @@ +# get all tables from source dataset and save them in a temporary CSV file +bq ls --max_results 100 $1:$2 | tail -n +3 | tr -s ' ' | cut -d' ' -f2 > source_tables.csv + +# create BQ connection for source +data-validation connections add \ + --connection-name my_bq_conn_source BigQuery \ + --project-id $1 + +# create BQ connection for target +data-validation connections add \ + --connection-name my_bq_conn_target BigQuery \ + --project-id $3 + +input="./source_tables.csv" +# perform both column and schema validation for every table in the given dataset +while IFS= read -r table +do + command="data-validation validate column -sc my_bq_conn_source -tc my_bq_conn_target -bqrh $5 -tbls $1.$2.$table=$3.$4.$table ${@:6}" + eval "$command" + + command="data-validation validate schema -sc my_bq_conn_source -tc my_bq_conn_target -bqrh $5 -tbls $1.$2.$table=$3.$4.$table" + eval "$command" +done < "$input" + +# delete the temporary file +rm $input +