From 14b506b34ef23fce6764ad62e763b15aac881ccc Mon Sep 17 00:00:00 2001 From: Neha Nene Date: Sun, 4 Dec 2022 11:00:27 -0600 Subject: [PATCH] feat: Support custom calculated fields (#637) * feat: support custom calc fields * add consts * fix: bug where logmech was deleted from TD connection --- README.md | 41 ++++++++++++++-- data_validation/cli_tools.py | 1 + data_validation/consts.py | 2 + .../query_builder/query_builder.py | 13 +++-- docs/examples.md | 47 +++++++++++++++++++ 5 files changed, 95 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 68975d8bf..62fe90630 100644 --- a/README.md +++ b/README.md @@ -467,9 +467,9 @@ Grouped Columns contain the fields you want your aggregations to be broken out by, e.g. `SELECT last_updated::DATE, COUNT(*) FROM my.table` will produce a resultset that breaks down the count of rows per calendar date. -### Hash and Comparison Fields +### Hash, Concat, and Comparison Fields -Row level validations can involve either a hash/checksum or comparison fields. +Row level validations can involve either a hash/checksum, concat, or comparison fields. A hash validation (`--hash '*'`) will first sanitize the data with the following operations on all or selected columns: CAST to string, IFNULL replace with a default replacement string, RSTRIP, and UPPER. Then, it will CONCAT() the results @@ -477,8 +477,11 @@ and run a SHA256() hash and compare the source and target results. Since each ro be returned in the result set, it is recommended to utilize the `--use-random-row` feature to validate a subset of the table. -Please note that SHA256 is not a supported function on teradata systems. If you wish to perform -this comparison on teradata you will need to [deploy a UDF to perform the conversion](https://github.com/akuroda/teradata-udf-sha2/blob/master/src/sha256.c). +Please note that SHA256 is not a supported function on Teradata systems. If you wish to perform +this comparison on Teradata you will need to [deploy a UDF to perform the conversion](https://github.com/akuroda/teradata-udf-sha2/blob/master/src/sha256.c). + +The concat validation (`--concat '*'`) will do everything up until the hash. It will sanitize +and concatenate the specified columns, and then value compare the results. Comparison field validations (`--comp-fields column`) involve an value comparison of the column values. These values will be compared via a JOIN on their corresponding primary @@ -499,7 +502,7 @@ Once a calculated field is defined, it can be referenced by other calculated fields at any "depth" or higher. Depth controls how many subqueries are executed in the resulting query. For example, with the following YAML config... -``` +```yaml - calculated_fields: - field_alias: rtrim_col_a source_calculated_columns: ['col_a'] @@ -539,6 +542,34 @@ a INT field to BIGINT for aggregations. See the [Examples page](https://github.com/GoogleCloudPlatform/professional-services-data-validator/blob/develop/docs/examples.md#sample-yaml-with-calc-fields-cast-to-numeric-before-aggregation) for a sample cast to NUMERIC. +#### Custom Calculated Fields + +DVT supports certain functions required for row hash validation natively (i.e. CAST() and CONCAT()), +which are listed in the CalculatedField() class methods in the [QueryBuilder](https://github.com/GoogleCloudPlatform/professional-services-data-validator/blob/develop/data_validation/query_builder/query_builder.py). + +You can also specify custom functions (i.e. replace() or truncate()) from the Ibis expression +[API reference](https://ibis-project.org/docs/3.2.0/api/expressions/). Keep in mind these will run +on both source and target systems. You will need to specify the Ibis API expression and the parameters +required, if any, with the 'params' block as shown below: + +```yaml +- calculated_fields: + - depth: 0 + field_alias: format_start_time + source_calculated_columns: + - start_time + target_calculated_columns: + - start_time + type: custom + ibis_expr: ibis.expr.api.TimestampValue.strftime + params: + - format_str: '%m%d%Y' +``` + +The above block references the [TimestampValue.strftime](https://ibis-project.org/docs/3.2.0/api/expressions/timestamps/#ibis.expr.types.temporal.TemporalValue.strftime) Ibis API expression. +See the [Examples page](https://github.com/GoogleCloudPlatform/professional-services-data-validator/blob/develop/docs/examples.md#sample-row-validation-yaml-with-custom-calc-field) +for a sample YAML with a custom calculated field. + ## Contributing Contributions are welcome. See the [Contributing guide](https://github.com/GoogleCloudPlatform/professional-services-data-validator/blob/develop/CONTRIBUTING.md) for details. diff --git a/data_validation/cli_tools.py b/data_validation/cli_tools.py index 057808d23..a21744498 100644 --- a/data_validation/cli_tools.py +++ b/data_validation/cli_tools.py @@ -62,6 +62,7 @@ ["port", "Teradata port to connect on"], ["user_name", "User used to connect"], ["password", "Password for supplied user"], + ["logmech", "Log on mechanism"], ], "Oracle": [ ["host", "Desired Oracle host"], diff --git a/data_validation/consts.py b/data_validation/consts.py index a02339098..da2495a98 100644 --- a/data_validation/consts.py +++ b/data_validation/consts.py @@ -22,6 +22,8 @@ CONFIG_TARGET_CONN = "target_conn" CONFIG_TYPE = "type" CONFIG_DEFAULT_CAST = "default_cast" +CONFIG_CUSTOM_IBIS_EXPR = "ibis_expr" +CONFIG_CUSTOM_PARAMS = "params" CONFIG_SCHEMA_NAME = "schema_name" CONFIG_TABLE_NAME = "table_name" CONFIG_TARGET_SCHEMA_NAME = "target_schema_name" diff --git a/data_validation/query_builder/query_builder.py b/data_validation/query_builder/query_builder.py index 894b72fa1..de8a9e5d5 100644 --- a/data_validation/query_builder/query_builder.py +++ b/data_validation/query_builder/query_builder.py @@ -368,12 +368,17 @@ def cast(config, fields): ) @staticmethod - def custom(expr): - """Returns a CalculatedField instance built for any custom SQL using a supported operator. + def custom(config, fields): + """Returns a CalculatedField instance built for any custom ibis expression + i.e. 'ibis.expr.api.StringValue.replace'. For a list of supported functions, + see https://github.com/ibis-project/ibis/blob/1.4.0/ibis/expr/api.py Args: - expr (Str): A custom SQL expression used to filter a query + expr (Str): A custom ibis expression to be used as a calc field """ - return CalculatedField(expr) + ibis_expr = config.get(consts.CONFIG_CUSTOM_IBIS_EXPR) + expr_params = config.get(consts.CONFIG_CUSTOM_PARAMS, []) + params = {k: v for d in expr_params for k, v in d.items()} + return CalculatedField(eval(ibis_expr), config, fields, **params) def _compile_fields(self, ibis_table, fields): compiled_fields = [] diff --git a/docs/examples.md b/docs/examples.md index addc80a71..85b6c974e 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -231,6 +231,53 @@ validations: use_random_rows: false ``` +#### Sample Row Validation YAML with Custom Calc Field + +This is a comparison field validation where DVT will first apply the +calculated field and then value compare the result. + +```yaml +result_handler: {} +source: my_bq_conn +target: my_bq_conn +validations: +- calculated_fields: + - depth: 0 + field_alias: replace_name + source_calculated_columns: + - name + target_calculated_columns: + - name + type: custom + ibis_expr: ibis.expr.api.StringValue.replace + params: + - pattern: '/' + - replacement: '-' + comparison_fields: + - cast: null + field_alias: replace_name + source_column: replace_name + target_column: replace_name + filter_status: null + filters: [] + format: table + labels: [] + primary_keys: + - cast: null + field_alias: station_id + source_column: station_id + target_column: station_id + random_row_batch_size: '5' + schema_name: bigquery-public-data.new_york_citibike + table_name: citibike_stations + target_schema_name: bigquery-public-data.new_york_citibike + target_table_name: citibike_stations + threshold: 0.0 + type: Row + use_random_rows: true +``` + + #### Run a custom query column validation ````shell script data-validation validate custom-query --custom-query-type column --source-query-file source_query.sql --target-query-file target_query.sql -sc my_bq_conn -tc my_bq_conn