diff --git a/README.md b/README.md index 2d9842f94..9e90aca80 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Data Validation Tool -The Data Validation Tool (DVT) is an open sourced Python CLI tool based on the +The Data Validation Tool (Beta) is an open sourced Python CLI tool based on the [Ibis framework](https://ibis-project.org/docs/tutorial/01-Introduction-to-Ibis.html) that compares heterogeneous data source tables with multi-leveled validation functions. @@ -9,7 +9,7 @@ Data validation is a critical step in a Data Warehouse, Database or Data Lake migration project, where structured or semi-structured data from both the source and the destination tables are compared to ensure they are matched and correct after each migration step (e.g. data and schema migration, SQL script -translation, ETL migration, etc.). The Data Validation Tool provides an +translation, ETL migration, etc.). The Data Validation Tool (DVT) provides an automated and repeatable solution to perform this task. DVT supports the following validation types: * Table level * Table row count * @@ -136,11 +136,20 @@ used to run powerful validations without writing any queries. #### Row Validations +(Note: Row hash validation is currently only supported for BigQuery, Teradata, and Imapala/Hive) + Below is the command syntax for row validations. In order to run row level validations you need to pass a `--primary-key` flag which defines what field(s) -the validation will be compared along, as well as a `--comparison-fields` flag -which specifies the values (e.g. columns) whose raw values will be compared -based on the primary key join. Additionally you can use +the validation will be compared on, as well as either the `--comparison-fields` flag +or the `--hash` flag. + +The `--comparison-fields` flag specifies the values (e.g. columns) whose raw values will be compared +based on the primary key join. The `--hash` flag will run a checksum across all columns in +the table. This will include casting to string, sanitizing the data, concatenating, and finally +hashing the row. To exclude columns from the checksum, use the YAML config to customize the validation. + + +Additionally you can use [Calculated Fields](#calculated-fields) to compare derived values such as string counts and hashes of multiple columns. @@ -156,12 +165,12 @@ data-validation (--verbose or -v) validate row Comma separated list of tables in the form schema.table=target_schema.target_table Target schema name and table name are optional. i.e 'bigquery-public-data.new_york_citibike.citibike_trips' - [--primary-keys or -pk PRIMARY_KEYS] + --primary-keys or -pk PRIMARY_KEYS Comma separated list of columns to use as primary keys - [--comparison-fields or -fields comparison-fields] + --comparison-fields or -comp-fields FIELDS Comma separated list of columns to compare. Can either be a physical column or an alias See: *Calculated Fields* section for details - [--hash COLUMNS] Comma separated list of columns to perform a hash operation on or * for all columns + --hash '*' '*' to hash all columns. To exclude columns, use the YAML config. [--bq-result-handler or -bqrh PROJECT_ID.DATASET.TABLE] BigQuery destination for validation results. Defaults to stdout. See: *Validation Reports* section diff --git a/data_validation/combiner.py b/data_validation/combiner.py index 44710dd9b..328607e8a 100644 --- a/data_validation/combiner.py +++ b/data_validation/combiner.py @@ -115,7 +115,8 @@ def _calculate_difference(field_differences, datatype, validation, is_value_comp ) else: difference = (target_value - source_value).cast("float64") - pct_difference = ( + + pct_difference_nonzero = ( ibis.literal(100.0) * difference / ( @@ -126,6 +127,14 @@ def _calculate_difference(field_differences, datatype, validation, is_value_comp ).cast("float64") ).cast("float64") + # Considers case that source and target agg values can both be 0 + pct_difference = ( + ibis.case() + .when(difference == ibis.literal(0), ibis.literal(0).cast("float64")) + .else_(pct_difference_nonzero) + .end() + ) + th_diff = (pct_difference.abs() - pct_threshold).cast("float64") status = ( ibis.case() diff --git a/data_validation/consts.py b/data_validation/consts.py index 2dc811e3d..09ffbf2b7 100644 --- a/data_validation/consts.py +++ b/data_validation/consts.py @@ -120,7 +120,7 @@ RESULT_TYPE_TARGET = "target" # Ibis Object Info -NUMERIC_DATA_TYPES = ["float64", "int32", "int64", "decimal"] +NUMERIC_DATA_TYPES = ["float64", "int8", "int16", "int32", "int64", "decimal"] FORMAT_TYPES = ["csv", "json", "table", "text"] diff --git a/docs/connections.md b/docs/connections.md index 5b4da1a0d..2e2fef8cd 100644 --- a/docs/connections.md +++ b/docs/connections.md @@ -268,10 +268,10 @@ Please note that for Group By validations, the following property must be set in `set hive:hive.groupby.orderby.position.alias=true` -If you are running Hive on Dataproc, you will also need to run -`pip install ibis-framework[impala]` - - Currently only INT, BIGINT, FLOAT, and DOUBLE data types are supported for Hive aggregation. + If you are running Hive on Dataproc, you will also need to install the following: + ``` + pip install ibis-framework[impala] + ``` ``` { diff --git a/tests/unit/test_data_validation.py b/tests/unit/test_data_validation.py index 45d93a7ed..ced133055 100644 --- a/tests/unit/test_data_validation.py +++ b/tests/unit/test_data_validation.py @@ -13,7 +13,6 @@ # limitations under the License. import json -import numpy import pandas import pytest import random @@ -501,7 +500,7 @@ def test_zero_both_values(module_under_test, fs): col_a_result_df = result_df[result_df.validation_name == "count_col_a"] col_a_pct_diff = col_a_result_df.pct_difference.values[0] - assert numpy.isnan(col_a_pct_diff) + assert col_a_pct_diff == 0.0 def test_status_success_validation(module_under_test, fs): diff --git a/third_party/ibis/ibis_addon/operations.py b/third_party/ibis/ibis_addon/operations.py index c505ff223..1795c0dd5 100644 --- a/third_party/ibis/ibis_addon/operations.py +++ b/third_party/ibis/ibis_addon/operations.py @@ -43,7 +43,6 @@ # from third_party.ibis.ibis_snowflake.compiler import SnowflakeExprTranslator # from third_party.ibis.ibis_oracle.compiler import OracleExprTranslator <<<<<< DB2 - class BitXor(Reduction): """Aggregate bitwise XOR operation.""" @@ -124,7 +123,6 @@ def format_hashbytes_teradata(translator, expr): else: raise ValueError(f"unexpected value for 'how': {how}") - def format_hashbytes_hive(translator, expr): arg, how = expr.op().args compiled_arg = translator.translate(arg) diff --git a/third_party/ibis/ibis_impala/api.py b/third_party/ibis/ibis_impala/api.py index 52e870c3f..bb0046112 100644 --- a/third_party/ibis/ibis_impala/api.py +++ b/third_party/ibis/ibis_impala/api.py @@ -22,7 +22,6 @@ _impala_to_ibis_type = udf._impala_to_ibis_type - def impala_connect( host=None, port=10000,