fix: Status when source and target agg values are 0 (#393)

* fix: doc updates, status bug fix * fix:merge conflicts * fix: lint * fix: update unit test * lint
GoogleCloudPlatform · Mar 18, 2022 · 6a41f68 · 6a41f68
1 parent ed46295
commit 6a41f68
Show file tree

Hide file tree

Showing 7 changed files with 33 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Data Validation Tool
 
-The Data Validation Tool (DVT) is an open sourced Python CLI tool based on the
+The Data Validation Tool (Beta) is an open sourced Python CLI tool based on the
 [Ibis framework](https://ibis-project.org/docs/tutorial/01-Introduction-to-Ibis.html)
 that compares heterogeneous data source tables with multi-leveled validation
 functions.
@@ -9,7 +9,7 @@ Data validation is a critical step in a Data Warehouse, Database or Data Lake
 migration project, where structured or semi-structured data from both the source
 and the destination tables are compared to ensure they are matched and correct
 after each migration step (e.g. data and schema migration, SQL script
-translation, ETL migration, etc.). The Data Validation Tool provides an
+translation, ETL migration, etc.). The Data Validation Tool (DVT) provides an
 automated and repeatable solution to perform this task.
 
 DVT supports the following validation types: * Table level * Table row count *
@@ -136,11 +136,20 @@ used to run powerful validations without writing any queries.
 
 #### Row Validations
 
+(Note: Row hash validation is currently only supported for BigQuery, Teradata, and Imapala/Hive)
+
 Below is the command syntax for row validations. In order to run row level
 validations you need to pass a `--primary-key` flag which defines what field(s)
-the validation will be compared along, as well as a `--comparison-fields` flag
-which specifies the values (e.g. columns) whose raw values will be compared
-based on the primary key join. Additionally you can use
+the validation will be compared on, as well as either the `--comparison-fields` flag
+or the `--hash` flag.
+
+The `--comparison-fields` flag specifies the values (e.g. columns) whose raw values will be compared
+based on the primary key join. The `--hash` flag will run a checksum across all columns in
+the table. This will include casting to string, sanitizing the data, concatenating, and finally
+hashing the row. To exclude columns from the checksum, use the YAML config to customize the validation.
+
+
+Additionally you can use
 [Calculated Fields](#calculated-fields) to compare derived values such as string
 counts and hashes of multiple columns.
 
@@ -156,12 +165,12 @@ data-validation (--verbose or -v) validate row
                         Comma separated list of tables in the form schema.table=target_schema.target_table
                         Target schema name and table name are optional.
                         i.e 'bigquery-public-data.new_york_citibike.citibike_trips'
-  [--primary-keys or -pk PRIMARY_KEYS]
+  --primary-keys or -pk PRIMARY_KEYS
                         Comma separated list of columns to use as primary keys
-  [--comparison-fields or -fields comparison-fields]
+  --comparison-fields or -comp-fields FIELDS
                         Comma separated list of columns to compare. Can either be a physical column or an alias
                         See: *Calculated Fields* section for details
-  [--hash COLUMNS]     Comma separated list of columns to perform a hash operation on or * for all columns
+  --hash '*'            '*' to hash all columns. To exclude columns, use the YAML config.
   [--bq-result-handler or -bqrh PROJECT_ID.DATASET.TABLE]
                         BigQuery destination for validation results. Defaults to stdout.
                         See: *Validation Reports* section

diff --git a/data_validation/combiner.py b/data_validation/combiner.py
@@ -115,7 +115,8 @@ def _calculate_difference(field_differences, datatype, validation, is_value_comp
         )
     else:
         difference = (target_value - source_value).cast("float64")
-        pct_difference = (
+
+        pct_difference_nonzero = (
             ibis.literal(100.0)
             * difference
             / (
@@ -126,6 +127,14 @@ def _calculate_difference(field_differences, datatype, validation, is_value_comp
             ).cast("float64")
         ).cast("float64")
 
+        # Considers case that source and target agg values can both be 0
+        pct_difference = (
+            ibis.case()
+            .when(difference == ibis.literal(0), ibis.literal(0).cast("float64"))
+            .else_(pct_difference_nonzero)
+            .end()
+        )
+
         th_diff = (pct_difference.abs() - pct_threshold).cast("float64")
         status = (
             ibis.case()

diff --git a/data_validation/consts.py b/data_validation/consts.py
@@ -120,7 +120,7 @@
 RESULT_TYPE_TARGET = "target"
 
 # Ibis Object Info
-NUMERIC_DATA_TYPES = ["float64", "int32", "int64", "decimal"]
+NUMERIC_DATA_TYPES = ["float64", "int8", "int16", "int32", "int64", "decimal"]
 
 FORMAT_TYPES = ["csv", "json", "table", "text"]
 

diff --git a/docs/connections.md b/docs/connections.md
@@ -268,10 +268,10 @@ Please note that for Group By validations, the following property must be set in
 
 `set hive:hive.groupby.orderby.position.alias=true`
 
-If you are running Hive on Dataproc, you will also need to run
-`pip install ibis-framework[impala]`
-
- Currently only INT, BIGINT, FLOAT, and DOUBLE data types are supported for Hive aggregation.
+ If you are running Hive on Dataproc, you will also need to install the following:
+ ```
+ pip install ibis-framework[impala]
+ ```
 
 ```
 {

diff --git a/tests/unit/test_data_validation.py b/tests/unit/test_data_validation.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import json
-import numpy
 import pandas
 import pytest
 import random
@@ -501,7 +500,7 @@ def test_zero_both_values(module_under_test, fs):
     col_a_result_df = result_df[result_df.validation_name == "count_col_a"]
     col_a_pct_diff = col_a_result_df.pct_difference.values[0]
 
-    assert numpy.isnan(col_a_pct_diff)
+    assert col_a_pct_diff == 0.0
 
 
 def test_status_success_validation(module_under_test, fs):

diff --git a/third_party/ibis/ibis_addon/operations.py b/third_party/ibis/ibis_addon/operations.py
@@ -43,7 +43,6 @@
 # from third_party.ibis.ibis_snowflake.compiler import SnowflakeExprTranslator
 # from third_party.ibis.ibis_oracle.compiler import OracleExprTranslator <<<<<< DB2
 
-
 class BitXor(Reduction):
     """Aggregate bitwise XOR operation."""
 
@@ -124,7 +123,6 @@ def format_hashbytes_teradata(translator, expr):
     else:
         raise ValueError(f"unexpected value for 'how': {how}")
 
-
 def format_hashbytes_hive(translator, expr):
     arg, how = expr.op().args
     compiled_arg = translator.translate(arg)

diff --git a/third_party/ibis/ibis_impala/api.py b/third_party/ibis/ibis_impala/api.py
@@ -22,7 +22,6 @@
 
 _impala_to_ibis_type = udf._impala_to_ibis_type
 
-
 def impala_connect(
     host=None,
     port=10000,