fix: Hash all bug, noxfile updates (GoogleCloudPlatform#413)

* fix: hash '*', updated docs, fixed nox failures due to new click release, cleaned up arg parser * docs: update row level validation
FX-HAO · Mar 31, 2022 · fc73e21 · fc73e21
1 parent 783cdf8
commit fc73e21
Show file tree

Hide file tree

Showing 5 changed files with 43 additions and 23 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Data Validation Tool
+# Data Validation Tool (Beta)
 
 The Data Validation Tool (Beta) is an open sourced Python CLI tool based on the
 [Ibis framework](https://ibis-project.org/docs/tutorial/01-Introduction-to-Ibis.html)
@@ -12,10 +12,12 @@ after each migration step (e.g. data and schema migration, SQL script
 translation, ETL migration, etc.). The Data Validation Tool (DVT) provides an
 automated and repeatable solution to perform this task.
 
-DVT supports the following validation types: * Table level * Table row count *
-Group by row count * Column aggregation * Filters and limits * Column level *
-Full column data type * Row level hash comparison (BigQuery tables only) * Raw
-SQL exploration * Run custom queries on different data sources
+DVT supports the following validations:
+* Column validation (count, sum, avg, min/max, group_by)
+* Row level validation
+* Schema validation 
+* Custom Query validation
+* RawSQL exploration
 
 DVT supports the following connection types:
 
@@ -35,6 +37,10 @@ DVT supports the following connection types:
 The [Connections](docs/connections.md) page provides details about how to create
 and list connections for the validation tool.
 
+### Disclaimer
+This is not an officially supported Google product. Please be aware that bugs may lurk, and that we reserve the right to make small backwards-incompatible changes. Feel free to open bugs or feature requests, or contribute directly 
+(see [CONTRIBUTING.md](CONTRIBUTING.md) for details).
+
 ## Installation
 
 The [Installation](docs/installation.md) page describes the prerequisites and
@@ -136,7 +142,8 @@ used to run powerful validations without writing any queries.
 
 #### Row Validations
 
-(Note: Row hash validation is currently only supported for BigQuery, Teradata, and Imapala/Hive)
+(Note: Row hash validation is currently only supported for BigQuery, Teradata, and Imapala/Hive. Struct and array 
+data types are not currently supported.)
 
 Below is the command syntax for row validations. In order to run row level
 validations you need to pass a `--primary-key` flag which defines what field(s)

diff --git a/data_validation/__main__.py b/data_validation/__main__.py
@@ -136,6 +136,7 @@ def build_config_from_args(args, config_manager):
         config_manager (ConfigManager): Validation config manager instance.
     """
     config_manager.append_calculated_fields(get_calculated_config(args, config_manager))
+
     if config_manager.validation_type == consts.COLUMN_VALIDATION:
         config_manager.append_aggregates(get_aggregate_config(args, config_manager))
         if args.grouped_columns is not None:
@@ -151,15 +152,16 @@ def build_config_from_args(args, config_manager):
             config_manager.append_comparison_fields(
                 config_manager.build_config_comparison_fields(comparison_fields)
             )
-            config_manager.append_dependent_aliases(comparison_fields)
+            if args.hash != "*":
+                config_manager.append_dependent_aliases(comparison_fields)
+
     if args.primary_keys is not None:
         primary_keys = cli_tools.get_arg_list(args.primary_keys)
         config_manager.append_primary_keys(
             config_manager.build_config_comparison_fields(primary_keys)
         )
-        config_manager.append_dependent_aliases(primary_keys)
-
-    # TODO(GH#18): Add query filter config logic
+        if args.hash != "*":
+            config_manager.append_dependent_aliases(primary_keys)
 
     if config_manager.validation_type == consts.CUSTOM_QUERY:
         config_manager.append_aggregates(get_aggregate_config(args, config_manager))
@@ -169,6 +171,7 @@ def build_config_from_args(args, config_manager):
         if args.target_query_file is not None:
             query_file = cli_tools.get_arg_list(args.target_query_file)
             config_manager.append_target_query_file(query_file)
+
     return config_manager
 
 

diff --git a/data_validation/cli_tools.py b/data_validation/cli_tools.py
@@ -403,14 +403,10 @@ def _configure_row_parser(row_parser):
         "-comp-fields",
         help="Individual columns to compare. If comparing a calculated field use the column alias.",
     )
-    row_parser.add_argument(
-        "--calculated-fields",
-        "-calc-fields",
-        help="list of calculated fields to generate.",
-    )
     row_parser.add_argument(
         "--primary-keys",
         "-pk",
+        required=True,
         help="Comma separated list of primary key columns 'col_a,col_b'",
     )
     row_parser.add_argument(
@@ -488,11 +484,6 @@ def _configure_column_parser(column_parser):
         "-comp-fields",
         help="list of fields to perform exact comparisons to. Use column aliases if this is calculated.",
     )
-    column_parser.add_argument(
-        "--calculated-fields",
-        "-calc-fields",
-        help="list of calculated fields to generate.",
-    )
     column_parser.add_argument(
         "--grouped-columns",
         "-gc",
@@ -612,8 +603,12 @@ def _configure_custom_query_parser(custom_query_parser):
 
 
 def _add_common_arguments(parser):
-    parser.add_argument("--source-conn", "-sc", help="Source connection name")
-    parser.add_argument("--target-conn", "-tc", help="Target connection name")
+    parser.add_argument(
+        "--source-conn", "-sc", required=True, help="Source connection name"
+    )
+    parser.add_argument(
+        "--target-conn", "-tc", required=True, help="Target connection name"
+    )
     parser.add_argument(
         "--tables-list",
         "-tbls",

diff --git a/noxfile.py b/noxfile.py
@@ -93,7 +93,9 @@ def lint(session):
     serious code quality issues.
     """
 
-    _setup_session_requirements(session, extra_packages=["flake8", "black==19.10b0"])
+    _setup_session_requirements(
+        session, extra_packages=["flake8", "black==19.10b0", "click==8.0.4"]
+    )
     session.install("--upgrade", "pip", "wheel")
     session.run("flake8", "data_validation")
     session.run("flake8", "tests")

diff --git a/third_party/ibis/ibis_addon/datatypes.py b/third_party/ibis/ibis_addon/datatypes.py
@@ -15,6 +15,7 @@
 
 from google.cloud import bigquery
 from ibis_bigquery.client import _DTYPE_TO_IBIS_TYPE
+from ibis_bigquery.datatypes import ibis_type_to_bigquery_type, TypeTranslationContext, trans_numeric_udf
 import ibis.expr.datatypes as dt
 from ibis.backends.pandas.client import _inferable_pandas_dtypes
 import pyarrow
@@ -33,3 +34,15 @@
 _inferable_pandas_dtypes['datetime64'] = dt.timestamp
 _inferable_pandas_dtypes['datetime'] = dt.timestamp
 _inferable_pandas_dtypes['time'] = dt.time
+
+# Patch Bug in Ibis BQ that was fixed in version 2.1.1
+@ibis_type_to_bigquery_type.register(dt.Decimal, TypeTranslationContext)
+def trans_numeric(t, context):
+    if (t.precision, t.scale) != (38, 9):
+        raise TypeError(
+            'BigQuery only supports decimal types with precision of 38 and '
+            'scale of 9'
+        )
+    return 'NUMERIC'
+
+trans_numeric_udf = trans_numeric