Merge remote-tracking branch 'upstream/develop' into develop

GoogleCloudPlatform · Apr 15, 2022 · c0cdf9c · c0cdf9c
2 parents 9427272 + cd88b25
commit c0cdf9c
Show file tree

Hide file tree

Showing 40 changed files with 964 additions and 228 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,42 @@
 
 ## Untagged
 
+### [1.7.1](https://github.com/GoogleCloudPlatform/professional-services-data-validator/compare/v1.7.0...v1.7.1) (2022-04-14)
+
+
+### ⚠ BREAKING CHANGES
+
+* Changed result schema 'status' column to 'validation_status' (#420)
+
+### Features
+
+* added timestamp to supported types for min and max ([#431](https://github.com/GoogleCloudPlatform/professional-services-data-validator/issues/431)) ([e8b4860](https://github.com/GoogleCloudPlatform/professional-services-data-validator/commit/e8b48603ff851d771c01794262ab1281192dea0e))
+* Allow aggregation over length of string columns ([#430](https://github.com/GoogleCloudPlatform/professional-services-data-validator/issues/430)) ([201f0a2](https://github.com/GoogleCloudPlatform/professional-services-data-validator/commit/201f0a2a67e79190400aba668fc34df54bf87b66))
+* Changed result schema 'status' column to 'validation_status' ([#420](https://github.com/GoogleCloudPlatform/professional-services-data-validator/issues/420)) ([dfcd0d5](https://github.com/GoogleCloudPlatform/professional-services-data-validator/commit/dfcd0d5000fb33e64a8580a0747319f5ddbec2ca))
+* hash filter support ([#408](https://github.com/GoogleCloudPlatform/professional-services-data-validator/issues/408)) ([46b3723](https://github.com/GoogleCloudPlatform/professional-services-data-validator/commit/46b3723390bf050ae33868517478906d63a43304))
+* Hash selective columns ([#407](https://github.com/GoogleCloudPlatform/professional-services-data-validator/issues/407)) ([88b6620](https://github.com/GoogleCloudPlatform/professional-services-data-validator/commit/88b66201750db3e1a0577f12965551f8955a2525))
+* Implement sum/avg/bit_xor aggs for Timestamp ([#442](https://github.com/GoogleCloudPlatform/professional-services-data-validator/issues/442)) ([51f3af3](https://github.com/GoogleCloudPlatform/professional-services-data-validator/commit/51f3af3c6bb11beaf7bc08e6b2766e0b5a6b8600))
+* improve postgres tests ([#443](https://github.com/GoogleCloudPlatform/professional-services-data-validator/issues/443)) ([6a54527](https://github.com/GoogleCloudPlatform/professional-services-data-validator/commit/6a54527b758c56cc1aa0a1a15b355970115fa16f))
+* Random Sort for Pandas Queries ([#404](https://github.com/GoogleCloudPlatform/professional-services-data-validator/issues/404)) ([2051039](https://github.com/GoogleCloudPlatform/professional-services-data-validator/commit/20510395193e831d8e3350842ce606f4adb80fcb))
+* Support for custom query ([#390](https://github.com/GoogleCloudPlatform/professional-services-data-validator/issues/390)) ([7a218d2](https://github.com/GoogleCloudPlatform/professional-services-data-validator/commit/7a218d2b516d480dad05f6fb52ed6347aef429ed))
+
+
+### Bug Fixes
+
+* bug introduced with new pr ([#429](https://github.com/GoogleCloudPlatform/professional-services-data-validator/issues/429)) ([a6cf3f0](https://github.com/GoogleCloudPlatform/professional-services-data-validator/commit/a6cf3f03e8f28eb6b3c2cfca41880d839f301637))
+* Hash all bug, noxfile updates ([#413](https://github.com/GoogleCloudPlatform/professional-services-data-validator/issues/413)) ([fc73e21](https://github.com/GoogleCloudPlatform/professional-services-data-validator/commit/fc73e21810ffe74459a90b79b956a91168c0dc1c))
+* Hive boolean nan to None, Unsupported ibis data types in structs and arrays ([#444](https://github.com/GoogleCloudPlatform/professional-services-data-validator/issues/444)) ([e94a1da](https://github.com/GoogleCloudPlatform/professional-services-data-validator/commit/e94a1daabf6c5df04720e20b764a8ccf9bc63050))
+* ibis default sql option limits query results at 10k rows ([#418](https://github.com/GoogleCloudPlatform/professional-services-data-validator/issues/418)) ([7539efe](https://github.com/GoogleCloudPlatform/professional-services-data-validator/commit/7539efe266ecea22102d775dc9ad4c8bbb9dba84))
+* Impala strings/objects now return None instead of NaN ([#406](https://github.com/GoogleCloudPlatform/professional-services-data-validator/issues/406)) ([9d3c5ec](https://github.com/GoogleCloudPlatform/professional-services-data-validator/commit/9d3c5ecf1babae2c811a30d0820701b124ae1c50))
+* issue 265 add cloud spanner functionality ([#394](https://github.com/GoogleCloudPlatform/professional-services-data-validator/issues/394)) ([783cdf8](https://github.com/GoogleCloudPlatform/professional-services-data-validator/commit/783cdf8810c29755b26e4894555b6dd03f4c9025))
+* support labels for schema validation ([#260](https://github.com/GoogleCloudPlatform/professional-services-data-validator/issues/260)) ([#381](https://github.com/GoogleCloudPlatform/professional-services-data-validator/issues/381)) ([f787701](https://github.com/GoogleCloudPlatform/professional-services-data-validator/commit/f787701dcb505fbced3e12b996c845148bbc1af0))
+* Treat both source and target values being NULL as a success ([#437](https://github.com/GoogleCloudPlatform/professional-services-data-validator/issues/437)) ([c4da5ca](https://github.com/GoogleCloudPlatform/professional-services-data-validator/commit/c4da5ca18f47af3e5ebadce97d35c25ca66d4003))
+
+
+### Miscellaneous Chores
+
+* release 1.7.1 ([#446](https://github.com/GoogleCloudPlatform/professional-services-data-validator/issues/446)) ([99916ba](https://github.com/GoogleCloudPlatform/professional-services-data-validator/commit/99916ba2c76b8370cabd648e4d1f3c4ec15b93d7))
+
 ## [1.7.0](https://github.com/GoogleCloudPlatform/professional-services-data-validator/compare/v1.6.0...v1.7.0) (2022-03-23)
 
 

diff --git a/README.md b/README.md
@@ -93,6 +93,11 @@ grouped column validations a step further by providing the `--primary-key` flag.
 With this flag, if a mismatch was found, DVT will dive deeper into the slice
 with the error and find the row (primary key value) with the inconsistency.
 
+You can specify a list of string columns for aggregations in order to calculate
+an aggregation over the `length(string_col)`. Running an aggregation 
+over all columns ('*') will only run over numeric columns, unless the
+`--wildcard-include-string-len` flag is present.
+
 ```
 data-validation (--verbose or -v) validate column
   --source-conn or -sc SOURCE_CONN
@@ -120,6 +125,8 @@ data-validation (--verbose or -v) validate column
                         See: *Validation Reports* section
   [--service-account or -sa PATH_TO_SA_KEY]
                         Service account to use for BigQuery result handler output.
+  [--wildcard-include-string-len or -wis]
+                        If flag is present, include string columns in aggregation as len(string_col)
   [--filters SOURCE_FILTER:TARGET_FILTER]
                         Colon separated string values of source and target filters.
                         If target filter is not provided, the source filter will run on source and target tables.
@@ -151,9 +158,9 @@ the validation will be compared on, as well as either the `--comparison-fields`
 or the `--hash` flag.
 
 The `--comparison-fields` flag specifies the values (e.g. columns) whose raw values will be compared
-based on the primary key join. The `--hash` flag will run a checksum across all columns in
-the table. This will include casting to string, sanitizing the data, concatenating, and finally
-hashing the row. To exclude columns from the checksum, use the YAML config to customize the validation.
+based on the primary key join. The `--hash` flag will run a checksum across specified columns in
+the table. This will include casting to string, sanitizing the data (ifnull, rtrim, upper), concatenating,
+and finally hashing the row.
 
 
 Additionally you can use
@@ -177,7 +184,7 @@ data-validation (--verbose or -v) validate row
   --comparison-fields or -comp-fields FIELDS
                         Comma separated list of columns to compare. Can either be a physical column or an alias
                         See: *Calculated Fields* section for details
-  --hash '*'            '*' to hash all columns. To exclude columns, use the YAML config.
+  --hash COLUMNS        Comma separated list of columns to hash or * for all columns 
   [--bq-result-handler or -bqrh PROJECT_ID.DATASET.TABLE]
                         BigQuery destination for validation results. Defaults to stdout.
                         See: *Validation Reports* section

diff --git a/data_validation/__main__.py b/data_validation/__main__.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 import json
+import os
 import sys
+
 from yaml import Dumper, dump
 
 from data_validation import (
@@ -27,7 +28,6 @@
 from data_validation.config_manager import ConfigManager
 from data_validation.data_validation import DataValidation
 
-
 # by default yaml dumps lists as pointers. This disables that feature
 Dumper.ignore_aliases = lambda *args: True
 
@@ -53,6 +53,18 @@ def get_aggregate_config(args, config_manager):
         config_manager (ConfigManager): Validation config manager instance.
     """
     aggregate_configs = [config_manager.build_config_count_aggregate()]
+    supported_data_types = [
+        "float64",
+        "int8",
+        "int16",
+        "int32",
+        "int64",
+        "decimal",
+        "timestamp",
+    ]
+
+    if args.wildcard_include_string_len:
+        supported_data_types.append("string")
 
     if args.count:
         col_args = None if args.count == "*" else cli_tools.get_arg_list(args.count)
@@ -62,27 +74,27 @@ def get_aggregate_config(args, config_manager):
     if args.sum:
         col_args = None if args.sum == "*" else cli_tools.get_arg_list(args.sum)
         aggregate_configs += config_manager.build_config_column_aggregates(
-            "sum", col_args, consts.NUMERIC_DATA_TYPES
+            "sum", col_args, supported_data_types
         )
     if args.avg:
         col_args = None if args.avg == "*" else cli_tools.get_arg_list(args.avg)
         aggregate_configs += config_manager.build_config_column_aggregates(
-            "avg", col_args, consts.NUMERIC_DATA_TYPES
+            "avg", col_args, supported_data_types
         )
     if args.min:
         col_args = None if args.min == "*" else cli_tools.get_arg_list(args.min)
         aggregate_configs += config_manager.build_config_column_aggregates(
-            "min", col_args, consts.NUMERIC_DATA_TYPES
+            "min", col_args, supported_data_types
         )
     if args.max:
         col_args = None if args.max == "*" else cli_tools.get_arg_list(args.max)
         aggregate_configs += config_manager.build_config_column_aggregates(
-            "max", col_args, consts.NUMERIC_DATA_TYPES
+            "max", col_args, supported_data_types
         )
     if args.bit_xor:
         col_args = None if args.bit_xor == "*" else cli_tools.get_arg_list(args.bit_xor)
         aggregate_configs += config_manager.build_config_column_aggregates(
-            "bit_xor", col_args, consts.NUMERIC_DATA_TYPES
+            "bit_xor", col_args, supported_data_types
         )
     return aggregate_configs
 
@@ -211,7 +223,7 @@ def build_config_managers_from_args(args):
             filter_config = cli_tools.get_filters(args.filters)
         if args.threshold:
             threshold = args.threshold
-        labels = cli_tools.get_labels(args.labels)
+    labels = cli_tools.get_labels(args.labels)
 
     mgr = state_manager.StateManager()
     source_client = clients.get_data_client(mgr.get_connection_config(args.source_conn))
@@ -459,7 +471,7 @@ def run_validation_configs(args):
 
 
 def validate(args):
-    """ Run commands related to data validation."""
+    """Run commands related to data validation."""
     if args.validate_cmd in ["column", "row", "schema", "custom-query"]:
         run(args)
     else:

diff --git a/data_validation/app.py b/data_validation/app.py
@@ -44,9 +44,9 @@ def validate(config):
 
 
 def main(request):
-    """ Handle incoming Data Validation requests.
+    """Handle incoming Data Validation requests.
 
-        request (flask.Request): HTTP request object.
+    request (flask.Request): HTTP request object.
     """
     try:
         config = _get_request_content(request)["config"]

diff --git a/data_validation/cli_tools.py b/data_validation/cli_tools.py
@@ -279,7 +279,9 @@ def _configure_run_parser(subparsers):
         help="Store the validation in the YAML Config File Path specified",
     )
     run_parser.add_argument(
-        "--labels", "-l", help="Key value pair labels for validation run",
+        "--labels",
+        "-l",
+        help="Key value pair labels for validation run",
     )
     run_parser.add_argument(
         "--hash",
@@ -320,6 +322,12 @@ def _configure_run_parser(subparsers):
         "-rbs",
         help="Row batch size used for random row filters (default 10,000).",
     )
+    run_parser.add_argument(
+        "--wildcard-include-string-len",
+        "-wis",
+        action="store_true",
+        help="Include string fields for wildcard aggregations.",
+    )
 
 
 def _configure_connection_parser(subparsers):
@@ -409,9 +417,6 @@ def _configure_row_parser(row_parser):
         required=True,
         help="Comma separated list of primary key columns 'col_a,col_b'",
     )
-    row_parser.add_argument(
-        "--labels", "-l", help="Key value pair labels for validation run"
-    )
     row_parser.add_argument(
         "--threshold",
         "-th",
@@ -494,9 +499,6 @@ def _configure_column_parser(column_parser):
         "-pk",
         help="Comma separated list of primary key columns 'col_a,col_b'",
     )
-    column_parser.add_argument(
-        "--labels", "-l", help="Key value pair labels for validation run"
-    )
     column_parser.add_argument(
         "--threshold",
         "-th",
@@ -519,6 +521,12 @@ def _configure_column_parser(column_parser):
         "-rbs",
         help="Row batch size used for random row filters (default 10,000).",
     )
+    column_parser.add_argument(
+        "--wildcard-include-string-len",
+        "-wis",
+        action="store_true",
+        help="Include string fields for wildcard aggregations.",
+    )
 
 
 def _configure_schema_parser(schema_parser):
@@ -530,10 +538,14 @@ def _configure_custom_query_parser(custom_query_parser):
     """Configure arguments to run custom-query validations."""
     _add_common_arguments(custom_query_parser)
     custom_query_parser.add_argument(
-        "--source-query-file", "-sqf", help="File containing the source sql query",
+        "--source-query-file",
+        "-sqf",
+        help="File containing the source sql query",
     )
     custom_query_parser.add_argument(
-        "--target-query-file", "-tqf", help="File containing the target sql query",
+        "--target-query-file",
+        "-tqf",
+        help="File containing the target sql query",
     )
     custom_query_parser.add_argument(
         "--count",
@@ -575,9 +587,6 @@ def _configure_custom_query_parser(custom_query_parser):
         "-filters",
         help="Filters in the format source_filter:target_filter",
     )
-    custom_query_parser.add_argument(
-        "--labels", "-l", help="Key value pair labels for validation run"
-    )
     custom_query_parser.add_argument(
         "--threshold",
         "-th",
@@ -617,6 +626,9 @@ def _add_common_arguments(parser):
     parser.add_argument(
         "--bq-result-handler", "-bqrh", help="BigQuery result handler config details"
     )
+    parser.add_argument(
+        "--labels", "-l", help="Key value pair labels for validation run"
+    )
     parser.add_argument(
         "--service-account",
         "-sa",

diff --git a/data_validation/clients.py b/data_validation/clients.py
@@ -33,6 +33,8 @@
 from data_validation import client_info
 from data_validation import consts, exceptions
 
+ibis.options.sql.default_limit = None
+
 # Our customized Ibis Datatype logic add support for new types
 third_party.ibis.ibis_addon.datatypes
 
@@ -146,10 +148,7 @@ def get_ibis_table_schema(client, schema_name, table_name):
     table_name (str): Table name of table object
     database_name (str): Database name (generally default is used)
     """
-    if type(client) in [
-        MySQLClient,
-        PostgreSQLClient
-    ]:
+    if type(client) in [MySQLClient, PostgreSQLClient]:
         return client.schema(schema_name).table(table_name).schema()
     else:
         return client.get_schema(table_name, schema_name)
@@ -208,7 +207,7 @@ def get_all_tables(client, allowed_schemas=None):
 
 
 def get_data_client(connection_config):
-    """ Return DataClient client from given configuration """
+    """Return DataClient client from given configuration"""
     connection_config = copy.deepcopy(connection_config)
     source_type = connection_config.pop(consts.SOURCE_TYPE)
 

diff --git a/data_validation/combiner.py b/data_validation/combiner.py
@@ -27,7 +27,6 @@
 
 from data_validation import consts
 
-
 DEFAULT_SOURCE = "source"
 DEFAULT_TARGET = "target"
 
@@ -90,8 +89,7 @@ def generate_report(
         print(documented.compile())
 
     result_df = client.execute(documented)
-    result_df.status.fillna(consts.VALIDATION_STATUS_FAIL, inplace=True)
-
+    result_df.validation_status.fillna(consts.VALIDATION_STATUS_FAIL, inplace=True)
     return result_df
 
 
@@ -101,14 +99,18 @@ def _calculate_difference(field_differences, datatype, validation, is_value_comp
     if isinstance(datatype, ibis.expr.datatypes.Timestamp):
         source_value = field_differences["differences_source_value"].epoch_seconds()
         target_value = field_differences["differences_target_value"].epoch_seconds()
+    elif isinstance(datatype, ibis.expr.datatypes.Float64):
+        # Float64 type results from AVG() aggregation
+        source_value = field_differences["differences_source_value"].round(digits=4)
+        target_value = field_differences["differences_target_value"].round(digits=4)
     else:
         source_value = field_differences["differences_source_value"]
         target_value = field_differences["differences_target_value"]
 
     # Does not calculate difference between agg values for row hash due to int64 overflow
     if is_value_comparison:
         difference = pct_difference = ibis.null()
-        status = (
+        validation_status = (
             ibis.case()
             .when(target_value == source_value, consts.VALIDATION_STATUS_SUCCESS)
             .else_(consts.VALIDATION_STATUS_FAIL)
@@ -137,8 +139,12 @@ def _calculate_difference(field_differences, datatype, validation, is_value_comp
         )
 
         th_diff = (pct_difference.abs() - pct_threshold).cast("float64")
-        status = (
+        validation_status = (
             ibis.case()
+            .when(
+                source_value.isnull() & target_value.isnull(),
+                consts.VALIDATION_STATUS_SUCCESS,
+            )
             .when(th_diff.isnan() | (th_diff > 0.0), consts.VALIDATION_STATUS_FAIL)
             .else_(consts.VALIDATION_STATUS_SUCCESS)
             .end()
@@ -148,7 +154,7 @@ def _calculate_difference(field_differences, datatype, validation, is_value_comp
         difference.name("difference"),
         pct_difference.name("pct_difference"),
         pct_threshold.name("pct_threshold"),
-        status.name("status"),
+        validation_status.name("validation_status"),
     )
 
 
@@ -282,7 +288,7 @@ def _join_pivots(source, target, differences, join_on_fields):
             differences["difference"],
             differences["pct_difference"],
             differences["pct_threshold"],
-            differences["status"],
+            differences["validation_status"],
         ]
     ]
     joined = source_difference.join(target, join_keys, how="outer")[
@@ -303,7 +309,7 @@ def _join_pivots(source, target, differences, join_on_fields):
         source_difference["difference"],
         source_difference["pct_difference"],
         source_difference["pct_threshold"],
-        source_difference["status"],
+        source_difference["validation_status"],
     ]
     return joined