From acaa0f5554d79e31e5bb1d1e3ef3e8fca96f6360 Mon Sep 17 00:00:00 2001 From: AJ <95496513+ajwelch4@users.noreply.github.com> Date: Tue, 1 Mar 2022 10:36:49 -0500 Subject: [PATCH] fix: use an appropriate column filter list for schema validation (#350) (#371) --- data_validation/__main__.py | 11 +++++++++-- data_validation/config_manager.py | 8 +++++++- data_validation/consts.py | 8 ++++++++ tests/system/data_sources/test_bigquery.py | 20 ++++++++++++++++++++ 4 files changed, 44 insertions(+), 3 deletions(-) diff --git a/data_validation/__main__.py b/data_validation/__main__.py index 35c5758ae..c93d09322 100644 --- a/data_validation/__main__.py +++ b/data_validation/__main__.py @@ -195,6 +195,13 @@ def build_config_managers_from_args(args): format = args.format if args.format else "table" + use_random_rows = ( + None if config_type == consts.SCHEMA_VALIDATION else args.use_random_row + ) + random_row_batch_size = ( + None if config_type == consts.SCHEMA_VALIDATION else args.random_row_batch_size + ) + is_filesystem = source_client._source_type == "FileSystem" tables_list = cli_tools.get_tables_list( args.tables_list, default_value=[], is_filesystem=is_filesystem @@ -209,8 +216,8 @@ def build_config_managers_from_args(args): labels, threshold, format, - use_random_rows=args.use_random_row, - random_row_batch_size=args.random_row_batch_size, + use_random_rows=use_random_rows, + random_row_batch_size=random_row_batch_size, source_client=source_client, target_client=target_client, result_handler_config=result_handler_config, diff --git a/data_validation/config_manager.py b/data_validation/config_manager.py index 9969bd6e5..c384672e5 100644 --- a/data_validation/config_manager.py +++ b/data_validation/config_manager.py @@ -283,7 +283,13 @@ def get_yaml_validation_block(self): def get_result_handler(self): """Return ResultHandler instance from supplied config.""" if not self.result_handler_config: - return TextResultHandler(self._config.get(consts.CONFIG_FORMAT, "table")) + if self.config[consts.CONFIG_TYPE] == consts.SCHEMA_VALIDATION: + cols_filter_list = consts.SCHEMA_VALIDATION_COLUMN_FILTER_LIST + else: + cols_filter_list = consts.COLUMN_FILTER_LIST + return TextResultHandler( + self._config.get(consts.CONFIG_FORMAT, "table"), cols_filter_list + ) result_type = self.result_handler_config[consts.CONFIG_TYPE] if result_type == "BigQuery": diff --git a/data_validation/consts.py b/data_validation/consts.py index 021b67d83..1d98561cc 100644 --- a/data_validation/consts.py +++ b/data_validation/consts.py @@ -140,3 +140,11 @@ "run_id", "start_time", ] +SCHEMA_VALIDATION_COLUMN_FILTER_LIST = [ + "run_id", + "start_time", + "end_time", + "aggregation_type", + "source_agg_value", + "target_agg_value", +] diff --git a/tests/system/data_sources/test_bigquery.py b/tests/system/data_sources/test_bigquery.py index 68a37bc7a..215020bde 100644 --- a/tests/system/data_sources/test_bigquery.py +++ b/tests/system/data_sources/test_bigquery.py @@ -133,6 +133,18 @@ consts.CONFIG_FORMAT: "table", } +CONFIG_SCHEMA_VALIDATION = { + # BigQuery Specific Connection Config + consts.CONFIG_SOURCE_CONN: BQ_CONN, + consts.CONFIG_TARGET_CONN: BQ_CONN, + # Validation Type + consts.CONFIG_TYPE: "Schema", + # Configuration Required Depending on Validator Type + consts.CONFIG_SCHEMA_NAME: "bigquery-public-data.new_york_citibike", + consts.CONFIG_TABLE_NAME: "citibike_trips", + consts.CONFIG_FORMAT: "table", +} + BQ_CONN_NAME = "bq-integration-test" CLI_CONFIG_FILE = "example_test.yaml" @@ -237,6 +249,14 @@ def test_numeric_types(): ) +def test_schema_validation(): + validator = data_validation.DataValidation(CONFIG_SCHEMA_VALIDATION, verbose=True) + df = validator.execute() + + for validation in df.to_dict(orient="records"): + assert validation["status"] == "Pass" + + def test_cli_store_yaml_then_run_gcs(): """Test storing and retrieving validation YAML when GCS env var is set.""" # Store BQ Connection