From 18d83be0da6f9c549f6db73f29694aa441a99310 Mon Sep 17 00:00:00 2001 From: "A.J. Welch" Date: Fri, 18 Feb 2022 04:22:31 +0000 Subject: [PATCH] fix: use an appropriate column filter list for schema validation (#350) --- data_validation/__main__.py | 11 +++++++++-- data_validation/config_manager.py | 8 +++++++- data_validation/consts.py | 8 ++++++++ tests/system/data_sources/test_bigquery.py | 20 ++++++++++++++++++++ 4 files changed, 44 insertions(+), 3 deletions(-) diff --git a/data_validation/__main__.py b/data_validation/__main__.py index ffe933457..23c8509f5 100644 --- a/data_validation/__main__.py +++ b/data_validation/__main__.py @@ -153,6 +153,13 @@ def build_config_managers_from_args(args): format = args.format if args.format else "table" + use_random_rows = ( + None if config_type == consts.SCHEMA_VALIDATION else args.use_random_row + ) + random_row_batch_size = ( + None if config_type == consts.SCHEMA_VALIDATION else args.random_row_batch_size + ) + is_filesystem = source_client._source_type == "FileSystem" tables_list = cli_tools.get_tables_list( args.tables_list, default_value=[], is_filesystem=is_filesystem @@ -167,8 +174,8 @@ def build_config_managers_from_args(args): labels, threshold, format, - use_random_rows=args.use_random_row, - random_row_batch_size=args.random_row_batch_size, + use_random_rows=use_random_rows, + random_row_batch_size=random_row_batch_size, source_client=source_client, target_client=target_client, result_handler_config=result_handler_config, diff --git a/data_validation/config_manager.py b/data_validation/config_manager.py index d63ecbfe9..a8359c648 100644 --- a/data_validation/config_manager.py +++ b/data_validation/config_manager.py @@ -277,7 +277,13 @@ def get_yaml_validation_block(self): def get_result_handler(self): """Return ResultHandler instance from supplied config.""" if not self.result_handler_config: - return TextResultHandler(self._config.get(consts.CONFIG_FORMAT, "table")) + if self.config[consts.CONFIG_TYPE] == consts.SCHEMA_VALIDATION: + cols_filter_list = consts.SCHEMA_VALIDATION_COLUMN_FILTER_LIST + else: + cols_filter_list = consts.COLUMN_FILTER_LIST + return TextResultHandler( + self._config.get(consts.CONFIG_FORMAT, "table"), cols_filter_list + ) result_type = self.result_handler_config[consts.CONFIG_TYPE] if result_type == "BigQuery": diff --git a/data_validation/consts.py b/data_validation/consts.py index 6766992e8..c58e8f38a 100644 --- a/data_validation/consts.py +++ b/data_validation/consts.py @@ -129,3 +129,11 @@ "run_id", "start_time", ] +SCHEMA_VALIDATION_COLUMN_FILTER_LIST = [ + "run_id", + "start_time", + "end_time", + "aggregation_type", + "source_agg_value", + "target_agg_value", +] diff --git a/tests/system/data_sources/test_bigquery.py b/tests/system/data_sources/test_bigquery.py index 5b7d71ea6..04ef4a949 100644 --- a/tests/system/data_sources/test_bigquery.py +++ b/tests/system/data_sources/test_bigquery.py @@ -134,6 +134,18 @@ consts.CONFIG_FORMAT: "table", } +CONFIG_SCHEMA_VALIDATION = { + # BigQuery Specific Connection Config + consts.CONFIG_SOURCE_CONN: BQ_CONN, + consts.CONFIG_TARGET_CONN: BQ_CONN, + # Validation Type + consts.CONFIG_TYPE: "Schema", + # Configuration Required Depending on Validator Type + consts.CONFIG_SCHEMA_NAME: "bigquery-public-data.new_york_citibike", + consts.CONFIG_TABLE_NAME: "citibike_trips", + consts.CONFIG_FORMAT: "table", +} + BQ_CONN_NAME = "bq-integration-test" CLI_CONFIG_FILE = "example_test.yaml" @@ -237,6 +249,14 @@ def test_numeric_types(): ) +def test_schema_validation(): + validator = data_validation.DataValidation(CONFIG_SCHEMA_VALIDATION, verbose=True) + df = validator.execute() + + for validation in df.to_dict(orient="records"): + assert validation["status"] == "Pass" + + def test_cli_store_yaml_then_run(): # Store BQ Connection _store_bq_conn()