From acaa0f5554d79e31e5bb1d1e3ef3e8fca96f6360 Mon Sep 17 00:00:00 2001
From: AJ <95496513+ajwelch4@users.noreply.github.com>
Date: Tue, 1 Mar 2022 10:36:49 -0500
Subject: [PATCH] fix: use an appropriate column filter list for schema
 validation (#350) (#371)

---
 data_validation/__main__.py                | 11 +++++++++--
 data_validation/config_manager.py          |  8 +++++++-
 data_validation/consts.py                  |  8 ++++++++
 tests/system/data_sources/test_bigquery.py | 20 ++++++++++++++++++++
 4 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/data_validation/__main__.py b/data_validation/__main__.py
index 35c5758ae..c93d09322 100644
--- a/data_validation/__main__.py
+++ b/data_validation/__main__.py
@@ -195,6 +195,13 @@ def build_config_managers_from_args(args):
 
     format = args.format if args.format else "table"
 
+    use_random_rows = (
+        None if config_type == consts.SCHEMA_VALIDATION else args.use_random_row
+    )
+    random_row_batch_size = (
+        None if config_type == consts.SCHEMA_VALIDATION else args.random_row_batch_size
+    )
+
     is_filesystem = source_client._source_type == "FileSystem"
     tables_list = cli_tools.get_tables_list(
         args.tables_list, default_value=[], is_filesystem=is_filesystem
@@ -209,8 +216,8 @@ def build_config_managers_from_args(args):
             labels,
             threshold,
             format,
-            use_random_rows=args.use_random_row,
-            random_row_batch_size=args.random_row_batch_size,
+            use_random_rows=use_random_rows,
+            random_row_batch_size=random_row_batch_size,
             source_client=source_client,
             target_client=target_client,
             result_handler_config=result_handler_config,
diff --git a/data_validation/config_manager.py b/data_validation/config_manager.py
index 9969bd6e5..c384672e5 100644
--- a/data_validation/config_manager.py
+++ b/data_validation/config_manager.py
@@ -283,7 +283,13 @@ def get_yaml_validation_block(self):
     def get_result_handler(self):
         """Return ResultHandler instance from supplied config."""
         if not self.result_handler_config:
-            return TextResultHandler(self._config.get(consts.CONFIG_FORMAT, "table"))
+            if self.config[consts.CONFIG_TYPE] == consts.SCHEMA_VALIDATION:
+                cols_filter_list = consts.SCHEMA_VALIDATION_COLUMN_FILTER_LIST
+            else:
+                cols_filter_list = consts.COLUMN_FILTER_LIST
+            return TextResultHandler(
+                self._config.get(consts.CONFIG_FORMAT, "table"), cols_filter_list
+            )
 
         result_type = self.result_handler_config[consts.CONFIG_TYPE]
         if result_type == "BigQuery":
diff --git a/data_validation/consts.py b/data_validation/consts.py
index 021b67d83..1d98561cc 100644
--- a/data_validation/consts.py
+++ b/data_validation/consts.py
@@ -140,3 +140,11 @@
     "run_id",
     "start_time",
 ]
+SCHEMA_VALIDATION_COLUMN_FILTER_LIST = [
+    "run_id",
+    "start_time",
+    "end_time",
+    "aggregation_type",
+    "source_agg_value",
+    "target_agg_value",
+]
diff --git a/tests/system/data_sources/test_bigquery.py b/tests/system/data_sources/test_bigquery.py
index 68a37bc7a..215020bde 100644
--- a/tests/system/data_sources/test_bigquery.py
+++ b/tests/system/data_sources/test_bigquery.py
@@ -133,6 +133,18 @@
     consts.CONFIG_FORMAT: "table",
 }
 
+CONFIG_SCHEMA_VALIDATION = {
+    # BigQuery Specific Connection Config
+    consts.CONFIG_SOURCE_CONN: BQ_CONN,
+    consts.CONFIG_TARGET_CONN: BQ_CONN,
+    # Validation Type
+    consts.CONFIG_TYPE: "Schema",
+    # Configuration Required Depending on Validator Type
+    consts.CONFIG_SCHEMA_NAME: "bigquery-public-data.new_york_citibike",
+    consts.CONFIG_TABLE_NAME: "citibike_trips",
+    consts.CONFIG_FORMAT: "table",
+}
+
 BQ_CONN_NAME = "bq-integration-test"
 CLI_CONFIG_FILE = "example_test.yaml"
 
@@ -237,6 +249,14 @@ def test_numeric_types():
         )
 
 
+def test_schema_validation():
+    validator = data_validation.DataValidation(CONFIG_SCHEMA_VALIDATION, verbose=True)
+    df = validator.execute()
+
+    for validation in df.to_dict(orient="records"):
+        assert validation["status"] == "Pass"
+
+
 def test_cli_store_yaml_then_run_gcs():
     """Test storing and retrieving validation YAML when GCS env var is set."""
     # Store BQ Connection