From 28f983ff6aac9876010430bd82262b0bafe55cfb Mon Sep 17 00:00:00 2001 From: Yogesh Tewari Date: Fri, 30 Jul 2021 11:32:18 -0400 Subject: [PATCH] feat: Allow user to specify a format for stdout (#242) --- data_validation/__main__.py | 2 ++ data_validation/cli_tools.py | 6 ++++++ data_validation/config_manager.py | 7 ++++++ data_validation/consts.py | 1 + data_validation/data_validation.py | 5 ++++- data_validation/result_handlers/text.py | 25 ++++++++++++++++++++-- tests/system/data_sources/test_bigquery.py | 23 ++++++++++++++------ 7 files changed, 59 insertions(+), 10 deletions(-) diff --git a/data_validation/__main__.py b/data_validation/__main__.py index f2426f941..beaf0ecbf 100644 --- a/data_validation/__main__.py +++ b/data_validation/__main__.py @@ -146,6 +146,7 @@ def build_config_managers_from_args(args): result_handler_config=result_handler_config, filter_config=filter_config, verbose=args.verbose, + format=args.format ) configs.append(build_config_from_args(args, config_manager)) @@ -281,6 +282,7 @@ def run_validation(config_manager, verbose=False): """ validator = DataValidation( config_manager.config, + format=config_manager.format, validation_builder=None, result_handler=None, verbose=verbose, diff --git a/data_validation/cli_tools.py b/data_validation/cli_tools.py index 84c5224e4..41e189e8f 100644 --- a/data_validation/cli_tools.py +++ b/data_validation/cli_tools.py @@ -291,6 +291,12 @@ def _configure_run_parser(subparsers): "-filters", help="Filters in the format source_filter:target_filter", ) + run_parser.add_argument( + "--format", + "-format", + default="table", + help="Set the format for printing command output", + ) def _configure_connection_parser(subparsers): diff --git a/data_validation/config_manager.py b/data_validation/config_manager.py index cc37fee12..7adae8c5e 100644 --- a/data_validation/config_manager.py +++ b/data_validation/config_manager.py @@ -188,6 +188,11 @@ def threshold(self): """Return threshold from Config """ return self._config.get(consts.CONFIG_THRESHOLD, 0.0) + @property + def format(self): + """Return threshold from Config """ + return self._config.get(consts.CONFIG_FORMAT, "table") + def get_source_ibis_table(self): """Return IbisTable from source.""" if not hasattr(self, "_source_ibis_table"): @@ -269,6 +274,7 @@ def build_config_manager( table_obj, labels, threshold, + format, result_handler_config=None, filter_config=None, verbose=False, @@ -289,6 +295,7 @@ def build_config_manager( consts.CONFIG_THRESHOLD: threshold, consts.CONFIG_RESULT_HANDLER: result_handler_config, consts.CONFIG_FILTERS: filter_config, + consts.CONFIG_FORMAT: format, } # Only FileSystem connections do not require schemas diff --git a/data_validation/consts.py b/data_validation/consts.py index 6b55eafbb..0b7daa019 100644 --- a/data_validation/consts.py +++ b/data_validation/consts.py @@ -34,6 +34,7 @@ CONFIG_SOURCE_COLUMN = "source_column" CONFIG_TARGET_COLUMN = "target_column" CONFIG_THRESHOLD = "threshold" +CONFIG_FORMAT = "format" CONFIG_CAST = "cast" CONFIG_LIMIT = "limit" CONFIG_FILTERS = "filters" diff --git a/data_validation/data_validation.py b/data_validation/data_validation.py index 2675dfdc0..105ec80c6 100644 --- a/data_validation/data_validation.py +++ b/data_validation/data_validation.py @@ -39,6 +39,7 @@ class DataValidation(object): def __init__( self, config, + format, validation_builder=None, schema_validator=None, result_handler=None, @@ -58,6 +59,8 @@ def __init__( # Data Client Management self.config = config + self.format = format + self.source_client = clients.get_data_client( self.config[consts.CONFIG_SOURCE_CONN] ) @@ -102,7 +105,7 @@ def execute(self): ) # Call Result Handler to Manage Results - return self.result_handler.execute(self.config, result_df) + return self.result_handler.execute(self.config, self.format, result_df) def query_too_large(self, rows_df, grouped_fields): """ Return bool to dictate if another level of recursion diff --git a/data_validation/result_handlers/text.py b/data_validation/result_handlers/text.py index 58cbb8c76..6e28d7e8a 100644 --- a/data_validation/result_handlers/text.py +++ b/data_validation/result_handlers/text.py @@ -23,8 +23,29 @@ """ -class TextResultHandler(object): - def execute(self, config, result_df): +def print_formatted_(format, result_df): + """ + Utility for printing formatted results + :param result_df + :param format + """ + if format == "text": print(result_df.to_string(index=False)) + elif format == "csv": + print(result_df.to_csv(index=False)) + elif format == "json": + print(result_df.to_json(orient="index")) + elif format == "table": + print(result_df.to_markdown(tablefmt="fancy_grid")) + else: + error_msg = f"format [{format}] not supported, results printed in default(table) mode. " \ + f"Supported formats are [text, csv, json, table]" + print(result_df.to_markdown(tablefmt="fancy_grid")) + raise ValueError(error_msg) + + +class TextResultHandler(object): + def execute(self, config, format, result_df): + print_formatted_(format, result_df) return result_df diff --git a/tests/system/data_sources/test_bigquery.py b/tests/system/data_sources/test_bigquery.py index d9e980b73..b90c5a52b 100644 --- a/tests/system/data_sources/test_bigquery.py +++ b/tests/system/data_sources/test_bigquery.py @@ -14,10 +14,11 @@ import os +import pytest + from data_validation import cli_tools, consts, data_validation from data_validation import __main__ as main - BQ_CONN = {"source_type": "BigQuery", "project_id": os.environ["PROJECT_ID"]} CONFIG_COUNT_VALID = { # BigQuery Specific Connection Name @@ -178,7 +179,7 @@ def test_count_validator(): - validator = data_validation.DataValidation(CONFIG_COUNT_VALID, verbose=True) + validator = data_validation.DataValidation(CONFIG_COUNT_VALID, format="text", verbose=True) df = validator.execute() count_value = df[df["validation_name"] == "count"]["source_agg_value"].values[0] @@ -201,13 +202,13 @@ def test_count_validator(): assert float(max_birth_year_value) > 0 assert float(min_birth_year_value) > 0 assert ( - df["source_agg_value"].astype(float).sum() - == df["target_agg_value"].astype(float).sum() + df["source_agg_value"].astype(float).sum() + == df["target_agg_value"].astype(float).sum() ) def test_grouped_count_validator(): - validator = data_validation.DataValidation(CONFIG_GROUPED_COUNT_VALID, verbose=True) + validator = data_validation.DataValidation(CONFIG_GROUPED_COUNT_VALID, format="csv", verbose=True) df = validator.execute() rows = list(df[df["validation_name"] == "count"].iterrows()) @@ -223,7 +224,7 @@ def test_grouped_count_validator(): def test_numeric_types(): - validator = data_validation.DataValidation(CONFIG_NUMERIC_AGG_VALID, verbose=True) + validator = data_validation.DataValidation(CONFIG_NUMERIC_AGG_VALID, format="json", verbose=True) df = validator.execute() for validation in df.to_dict(orient="records"): @@ -246,7 +247,7 @@ def test_cli_store_yaml_then_run(): # The number of lines is not significant, except that it represents # the exact file expected to be created. Any change to this value # is likely to be a breaking change and must be assessed. - assert len(yaml_file.readlines()) == 32 + assert len(yaml_file.readlines()) == 33 # Run generated config run_config_args = parser.parse_args(CLI_RUN_CONFIG_ARGS) @@ -278,3 +279,11 @@ def _store_bq_conn(): def _remove_bq_conn(): file_path = cli_tools._get_connection_file(BQ_CONN_NAME) os.remove(file_path) + + +def test_unsupported_result_format(): + with pytest.raises(ValueError) as exp: + validator = data_validation.DataValidation(CONFIG_GROUPED_COUNT_VALID, format="foobar", verbose=True) + df = validator.execute() + rows = list(df[df["validation_name"] == "count"].iterrows()) + assert len(rows) > 1