GoogleCloudPlatform · yogeshtewari · Aug 3, 2021 · Jul 30, 2021 · Jul 30, 2021 · Jul 30, 2021
diff --git a/README.md b/README.md
@@ -110,6 +110,8 @@ data-validation run
   --labels or -l KEY1=VALUE1,KEY2=VALUE2
                         (Optional) Comma-separated key value pair labels for the run.
   --verbose or -v       Verbose logging will print queries executed
+  --format or -fmt      Format for stdout output, Supported formats are (text, csv, json, table)
+                        It defaults to table.
 ```
 
 The default aggregation type is a 'COUNT *'. If no aggregation flag (i.e count,

@@ -146,6 +146,7 @@ def build_config_managers_from_args(args):
             result_handler_config=result_handler_config,
             filter_config=filter_config,
             verbose=args.verbose,
+            format=args.format,
         )
         configs.append(build_config_from_args(args, config_manager))
 
@@ -281,6 +282,7 @@ def run_validation(config_manager, verbose=False):
     """
     validator = DataValidation(
         config_manager.config,
+        format=config_manager.format,
         validation_builder=None,
         result_handler=None,
         verbose=verbose,

@@ -291,6 +291,12 @@ def _configure_run_parser(subparsers):
         "-filters",
         help="Filters in the format source_filter:target_filter",
     )
+    run_parser.add_argument(
+        "--format",
+        "-fmt",
+        default="table",
+        help="Set the format for printing command output, Supported formats are (text, csv, json, table)",
+    )
 
 
 def _configure_connection_parser(subparsers):

@@ -188,6 +188,11 @@ def threshold(self):
         """Return threshold from Config """
         return self._config.get(consts.CONFIG_THRESHOLD, 0.0)
 
+    @property
+    def format(self):
+        """Return threshold from Config """
+        return self._config.get(consts.CONFIG_FORMAT, "table")
+
     def get_source_ibis_table(self):
         """Return IbisTable from source."""
         if not hasattr(self, "_source_ibis_table"):
@@ -269,6 +274,7 @@ def build_config_manager(
         table_obj,
         labels,
         threshold,
+        format,
         result_handler_config=None,
         filter_config=None,
         verbose=False,
@@ -289,6 +295,7 @@ def build_config_manager(
             consts.CONFIG_THRESHOLD: threshold,
             consts.CONFIG_RESULT_HANDLER: result_handler_config,
             consts.CONFIG_FILTERS: filter_config,
+            consts.CONFIG_FORMAT: format,
         }
 
         # Only FileSystem connections do not require schemas

@@ -34,6 +34,7 @@
 CONFIG_SOURCE_COLUMN = "source_column"
 CONFIG_TARGET_COLUMN = "target_column"
 CONFIG_THRESHOLD = "threshold"
+CONFIG_FORMAT = "format"
 CONFIG_CAST = "cast"
 CONFIG_LIMIT = "limit"
 CONFIG_FILTERS = "filters"

@@ -39,6 +39,7 @@ class DataValidation(object):
     def __init__(
         self,
         config,
+        format="table",
         validation_builder=None,
         schema_validator=None,
         result_handler=None,
@@ -58,6 +59,8 @@ def __init__(
         # Data Client Management
         self.config = config
 
+        self.format = format
+
         self.source_client = clients.get_data_client(
             self.config[consts.CONFIG_SOURCE_CONN]
         )
@@ -102,7 +105,7 @@ def execute(self):
             )
 
         # Call Result Handler to Manage Results
-        return self.result_handler.execute(self.config, result_df)
+        return self.result_handler.execute(self.config, self.format, result_df)
 
     def query_too_large(self, rows_df, grouped_fields):
         """ Return bool to dictate if another level of recursion

@@ -23,8 +23,31 @@
 """
 
 
-class TextResultHandler(object):
-    def execute(self, config, result_df):
+def print_formatted_(format, result_df):
+    """
+    Utility for printing formatted results
+    :param result_df
+    :param format
+    """
+    if format == "text":
         print(result_df.to_string(index=False))
+    elif format == "csv":
+        print(result_df.to_csv(index=False))
+    elif format == "json":
+        print(result_df.to_json(orient="index"))
+    elif format == "table":
+        print(result_df.to_markdown(tablefmt="fancy_grid"))
+    else:
+        error_msg = (
+            f"format [{format}] not supported, results printed in default(table) mode. "
+            f"Supported formats are [text, csv, json, table]"
+        )
+        print(result_df.to_markdown(tablefmt="fancy_grid"))
+        raise ValueError(error_msg)
+
+
+class TextResultHandler(object):
+    def execute(self, config, format, result_df):
+        print_formatted_(format, result_df)
 
         return result_df
@@ -48,6 +48,7 @@
     "google-cloud-spanner==3.1.0",
     "setuptools>=34.0.0",
     "jellyfish==0.8.2",
+    "tabulate==0.8.9",
 ]
 
 extras_require = {

@@ -14,9 +14,10 @@
 
 import os
 
-from data_validation import cli_tools, consts, data_validation
-from data_validation import __main__ as main
+import pytest
 
+from data_validation import __main__ as main
+from data_validation import cli_tools, consts, data_validation
 
 BQ_CONN = {"source_type": "BigQuery", "project_id": os.environ["PROJECT_ID"]}
 CONFIG_COUNT_VALID = {
@@ -178,7 +179,9 @@
 
 
 def test_count_validator():
-    validator = data_validation.DataValidation(CONFIG_COUNT_VALID, verbose=True)
+    validator = data_validation.DataValidation(
+        CONFIG_COUNT_VALID, format="text", verbose=True
+    )
     df = validator.execute()
 
     count_value = df[df["validation_name"] == "count"]["source_agg_value"].values[0]
@@ -207,7 +210,9 @@ def test_count_validator():
 
 
 def test_grouped_count_validator():
-    validator = data_validation.DataValidation(CONFIG_GROUPED_COUNT_VALID, verbose=True)
+    validator = data_validation.DataValidation(
+        CONFIG_GROUPED_COUNT_VALID, format="csv", verbose=True
+    )
     df = validator.execute()
     rows = list(df[df["validation_name"] == "count"].iterrows())
 
@@ -223,7 +228,9 @@ def test_grouped_count_validator():
 
 
 def test_numeric_types():
-    validator = data_validation.DataValidation(CONFIG_NUMERIC_AGG_VALID, verbose=True)
+    validator = data_validation.DataValidation(
+        CONFIG_NUMERIC_AGG_VALID, format="json", verbose=True
+    )
     df = validator.execute()
 
     for validation in df.to_dict(orient="records"):
@@ -246,7 +253,7 @@ def test_cli_store_yaml_then_run():
         # The number of lines is not significant, except that it represents
         # the exact file expected to be created.  Any change to this value
         # is likely to be a breaking change and must be assessed.
-        assert len(yaml_file.readlines()) == 32
+        assert len(yaml_file.readlines()) == 33
 
     # Run generated config
     run_config_args = parser.parse_args(CLI_RUN_CONFIG_ARGS)
@@ -278,3 +285,13 @@ def _store_bq_conn():
 def _remove_bq_conn():
     file_path = cli_tools._get_connection_file(BQ_CONN_NAME)
     os.remove(file_path)
+
+
+def test_unsupported_result_format():
+    with pytest.raises(ValueError):
+        validator = data_validation.DataValidation(
+            CONFIG_GROUPED_COUNT_VALID, format="foobar", verbose=True
+        )
+        df = validator.execute()
+        rows = list(df[df["validation_name"] == "count"].iterrows())
+        assert len(rows) > 1
@@ -38,8 +38,9 @@ def test_import(module_under_test):
 
 def test_basic_result_handler(module_under_test):
     """Test basic handler executes """
+    format = "json"
     result_df = DataFrame(SAMPLE_RESULT_DATA)
     result_handler = module_under_test.TextResultHandler()
 
-    handler_output = result_handler.execute(SAMPLE_CONFIG, result_df)
+    handler_output = result_handler.execute(SAMPLE_CONFIG, format, result_df)
     assert handler_output["count"].sum() == result_df["count"].sum()