GoogleCloudPlatform · nehanene15 · Mar 2, 2022 · Feb 26, 2022 · Mar 2, 2022
@@ -89,7 +89,7 @@ def generate_report(
         print(documented.compile())
 
     result_df = client.execute(documented)
-    result_df.status.fillna("fail", inplace=True)
+    result_df.status.fillna(consts.VALIDATION_STATUS_FAIL, inplace=True)
 
     return result_df
 
@@ -109,8 +109,8 @@ def _calculate_difference(field_differences, datatype, validation, is_value_comp
         difference = pct_difference = ibis.null()
         status = (
             ibis.case()
-            .when(target_value == source_value, "success")
-            .else_("fail")
+            .when(target_value == source_value, consts.VALIDATION_STATUS_SUCCESS)
+            .else_(consts.VALIDATION_STATUS_FAIL)
             .end()
         )
     else:
@@ -129,8 +129,8 @@ def _calculate_difference(field_differences, datatype, validation, is_value_comp
         th_diff = (pct_difference.abs() - pct_threshold).cast("float64")
         status = (
             ibis.case()
-            .when(th_diff.isnan() | (th_diff > 0.0), "fail")
-            .else_("success")
+            .when(th_diff.isnan() | (th_diff > 0.0), consts.VALIDATION_STATUS_FAIL)
+            .else_(consts.VALIDATION_STATUS_SUCCESS)
             .end()
         )
 

@@ -108,6 +108,8 @@
 TARGET_AGG_VALUE = "target_agg_value"
 
 VALIDATION_STATUS = "status"
+VALIDATION_STATUS_SUCCESS = "success"
+VALIDATION_STATUS_FAIL = "fail"
 
 # SQL Template Formatting
 # TODO: should this be managed in query_builder if that is the only place its used?

@@ -15,7 +15,7 @@
 import datetime
 import pandas
 
-from data_validation import metadata
+from data_validation import metadata, consts
 
 
 class SchemaValidation(object):
@@ -100,7 +100,7 @@ def schema_validation_matching(source_fields, target_fields):
                         source_field_name,
                         "1",
                         "1",
-                        "Pass",
+                        consts.VALIDATION_STATUS_SUCCESS,
                         "Source_type:{} Target_type:{}".format(
                             source_field_type, target_fields[source_field_name]
                         ),
@@ -114,7 +114,7 @@ def schema_validation_matching(source_fields, target_fields):
                         source_field_name,
                         "1",
                         "1",
-                        "Fail",
+                        consts.VALIDATION_STATUS_FAIL,
                         "Data type mismatch between source and target. Source_type:{} Target_type:{}".format(
                             source_field_type, target_fields[source_field_name]
                         ),
@@ -128,7 +128,7 @@ def schema_validation_matching(source_fields, target_fields):
                     "N/A",
                     "1",
                     "0",
-                    "Fail",
+                    consts.VALIDATION_STATUS_FAIL,
                     "Target doesn't have a matching field name",
                 ]
             )
@@ -142,7 +142,7 @@ def schema_validation_matching(source_fields, target_fields):
                     target_field_name,
                     "0",
                     "1",
-                    "Fail",
+                    consts.VALIDATION_STATUS_FAIL,
                     "Source doesn't have a matching field name",
                 ]
             )

@@ -254,7 +254,7 @@ def test_schema_validation():
     df = validator.execute()
 
     for validation in df.to_dict(orient="records"):
-        assert validation["status"] == "Pass"
+        assert validation["status"] == consts.VALIDATION_STATUS_SUCCESS
 
 
 def test_cli_store_yaml_then_run_gcs():

@@ -20,6 +20,8 @@
 import pandas
 import pandas.testing
 
+from data_validation import consts
+
 
 REPO_ROOT = pathlib.Path(__file__).parent.parent.parent.parent
 SCHEMA_PATH = REPO_ROOT / "terraform" / "results_schema.json"
@@ -134,7 +136,14 @@ def test_execute_with_nan(bigquery_client, bigquery_dataset_id):
             "difference": [-1.0, -1.0, _NAN, _NAN, _NAN, _NAN],
             "pct_difference": [-50.0, -25.0, _NAN, _NAN, _NAN, _NAN],
             "pct_threshold": [25.0, 25.0, _NAN, _NAN, _NAN, _NAN],
-            "status": ["fail", "success", _NAN, _NAN, _NAN, _NAN],
+            "status": [
+                consts.VALIDATION_STATUS_FAIL,
+                consts.VALIDATION_STATUS_SUCCESS,
+                _NAN,
+                _NAN,
+                _NAN,
+                _NAN,
+            ],
             "labels": [[{"key": "name", "value": "test_label"}]] * 6,
         }
     )

@@ -19,7 +19,7 @@
 import pandas.testing
 import pytest
 
-from data_validation import metadata
+from data_validation import metadata, consts
 
 
 _NAN = float("nan")
@@ -149,7 +149,7 @@ def test_generate_report_with_too_many_rows(module_under_test):
                     "difference": [1.0],
                     "pct_difference": [100.0],
                     "pct_threshold": [0.0],
-                    "status": ["fail"],
+                    "status": [consts.VALIDATION_STATUS_FAIL],
                     "labels": [[("name", "test_label")]],
                 }
             ),
@@ -198,7 +198,7 @@ def test_generate_report_with_too_many_rows(module_under_test):
                     "difference": [0.0],
                     "pct_difference": [0.0],
                     "pct_threshold": [0.0],
-                    "status": ["success"],
+                    "status": [consts.VALIDATION_STATUS_SUCCESS],
                     "labels": [[("name", "test_label")]],
                 }
             ),
@@ -255,7 +255,7 @@ def test_generate_report_with_too_many_rows(module_under_test):
                     "difference": [400000000.0],
                     "pct_difference": [25.0],
                     "pct_threshold": [0.0],
-                    "status": ["fail"],
+                    "status": [consts.VALIDATION_STATUS_FAIL],
                     "labels": [[("name", "test_label")]],
                 }
             ),
@@ -317,7 +317,10 @@ def test_generate_report_with_too_many_rows(module_under_test):
                     "difference": [1.0, 2.0],
                     "pct_difference": [12.5, -200.0],
                     "pct_threshold": [30.0, 0.0],
-                    "status": ["success", "fail"],
+                    "status": [
+                        consts.VALIDATION_STATUS_SUCCESS,
+                        consts.VALIDATION_STATUS_FAIL,
+                    ],
                     "labels": [[("name", "test_label")]] * 2,
                 }
             ),
@@ -413,7 +416,12 @@ def test_generate_report_without_group_by(
                     "difference": [-1.0, -1.0, -1.0, 1.0],
                     "pct_difference": [-50.0, -25.0, -12.5, 6.25],
                     "pct_threshold": [7.0, 7.0, 7.0, 7.0],
-                    "status": ["fail", "fail", "fail", "success"],
+                    "status": [
+                        consts.VALIDATION_STATUS_FAIL,
+                        consts.VALIDATION_STATUS_FAIL,
+                        consts.VALIDATION_STATUS_FAIL,
+                        consts.VALIDATION_STATUS_SUCCESS,
+                    ],
                     "labels": [[("name", "group_label")]] * 4,
                 }
             ),
@@ -459,7 +467,10 @@ def test_generate_report_without_group_by(
                     "difference": [2.0, 2.0],
                     "pct_difference": [200.0, 100.0],
                     "pct_threshold": [100.0, 100.0],
-                    "status": ["fail", "success"],
+                    "status": [
+                        consts.VALIDATION_STATUS_FAIL,
+                        consts.VALIDATION_STATUS_SUCCESS,
+                    ],
                     "labels": [[("name", "group_label")]] * 2,
                 }
             ),
@@ -538,7 +549,14 @@ def test_generate_report_without_group_by(
                     "difference": [-1.0, -1.0, _NAN, _NAN, _NAN, _NAN],
                     "pct_difference": [-50.0, -25.0, _NAN, _NAN, _NAN, _NAN],
                     "pct_threshold": [25.0, 25.0, _NAN, _NAN, _NAN, _NAN],
-                    "status": ["fail", "success", "fail", "fail", "fail", "fail"],
+                    "status": [
+                        consts.VALIDATION_STATUS_FAIL,
+                        consts.VALIDATION_STATUS_SUCCESS,
+                        consts.VALIDATION_STATUS_FAIL,
+                        consts.VALIDATION_STATUS_FAIL,
+                        consts.VALIDATION_STATUS_FAIL,
+                        consts.VALIDATION_STATUS_FAIL,
+                    ],
                     "labels": [[("name", "group_label")]] * 6,
                 }
             ),
@@ -625,7 +643,7 @@ def test_generate_report_with_group_by(
                     "difference": [_NAN],
                     "pct_difference": [_NAN],
                     "pct_threshold": [0.0],
-                    "status": ["fail"],
+                    "status": [consts.VALIDATION_STATUS_FAIL],
                     "labels": [[("name", "test_label")]],
                 }
             ),
@@ -670,7 +688,7 @@ def test_generate_report_with_group_by(
                     "difference": [_NAN],
                     "pct_difference": [_NAN],
                     "pct_threshold": [0.0],
-                    "status": ["fail"],
+                    "status": [consts.VALIDATION_STATUS_FAIL],
                     "labels": [[("name", "test_label")]],
                 }
             ),
@@ -715,7 +733,7 @@ def test_generate_report_with_group_by(
                     "difference": [_NAN],
                     "pct_difference": [_NAN],
                     "pct_threshold": [0.0],
-                    "status": ["fail"],
+                    "status": [consts.VALIDATION_STATUS_FAIL],
                     "labels": [[("name", "test_label")]],
                 }
             ),
@@ -777,7 +795,10 @@ def test_generate_report_with_group_by(
                     "difference": [1.0, _NAN],
                     "pct_difference": [12.5, _NAN],
                     "pct_threshold": [30.0, 0.0],
-                    "status": ["success", "fail"],
+                    "status": [
+                        consts.VALIDATION_STATUS_SUCCESS,
+                        consts.VALIDATION_STATUS_FAIL,
+                    ],
                     "labels": [[("name", "test_label")]] * 2,
                 }
             ),

@@ -516,7 +516,7 @@ def test_status_success_validation(module_under_test, fs):
     col_a_status = col_a_result_df.status.values[0]
 
     assert col_a_pct_threshold == 0.0
-    assert col_a_status == "success"
+    assert col_a_status == consts.VALIDATION_STATUS_SUCCESS
 
 
 def test_status_fail_validation(module_under_test, fs):
@@ -530,7 +530,7 @@ def test_status_fail_validation(module_under_test, fs):
     col_a_status = col_a_result_df.status.values[0]
 
     assert col_a_pct_threshold == 0.0
-    assert col_a_status == "fail"
+    assert col_a_status == consts.VALIDATION_STATUS_FAIL
 
 
 def test_threshold_equals_diff(module_under_test, fs):
@@ -546,7 +546,7 @@ def test_threshold_equals_diff(module_under_test, fs):
 
     assert col_a_pct_diff == 150.0
     assert col_a_pct_threshold == 150.0
-    assert col_a_status == "success"
+    assert col_a_status == consts.VALIDATION_STATUS_SUCCESS
 
 
 def test_grouped_column_level_validation_perfect_match(module_under_test, fs):
@@ -674,7 +674,7 @@ def test_fail_row_level_validation(module_under_test, fs):
     result_df = client.execute()
 
     # based on shared keys
-    fail_df = result_df[result_df["status"] == "fail"]
+    fail_df = result_df[result_df["status"] == consts.VALIDATION_STATUS_FAIL]
     assert len(fail_df) == 5
 
 
@@ -691,7 +691,7 @@ def test_bad_join_row_level_validation(module_under_test, fs):
     client = module_under_test.DataValidation(SAMPLE_ROW_CONFIG)
     result_df = client.execute()
 
-    comparison_df = result_df[result_df["status"] == "fail"]
+    comparison_df = result_df[result_df["status"] == consts.VALIDATION_STATUS_FAIL]
     # 2 validations * (100 source + 1 target)
     assert len(result_df) == 202
     assert len(comparison_df) == 202
@@ -143,13 +143,20 @@ def test_schema_validation_matching(module_under_test):
     target_fields = {"field1": "string", "field2": "timestamp", "field_3": "string"}
 
     expected_results = [
-        ["field1", "field1", "1", "1", "Pass", "Source_type:string Target_type:string"],
+        [
+            "field1",
+            "field1",
+            "1",
+            "1",
+            consts.VALIDATION_STATUS_SUCCESS,
+            "Source_type:string Target_type:string",
+        ],
         [
             "field2",
             "field2",
             "1",
             "1",
-            "Fail",
+            consts.VALIDATION_STATUS_FAIL,
             "Data type mismatch between source and target. "
             "Source_type:datetime Target_type:timestamp",
         ],
@@ -158,15 +165,15 @@ def test_schema_validation_matching(module_under_test):
             "N/A",
             "1",
             "0",
-            "Fail",
+            consts.VALIDATION_STATUS_FAIL,
             "Target doesn't have a matching field name",
         ],
         [
             "N/A",
             "field_3",
             "0",
             "1",
-            "Fail",
+            consts.VALIDATION_STATUS_FAIL,
             "Source doesn't have a matching field name",
         ],
     ]
@@ -188,7 +195,9 @@ def test_execute(module_under_test, fs):
 
     dv_client = data_validation.DataValidation(SAMPLE_SCHEMA_CONFIG, verbose=True)
     result_df = dv_client.schema_validator.execute()
-    failures = result_df[result_df["status"].str.contains("Fail")]
+    failures = result_df[
+        result_df["status"].str.contains(consts.VALIDATION_STATUS_FAIL)
+    ]
 
     assert len(result_df) == len(source_data[0]) + 1
     assert result_df["source_agg_value"].astype(float).sum() == 7