diff --git a/data_validation/combiner.py b/data_validation/combiner.py index bd446621d..44710dd9b 100644 --- a/data_validation/combiner.py +++ b/data_validation/combiner.py @@ -89,7 +89,7 @@ def generate_report( print(documented.compile()) result_df = client.execute(documented) - result_df.status.fillna("fail", inplace=True) + result_df.status.fillna(consts.VALIDATION_STATUS_FAIL, inplace=True) return result_df @@ -109,8 +109,8 @@ def _calculate_difference(field_differences, datatype, validation, is_value_comp difference = pct_difference = ibis.null() status = ( ibis.case() - .when(target_value == source_value, "success") - .else_("fail") + .when(target_value == source_value, consts.VALIDATION_STATUS_SUCCESS) + .else_(consts.VALIDATION_STATUS_FAIL) .end() ) else: @@ -129,8 +129,8 @@ def _calculate_difference(field_differences, datatype, validation, is_value_comp th_diff = (pct_difference.abs() - pct_threshold).cast("float64") status = ( ibis.case() - .when(th_diff.isnan() | (th_diff > 0.0), "fail") - .else_("success") + .when(th_diff.isnan() | (th_diff > 0.0), consts.VALIDATION_STATUS_FAIL) + .else_(consts.VALIDATION_STATUS_SUCCESS) .end() ) diff --git a/data_validation/consts.py b/data_validation/consts.py index 43a436c44..2dc811e3d 100644 --- a/data_validation/consts.py +++ b/data_validation/consts.py @@ -108,6 +108,8 @@ TARGET_AGG_VALUE = "target_agg_value" VALIDATION_STATUS = "status" +VALIDATION_STATUS_SUCCESS = "success" +VALIDATION_STATUS_FAIL = "fail" # SQL Template Formatting # TODO: should this be managed in query_builder if that is the only place its used? diff --git a/data_validation/schema_validation.py b/data_validation/schema_validation.py index cdb60087e..7596ab65a 100644 --- a/data_validation/schema_validation.py +++ b/data_validation/schema_validation.py @@ -15,7 +15,7 @@ import datetime import pandas -from data_validation import metadata +from data_validation import metadata, consts class SchemaValidation(object): @@ -100,7 +100,7 @@ def schema_validation_matching(source_fields, target_fields): source_field_name, "1", "1", - "Pass", + consts.VALIDATION_STATUS_SUCCESS, "Source_type:{} Target_type:{}".format( source_field_type, target_fields[source_field_name] ), @@ -114,7 +114,7 @@ def schema_validation_matching(source_fields, target_fields): source_field_name, "1", "1", - "Fail", + consts.VALIDATION_STATUS_FAIL, "Data type mismatch between source and target. Source_type:{} Target_type:{}".format( source_field_type, target_fields[source_field_name] ), @@ -128,7 +128,7 @@ def schema_validation_matching(source_fields, target_fields): "N/A", "1", "0", - "Fail", + consts.VALIDATION_STATUS_FAIL, "Target doesn't have a matching field name", ] ) @@ -142,7 +142,7 @@ def schema_validation_matching(source_fields, target_fields): target_field_name, "0", "1", - "Fail", + consts.VALIDATION_STATUS_FAIL, "Source doesn't have a matching field name", ] ) diff --git a/tests/system/data_sources/test_bigquery.py b/tests/system/data_sources/test_bigquery.py index 215020bde..094ff1161 100644 --- a/tests/system/data_sources/test_bigquery.py +++ b/tests/system/data_sources/test_bigquery.py @@ -254,7 +254,7 @@ def test_schema_validation(): df = validator.execute() for validation in df.to_dict(orient="records"): - assert validation["status"] == "Pass" + assert validation["status"] == consts.VALIDATION_STATUS_SUCCESS def test_cli_store_yaml_then_run_gcs(): diff --git a/tests/system/result_handlers/test_bigquery.py b/tests/system/result_handlers/test_bigquery.py index fa02bd5f8..6b023c6c8 100644 --- a/tests/system/result_handlers/test_bigquery.py +++ b/tests/system/result_handlers/test_bigquery.py @@ -20,6 +20,8 @@ import pandas import pandas.testing +from data_validation import consts + REPO_ROOT = pathlib.Path(__file__).parent.parent.parent.parent SCHEMA_PATH = REPO_ROOT / "terraform" / "results_schema.json" @@ -134,7 +136,14 @@ def test_execute_with_nan(bigquery_client, bigquery_dataset_id): "difference": [-1.0, -1.0, _NAN, _NAN, _NAN, _NAN], "pct_difference": [-50.0, -25.0, _NAN, _NAN, _NAN, _NAN], "pct_threshold": [25.0, 25.0, _NAN, _NAN, _NAN, _NAN], - "status": ["fail", "success", _NAN, _NAN, _NAN, _NAN], + "status": [ + consts.VALIDATION_STATUS_FAIL, + consts.VALIDATION_STATUS_SUCCESS, + _NAN, + _NAN, + _NAN, + _NAN, + ], "labels": [[{"key": "name", "value": "test_label"}]] * 6, } ) diff --git a/tests/unit/test_combiner.py b/tests/unit/test_combiner.py index fc57e30bf..6d4d84076 100644 --- a/tests/unit/test_combiner.py +++ b/tests/unit/test_combiner.py @@ -19,7 +19,7 @@ import pandas.testing import pytest -from data_validation import metadata +from data_validation import metadata, consts _NAN = float("nan") @@ -149,7 +149,7 @@ def test_generate_report_with_too_many_rows(module_under_test): "difference": [1.0], "pct_difference": [100.0], "pct_threshold": [0.0], - "status": ["fail"], + "status": [consts.VALIDATION_STATUS_FAIL], "labels": [[("name", "test_label")]], } ), @@ -198,7 +198,7 @@ def test_generate_report_with_too_many_rows(module_under_test): "difference": [0.0], "pct_difference": [0.0], "pct_threshold": [0.0], - "status": ["success"], + "status": [consts.VALIDATION_STATUS_SUCCESS], "labels": [[("name", "test_label")]], } ), @@ -255,7 +255,7 @@ def test_generate_report_with_too_many_rows(module_under_test): "difference": [400000000.0], "pct_difference": [25.0], "pct_threshold": [0.0], - "status": ["fail"], + "status": [consts.VALIDATION_STATUS_FAIL], "labels": [[("name", "test_label")]], } ), @@ -317,7 +317,10 @@ def test_generate_report_with_too_many_rows(module_under_test): "difference": [1.0, 2.0], "pct_difference": [12.5, -200.0], "pct_threshold": [30.0, 0.0], - "status": ["success", "fail"], + "status": [ + consts.VALIDATION_STATUS_SUCCESS, + consts.VALIDATION_STATUS_FAIL, + ], "labels": [[("name", "test_label")]] * 2, } ), @@ -413,7 +416,12 @@ def test_generate_report_without_group_by( "difference": [-1.0, -1.0, -1.0, 1.0], "pct_difference": [-50.0, -25.0, -12.5, 6.25], "pct_threshold": [7.0, 7.0, 7.0, 7.0], - "status": ["fail", "fail", "fail", "success"], + "status": [ + consts.VALIDATION_STATUS_FAIL, + consts.VALIDATION_STATUS_FAIL, + consts.VALIDATION_STATUS_FAIL, + consts.VALIDATION_STATUS_SUCCESS, + ], "labels": [[("name", "group_label")]] * 4, } ), @@ -459,7 +467,10 @@ def test_generate_report_without_group_by( "difference": [2.0, 2.0], "pct_difference": [200.0, 100.0], "pct_threshold": [100.0, 100.0], - "status": ["fail", "success"], + "status": [ + consts.VALIDATION_STATUS_FAIL, + consts.VALIDATION_STATUS_SUCCESS, + ], "labels": [[("name", "group_label")]] * 2, } ), @@ -538,7 +549,14 @@ def test_generate_report_without_group_by( "difference": [-1.0, -1.0, _NAN, _NAN, _NAN, _NAN], "pct_difference": [-50.0, -25.0, _NAN, _NAN, _NAN, _NAN], "pct_threshold": [25.0, 25.0, _NAN, _NAN, _NAN, _NAN], - "status": ["fail", "success", "fail", "fail", "fail", "fail"], + "status": [ + consts.VALIDATION_STATUS_FAIL, + consts.VALIDATION_STATUS_SUCCESS, + consts.VALIDATION_STATUS_FAIL, + consts.VALIDATION_STATUS_FAIL, + consts.VALIDATION_STATUS_FAIL, + consts.VALIDATION_STATUS_FAIL, + ], "labels": [[("name", "group_label")]] * 6, } ), @@ -625,7 +643,7 @@ def test_generate_report_with_group_by( "difference": [_NAN], "pct_difference": [_NAN], "pct_threshold": [0.0], - "status": ["fail"], + "status": [consts.VALIDATION_STATUS_FAIL], "labels": [[("name", "test_label")]], } ), @@ -670,7 +688,7 @@ def test_generate_report_with_group_by( "difference": [_NAN], "pct_difference": [_NAN], "pct_threshold": [0.0], - "status": ["fail"], + "status": [consts.VALIDATION_STATUS_FAIL], "labels": [[("name", "test_label")]], } ), @@ -715,7 +733,7 @@ def test_generate_report_with_group_by( "difference": [_NAN], "pct_difference": [_NAN], "pct_threshold": [0.0], - "status": ["fail"], + "status": [consts.VALIDATION_STATUS_FAIL], "labels": [[("name", "test_label")]], } ), @@ -777,7 +795,10 @@ def test_generate_report_with_group_by( "difference": [1.0, _NAN], "pct_difference": [12.5, _NAN], "pct_threshold": [30.0, 0.0], - "status": ["success", "fail"], + "status": [ + consts.VALIDATION_STATUS_SUCCESS, + consts.VALIDATION_STATUS_FAIL, + ], "labels": [[("name", "test_label")]] * 2, } ), diff --git a/tests/unit/test_data_validation.py b/tests/unit/test_data_validation.py index cca6fbf69..45d93a7ed 100644 --- a/tests/unit/test_data_validation.py +++ b/tests/unit/test_data_validation.py @@ -516,7 +516,7 @@ def test_status_success_validation(module_under_test, fs): col_a_status = col_a_result_df.status.values[0] assert col_a_pct_threshold == 0.0 - assert col_a_status == "success" + assert col_a_status == consts.VALIDATION_STATUS_SUCCESS def test_status_fail_validation(module_under_test, fs): @@ -530,7 +530,7 @@ def test_status_fail_validation(module_under_test, fs): col_a_status = col_a_result_df.status.values[0] assert col_a_pct_threshold == 0.0 - assert col_a_status == "fail" + assert col_a_status == consts.VALIDATION_STATUS_FAIL def test_threshold_equals_diff(module_under_test, fs): @@ -546,7 +546,7 @@ def test_threshold_equals_diff(module_under_test, fs): assert col_a_pct_diff == 150.0 assert col_a_pct_threshold == 150.0 - assert col_a_status == "success" + assert col_a_status == consts.VALIDATION_STATUS_SUCCESS def test_grouped_column_level_validation_perfect_match(module_under_test, fs): @@ -674,7 +674,7 @@ def test_fail_row_level_validation(module_under_test, fs): result_df = client.execute() # based on shared keys - fail_df = result_df[result_df["status"] == "fail"] + fail_df = result_df[result_df["status"] == consts.VALIDATION_STATUS_FAIL] assert len(fail_df) == 5 @@ -691,7 +691,7 @@ def test_bad_join_row_level_validation(module_under_test, fs): client = module_under_test.DataValidation(SAMPLE_ROW_CONFIG) result_df = client.execute() - comparison_df = result_df[result_df["status"] == "fail"] + comparison_df = result_df[result_df["status"] == consts.VALIDATION_STATUS_FAIL] # 2 validations * (100 source + 1 target) assert len(result_df) == 202 assert len(comparison_df) == 202 diff --git a/tests/unit/test_schema_validation.py b/tests/unit/test_schema_validation.py index f7987fa36..9ebbf9409 100644 --- a/tests/unit/test_schema_validation.py +++ b/tests/unit/test_schema_validation.py @@ -143,13 +143,20 @@ def test_schema_validation_matching(module_under_test): target_fields = {"field1": "string", "field2": "timestamp", "field_3": "string"} expected_results = [ - ["field1", "field1", "1", "1", "Pass", "Source_type:string Target_type:string"], + [ + "field1", + "field1", + "1", + "1", + consts.VALIDATION_STATUS_SUCCESS, + "Source_type:string Target_type:string", + ], [ "field2", "field2", "1", "1", - "Fail", + consts.VALIDATION_STATUS_FAIL, "Data type mismatch between source and target. " "Source_type:datetime Target_type:timestamp", ], @@ -158,7 +165,7 @@ def test_schema_validation_matching(module_under_test): "N/A", "1", "0", - "Fail", + consts.VALIDATION_STATUS_FAIL, "Target doesn't have a matching field name", ], [ @@ -166,7 +173,7 @@ def test_schema_validation_matching(module_under_test): "field_3", "0", "1", - "Fail", + consts.VALIDATION_STATUS_FAIL, "Source doesn't have a matching field name", ], ] @@ -188,7 +195,9 @@ def test_execute(module_under_test, fs): dv_client = data_validation.DataValidation(SAMPLE_SCHEMA_CONFIG, verbose=True) result_df = dv_client.schema_validator.execute() - failures = result_df[result_df["status"].str.contains("Fail")] + failures = result_df[ + result_df["status"].str.contains(consts.VALIDATION_STATUS_FAIL) + ] assert len(result_df) == len(source_data[0]) + 1 assert result_df["source_agg_value"].astype(float).sum() == 7