Skip to content

Commit

Permalink
fix: make status values consistent across validation types (#377) (#378)
Browse files Browse the repository at this point in the history
* fix: make status values consistent across validation types (#377)

* fix: make validation status values consts (#377)
  • Loading branch information
ajwelch4 committed Mar 2, 2022
1 parent 806151a commit 5c08463
Show file tree
Hide file tree
Showing 8 changed files with 75 additions and 34 deletions.
10 changes: 5 additions & 5 deletions data_validation/combiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def generate_report(
print(documented.compile())

result_df = client.execute(documented)
result_df.status.fillna("fail", inplace=True)
result_df.status.fillna(consts.VALIDATION_STATUS_FAIL, inplace=True)

return result_df

Expand All @@ -109,8 +109,8 @@ def _calculate_difference(field_differences, datatype, validation, is_value_comp
difference = pct_difference = ibis.null()
status = (
ibis.case()
.when(target_value == source_value, "success")
.else_("fail")
.when(target_value == source_value, consts.VALIDATION_STATUS_SUCCESS)
.else_(consts.VALIDATION_STATUS_FAIL)
.end()
)
else:
Expand All @@ -129,8 +129,8 @@ def _calculate_difference(field_differences, datatype, validation, is_value_comp
th_diff = (pct_difference.abs() - pct_threshold).cast("float64")
status = (
ibis.case()
.when(th_diff.isnan() | (th_diff > 0.0), "fail")
.else_("success")
.when(th_diff.isnan() | (th_diff > 0.0), consts.VALIDATION_STATUS_FAIL)
.else_(consts.VALIDATION_STATUS_SUCCESS)
.end()
)

Expand Down
2 changes: 2 additions & 0 deletions data_validation/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@
TARGET_AGG_VALUE = "target_agg_value"

VALIDATION_STATUS = "status"
VALIDATION_STATUS_SUCCESS = "success"
VALIDATION_STATUS_FAIL = "fail"

# SQL Template Formatting
# TODO: should this be managed in query_builder if that is the only place its used?
Expand Down
10 changes: 5 additions & 5 deletions data_validation/schema_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import datetime
import pandas

from data_validation import metadata
from data_validation import metadata, consts


class SchemaValidation(object):
Expand Down Expand Up @@ -100,7 +100,7 @@ def schema_validation_matching(source_fields, target_fields):
source_field_name,
"1",
"1",
"Pass",
consts.VALIDATION_STATUS_SUCCESS,
"Source_type:{} Target_type:{}".format(
source_field_type, target_fields[source_field_name]
),
Expand All @@ -114,7 +114,7 @@ def schema_validation_matching(source_fields, target_fields):
source_field_name,
"1",
"1",
"Fail",
consts.VALIDATION_STATUS_FAIL,
"Data type mismatch between source and target. Source_type:{} Target_type:{}".format(
source_field_type, target_fields[source_field_name]
),
Expand All @@ -128,7 +128,7 @@ def schema_validation_matching(source_fields, target_fields):
"N/A",
"1",
"0",
"Fail",
consts.VALIDATION_STATUS_FAIL,
"Target doesn't have a matching field name",
]
)
Expand All @@ -142,7 +142,7 @@ def schema_validation_matching(source_fields, target_fields):
target_field_name,
"0",
"1",
"Fail",
consts.VALIDATION_STATUS_FAIL,
"Source doesn't have a matching field name",
]
)
Expand Down
2 changes: 1 addition & 1 deletion tests/system/data_sources/test_bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ def test_schema_validation():
df = validator.execute()

for validation in df.to_dict(orient="records"):
assert validation["status"] == "Pass"
assert validation["status"] == consts.VALIDATION_STATUS_SUCCESS


def test_cli_store_yaml_then_run_gcs():
Expand Down
11 changes: 10 additions & 1 deletion tests/system/result_handlers/test_bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import pandas
import pandas.testing

from data_validation import consts


REPO_ROOT = pathlib.Path(__file__).parent.parent.parent.parent
SCHEMA_PATH = REPO_ROOT / "terraform" / "results_schema.json"
Expand Down Expand Up @@ -134,7 +136,14 @@ def test_execute_with_nan(bigquery_client, bigquery_dataset_id):
"difference": [-1.0, -1.0, _NAN, _NAN, _NAN, _NAN],
"pct_difference": [-50.0, -25.0, _NAN, _NAN, _NAN, _NAN],
"pct_threshold": [25.0, 25.0, _NAN, _NAN, _NAN, _NAN],
"status": ["fail", "success", _NAN, _NAN, _NAN, _NAN],
"status": [
consts.VALIDATION_STATUS_FAIL,
consts.VALIDATION_STATUS_SUCCESS,
_NAN,
_NAN,
_NAN,
_NAN,
],
"labels": [[{"key": "name", "value": "test_label"}]] * 6,
}
)
Expand Down
45 changes: 33 additions & 12 deletions tests/unit/test_combiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import pandas.testing
import pytest

from data_validation import metadata
from data_validation import metadata, consts


_NAN = float("nan")
Expand Down Expand Up @@ -149,7 +149,7 @@ def test_generate_report_with_too_many_rows(module_under_test):
"difference": [1.0],
"pct_difference": [100.0],
"pct_threshold": [0.0],
"status": ["fail"],
"status": [consts.VALIDATION_STATUS_FAIL],
"labels": [[("name", "test_label")]],
}
),
Expand Down Expand Up @@ -198,7 +198,7 @@ def test_generate_report_with_too_many_rows(module_under_test):
"difference": [0.0],
"pct_difference": [0.0],
"pct_threshold": [0.0],
"status": ["success"],
"status": [consts.VALIDATION_STATUS_SUCCESS],
"labels": [[("name", "test_label")]],
}
),
Expand Down Expand Up @@ -255,7 +255,7 @@ def test_generate_report_with_too_many_rows(module_under_test):
"difference": [400000000.0],
"pct_difference": [25.0],
"pct_threshold": [0.0],
"status": ["fail"],
"status": [consts.VALIDATION_STATUS_FAIL],
"labels": [[("name", "test_label")]],
}
),
Expand Down Expand Up @@ -317,7 +317,10 @@ def test_generate_report_with_too_many_rows(module_under_test):
"difference": [1.0, 2.0],
"pct_difference": [12.5, -200.0],
"pct_threshold": [30.0, 0.0],
"status": ["success", "fail"],
"status": [
consts.VALIDATION_STATUS_SUCCESS,
consts.VALIDATION_STATUS_FAIL,
],
"labels": [[("name", "test_label")]] * 2,
}
),
Expand Down Expand Up @@ -413,7 +416,12 @@ def test_generate_report_without_group_by(
"difference": [-1.0, -1.0, -1.0, 1.0],
"pct_difference": [-50.0, -25.0, -12.5, 6.25],
"pct_threshold": [7.0, 7.0, 7.0, 7.0],
"status": ["fail", "fail", "fail", "success"],
"status": [
consts.VALIDATION_STATUS_FAIL,
consts.VALIDATION_STATUS_FAIL,
consts.VALIDATION_STATUS_FAIL,
consts.VALIDATION_STATUS_SUCCESS,
],
"labels": [[("name", "group_label")]] * 4,
}
),
Expand Down Expand Up @@ -459,7 +467,10 @@ def test_generate_report_without_group_by(
"difference": [2.0, 2.0],
"pct_difference": [200.0, 100.0],
"pct_threshold": [100.0, 100.0],
"status": ["fail", "success"],
"status": [
consts.VALIDATION_STATUS_FAIL,
consts.VALIDATION_STATUS_SUCCESS,
],
"labels": [[("name", "group_label")]] * 2,
}
),
Expand Down Expand Up @@ -538,7 +549,14 @@ def test_generate_report_without_group_by(
"difference": [-1.0, -1.0, _NAN, _NAN, _NAN, _NAN],
"pct_difference": [-50.0, -25.0, _NAN, _NAN, _NAN, _NAN],
"pct_threshold": [25.0, 25.0, _NAN, _NAN, _NAN, _NAN],
"status": ["fail", "success", "fail", "fail", "fail", "fail"],
"status": [
consts.VALIDATION_STATUS_FAIL,
consts.VALIDATION_STATUS_SUCCESS,
consts.VALIDATION_STATUS_FAIL,
consts.VALIDATION_STATUS_FAIL,
consts.VALIDATION_STATUS_FAIL,
consts.VALIDATION_STATUS_FAIL,
],
"labels": [[("name", "group_label")]] * 6,
}
),
Expand Down Expand Up @@ -625,7 +643,7 @@ def test_generate_report_with_group_by(
"difference": [_NAN],
"pct_difference": [_NAN],
"pct_threshold": [0.0],
"status": ["fail"],
"status": [consts.VALIDATION_STATUS_FAIL],
"labels": [[("name", "test_label")]],
}
),
Expand Down Expand Up @@ -670,7 +688,7 @@ def test_generate_report_with_group_by(
"difference": [_NAN],
"pct_difference": [_NAN],
"pct_threshold": [0.0],
"status": ["fail"],
"status": [consts.VALIDATION_STATUS_FAIL],
"labels": [[("name", "test_label")]],
}
),
Expand Down Expand Up @@ -715,7 +733,7 @@ def test_generate_report_with_group_by(
"difference": [_NAN],
"pct_difference": [_NAN],
"pct_threshold": [0.0],
"status": ["fail"],
"status": [consts.VALIDATION_STATUS_FAIL],
"labels": [[("name", "test_label")]],
}
),
Expand Down Expand Up @@ -777,7 +795,10 @@ def test_generate_report_with_group_by(
"difference": [1.0, _NAN],
"pct_difference": [12.5, _NAN],
"pct_threshold": [30.0, 0.0],
"status": ["success", "fail"],
"status": [
consts.VALIDATION_STATUS_SUCCESS,
consts.VALIDATION_STATUS_FAIL,
],
"labels": [[("name", "test_label")]] * 2,
}
),
Expand Down
10 changes: 5 additions & 5 deletions tests/unit/test_data_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,7 @@ def test_status_success_validation(module_under_test, fs):
col_a_status = col_a_result_df.status.values[0]

assert col_a_pct_threshold == 0.0
assert col_a_status == "success"
assert col_a_status == consts.VALIDATION_STATUS_SUCCESS


def test_status_fail_validation(module_under_test, fs):
Expand All @@ -530,7 +530,7 @@ def test_status_fail_validation(module_under_test, fs):
col_a_status = col_a_result_df.status.values[0]

assert col_a_pct_threshold == 0.0
assert col_a_status == "fail"
assert col_a_status == consts.VALIDATION_STATUS_FAIL


def test_threshold_equals_diff(module_under_test, fs):
Expand All @@ -546,7 +546,7 @@ def test_threshold_equals_diff(module_under_test, fs):

assert col_a_pct_diff == 150.0
assert col_a_pct_threshold == 150.0
assert col_a_status == "success"
assert col_a_status == consts.VALIDATION_STATUS_SUCCESS


def test_grouped_column_level_validation_perfect_match(module_under_test, fs):
Expand Down Expand Up @@ -674,7 +674,7 @@ def test_fail_row_level_validation(module_under_test, fs):
result_df = client.execute()

# based on shared keys
fail_df = result_df[result_df["status"] == "fail"]
fail_df = result_df[result_df["status"] == consts.VALIDATION_STATUS_FAIL]
assert len(fail_df) == 5


Expand All @@ -691,7 +691,7 @@ def test_bad_join_row_level_validation(module_under_test, fs):
client = module_under_test.DataValidation(SAMPLE_ROW_CONFIG)
result_df = client.execute()

comparison_df = result_df[result_df["status"] == "fail"]
comparison_df = result_df[result_df["status"] == consts.VALIDATION_STATUS_FAIL]
# 2 validations * (100 source + 1 target)
assert len(result_df) == 202
assert len(comparison_df) == 202
19 changes: 14 additions & 5 deletions tests/unit/test_schema_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,20 @@ def test_schema_validation_matching(module_under_test):
target_fields = {"field1": "string", "field2": "timestamp", "field_3": "string"}

expected_results = [
["field1", "field1", "1", "1", "Pass", "Source_type:string Target_type:string"],
[
"field1",
"field1",
"1",
"1",
consts.VALIDATION_STATUS_SUCCESS,
"Source_type:string Target_type:string",
],
[
"field2",
"field2",
"1",
"1",
"Fail",
consts.VALIDATION_STATUS_FAIL,
"Data type mismatch between source and target. "
"Source_type:datetime Target_type:timestamp",
],
Expand All @@ -158,15 +165,15 @@ def test_schema_validation_matching(module_under_test):
"N/A",
"1",
"0",
"Fail",
consts.VALIDATION_STATUS_FAIL,
"Target doesn't have a matching field name",
],
[
"N/A",
"field_3",
"0",
"1",
"Fail",
consts.VALIDATION_STATUS_FAIL,
"Source doesn't have a matching field name",
],
]
Expand All @@ -188,7 +195,9 @@ def test_execute(module_under_test, fs):

dv_client = data_validation.DataValidation(SAMPLE_SCHEMA_CONFIG, verbose=True)
result_df = dv_client.schema_validator.execute()
failures = result_df[result_df["status"].str.contains("Fail")]
failures = result_df[
result_df["status"].str.contains(consts.VALIDATION_STATUS_FAIL)
]

assert len(result_df) == len(source_data[0]) + 1
assert result_df["source_agg_value"].astype(float).sum() == 7
Expand Down

0 comments on commit 5c08463

Please sign in to comment.