Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: make status values consistent across validation types (#377) #378

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions data_validation/combiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def generate_report(
print(documented.compile())

result_df = client.execute(documented)
result_df.status.fillna("fail", inplace=True)
result_df.status.fillna(consts.VALIDATION_STATUS_FAIL, inplace=True)

return result_df

Expand All @@ -109,8 +109,8 @@ def _calculate_difference(field_differences, datatype, validation, is_value_comp
difference = pct_difference = ibis.null()
status = (
ibis.case()
.when(target_value == source_value, "success")
.else_("fail")
.when(target_value == source_value, consts.VALIDATION_STATUS_SUCCESS)
.else_(consts.VALIDATION_STATUS_FAIL)
.end()
)
else:
Expand All @@ -129,8 +129,8 @@ def _calculate_difference(field_differences, datatype, validation, is_value_comp
th_diff = (pct_difference.abs() - pct_threshold).cast("float64")
status = (
ibis.case()
.when(th_diff.isnan() | (th_diff > 0.0), "fail")
.else_("success")
.when(th_diff.isnan() | (th_diff > 0.0), consts.VALIDATION_STATUS_FAIL)
.else_(consts.VALIDATION_STATUS_SUCCESS)
.end()
)

Expand Down
2 changes: 2 additions & 0 deletions data_validation/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@
TARGET_AGG_VALUE = "target_agg_value"

VALIDATION_STATUS = "status"
VALIDATION_STATUS_SUCCESS = "success"
VALIDATION_STATUS_FAIL = "fail"

# SQL Template Formatting
# TODO: should this be managed in query_builder if that is the only place its used?
Expand Down
10 changes: 5 additions & 5 deletions data_validation/schema_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import datetime
import pandas

from data_validation import metadata
from data_validation import metadata, consts


class SchemaValidation(object):
Expand Down Expand Up @@ -100,7 +100,7 @@ def schema_validation_matching(source_fields, target_fields):
source_field_name,
"1",
"1",
"Pass",
consts.VALIDATION_STATUS_SUCCESS,
"Source_type:{} Target_type:{}".format(
source_field_type, target_fields[source_field_name]
),
Expand All @@ -114,7 +114,7 @@ def schema_validation_matching(source_fields, target_fields):
source_field_name,
"1",
"1",
"Fail",
consts.VALIDATION_STATUS_FAIL,
"Data type mismatch between source and target. Source_type:{} Target_type:{}".format(
source_field_type, target_fields[source_field_name]
),
Expand All @@ -128,7 +128,7 @@ def schema_validation_matching(source_fields, target_fields):
"N/A",
"1",
"0",
"Fail",
consts.VALIDATION_STATUS_FAIL,
"Target doesn't have a matching field name",
]
)
Expand All @@ -142,7 +142,7 @@ def schema_validation_matching(source_fields, target_fields):
target_field_name,
"0",
"1",
"Fail",
consts.VALIDATION_STATUS_FAIL,
"Source doesn't have a matching field name",
]
)
Expand Down
2 changes: 1 addition & 1 deletion tests/system/data_sources/test_bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ def test_schema_validation():
df = validator.execute()

for validation in df.to_dict(orient="records"):
assert validation["status"] == "Pass"
assert validation["status"] == consts.VALIDATION_STATUS_SUCCESS


def test_cli_store_yaml_then_run_gcs():
Expand Down
11 changes: 10 additions & 1 deletion tests/system/result_handlers/test_bigquery.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import pandas
import pandas.testing

from data_validation import consts


REPO_ROOT = pathlib.Path(__file__).parent.parent.parent.parent
SCHEMA_PATH = REPO_ROOT / "terraform" / "results_schema.json"
Expand Down Expand Up @@ -134,7 +136,14 @@ def test_execute_with_nan(bigquery_client, bigquery_dataset_id):
"difference": [-1.0, -1.0, _NAN, _NAN, _NAN, _NAN],
"pct_difference": [-50.0, -25.0, _NAN, _NAN, _NAN, _NAN],
"pct_threshold": [25.0, 25.0, _NAN, _NAN, _NAN, _NAN],
"status": ["fail", "success", _NAN, _NAN, _NAN, _NAN],
"status": [
consts.VALIDATION_STATUS_FAIL,
consts.VALIDATION_STATUS_SUCCESS,
_NAN,
_NAN,
_NAN,
_NAN,
],
"labels": [[{"key": "name", "value": "test_label"}]] * 6,
}
)
Expand Down
45 changes: 33 additions & 12 deletions tests/unit/test_combiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import pandas.testing
import pytest

from data_validation import metadata
from data_validation import metadata, consts


_NAN = float("nan")
Expand Down Expand Up @@ -149,7 +149,7 @@ def test_generate_report_with_too_many_rows(module_under_test):
"difference": [1.0],
"pct_difference": [100.0],
"pct_threshold": [0.0],
"status": ["fail"],
"status": [consts.VALIDATION_STATUS_FAIL],
"labels": [[("name", "test_label")]],
}
),
Expand Down Expand Up @@ -198,7 +198,7 @@ def test_generate_report_with_too_many_rows(module_under_test):
"difference": [0.0],
"pct_difference": [0.0],
"pct_threshold": [0.0],
"status": ["success"],
"status": [consts.VALIDATION_STATUS_SUCCESS],
"labels": [[("name", "test_label")]],
}
),
Expand Down Expand Up @@ -255,7 +255,7 @@ def test_generate_report_with_too_many_rows(module_under_test):
"difference": [400000000.0],
"pct_difference": [25.0],
"pct_threshold": [0.0],
"status": ["fail"],
"status": [consts.VALIDATION_STATUS_FAIL],
"labels": [[("name", "test_label")]],
}
),
Expand Down Expand Up @@ -317,7 +317,10 @@ def test_generate_report_with_too_many_rows(module_under_test):
"difference": [1.0, 2.0],
"pct_difference": [12.5, -200.0],
"pct_threshold": [30.0, 0.0],
"status": ["success", "fail"],
"status": [
consts.VALIDATION_STATUS_SUCCESS,
consts.VALIDATION_STATUS_FAIL,
],
"labels": [[("name", "test_label")]] * 2,
}
),
Expand Down Expand Up @@ -413,7 +416,12 @@ def test_generate_report_without_group_by(
"difference": [-1.0, -1.0, -1.0, 1.0],
"pct_difference": [-50.0, -25.0, -12.5, 6.25],
"pct_threshold": [7.0, 7.0, 7.0, 7.0],
"status": ["fail", "fail", "fail", "success"],
"status": [
consts.VALIDATION_STATUS_FAIL,
consts.VALIDATION_STATUS_FAIL,
consts.VALIDATION_STATUS_FAIL,
consts.VALIDATION_STATUS_SUCCESS,
],
"labels": [[("name", "group_label")]] * 4,
}
),
Expand Down Expand Up @@ -459,7 +467,10 @@ def test_generate_report_without_group_by(
"difference": [2.0, 2.0],
"pct_difference": [200.0, 100.0],
"pct_threshold": [100.0, 100.0],
"status": ["fail", "success"],
"status": [
consts.VALIDATION_STATUS_FAIL,
consts.VALIDATION_STATUS_SUCCESS,
],
"labels": [[("name", "group_label")]] * 2,
}
),
Expand Down Expand Up @@ -538,7 +549,14 @@ def test_generate_report_without_group_by(
"difference": [-1.0, -1.0, _NAN, _NAN, _NAN, _NAN],
"pct_difference": [-50.0, -25.0, _NAN, _NAN, _NAN, _NAN],
"pct_threshold": [25.0, 25.0, _NAN, _NAN, _NAN, _NAN],
"status": ["fail", "success", "fail", "fail", "fail", "fail"],
"status": [
consts.VALIDATION_STATUS_FAIL,
consts.VALIDATION_STATUS_SUCCESS,
consts.VALIDATION_STATUS_FAIL,
consts.VALIDATION_STATUS_FAIL,
consts.VALIDATION_STATUS_FAIL,
consts.VALIDATION_STATUS_FAIL,
],
"labels": [[("name", "group_label")]] * 6,
}
),
Expand Down Expand Up @@ -625,7 +643,7 @@ def test_generate_report_with_group_by(
"difference": [_NAN],
"pct_difference": [_NAN],
"pct_threshold": [0.0],
"status": ["fail"],
"status": [consts.VALIDATION_STATUS_FAIL],
"labels": [[("name", "test_label")]],
}
),
Expand Down Expand Up @@ -670,7 +688,7 @@ def test_generate_report_with_group_by(
"difference": [_NAN],
"pct_difference": [_NAN],
"pct_threshold": [0.0],
"status": ["fail"],
"status": [consts.VALIDATION_STATUS_FAIL],
"labels": [[("name", "test_label")]],
}
),
Expand Down Expand Up @@ -715,7 +733,7 @@ def test_generate_report_with_group_by(
"difference": [_NAN],
"pct_difference": [_NAN],
"pct_threshold": [0.0],
"status": ["fail"],
"status": [consts.VALIDATION_STATUS_FAIL],
"labels": [[("name", "test_label")]],
}
),
Expand Down Expand Up @@ -777,7 +795,10 @@ def test_generate_report_with_group_by(
"difference": [1.0, _NAN],
"pct_difference": [12.5, _NAN],
"pct_threshold": [30.0, 0.0],
"status": ["success", "fail"],
"status": [
consts.VALIDATION_STATUS_SUCCESS,
consts.VALIDATION_STATUS_FAIL,
],
"labels": [[("name", "test_label")]] * 2,
}
),
Expand Down
10 changes: 5 additions & 5 deletions tests/unit/test_data_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,7 @@ def test_status_success_validation(module_under_test, fs):
col_a_status = col_a_result_df.status.values[0]

assert col_a_pct_threshold == 0.0
assert col_a_status == "success"
assert col_a_status == consts.VALIDATION_STATUS_SUCCESS


def test_status_fail_validation(module_under_test, fs):
Expand All @@ -530,7 +530,7 @@ def test_status_fail_validation(module_under_test, fs):
col_a_status = col_a_result_df.status.values[0]

assert col_a_pct_threshold == 0.0
assert col_a_status == "fail"
assert col_a_status == consts.VALIDATION_STATUS_FAIL


def test_threshold_equals_diff(module_under_test, fs):
Expand All @@ -546,7 +546,7 @@ def test_threshold_equals_diff(module_under_test, fs):

assert col_a_pct_diff == 150.0
assert col_a_pct_threshold == 150.0
assert col_a_status == "success"
assert col_a_status == consts.VALIDATION_STATUS_SUCCESS


def test_grouped_column_level_validation_perfect_match(module_under_test, fs):
Expand Down Expand Up @@ -674,7 +674,7 @@ def test_fail_row_level_validation(module_under_test, fs):
result_df = client.execute()

# based on shared keys
fail_df = result_df[result_df["status"] == "fail"]
fail_df = result_df[result_df["status"] == consts.VALIDATION_STATUS_FAIL]
assert len(fail_df) == 5


Expand All @@ -691,7 +691,7 @@ def test_bad_join_row_level_validation(module_under_test, fs):
client = module_under_test.DataValidation(SAMPLE_ROW_CONFIG)
result_df = client.execute()

comparison_df = result_df[result_df["status"] == "fail"]
comparison_df = result_df[result_df["status"] == consts.VALIDATION_STATUS_FAIL]
# 2 validations * (100 source + 1 target)
assert len(result_df) == 202
assert len(comparison_df) == 202
19 changes: 14 additions & 5 deletions tests/unit/test_schema_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,20 @@ def test_schema_validation_matching(module_under_test):
target_fields = {"field1": "string", "field2": "timestamp", "field_3": "string"}

expected_results = [
["field1", "field1", "1", "1", "Pass", "Source_type:string Target_type:string"],
[
"field1",
"field1",
"1",
"1",
consts.VALIDATION_STATUS_SUCCESS,
"Source_type:string Target_type:string",
],
[
"field2",
"field2",
"1",
"1",
"Fail",
consts.VALIDATION_STATUS_FAIL,
"Data type mismatch between source and target. "
"Source_type:datetime Target_type:timestamp",
],
Expand All @@ -158,15 +165,15 @@ def test_schema_validation_matching(module_under_test):
"N/A",
"1",
"0",
"Fail",
consts.VALIDATION_STATUS_FAIL,
"Target doesn't have a matching field name",
],
[
"N/A",
"field_3",
"0",
"1",
"Fail",
consts.VALIDATION_STATUS_FAIL,
"Source doesn't have a matching field name",
],
]
Expand All @@ -188,7 +195,9 @@ def test_execute(module_under_test, fs):

dv_client = data_validation.DataValidation(SAMPLE_SCHEMA_CONFIG, verbose=True)
result_df = dv_client.schema_validator.execute()
failures = result_df[result_df["status"].str.contains("Fail")]
failures = result_df[
result_df["status"].str.contains(consts.VALIDATION_STATUS_FAIL)
]

assert len(result_df) == len(source_data[0]) + 1
assert result_df["source_agg_value"].astype(float).sum() == 7
Expand Down