Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Schema validations ignore not null on Teradata and BigQuery #935

Merged
merged 13 commits into from
Aug 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions tests/resources/bigquery_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,11 @@ INSERT INTO `pso_data_validator`.`dvt_core_types` VALUES
,'Hello DVT','C ','Hello DVT'
,DATE '1970-01-03',DATETIME '1970-01-03 00:00:03'
,TIMESTAMP '1970-01-03 00:00:03-03:00');

DROP TABLE `pso_data_validator`.`dvt_null_not_null`;
CREATE TABLE `pso_data_validator`.`dvt_null_not_null`
( col_nn DATETIME NOT NULL
, col_nullable DATETIME
, col_src_nn_trg_n DATETIME
, col_src_n_trg_nn DATETIME NOT NULL
) OPTIONS (description='Nullable integration test table, BigQuery is assumed to be a DVT target (not source)');
9 changes: 9 additions & 0 deletions tests/resources/hive_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,12 @@ INSERT INTO pso_data_validator.dvt_core_types VALUES
,12345678901234567890,1234567890123456789012345,123.33,123456.3,12345678.3
,'Hello DVT','C ','Hello DVT'
,'1970-01-03','1970-01-03 00:00:03','1970-01-03 03:00:03');


DROP TABLE `pso_data_validator`.`dvt_null_not_null`;
CREATE TABLE `pso_data_validator`.`dvt_null_not_null`
( col_nn timestamp NOT NULL
, col_nullable timestamp
, col_src_nn_trg_n timestamp NOT NULL
, col_src_n_trg_nn timestamp
) COMMENT 'Nullable integration test table, Hive is assumed to be a DVT source (not target).';
8 changes: 8 additions & 0 deletions tests/resources/mysql_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,11 @@ INSERT INTO `pso_data_validator`.`dvt_core_types` VALUES
,12345678901234567890,1234567890123456789012345,123.33,123456.3,12345678.3
,'Hello DVT','C ','Hello DVT'
,'1970-01-03','1970-01-03 00:00:03','1970-01-03 03:00:03');

DROP TABLE `pso_data_validator`.`dvt_null_not_null`;
CREATE TABLE `pso_data_validator`.`dvt_null_not_null`
( col_nn datetime(0) NOT NULL
, col_nullable datetime(0)
, col_src_nn_trg_n datetime(0) NOT NULL
, col_src_n_trg_nn datetime(0)
) COMMENT 'Nullable integration test table, MySQL is assumed to be a DVT source (not target).';
9 changes: 9 additions & 0 deletions tests/resources/oracle_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,12 @@ INSERT INTO pso_data_validator.dvt_core_types VALUES
,DATE'1970-01-03',TIMESTAMP'1970-01-03 00:00:03'
,to_timestamp_tz('1970-01-03 00:00:03 -03:00','YYYY-MM-DD HH24:MI:SS TZH:TZM'));
COMMIT;

DROP TABLE pso_data_validator.dvt_null_not_null;
CREATE TABLE pso_data_validator.dvt_null_not_null
( col_nn TIMESTAMP(0) NOT NULL
, col_nullable TIMESTAMP(0)
, col_src_nn_trg_n TIMESTAMP(0) NOT NULL
, col_src_n_trg_nn TIMESTAMP(0)
);
COMMENT ON TABLE pso_data_validator.dvt_null_not_null IS 'Nullable integration test table, Oracle is assumed to be a DVT source (not target).';
11 changes: 10 additions & 1 deletion tests/resources/postgresql_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -89,4 +89,13 @@ INSERT INTO public.test_generate_partitions (course_id, quarter_id, student_id,
('TRI001', 2, 9012, 3.5),
('TRI001', 3, 1234, 2.7),
('TRI001', 3, 5678, 3.5),
('TRI001', 3, 9012, 2.8);
('TRI001', 3, 9012, 2.8);

DROP TABLE pso_data_validator.dvt_null_not_null;
CREATE TABLE pso_data_validator.dvt_null_not_null
( col_nn TIMESTAMP(0) NOT NULL
, col_nullable TIMESTAMP(0)
, col_src_nn_trg_n TIMESTAMP(0) NOT NULL
, col_src_n_trg_nn TIMESTAMP(0)
);
COMMENT ON TABLE pso_data_validator.dvt_null_not_null IS 'Nullable integration test table, PostgreSQL is assumed to be a DVT source (not target).';
11 changes: 10 additions & 1 deletion tests/resources/snowflake_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -83,4 +83,13 @@ INSERT INTO PSO_DATA_VALIDATOR.PUBLIC.TEST_GENERATE_PARTITIONS (COURSE_ID, QUART
('TRI001', 2, 9012, 3.5),
('TRI001', 3, 1234, 2.7),
('TRI001', 3, 5678, 3.5),
('TRI001', 3, 9012, 2.8);
('TRI001', 3, 9012, 2.8);

DROP TABLE PSO_DATA_VALIDATOR.PUBLIC.DVT_NULL_NOT_NULL;
CREATE TABLE PSO_DATA_VALIDATOR.PUBLIC.DVT_NULL_NOT_NULL
( col_nn TIMESTAMP(0) NOT NULL
, col_nullable TIMESTAMP(0)
, col_src_nn_trg_n TIMESTAMP(0) NOT NULL
, col_src_n_trg_nn TIMESTAMP(0)
);
COMMENT ON TABLE PSO_DATA_VALIDATOR.PUBLIC.DVT_NULL_NOT_NULL IS 'Nullable integration test table, Oracle is assumed to be a DVT source (not target).';
10 changes: 9 additions & 1 deletion tests/resources/sqlserver_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,12 @@ INSERT INTO pso_data_validator.dvt_core_types VALUES
,12345678901234567890,1234567890123456789012345,123.33,123456.3,12345678.3
,'Hello DVT','C ','Hello DVT'
,'1970-01-03','1970-01-03 00:00:03'
,cast('1970-01-03 00:00:03 -03:00' as datetimeoffset(3)));
,cast('1970-01-03 00:00:03 -03:00' as datetimeoffset(3)));

DROP TABLE pso_data_validator.dvt_null_not_null;
CREATE TABLE pso_data_validator.dvt_null_not_null
( col_nn datetime2(0) NOT NULL
, col_nullable datetime2(0)
, col_src_nn_trg_n datetime2(0) NOT NULL
, col_src_n_trg_nn datetime2(0)
);
10 changes: 10 additions & 0 deletions tests/resources/teradata_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ CREATE TABLE udf.dvt_core_types
, col_datetime TIMESTAMP(3)
, col_tstz TIMESTAMP(3) WITH TIME ZONE
);
COMMENT ON TABLE udf.dvt_core_types AS 'Core data types integration test table';

INSERT INTO udf.dvt_core_types VALUES
(1,1,1,1,1
Expand All @@ -50,3 +51,12 @@ INSERT INTO udf.dvt_core_types VALUES
,'Hello DVT','C ','Hello DVT'
,DATE'1970-01-03',TIMESTAMP'1970-01-03 00:00:03'
,CAST('1970-01-03 00:00:03.000-03:00' AS TIMESTAMP(3) WITH TIME ZONE));

DROP TABLE udf.dvt_null_not_null;
CREATE TABLE udf.dvt_null_not_null
( col_nn TIMESTAMP(0) NOT NULL
, col_nullable TIMESTAMP(0)
, col_src_nn_trg_n TIMESTAMP(0) NOT NULL
, col_src_n_trg_nn TIMESTAMP(0)
);
COMMENT ON TABLE udf.dvt_null_not_null AS 'Nullable integration test table, Teradata is assumed to be a DVT source (not target).';
35 changes: 35 additions & 0 deletions tests/system/data_sources/common_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


def null_not_null_assertions(df):
"""Standard assertions for null_not_null integration test.
These tests use BigQuery as a set target with a mismatch of not null/nullable settings.
All other engines are validated against BigQuery to check we get the correct status."""
# Should be 4 columns in the Dataframe.
assert len(df) == 4
match_columns = ["col_nn", "col_nullable"]
mismatch_columns = ["col_src_nn_trg_n", "col_src_n_trg_nn"]
for column_name, status in zip(df["source_column_name"], df["validation_status"]):
assert column_name in (match_columns + mismatch_columns)
if column_name in match_columns:
# These columns are the same for all engines and should succeed.
assert (
status == "success"
), f"Column: {column_name}, status: {status} != 'success'"
elif column_name in mismatch_columns:
# These columns are the different for source and target engines and should fail.
assert (
status == "fail"
), f"Column: {column_name}, status: {status} != 'fail'"
30 changes: 30 additions & 0 deletions tests/system/data_sources/test_hive.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from data_validation import __main__ as main
from data_validation import cli_tools, data_validation, consts
from data_validation.partition_builder import PartitionBuilder
from tests.system.data_sources.common_functions import null_not_null_assertions
from tests.system.data_sources.test_bigquery import BQ_CONN


Expand Down Expand Up @@ -161,6 +162,7 @@ def test_schema_validation_core_types_to_bigquery():
"-tc=bq-conn",
"-tbls=pso_data_validator.dvt_core_types",
"--filter-status=fail",
"--exclusion-columns=id",
(
# All Hive integrals go to BigQuery INT64.
"--allow-list=int8:int64,int16:int64,int32:int64,"
Expand All @@ -184,6 +186,34 @@ def test_schema_validation_core_types_to_bigquery():
assert len(df) == 0


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
)
def disabled_test_schema_validation_not_null_vs_nullable():
"""
Disabled this test because we don't currently pull nullable from Hive.
https://github.com/GoogleCloudPlatform/professional-services-data-validator/issues/934
Compares a source table with a BigQuery target and ensure we match/fail on nnot null/nullable correctly.
"""
parser = cli_tools.configure_arg_parser()
args = parser.parse_args(
[
"validate",
"schema",
"-sc=hive-conn",
"-tc=bq-conn",
"-tbls=pso_data_validator.dvt_null_not_null=pso_data_validator.dvt_null_not_null",
]
)
config_managers = main.build_config_managers_from_args(args)
assert len(config_managers) == 1
config_manager = config_managers[0]
validator = data_validation.DataValidation(config_manager.config, verbose=False)
df = validator.execute()
null_not_null_assertions(df)


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
Expand Down
39 changes: 32 additions & 7 deletions tests/system/data_sources/test_mysql.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from data_validation import __main__ as main
from data_validation import cli_tools, data_validation, consts, exceptions
from data_validation.partition_builder import PartitionBuilder
from tests.system.data_sources.common_functions import null_not_null_assertions
from tests.system.data_sources.test_bigquery import BQ_CONN


Expand Down Expand Up @@ -64,6 +65,13 @@
}


def mock_get_connection_config(*args):
if args[1] in ("mysql-conn", "mock-conn"):
return CONN
elif args[1] == "bq-conn":
return BQ_CONN


def test_mysql_count_invalid_host():
try:
data_validator = data_validation.DataValidation(
Expand Down Expand Up @@ -428,13 +436,6 @@ def test_mysql_row():
pass


def mock_get_connection_config(*args):
if args[1] in ("mysql-conn", "mock-conn"):
return CONN
elif args[1] == "bq-conn":
return BQ_CONN


# Expected result from partitioning table on 3 keys
EXPECTED_PARTITION_FILTER = [
"course_id < 'ALG001' OR course_id = 'ALG001' AND (quarter_id < 3 OR quarter_id = 3 AND (student_id < 1234))",
Expand Down Expand Up @@ -508,6 +509,30 @@ def test_schema_validation_core_types():
assert len(df) == 0


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
)
def test_schema_validation_not_null_vs_nullable():
"""Compares a source table with a BigQuery target and ensure we match/fail on nnot null/nullable correctly."""
parser = cli_tools.configure_arg_parser()
args = parser.parse_args(
[
"validate",
"schema",
"-sc=mysql-conn",
"-tc=bq-conn",
"-tbls=pso_data_validator.dvt_null_not_null=pso_data_validator.dvt_null_not_null",
]
)
config_managers = main.build_config_managers_from_args(args)
assert len(config_managers) == 1
config_manager = config_managers[0]
validator = data_validation.DataValidation(config_manager.config, verbose=False)
df = validator.execute()
null_not_null_assertions(df)


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
Expand Down
28 changes: 27 additions & 1 deletion tests/system/data_sources/test_oracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from data_validation import __main__ as main
from data_validation import cli_tools, data_validation, consts
from data_validation.partition_builder import PartitionBuilder
from tests.system.data_sources.common_functions import null_not_null_assertions
from tests.system.data_sources.test_bigquery import BQ_CONN


Expand Down Expand Up @@ -160,9 +161,10 @@ def test_schema_validation_core_types_to_bigquery():
"-tc=bq-conn",
"-tbls=pso_data_validator.dvt_core_types",
"--filter-status=fail",
"--exclusion-columns=id",
(
# Integral Oracle NUMBERS go to BigQuery INT64.
"--allow-list=!decimal(8,0):int64,decimal(2,0):int64,decimal(4,0):int64,decimal(9,0):int64,decimal(18,0):int64,"
"--allow-list=decimal(2,0):int64,decimal(4,0):int64,decimal(9,0):int64,decimal(18,0):int64,"
# Oracle NUMBERS that map to BigQuery NUMERIC.
"decimal(20,0):decimal(38,9),decimal(10,2):decimal(38,9),"
# Oracle NUMBERS that map to BigQuery BIGNUMERIC.
Expand All @@ -181,6 +183,30 @@ def test_schema_validation_core_types_to_bigquery():
assert len(df) == 0


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
)
def test_schema_validation_not_null_vs_nullable():
"""Compares a source table with a BigQuery target and ensure we match/fail on nnot null/nullable correctly."""
parser = cli_tools.configure_arg_parser()
args = parser.parse_args(
[
"validate",
"schema",
"-sc=ora-conn",
"-tc=bq-conn",
"-tbls=pso_data_validator.dvt_null_not_null=pso_data_validator.dvt_null_not_null",
]
)
config_managers = main.build_config_managers_from_args(args)
assert len(config_managers) == 1
config_manager = config_managers[0]
validator = data_validation.DataValidation(config_manager.config, verbose=False)
df = validator.execute()
null_not_null_assertions(df)


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
Expand Down
28 changes: 27 additions & 1 deletion tests/system/data_sources/test_postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from tests.system.data_sources.deploy_cloudsql.cloudsql_resource_manager import (
CloudSQLResourceManager,
)
from tests.system.data_sources.common_functions import null_not_null_assertions
from tests.system.data_sources.test_bigquery import BQ_CONN


Expand Down Expand Up @@ -560,10 +561,11 @@ def test_schema_validation_core_types_to_bigquery():
"-sc=pg-conn",
"-tc=bq-conn",
"-tbls=pso_data_validator.dvt_core_types",
"--exclusion-columns=id",
"--filter-status=fail",
(
# PostgreSQL integrals go to BigQuery INT64.
"--allow-list=int16:int64,int32:int64,!int32:int64,"
"--allow-list=int16:int64,int32:int64,"
# Oracle NUMBERS that map to BigQuery NUMERIC.
"decimal(20,0):decimal(38,9),decimal(10,2):decimal(38,9),"
# Oracle NUMBERS that map to BigQuery BIGNUMERIC.
Expand All @@ -582,6 +584,30 @@ def test_schema_validation_core_types_to_bigquery():
assert len(df) == 0


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
)
def test_schema_validation_not_null_vs_nullable():
"""Compares a source table with a BigQuery target and ensure we match/fail on nnot null/nullable correctly."""
parser = cli_tools.configure_arg_parser()
args = parser.parse_args(
[
"validate",
"schema",
"-sc=pg-conn",
"-tc=bq-conn",
"-tbls=pso_data_validator.dvt_null_not_null=pso_data_validator.dvt_null_not_null",
]
)
config_managers = main.build_config_managers_from_args(args)
assert len(config_managers) == 1
config_manager = config_managers[0]
validator = data_validation.DataValidation(config_manager.config, verbose=False)
df = validator.execute()
null_not_null_assertions(df)


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
Expand Down
Loading