Skip to content

Commit

Permalink
fix: Schema validations ignore not null on Teradata and BigQuery (#935)
Browse files Browse the repository at this point in the history
* tests: Add null/not null schema validation integration tests and supporting tables

* fix: Teradata now sets nullable correctly

* chore: Reformat code

* fix: BigQuery now sets nullable correctly

* tests: Add null/not null schema validation integration tests and supporting table for Hive

* chore: Edit comment

* chore: Disable 773da9c fix to understand build failure

* tests: Remove duplicate function from mysql tests

* Revert "chore: Disable 773da9c fix to understand build failure"

This reverts commit f66dc86.

* tests: Exclude dvt_core_types id column from schema validation test

* tests: Exclude dvt_core_types id column from schema validation test

* tests: Disable test_schema_validation_not_null_vs_nullable on Hive
  • Loading branch information
nj1973 committed Aug 14, 2023
1 parent ee7ae9a commit 936744b
Show file tree
Hide file tree
Showing 18 changed files with 426 additions and 53 deletions.
8 changes: 8 additions & 0 deletions tests/resources/bigquery_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,11 @@ INSERT INTO `pso_data_validator`.`dvt_core_types` VALUES
,'Hello DVT','C ','Hello DVT'
,DATE '1970-01-03',DATETIME '1970-01-03 00:00:03'
,TIMESTAMP '1970-01-03 00:00:03-03:00');

DROP TABLE `pso_data_validator`.`dvt_null_not_null`;
CREATE TABLE `pso_data_validator`.`dvt_null_not_null`
( col_nn DATETIME NOT NULL
, col_nullable DATETIME
, col_src_nn_trg_n DATETIME
, col_src_n_trg_nn DATETIME NOT NULL
) OPTIONS (description='Nullable integration test table, BigQuery is assumed to be a DVT target (not source)');
9 changes: 9 additions & 0 deletions tests/resources/hive_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,12 @@ INSERT INTO pso_data_validator.dvt_core_types VALUES
,12345678901234567890,1234567890123456789012345,123.33,123456.3,12345678.3
,'Hello DVT','C ','Hello DVT'
,'1970-01-03','1970-01-03 00:00:03','1970-01-03 03:00:03');


DROP TABLE `pso_data_validator`.`dvt_null_not_null`;
CREATE TABLE `pso_data_validator`.`dvt_null_not_null`
( col_nn timestamp NOT NULL
, col_nullable timestamp
, col_src_nn_trg_n timestamp NOT NULL
, col_src_n_trg_nn timestamp
) COMMENT 'Nullable integration test table, Hive is assumed to be a DVT source (not target).';
8 changes: 8 additions & 0 deletions tests/resources/mysql_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,11 @@ INSERT INTO `pso_data_validator`.`dvt_core_types` VALUES
,12345678901234567890,1234567890123456789012345,123.33,123456.3,12345678.3
,'Hello DVT','C ','Hello DVT'
,'1970-01-03','1970-01-03 00:00:03','1970-01-03 03:00:03');

DROP TABLE `pso_data_validator`.`dvt_null_not_null`;
CREATE TABLE `pso_data_validator`.`dvt_null_not_null`
( col_nn datetime(0) NOT NULL
, col_nullable datetime(0)
, col_src_nn_trg_n datetime(0) NOT NULL
, col_src_n_trg_nn datetime(0)
) COMMENT 'Nullable integration test table, MySQL is assumed to be a DVT source (not target).';
9 changes: 9 additions & 0 deletions tests/resources/oracle_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,12 @@ INSERT INTO pso_data_validator.dvt_core_types VALUES
,DATE'1970-01-03',TIMESTAMP'1970-01-03 00:00:03'
,to_timestamp_tz('1970-01-03 00:00:03 -03:00','YYYY-MM-DD HH24:MI:SS TZH:TZM'));
COMMIT;

DROP TABLE pso_data_validator.dvt_null_not_null;
CREATE TABLE pso_data_validator.dvt_null_not_null
( col_nn TIMESTAMP(0) NOT NULL
, col_nullable TIMESTAMP(0)
, col_src_nn_trg_n TIMESTAMP(0) NOT NULL
, col_src_n_trg_nn TIMESTAMP(0)
);
COMMENT ON TABLE pso_data_validator.dvt_null_not_null IS 'Nullable integration test table, Oracle is assumed to be a DVT source (not target).';
11 changes: 10 additions & 1 deletion tests/resources/postgresql_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -89,4 +89,13 @@ INSERT INTO public.test_generate_partitions (course_id, quarter_id, student_id,
('TRI001', 2, 9012, 3.5),
('TRI001', 3, 1234, 2.7),
('TRI001', 3, 5678, 3.5),
('TRI001', 3, 9012, 2.8);
('TRI001', 3, 9012, 2.8);

DROP TABLE pso_data_validator.dvt_null_not_null;
CREATE TABLE pso_data_validator.dvt_null_not_null
( col_nn TIMESTAMP(0) NOT NULL
, col_nullable TIMESTAMP(0)
, col_src_nn_trg_n TIMESTAMP(0) NOT NULL
, col_src_n_trg_nn TIMESTAMP(0)
);
COMMENT ON TABLE pso_data_validator.dvt_null_not_null IS 'Nullable integration test table, PostgreSQL is assumed to be a DVT source (not target).';
11 changes: 10 additions & 1 deletion tests/resources/snowflake_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -83,4 +83,13 @@ INSERT INTO PSO_DATA_VALIDATOR.PUBLIC.TEST_GENERATE_PARTITIONS (COURSE_ID, QUART
('TRI001', 2, 9012, 3.5),
('TRI001', 3, 1234, 2.7),
('TRI001', 3, 5678, 3.5),
('TRI001', 3, 9012, 2.8);
('TRI001', 3, 9012, 2.8);

DROP TABLE PSO_DATA_VALIDATOR.PUBLIC.DVT_NULL_NOT_NULL;
CREATE TABLE PSO_DATA_VALIDATOR.PUBLIC.DVT_NULL_NOT_NULL
( col_nn TIMESTAMP(0) NOT NULL
, col_nullable TIMESTAMP(0)
, col_src_nn_trg_n TIMESTAMP(0) NOT NULL
, col_src_n_trg_nn TIMESTAMP(0)
);
COMMENT ON TABLE PSO_DATA_VALIDATOR.PUBLIC.DVT_NULL_NOT_NULL IS 'Nullable integration test table, Oracle is assumed to be a DVT source (not target).';
10 changes: 9 additions & 1 deletion tests/resources/sqlserver_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -53,4 +53,12 @@ INSERT INTO pso_data_validator.dvt_core_types VALUES
,12345678901234567890,1234567890123456789012345,123.33,123456.3,12345678.3
,'Hello DVT','C ','Hello DVT'
,'1970-01-03','1970-01-03 00:00:03'
,cast('1970-01-03 00:00:03 -03:00' as datetimeoffset(3)));
,cast('1970-01-03 00:00:03 -03:00' as datetimeoffset(3)));

DROP TABLE pso_data_validator.dvt_null_not_null;
CREATE TABLE pso_data_validator.dvt_null_not_null
( col_nn datetime2(0) NOT NULL
, col_nullable datetime2(0)
, col_src_nn_trg_n datetime2(0) NOT NULL
, col_src_n_trg_nn datetime2(0)
);
10 changes: 10 additions & 0 deletions tests/resources/teradata_test_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ CREATE TABLE udf.dvt_core_types
, col_datetime TIMESTAMP(3)
, col_tstz TIMESTAMP(3) WITH TIME ZONE
);
COMMENT ON TABLE udf.dvt_core_types AS 'Core data types integration test table';

INSERT INTO udf.dvt_core_types VALUES
(1,1,1,1,1
Expand All @@ -50,3 +51,12 @@ INSERT INTO udf.dvt_core_types VALUES
,'Hello DVT','C ','Hello DVT'
,DATE'1970-01-03',TIMESTAMP'1970-01-03 00:00:03'
,CAST('1970-01-03 00:00:03.000-03:00' AS TIMESTAMP(3) WITH TIME ZONE));

DROP TABLE udf.dvt_null_not_null;
CREATE TABLE udf.dvt_null_not_null
( col_nn TIMESTAMP(0) NOT NULL
, col_nullable TIMESTAMP(0)
, col_src_nn_trg_n TIMESTAMP(0) NOT NULL
, col_src_n_trg_nn TIMESTAMP(0)
);
COMMENT ON TABLE udf.dvt_null_not_null AS 'Nullable integration test table, Teradata is assumed to be a DVT source (not target).';
35 changes: 35 additions & 0 deletions tests/system/data_sources/common_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


def null_not_null_assertions(df):
"""Standard assertions for null_not_null integration test.
These tests use BigQuery as a set target with a mismatch of not null/nullable settings.
All other engines are validated against BigQuery to check we get the correct status."""
# Should be 4 columns in the Dataframe.
assert len(df) == 4
match_columns = ["col_nn", "col_nullable"]
mismatch_columns = ["col_src_nn_trg_n", "col_src_n_trg_nn"]
for column_name, status in zip(df["source_column_name"], df["validation_status"]):
assert column_name in (match_columns + mismatch_columns)
if column_name in match_columns:
# These columns are the same for all engines and should succeed.
assert (
status == "success"
), f"Column: {column_name}, status: {status} != 'success'"
elif column_name in mismatch_columns:
# These columns are the different for source and target engines and should fail.
assert (
status == "fail"
), f"Column: {column_name}, status: {status} != 'fail'"
30 changes: 30 additions & 0 deletions tests/system/data_sources/test_hive.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from data_validation import __main__ as main
from data_validation import cli_tools, data_validation, consts
from data_validation.partition_builder import PartitionBuilder
from tests.system.data_sources.common_functions import null_not_null_assertions
from tests.system.data_sources.test_bigquery import BQ_CONN


Expand Down Expand Up @@ -161,6 +162,7 @@ def test_schema_validation_core_types_to_bigquery():
"-tc=bq-conn",
"-tbls=pso_data_validator.dvt_core_types",
"--filter-status=fail",
"--exclusion-columns=id",
(
# All Hive integrals go to BigQuery INT64.
"--allow-list=int8:int64,int16:int64,int32:int64,"
Expand All @@ -184,6 +186,34 @@ def test_schema_validation_core_types_to_bigquery():
assert len(df) == 0


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
)
def disabled_test_schema_validation_not_null_vs_nullable():
"""
Disabled this test because we don't currently pull nullable from Hive.
https://github.com/GoogleCloudPlatform/professional-services-data-validator/issues/934
Compares a source table with a BigQuery target and ensure we match/fail on nnot null/nullable correctly.
"""
parser = cli_tools.configure_arg_parser()
args = parser.parse_args(
[
"validate",
"schema",
"-sc=hive-conn",
"-tc=bq-conn",
"-tbls=pso_data_validator.dvt_null_not_null=pso_data_validator.dvt_null_not_null",
]
)
config_managers = main.build_config_managers_from_args(args)
assert len(config_managers) == 1
config_manager = config_managers[0]
validator = data_validation.DataValidation(config_manager.config, verbose=False)
df = validator.execute()
null_not_null_assertions(df)


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
Expand Down
39 changes: 32 additions & 7 deletions tests/system/data_sources/test_mysql.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from data_validation import __main__ as main
from data_validation import cli_tools, data_validation, consts, exceptions
from data_validation.partition_builder import PartitionBuilder
from tests.system.data_sources.common_functions import null_not_null_assertions
from tests.system.data_sources.test_bigquery import BQ_CONN


Expand Down Expand Up @@ -64,6 +65,13 @@
}


def mock_get_connection_config(*args):
if args[1] in ("mysql-conn", "mock-conn"):
return CONN
elif args[1] == "bq-conn":
return BQ_CONN


def test_mysql_count_invalid_host():
try:
data_validator = data_validation.DataValidation(
Expand Down Expand Up @@ -428,13 +436,6 @@ def test_mysql_row():
pass


def mock_get_connection_config(*args):
if args[1] in ("mysql-conn", "mock-conn"):
return CONN
elif args[1] == "bq-conn":
return BQ_CONN


# Expected result from partitioning table on 3 keys
EXPECTED_PARTITION_FILTER = [
"course_id < 'ALG001' OR course_id = 'ALG001' AND (quarter_id < 3 OR quarter_id = 3 AND (student_id < 1234))",
Expand Down Expand Up @@ -508,6 +509,30 @@ def test_schema_validation_core_types():
assert len(df) == 0


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
)
def test_schema_validation_not_null_vs_nullable():
"""Compares a source table with a BigQuery target and ensure we match/fail on nnot null/nullable correctly."""
parser = cli_tools.configure_arg_parser()
args = parser.parse_args(
[
"validate",
"schema",
"-sc=mysql-conn",
"-tc=bq-conn",
"-tbls=pso_data_validator.dvt_null_not_null=pso_data_validator.dvt_null_not_null",
]
)
config_managers = main.build_config_managers_from_args(args)
assert len(config_managers) == 1
config_manager = config_managers[0]
validator = data_validation.DataValidation(config_manager.config, verbose=False)
df = validator.execute()
null_not_null_assertions(df)


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
Expand Down
28 changes: 27 additions & 1 deletion tests/system/data_sources/test_oracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from data_validation import __main__ as main
from data_validation import cli_tools, data_validation, consts
from data_validation.partition_builder import PartitionBuilder
from tests.system.data_sources.common_functions import null_not_null_assertions
from tests.system.data_sources.test_bigquery import BQ_CONN


Expand Down Expand Up @@ -160,9 +161,10 @@ def test_schema_validation_core_types_to_bigquery():
"-tc=bq-conn",
"-tbls=pso_data_validator.dvt_core_types",
"--filter-status=fail",
"--exclusion-columns=id",
(
# Integral Oracle NUMBERS go to BigQuery INT64.
"--allow-list=!decimal(8,0):int64,decimal(2,0):int64,decimal(4,0):int64,decimal(9,0):int64,decimal(18,0):int64,"
"--allow-list=decimal(2,0):int64,decimal(4,0):int64,decimal(9,0):int64,decimal(18,0):int64,"
# Oracle NUMBERS that map to BigQuery NUMERIC.
"decimal(20,0):decimal(38,9),decimal(10,2):decimal(38,9),"
# Oracle NUMBERS that map to BigQuery BIGNUMERIC.
Expand All @@ -181,6 +183,30 @@ def test_schema_validation_core_types_to_bigquery():
assert len(df) == 0


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
)
def test_schema_validation_not_null_vs_nullable():
"""Compares a source table with a BigQuery target and ensure we match/fail on nnot null/nullable correctly."""
parser = cli_tools.configure_arg_parser()
args = parser.parse_args(
[
"validate",
"schema",
"-sc=ora-conn",
"-tc=bq-conn",
"-tbls=pso_data_validator.dvt_null_not_null=pso_data_validator.dvt_null_not_null",
]
)
config_managers = main.build_config_managers_from_args(args)
assert len(config_managers) == 1
config_manager = config_managers[0]
validator = data_validation.DataValidation(config_manager.config, verbose=False)
df = validator.execute()
null_not_null_assertions(df)


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
Expand Down
28 changes: 27 additions & 1 deletion tests/system/data_sources/test_postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from tests.system.data_sources.deploy_cloudsql.cloudsql_resource_manager import (
CloudSQLResourceManager,
)
from tests.system.data_sources.common_functions import null_not_null_assertions
from tests.system.data_sources.test_bigquery import BQ_CONN


Expand Down Expand Up @@ -560,10 +561,11 @@ def test_schema_validation_core_types_to_bigquery():
"-sc=pg-conn",
"-tc=bq-conn",
"-tbls=pso_data_validator.dvt_core_types",
"--exclusion-columns=id",
"--filter-status=fail",
(
# PostgreSQL integrals go to BigQuery INT64.
"--allow-list=int16:int64,int32:int64,!int32:int64,"
"--allow-list=int16:int64,int32:int64,"
# Oracle NUMBERS that map to BigQuery NUMERIC.
"decimal(20,0):decimal(38,9),decimal(10,2):decimal(38,9),"
# Oracle NUMBERS that map to BigQuery BIGNUMERIC.
Expand All @@ -582,6 +584,30 @@ def test_schema_validation_core_types_to_bigquery():
assert len(df) == 0


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
)
def test_schema_validation_not_null_vs_nullable():
"""Compares a source table with a BigQuery target and ensure we match/fail on nnot null/nullable correctly."""
parser = cli_tools.configure_arg_parser()
args = parser.parse_args(
[
"validate",
"schema",
"-sc=pg-conn",
"-tc=bq-conn",
"-tbls=pso_data_validator.dvt_null_not_null=pso_data_validator.dvt_null_not_null",
]
)
config_managers = main.build_config_managers_from_args(args)
assert len(config_managers) == 1
config_manager = config_managers[0]
validator = data_validation.DataValidation(config_manager.config, verbose=False)
df = validator.execute()
null_not_null_assertions(df)


@mock.patch(
"data_validation.state_manager.StateManager.get_connection_config",
new=mock_get_connection_config,
Expand Down
Loading

0 comments on commit 936744b

Please sign in to comment.