From ee7ae9a168bc7dfe3798d77ab238552c36670864 Mon Sep 17 00:00:00 2001 From: nj1973 Date: Wed, 9 Aug 2023 09:30:09 +0100 Subject: [PATCH] fix: Issues with validate column for time zoned timestamps (#930) * fix: Include AT TIME ZONE for Oracle time zoned timestamp * tests: Enable column validation col_tstz testing for PostgreSQL, Teradata and SQL Server * tests: Add cross engine column/row validation testing for MySQL * chore: Update comments in tests * tests: Add col_tstz to PostgreSQL column validation --sum option --- tests/system/data_sources/test_mysql.py | 88 +++++++++++++++++--- tests/system/data_sources/test_oracle.py | 8 +- tests/system/data_sources/test_postgres.py | 7 +- tests/system/data_sources/test_sql_server.py | 7 +- tests/system/data_sources/test_teradata.py | 6 +- third_party/ibis/ibis_oracle/registry.py | 6 ++ 6 files changed, 96 insertions(+), 26 deletions(-) diff --git a/tests/system/data_sources/test_mysql.py b/tests/system/data_sources/test_mysql.py index bb61b8b43..43ac96a0e 100644 --- a/tests/system/data_sources/test_mysql.py +++ b/tests/system/data_sources/test_mysql.py @@ -18,6 +18,8 @@ from data_validation import __main__ as main from data_validation import cli_tools, data_validation, consts, exceptions from data_validation.partition_builder import PartitionBuilder +from tests.system.data_sources.test_bigquery import BQ_CONN + MYSQL_HOST = os.getenv("MYSQL_HOST", "localhost") MYSQL_USER = os.getenv("MYSQL_USER", "dvt") @@ -426,6 +428,13 @@ def test_mysql_row(): pass +def mock_get_connection_config(*args): + if args[1] in ("mysql-conn", "mock-conn"): + return CONN + elif args[1] == "bq-conn": + return BQ_CONN + + # Expected result from partitioning table on 3 keys EXPECTED_PARTITION_FILTER = [ "course_id < 'ALG001' OR course_id = 'ALG001' AND (quarter_id < 3 OR quarter_id = 3 AND (student_id < 1234))", @@ -439,9 +448,9 @@ def test_mysql_row(): @mock.patch( "data_validation.state_manager.StateManager.get_connection_config", - return_value=CONN, + new=mock_get_connection_config, ) -def test_mysql_generate_table_partitions(mock_conn): +def test_mysql_generate_table_partitions(): """Test generate table partitions on mysql The unit tests, specifically test_add_partition_filters_to_config and test_store_yaml_partitions_local check that yaml configurations are created and saved in local storage. Partitions can only be created with @@ -476,9 +485,9 @@ def test_mysql_generate_table_partitions(mock_conn): @mock.patch( "data_validation.state_manager.StateManager.get_connection_config", - return_value=CONN, + new=mock_get_connection_config, ) -def test_schema_validation_core_types(mock_conn): +def test_schema_validation_core_types(): parser = cli_tools.configure_arg_parser() args = parser.parse_args( [ @@ -501,9 +510,9 @@ def test_schema_validation_core_types(mock_conn): @mock.patch( "data_validation.state_manager.StateManager.get_connection_config", - return_value=CONN, + new=mock_get_connection_config, ) -def test_column_validation_core_types(mock_conn): +def test_column_validation_core_types(): parser = cli_tools.configure_arg_parser() args = parser.parse_args( [ @@ -529,9 +538,40 @@ def test_column_validation_core_types(mock_conn): @mock.patch( "data_validation.state_manager.StateManager.get_connection_config", - return_value=CONN, + new=mock_get_connection_config, +) +def test_column_validation_core_types_to_bigquery(): + parser = cli_tools.configure_arg_parser() + # TODO Change --sum string below to include col_datetime and col_tstz when issue-762 is complete. + # TODO Change --sum, --min and --max options to include col_char_2 when issue-842 is complete. + # We've excluded col_float32 because BigQuery does not have an exact same type and float32/64 are lossy and cannot be compared. + args = parser.parse_args( + [ + "validate", + "column", + "-sc=mysql-conn", + "-tc=bq-conn", + "-tbls=pso_data_validator.dvt_core_types", + "--filter-status=fail", + "--sum=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float64,col_varchar_30,col_string,col_date", + "--min=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float64,col_varchar_30,col_string,col_date,col_datetime,col_tstz", + "--max=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float64,col_varchar_30,col_string,col_date,col_datetime,col_tstz", + ] + ) + config_managers = main.build_config_managers_from_args(args) + assert len(config_managers) == 1 + config_manager = config_managers[0] + validator = data_validation.DataValidation(config_manager.config, verbose=False) + df = validator.execute() + # With filter on failures the data frame should be empty + assert len(df) == 0 + + +@mock.patch( + "data_validation.state_manager.StateManager.get_connection_config", + new=mock_get_connection_config, ) -def test_row_validation_core_types(mock_conn): +def test_row_validation_core_types(): parser = cli_tools.configure_arg_parser() args = parser.parse_args( [ @@ -556,9 +596,37 @@ def test_row_validation_core_types(mock_conn): @mock.patch( "data_validation.state_manager.StateManager.get_connection_config", - return_value=CONN, + new=mock_get_connection_config, +) +def test_row_validation_core_types_to_bigquery(): + # TODO Change --hash string below to include col_float32,col_float64 when issue-841 is complete. + parser = cli_tools.configure_arg_parser() + args = parser.parse_args( + [ + "validate", + "row", + "-sc=mysql-conn", + "-tc=bq-conn", + "-tbls=pso_data_validator.dvt_core_types", + "--primary-keys=id", + "--filter-status=fail", + "--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_varchar_30,col_char_2,col_string,col_date,col_datetime,col_tstz", + ] + ) + config_managers = main.build_config_managers_from_args(args) + assert len(config_managers) == 1 + config_manager = config_managers[0] + validator = data_validation.DataValidation(config_manager.config, verbose=False) + df = validator.execute() + # With filter on failures the data frame should be empty + assert len(df) == 0 + + +@mock.patch( + "data_validation.state_manager.StateManager.get_connection_config", + new=mock_get_connection_config, ) -def test_custom_query_validation_core_types(mock_conn): +def test_custom_query_validation_core_types(): """MySQL to MySQL dvt_core_types custom-query validation""" parser = cli_tools.configure_arg_parser() args = parser.parse_args( diff --git a/tests/system/data_sources/test_oracle.py b/tests/system/data_sources/test_oracle.py index 321d8198e..377c2fb2e 100644 --- a/tests/system/data_sources/test_oracle.py +++ b/tests/system/data_sources/test_oracle.py @@ -218,7 +218,6 @@ def test_column_validation_core_types(): def test_column_validation_core_types_to_bigquery(): parser = cli_tools.configure_arg_parser() # TODO Change --sum string below to include col_datetime and col_tstz when issue-762 is complete. - # TODO Change --min/max strings below to include col_tstz when issue-917 is complete. # We've excluded col_float32 because BigQuery does not have an exact same type and float32/64 are lossy and cannot be compared. args = parser.parse_args( [ @@ -229,8 +228,8 @@ def test_column_validation_core_types_to_bigquery(): "-tbls=pso_data_validator.dvt_core_types", "--filter-status=fail", "--sum=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float64,col_varchar_30,col_char_2,col_string,col_date", - "--min=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float64,col_varchar_30,col_char_2,col_string,col_date,col_datetime", - "--max=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float64,col_varchar_30,col_char_2,col_string,col_date,col_datetime", + "--min=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float64,col_varchar_30,col_char_2,col_string,col_date,col_datetime,col_tstz", + "--max=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float64,col_varchar_30,col_char_2,col_string,col_date,col_datetime,col_tstz", ] ) config_managers = main.build_config_managers_from_args(args) @@ -275,7 +274,6 @@ def test_row_validation_core_types(): new=mock_get_connection_config, ) def test_row_validation_core_types_to_bigquery(): - # TODO Change --hash string below to include col_tstz when issue-917 is complete. # TODO Change --hash string below to include col_float32,col_float64 when issue-841 is complete. parser = cli_tools.configure_arg_parser() args = parser.parse_args( @@ -287,7 +285,7 @@ def test_row_validation_core_types_to_bigquery(): "-tbls=pso_data_validator.dvt_core_types", "--primary-keys=id", "--filter-status=fail", - "--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_varchar_30,col_char_2,col_string,col_date,col_datetime", + "--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_varchar_30,col_char_2,col_string,col_date,col_datetime,col_tstz", ] ) config_managers = main.build_config_managers_from_args(args) diff --git a/tests/system/data_sources/test_postgres.py b/tests/system/data_sources/test_postgres.py index 7ec8a9d49..be40813d7 100644 --- a/tests/system/data_sources/test_postgres.py +++ b/tests/system/data_sources/test_postgres.py @@ -616,7 +616,6 @@ def test_column_validation_core_types(): ) def test_column_validation_core_types_to_bigquery(): parser = cli_tools.configure_arg_parser() - # TODO Change --min/max strings below to include col_tstz when issue-917 is complete. # We've excluded col_float32 because BigQuery does not have an exact same type and float32/64 are lossy and cannot be compared. # TODO Change --sum and --max options to include col_char_2 when issue-842 is complete. args = parser.parse_args( @@ -627,9 +626,9 @@ def test_column_validation_core_types_to_bigquery(): "-tc=bq-conn", "-tbls=pso_data_validator.dvt_core_types", "--filter-status=fail", - "--sum=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float64,col_varchar_30,col_string,col_date,col_datetime", - "--min=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float64,col_varchar_30,col_string,col_date,col_datetime", - "--max=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float64,col_varchar_30,col_string,col_date,col_datetime", + "--sum=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float64,col_varchar_30,col_string,col_date,col_datetime,col_tstz", + "--min=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float64,col_varchar_30,col_string,col_date,col_datetime,col_tstz", + "--max=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float64,col_varchar_30,col_string,col_date,col_datetime,col_tstz", ] ) config_managers = main.build_config_managers_from_args(args) diff --git a/tests/system/data_sources/test_sql_server.py b/tests/system/data_sources/test_sql_server.py index 6771e7d69..bedba7925 100644 --- a/tests/system/data_sources/test_sql_server.py +++ b/tests/system/data_sources/test_sql_server.py @@ -344,7 +344,6 @@ def test_column_validation_core_types(): ) def test_column_validation_core_types_to_bigquery(): parser = cli_tools.configure_arg_parser() - # TODO Change --sum/min/max strings below to include col_tstz when issue-917 is complete. # We've excluded col_float32 because BigQuery does not have an exact same type and float32/64 are lossy and cannot be compared. # We've excluded col_char_2 since the data stored in MSSQL has a trailing space which is counted in the LEN() args = parser.parse_args( @@ -356,8 +355,8 @@ def test_column_validation_core_types_to_bigquery(): "-tbls=pso_data_validator.dvt_core_types", "--filter-status=fail", "--sum=col_int8,col_int16,col_int32,col_int64,col_float64,col_date,col_datetime,col_dec_10_2,col_dec_20,col_dec_38,col_varchar_30,col_char_2,col_string", - "--min=col_int8,col_int16,col_int32,col_int64,col_float64,col_date,col_datetime,col_dec_10_2,col_dec_20,col_dec_38,col_varchar_30,col_char_2,col_string", - "--max=col_int8,col_int16,col_int32,col_int64,col_float64,col_date,col_datetime,col_dec_10_2,col_dec_20,col_dec_38,col_varchar_30,col_char_2,col_string", + "--min=col_int8,col_int16,col_int32,col_int64,col_float64,col_date,col_datetime,col_tstz,col_dec_10_2,col_dec_20,col_dec_38,col_varchar_30,col_char_2,col_string", + "--max=col_int8,col_int16,col_int32,col_int64,col_float64,col_date,col_datetime,col_tstz,col_dec_10_2,col_dec_20,col_dec_38,col_varchar_30,col_char_2,col_string", ] ) config_managers = main.build_config_managers_from_args(args) @@ -404,7 +403,7 @@ def test_row_validation_core_types(): def test_row_validation_core_types_to_bigquery(): parser = cli_tools.configure_arg_parser() # TODO When issue-834 is complete add col_string to --hash string below. - # TODO Change --hash string below to include col_tstz when issue-917 is complete. + # TODO Change --hash string below to include col_tstz when issue-929 is complete. # TODO Change --hash string below to include col_float32,col_float64 when issue-841 is complete. args = parser.parse_args( [ diff --git a/tests/system/data_sources/test_teradata.py b/tests/system/data_sources/test_teradata.py index 7128859ea..8d1949e10 100644 --- a/tests/system/data_sources/test_teradata.py +++ b/tests/system/data_sources/test_teradata.py @@ -304,8 +304,8 @@ def test_column_validation_core_types_to_bigquery(): "-tbls=udf.dvt_core_types=pso_data_validator.dvt_core_types", "--filter-status=fail", "--sum=col_int8,col_int16,col_int32,col_int64,col_float32,col_float64,col_varchar_30,col_char_2,col_string,col_date,col_dec_20,col_dec_38,col_dec_10_2", - "--min=col_int8,col_int16,col_int32,col_int64,col_float32,col_float64,col_varchar_30,col_char_2,col_string,col_date,col_dec_20,col_dec_38,col_dec_10_2", - "--max=col_int8,col_int16,col_int32,col_int64,col_float32,col_float64,col_varchar_30,col_char_2,col_string,col_date,col_dec_20,col_dec_38,col_dec_10_2", + "--min=col_int8,col_int16,col_int32,col_int64,col_float32,col_float64,col_varchar_30,col_char_2,col_string,col_date,col_tstz,col_dec_20,col_dec_38,col_dec_10_2", + "--max=col_int8,col_int16,col_int32,col_int64,col_float32,col_float64,col_varchar_30,col_char_2,col_string,col_date,col_tstz,col_dec_20,col_dec_38,col_dec_10_2", ] ) config_managers = main.build_config_managers_from_args(args) @@ -402,7 +402,7 @@ def test_row_validation_core_types_to_bigquery(): parser = cli_tools.configure_arg_parser() # Excluded col_string because LONG VARCHAR column causes exception regardless of column contents: # [Error 3798] A column or character expression is larger than the max size. - # TODO Change --hash option to include col_tstz when issue-917 is complete. + # TODO Change --hash option to include col_tstz when issue-929 is complete. # TODO Change --hash option to include col_float32,col_float64 when issue-841 is complete. args = parser.parse_args( [ diff --git a/third_party/ibis/ibis_oracle/registry.py b/third_party/ibis/ibis_oracle/registry.py index 6325c4b4e..aaa410ab0 100644 --- a/third_party/ibis/ibis_oracle/registry.py +++ b/third_party/ibis/ibis_oracle/registry.py @@ -329,6 +329,12 @@ def _table_column(t, op): out_expr = get_col(sa_table, op) out_expr.quote = t._quote_column_names + if op.output_dtype.is_timestamp(): + timezone = op.output_dtype.timezone + if timezone is not None: + # Using literal_column on Oracle because the time zone string cannot be a bind. + out_expr = sa.literal_column(f"{out_expr.name} AT TIME ZONE '{timezone}'").label(op.name) + # If the column does not originate from the table set in the current SELECT # context, we should format as a subquery if t.permit_subquery and ctx.is_foreign_expr(table):