From 15bfc4c85e43894f03288cf10aa7888a95342a05 Mon Sep 17 00:00:00 2001 From: Neil Johnson Date: Mon, 22 Jan 2024 16:26:34 +0000 Subject: [PATCH] fix: Better casts to string for binary floats/doubles (#1078) * fix: Better casts to string for binary floats/doubles --- tests/system/data_sources/test_hive.py | 4 +-- tests/system/data_sources/test_oracle.py | 4 +-- tests/system/data_sources/test_sql_server.py | 3 +- tests/system/data_sources/test_teradata.py | 3 +- third_party/ibis/ibis_addon/operations.py | 34 +++++++++++--------- third_party/ibis/ibis_oracle/registry.py | 27 ++++++---------- third_party/ibis/ibis_teradata/registry.py | 8 +++++ 7 files changed, 42 insertions(+), 41 deletions(-) diff --git a/tests/system/data_sources/test_hive.py b/tests/system/data_sources/test_hive.py index efa5e99f1..f5a1343cc 100644 --- a/tests/system/data_sources/test_hive.py +++ b/tests/system/data_sources/test_hive.py @@ -312,7 +312,7 @@ def disabled_test_row_validation_core_types(): ) def test_row_validation_core_types_to_bigquery(): parser = cli_tools.configure_arg_parser() - # TODO Change --hash string below to include col_float32,col_float64 when issue-841 is complete. + # col_float64 is excluded below because there is no way to control the format when casting to string. args = parser.parse_args( [ "validate", @@ -322,7 +322,7 @@ def test_row_validation_core_types_to_bigquery(): "-tbls=pso_data_validator.dvt_core_types", "--primary-keys=id", "--filter-status=fail", - "--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_varchar_30,col_char_2,col_string,col_date,col_datetime,col_tstz", + "--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float32,col_varchar_30,col_char_2,col_string,col_date,col_datetime,col_tstz", ] ) config_managers = main.build_config_managers_from_args(args) diff --git a/tests/system/data_sources/test_oracle.py b/tests/system/data_sources/test_oracle.py index ca92ed1c9..7410fd916 100644 --- a/tests/system/data_sources/test_oracle.py +++ b/tests/system/data_sources/test_oracle.py @@ -394,7 +394,7 @@ def test_row_validation_core_types(): ) def test_row_validation_core_types_to_bigquery(): """Oracle to BigQuery dvt_core_types row validation""" - # TODO Change --hash string below to include col_float32,col_float64 when issue-841 is complete. + # Excluded col_float32,col_float64 due to the lossy nature of BINARY_FLOAT/DOUBLE. parser = cli_tools.configure_arg_parser() args = parser.parse_args( [ @@ -423,9 +423,9 @@ def test_row_validation_core_types_to_bigquery(): ) def test_row_validation_oracle_to_postgres(): # TODO Change hash_cols below to include col_tstz when issue-706 is complete. - # TODO Change hash_cols below to include col_float32,col_float64 when issue-841 is complete. # TODO col_raw is blocked by issue-773 (is it even reasonable to expect binary columns to work here?) # TODO Change hash_cols below to include col_nvarchar_30,col_nchar_2 when issue-772 is complete. + # Excluded col_float32,col_float64 due to the lossy nature of BINARY_FLOAT/DOUBLE. # Excluded CLOB/NCLOB/BLOB columns because lob values cannot be concatenated hash_cols = ",".join( [ diff --git a/tests/system/data_sources/test_sql_server.py b/tests/system/data_sources/test_sql_server.py index 05254ac2c..9b1f813f3 100644 --- a/tests/system/data_sources/test_sql_server.py +++ b/tests/system/data_sources/test_sql_server.py @@ -425,7 +425,6 @@ def test_row_validation_core_types_to_bigquery(): parser = cli_tools.configure_arg_parser() # TODO When issue-834 is complete add col_string to --hash string below. # TODO Change --hash string below to include col_tstz when issue-929 is complete. - # TODO Change --hash string below to include col_float32,col_float64 when issue-841 is complete. args = parser.parse_args( [ "validate", @@ -435,7 +434,7 @@ def test_row_validation_core_types_to_bigquery(): "-tbls=pso_data_validator.dvt_core_types", "--primary-keys=id", "--filter-status=fail", - "--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_varchar_30,col_char_2,col_date,col_datetime", + "--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float32,col_float64,col_varchar_30,col_char_2,col_date,col_datetime", ] ) config_managers = main.build_config_managers_from_args(args) diff --git a/tests/system/data_sources/test_teradata.py b/tests/system/data_sources/test_teradata.py index 0c88405bf..b6cdf4f77 100644 --- a/tests/system/data_sources/test_teradata.py +++ b/tests/system/data_sources/test_teradata.py @@ -424,7 +424,6 @@ def test_row_validation_core_types_to_bigquery(): # Excluded col_string because LONG VARCHAR column causes exception regardless of column contents: # [Error 3798] A column or character expression is larger than the max size. # TODO Change --hash option to include col_tstz when issue-929 is complete. - # TODO Change --hash option to include col_float32,col_float64 when issue-841 is complete. args = parser.parse_args( [ "validate", @@ -434,7 +433,7 @@ def test_row_validation_core_types_to_bigquery(): "-tbls=udf.dvt_core_types=pso_data_validator.dvt_core_types", "--primary-keys=id", "--filter-status=fail", - "--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_varchar_30,col_char_2,col_date,col_datetime", + "--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float32,col_float64,col_varchar_30,col_char_2,col_date,col_datetime", ] ) config_managers = main.build_config_managers_from_args(args) diff --git a/third_party/ibis/ibis_addon/operations.py b/third_party/ibis/ibis_addon/operations.py index 826888e1e..354c7cdc0 100644 --- a/third_party/ibis/ibis_addon/operations.py +++ b/third_party/ibis/ibis_addon/operations.py @@ -28,7 +28,10 @@ import ibis.expr.operations as ops import ibis.expr.rules as rlz import sqlalchemy as sa -from ibis.backends.base.sql.alchemy.registry import fixed_arity as sa_fixed_arity +from ibis.backends.base.sql.alchemy.registry import ( + fixed_arity as sa_fixed_arity, + _cast as sa_fixed_cast, +) from ibis.backends.base.sql.alchemy.translator import AlchemyExprTranslator from ibis.backends.base.sql.compiler.translator import ExprTranslator from ibis.backends.base.sql.registry import fixed_arity @@ -324,8 +327,6 @@ def sa_cast_postgres(t, op): typ = op.to arg_dtype = arg.output_dtype - sa_arg = t.translate(arg) - # Specialize going from numeric(p,s>0) to string if ( arg_dtype.is_decimal() @@ -333,6 +334,7 @@ def sa_cast_postgres(t, op): and arg_dtype.scale > 0 and typ.is_string() ): + sa_arg = t.translate(arg) # When casting a number to string PostgreSQL includes the full scale, e.g.: # SELECT CAST(CAST(100 AS DECIMAL(5,2)) AS VARCHAR(10)); # 100.00 @@ -347,22 +349,23 @@ def sa_cast_postgres(t, op): ) return sa.func.rtrim(sa.func.to_char(sa_arg, fmt), ".") - # specialize going from an integer type to a timestamp - if arg_dtype.is_integer() and typ.is_timestamp(): - return t.integer_to_timestamp(sa_arg, tz=typ.timezone) + # Follow the original Ibis code path. + return sa_fixed_cast(t, op) - if arg_dtype.is_binary() and typ.is_string(): - return sa.func.encode(sa_arg, "escape") - if typ.is_binary(): - # decode yields a column of memoryview which is annoying to deal with - # in pandas. CAST(expr AS BYTEA) is correct and returns byte strings. - return sa.cast(sa_arg, sa.LargeBinary()) +def sa_cast_mssql(t, op): + arg = op.arg + typ = op.to + arg_dtype = arg.output_dtype - if typ.is_json() and not t.native_json_type: - return sa_arg + # Specialize going from a binary float type to a string. + if (arg_dtype.is_float32() or arg_dtype.is_float64()) and typ.is_string(): + sa_arg = t.translate(arg) + # This prevents output in scientific notation, at least for my tests it did. + return sa.func.format(sa_arg, "G") - return sa.cast(sa_arg, t.get_sqla_type(typ)) + # Follow the original Ibis code path. + return sa_fixed_cast(t, op) def _sa_string_join(t, op): @@ -458,6 +461,7 @@ def _bigquery_field_to_ibis_dtype(field): MsSqlExprTranslator._registry[StringJoin] = _sa_string_join MsSqlExprTranslator._registry[RandomScalar] = sa_format_new_id MsSqlExprTranslator._registry[Strftime] = strftime_mssql +MsSqlExprTranslator._registry[Cast] = sa_cast_mssql MsSqlExprTranslator._registry[BinaryLength] = sa_format_binary_length_mssql MySQLExprTranslator._registry[RawSQL] = sa_format_raw_sql diff --git a/third_party/ibis/ibis_oracle/registry.py b/third_party/ibis/ibis_oracle/registry.py index f31f4aa7f..6ff4b61ff 100644 --- a/third_party/ibis/ibis_oracle/registry.py +++ b/third_party/ibis/ibis_oracle/registry.py @@ -23,6 +23,7 @@ import sqlalchemy as sa +from ibis.backends.base.sql.alchemy.registry import _cast as sa_fixed_cast import ibis.common.exceptions as com import ibis.expr.operations as ops @@ -103,24 +104,15 @@ def _cast(t, op): typ = op.to arg_dtype = arg.output_dtype - sa_arg = t.translate(arg) - - # specialize going from an integer type to a timestamp - if arg_dtype.is_integer() and typ.is_timestamp(): - return t.integer_to_timestamp(sa_arg, tz=typ.timezone) - - if arg_dtype.is_binary() and typ.is_string(): - return sa.func.encode(sa_arg, "escape") + # Specialize going from a binary float type to a string. + if (arg_dtype.is_float32() or arg_dtype.is_float64()) and typ.is_string(): + sa_arg = t.translate(arg) + # This prevents output in scientific notation but we still have + # problems with the lossy nature of BINARY_FLOAT/DOUBLE. + return sa.func.to_char(sa_arg, "TM9") - if typ.is_binary(): - # decode yields a column of memoryview which is annoying to deal with - # in pandas. CAST(expr AS BYTEA) is correct and returns byte strings. - return sa.cast(sa_arg, sa.LargeBinary()) - - if typ.is_json() and not t.native_json_type: - return sa_arg - - return sa.cast(sa_arg, t.get_sqla_type(typ)) + # Follow the original Ibis code path. + return sa_fixed_cast(t, op) def _typeof(t, op): @@ -157,7 +149,6 @@ def _string_agg(t, op): } try: - _strftime_to_oracle_rules.update( { "%c": locale.nl_langinfo(locale.D_T_FMT), # locale date and time diff --git a/third_party/ibis/ibis_teradata/registry.py b/third_party/ibis/ibis_teradata/registry.py index b77d25c43..0eaa4780e 100644 --- a/third_party/ibis/ibis_teradata/registry.py +++ b/third_party/ibis/ibis_teradata/registry.py @@ -54,6 +54,14 @@ def _cast(t, op): arg, target_type = op.args arg_formatted = t.translate(arg) input_dtype = arg.output_dtype + + # Specialize going from a binary float type to a string. + # Trying to avoid scientific notation. + if (input_dtype.is_float32() or input_dtype.is_float64()) and op.to.is_string(): + # Cannot return sa.func.to_char because t (TeradataExprTranslator) returns + # a string containing the column and not a SQLAlchemy Column object. + return f"TO_CHAR({arg_formatted},'TM9')" + return teradata_cast(arg_formatted, input_dtype, target_type)