fix: Better casts to string for binary floats/doubles (#1078)

* fix: Better casts to string for binary floats/doubles
GoogleCloudPlatform · Jan 22, 2024 · 15bfc4c · 15bfc4c
1 parent 58f8fcb
commit 15bfc4c
Show file tree

Hide file tree

Showing 7 changed files with 42 additions and 41 deletions.
diff --git a/tests/system/data_sources/test_hive.py b/tests/system/data_sources/test_hive.py
@@ -312,7 +312,7 @@ def disabled_test_row_validation_core_types():
 )
 def test_row_validation_core_types_to_bigquery():
     parser = cli_tools.configure_arg_parser()
-    # TODO Change --hash string below to include col_float32,col_float64 when issue-841 is complete.
+    # col_float64 is excluded below because there is no way to control the format when casting to string.
     args = parser.parse_args(
         [
             "validate",
@@ -322,7 +322,7 @@ def test_row_validation_core_types_to_bigquery():
             "-tbls=pso_data_validator.dvt_core_types",
             "--primary-keys=id",
             "--filter-status=fail",
-            "--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_varchar_30,col_char_2,col_string,col_date,col_datetime,col_tstz",
+            "--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float32,col_varchar_30,col_char_2,col_string,col_date,col_datetime,col_tstz",
         ]
     )
     config_managers = main.build_config_managers_from_args(args)

diff --git a/tests/system/data_sources/test_oracle.py b/tests/system/data_sources/test_oracle.py
@@ -394,7 +394,7 @@ def test_row_validation_core_types():
 )
 def test_row_validation_core_types_to_bigquery():
     """Oracle to BigQuery dvt_core_types row validation"""
-    # TODO Change --hash string below to include col_float32,col_float64 when issue-841 is complete.
+    # Excluded col_float32,col_float64 due to the lossy nature of BINARY_FLOAT/DOUBLE.
     parser = cli_tools.configure_arg_parser()
     args = parser.parse_args(
         [
@@ -423,9 +423,9 @@ def test_row_validation_core_types_to_bigquery():
 )
 def test_row_validation_oracle_to_postgres():
     # TODO Change hash_cols below to include col_tstz when issue-706 is complete.
-    # TODO Change hash_cols below to include col_float32,col_float64 when issue-841 is complete.
     # TODO col_raw is blocked by issue-773 (is it even reasonable to expect binary columns to work here?)
     # TODO Change hash_cols below to include col_nvarchar_30,col_nchar_2 when issue-772 is complete.
+    # Excluded col_float32,col_float64 due to the lossy nature of BINARY_FLOAT/DOUBLE.
     # Excluded CLOB/NCLOB/BLOB columns because lob values cannot be concatenated
     hash_cols = ",".join(
         [

diff --git a/tests/system/data_sources/test_sql_server.py b/tests/system/data_sources/test_sql_server.py
@@ -425,7 +425,6 @@ def test_row_validation_core_types_to_bigquery():
     parser = cli_tools.configure_arg_parser()
     # TODO When issue-834 is complete add col_string to --hash string below.
     # TODO Change --hash string below to include col_tstz when issue-929 is complete.
-    # TODO Change --hash string below to include col_float32,col_float64 when issue-841 is complete.
     args = parser.parse_args(
         [
             "validate",
@@ -435,7 +434,7 @@ def test_row_validation_core_types_to_bigquery():
             "-tbls=pso_data_validator.dvt_core_types",
             "--primary-keys=id",
             "--filter-status=fail",
-            "--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_varchar_30,col_char_2,col_date,col_datetime",
+            "--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float32,col_float64,col_varchar_30,col_char_2,col_date,col_datetime",
         ]
     )
     config_managers = main.build_config_managers_from_args(args)

diff --git a/tests/system/data_sources/test_teradata.py b/tests/system/data_sources/test_teradata.py
@@ -424,7 +424,6 @@ def test_row_validation_core_types_to_bigquery():
     # Excluded col_string because LONG VARCHAR column causes exception regardless of column contents:
     # [Error 3798] A column or character expression is larger than the max size.
     # TODO Change --hash option to include col_tstz when issue-929 is complete.
-    # TODO Change --hash option to include col_float32,col_float64 when issue-841 is complete.
     args = parser.parse_args(
         [
             "validate",
@@ -434,7 +433,7 @@ def test_row_validation_core_types_to_bigquery():
             "-tbls=udf.dvt_core_types=pso_data_validator.dvt_core_types",
             "--primary-keys=id",
             "--filter-status=fail",
-            "--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_varchar_30,col_char_2,col_date,col_datetime",
+            "--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float32,col_float64,col_varchar_30,col_char_2,col_date,col_datetime",
         ]
     )
     config_managers = main.build_config_managers_from_args(args)

diff --git a/third_party/ibis/ibis_addon/operations.py b/third_party/ibis/ibis_addon/operations.py
@@ -28,7 +28,10 @@
 import ibis.expr.operations as ops
 import ibis.expr.rules as rlz
 import sqlalchemy as sa
-from ibis.backends.base.sql.alchemy.registry import fixed_arity as sa_fixed_arity
+from ibis.backends.base.sql.alchemy.registry import (
+    fixed_arity as sa_fixed_arity,
+    _cast as sa_fixed_cast,
+)
 from ibis.backends.base.sql.alchemy.translator import AlchemyExprTranslator
 from ibis.backends.base.sql.compiler.translator import ExprTranslator
 from ibis.backends.base.sql.registry import fixed_arity
@@ -324,15 +327,14 @@ def sa_cast_postgres(t, op):
     typ = op.to
     arg_dtype = arg.output_dtype
 
-    sa_arg = t.translate(arg)
-
     # Specialize going from numeric(p,s>0) to string
     if (
         arg_dtype.is_decimal()
         and arg_dtype.scale
         and arg_dtype.scale > 0
         and typ.is_string()
     ):
+        sa_arg = t.translate(arg)
         # When casting a number to string PostgreSQL includes the full scale, e.g.:
         #   SELECT CAST(CAST(100 AS DECIMAL(5,2)) AS VARCHAR(10));
         #     100.00
@@ -347,22 +349,23 @@ def sa_cast_postgres(t, op):
         )
         return sa.func.rtrim(sa.func.to_char(sa_arg, fmt), ".")
 
-    # specialize going from an integer type to a timestamp
-    if arg_dtype.is_integer() and typ.is_timestamp():
-        return t.integer_to_timestamp(sa_arg, tz=typ.timezone)
+    # Follow the original Ibis code path.
+    return sa_fixed_cast(t, op)
 
-    if arg_dtype.is_binary() and typ.is_string():
-        return sa.func.encode(sa_arg, "escape")
 
-    if typ.is_binary():
-        #  decode yields a column of memoryview which is annoying to deal with
-        # in pandas. CAST(expr AS BYTEA) is correct and returns byte strings.
-        return sa.cast(sa_arg, sa.LargeBinary())
+def sa_cast_mssql(t, op):
+    arg = op.arg
+    typ = op.to
+    arg_dtype = arg.output_dtype
 
-    if typ.is_json() and not t.native_json_type:
-        return sa_arg
+    # Specialize going from a binary float type to a string.
+    if (arg_dtype.is_float32() or arg_dtype.is_float64()) and typ.is_string():
+        sa_arg = t.translate(arg)
+        # This prevents output in scientific notation, at least for my tests it did.
+        return sa.func.format(sa_arg, "G")
 
-    return sa.cast(sa_arg, t.get_sqla_type(typ))
+    # Follow the original Ibis code path.
+    return sa_fixed_cast(t, op)
 
 
 def _sa_string_join(t, op):
@@ -458,6 +461,7 @@ def _bigquery_field_to_ibis_dtype(field):
 MsSqlExprTranslator._registry[StringJoin] = _sa_string_join
 MsSqlExprTranslator._registry[RandomScalar] = sa_format_new_id
 MsSqlExprTranslator._registry[Strftime] = strftime_mssql
+MsSqlExprTranslator._registry[Cast] = sa_cast_mssql
 MsSqlExprTranslator._registry[BinaryLength] = sa_format_binary_length_mssql
 
 MySQLExprTranslator._registry[RawSQL] = sa_format_raw_sql

diff --git a/third_party/ibis/ibis_oracle/registry.py b/third_party/ibis/ibis_oracle/registry.py
@@ -23,6 +23,7 @@
 
 import sqlalchemy as sa
 
+from ibis.backends.base.sql.alchemy.registry import _cast as sa_fixed_cast
 import ibis.common.exceptions as com
 import ibis.expr.operations as ops
 
@@ -103,24 +104,15 @@ def _cast(t, op):
     typ = op.to
     arg_dtype = arg.output_dtype
 
-    sa_arg = t.translate(arg)
-
-    # specialize going from an integer type to a timestamp
-    if arg_dtype.is_integer() and typ.is_timestamp():
-        return t.integer_to_timestamp(sa_arg, tz=typ.timezone)
-
-    if arg_dtype.is_binary() and typ.is_string():
-        return sa.func.encode(sa_arg, "escape")
+    # Specialize going from a binary float type to a string.
+    if (arg_dtype.is_float32() or arg_dtype.is_float64()) and typ.is_string():
+        sa_arg = t.translate(arg)
+        # This prevents output in scientific notation but we still have
+        # problems with the lossy nature of BINARY_FLOAT/DOUBLE.
+        return sa.func.to_char(sa_arg, "TM9")
 
-    if typ.is_binary():
-        #  decode yields a column of memoryview which is annoying to deal with
-        # in pandas. CAST(expr AS BYTEA) is correct and returns byte strings.
-        return sa.cast(sa_arg, sa.LargeBinary())
-
-    if typ.is_json() and not t.native_json_type:
-        return sa_arg
-
-    return sa.cast(sa_arg, t.get_sqla_type(typ))
+    # Follow the original Ibis code path.
+    return sa_fixed_cast(t, op)
 
 
 def _typeof(t, op):
@@ -157,7 +149,6 @@ def _string_agg(t, op):
 }
 
 try:
-
     _strftime_to_oracle_rules.update(
         {
             "%c": locale.nl_langinfo(locale.D_T_FMT),  # locale date and time

diff --git a/third_party/ibis/ibis_teradata/registry.py b/third_party/ibis/ibis_teradata/registry.py
@@ -54,6 +54,14 @@ def _cast(t, op):
     arg, target_type = op.args
     arg_formatted = t.translate(arg)
     input_dtype = arg.output_dtype
+
+    # Specialize going from a binary float type to a string.
+    # Trying to avoid scientific notation.
+    if (input_dtype.is_float32() or input_dtype.is_float64()) and op.to.is_string():
+        # Cannot return sa.func.to_char because t (TeradataExprTranslator) returns
+        # a string containing the column and not a SQLAlchemy Column object.
+        return f"TO_CHAR({arg_formatted},'TM9')"
+
     return teradata_cast(arg_formatted, input_dtype, target_type)