From 15bfc4c85e43894f03288cf10aa7888a95342a05 Mon Sep 17 00:00:00 2001
From: Neil Johnson <neiljohnson@google.com>
Date: Mon, 22 Jan 2024 16:26:34 +0000
Subject: [PATCH] fix: Better casts to string for binary floats/doubles (#1078)

* fix: Better casts to string for binary floats/doubles
---
 tests/system/data_sources/test_hive.py       |  4 +--
 tests/system/data_sources/test_oracle.py     |  4 +--
 tests/system/data_sources/test_sql_server.py |  3 +-
 tests/system/data_sources/test_teradata.py   |  3 +-
 third_party/ibis/ibis_addon/operations.py    | 34 +++++++++++---------
 third_party/ibis/ibis_oracle/registry.py     | 27 ++++++----------
 third_party/ibis/ibis_teradata/registry.py   |  8 +++++
 7 files changed, 42 insertions(+), 41 deletions(-)

diff --git a/tests/system/data_sources/test_hive.py b/tests/system/data_sources/test_hive.py
index efa5e99f1..f5a1343cc 100644
--- a/tests/system/data_sources/test_hive.py
+++ b/tests/system/data_sources/test_hive.py
@@ -312,7 +312,7 @@ def disabled_test_row_validation_core_types():
 )
 def test_row_validation_core_types_to_bigquery():
     parser = cli_tools.configure_arg_parser()
-    # TODO Change --hash string below to include col_float32,col_float64 when issue-841 is complete.
+    # col_float64 is excluded below because there is no way to control the format when casting to string.
     args = parser.parse_args(
         [
             "validate",
@@ -322,7 +322,7 @@ def test_row_validation_core_types_to_bigquery():
             "-tbls=pso_data_validator.dvt_core_types",
             "--primary-keys=id",
             "--filter-status=fail",
-            "--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_varchar_30,col_char_2,col_string,col_date,col_datetime,col_tstz",
+            "--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float32,col_varchar_30,col_char_2,col_string,col_date,col_datetime,col_tstz",
         ]
     )
     config_managers = main.build_config_managers_from_args(args)
diff --git a/tests/system/data_sources/test_oracle.py b/tests/system/data_sources/test_oracle.py
index ca92ed1c9..7410fd916 100644
--- a/tests/system/data_sources/test_oracle.py
+++ b/tests/system/data_sources/test_oracle.py
@@ -394,7 +394,7 @@ def test_row_validation_core_types():
 )
 def test_row_validation_core_types_to_bigquery():
     """Oracle to BigQuery dvt_core_types row validation"""
-    # TODO Change --hash string below to include col_float32,col_float64 when issue-841 is complete.
+    # Excluded col_float32,col_float64 due to the lossy nature of BINARY_FLOAT/DOUBLE.
     parser = cli_tools.configure_arg_parser()
     args = parser.parse_args(
         [
@@ -423,9 +423,9 @@ def test_row_validation_core_types_to_bigquery():
 )
 def test_row_validation_oracle_to_postgres():
     # TODO Change hash_cols below to include col_tstz when issue-706 is complete.
-    # TODO Change hash_cols below to include col_float32,col_float64 when issue-841 is complete.
     # TODO col_raw is blocked by issue-773 (is it even reasonable to expect binary columns to work here?)
     # TODO Change hash_cols below to include col_nvarchar_30,col_nchar_2 when issue-772 is complete.
+    # Excluded col_float32,col_float64 due to the lossy nature of BINARY_FLOAT/DOUBLE.
     # Excluded CLOB/NCLOB/BLOB columns because lob values cannot be concatenated
     hash_cols = ",".join(
         [
diff --git a/tests/system/data_sources/test_sql_server.py b/tests/system/data_sources/test_sql_server.py
index 05254ac2c..9b1f813f3 100644
--- a/tests/system/data_sources/test_sql_server.py
+++ b/tests/system/data_sources/test_sql_server.py
@@ -425,7 +425,6 @@ def test_row_validation_core_types_to_bigquery():
     parser = cli_tools.configure_arg_parser()
     # TODO When issue-834 is complete add col_string to --hash string below.
     # TODO Change --hash string below to include col_tstz when issue-929 is complete.
-    # TODO Change --hash string below to include col_float32,col_float64 when issue-841 is complete.
     args = parser.parse_args(
         [
             "validate",
@@ -435,7 +434,7 @@ def test_row_validation_core_types_to_bigquery():
             "-tbls=pso_data_validator.dvt_core_types",
             "--primary-keys=id",
             "--filter-status=fail",
-            "--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_varchar_30,col_char_2,col_date,col_datetime",
+            "--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float32,col_float64,col_varchar_30,col_char_2,col_date,col_datetime",
         ]
     )
     config_managers = main.build_config_managers_from_args(args)
diff --git a/tests/system/data_sources/test_teradata.py b/tests/system/data_sources/test_teradata.py
index 0c88405bf..b6cdf4f77 100644
--- a/tests/system/data_sources/test_teradata.py
+++ b/tests/system/data_sources/test_teradata.py
@@ -424,7 +424,6 @@ def test_row_validation_core_types_to_bigquery():
     # Excluded col_string because LONG VARCHAR column causes exception regardless of column contents:
     # [Error 3798] A column or character expression is larger than the max size.
     # TODO Change --hash option to include col_tstz when issue-929 is complete.
-    # TODO Change --hash option to include col_float32,col_float64 when issue-841 is complete.
     args = parser.parse_args(
         [
             "validate",
@@ -434,7 +433,7 @@ def test_row_validation_core_types_to_bigquery():
             "-tbls=udf.dvt_core_types=pso_data_validator.dvt_core_types",
             "--primary-keys=id",
             "--filter-status=fail",
-            "--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_varchar_30,col_char_2,col_date,col_datetime",
+            "--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float32,col_float64,col_varchar_30,col_char_2,col_date,col_datetime",
         ]
     )
     config_managers = main.build_config_managers_from_args(args)
diff --git a/third_party/ibis/ibis_addon/operations.py b/third_party/ibis/ibis_addon/operations.py
index 826888e1e..354c7cdc0 100644
--- a/third_party/ibis/ibis_addon/operations.py
+++ b/third_party/ibis/ibis_addon/operations.py
@@ -28,7 +28,10 @@
 import ibis.expr.operations as ops
 import ibis.expr.rules as rlz
 import sqlalchemy as sa
-from ibis.backends.base.sql.alchemy.registry import fixed_arity as sa_fixed_arity
+from ibis.backends.base.sql.alchemy.registry import (
+    fixed_arity as sa_fixed_arity,
+    _cast as sa_fixed_cast,
+)
 from ibis.backends.base.sql.alchemy.translator import AlchemyExprTranslator
 from ibis.backends.base.sql.compiler.translator import ExprTranslator
 from ibis.backends.base.sql.registry import fixed_arity
@@ -324,8 +327,6 @@ def sa_cast_postgres(t, op):
     typ = op.to
     arg_dtype = arg.output_dtype
 
-    sa_arg = t.translate(arg)
-
     # Specialize going from numeric(p,s>0) to string
     if (
         arg_dtype.is_decimal()
@@ -333,6 +334,7 @@ def sa_cast_postgres(t, op):
         and arg_dtype.scale > 0
         and typ.is_string()
     ):
+        sa_arg = t.translate(arg)
         # When casting a number to string PostgreSQL includes the full scale, e.g.:
         #   SELECT CAST(CAST(100 AS DECIMAL(5,2)) AS VARCHAR(10));
         #     100.00
@@ -347,22 +349,23 @@ def sa_cast_postgres(t, op):
         )
         return sa.func.rtrim(sa.func.to_char(sa_arg, fmt), ".")
 
-    # specialize going from an integer type to a timestamp
-    if arg_dtype.is_integer() and typ.is_timestamp():
-        return t.integer_to_timestamp(sa_arg, tz=typ.timezone)
+    # Follow the original Ibis code path.
+    return sa_fixed_cast(t, op)
 
-    if arg_dtype.is_binary() and typ.is_string():
-        return sa.func.encode(sa_arg, "escape")
 
-    if typ.is_binary():
-        #  decode yields a column of memoryview which is annoying to deal with
-        # in pandas. CAST(expr AS BYTEA) is correct and returns byte strings.
-        return sa.cast(sa_arg, sa.LargeBinary())
+def sa_cast_mssql(t, op):
+    arg = op.arg
+    typ = op.to
+    arg_dtype = arg.output_dtype
 
-    if typ.is_json() and not t.native_json_type:
-        return sa_arg
+    # Specialize going from a binary float type to a string.
+    if (arg_dtype.is_float32() or arg_dtype.is_float64()) and typ.is_string():
+        sa_arg = t.translate(arg)
+        # This prevents output in scientific notation, at least for my tests it did.
+        return sa.func.format(sa_arg, "G")
 
-    return sa.cast(sa_arg, t.get_sqla_type(typ))
+    # Follow the original Ibis code path.
+    return sa_fixed_cast(t, op)
 
 
 def _sa_string_join(t, op):
@@ -458,6 +461,7 @@ def _bigquery_field_to_ibis_dtype(field):
 MsSqlExprTranslator._registry[StringJoin] = _sa_string_join
 MsSqlExprTranslator._registry[RandomScalar] = sa_format_new_id
 MsSqlExprTranslator._registry[Strftime] = strftime_mssql
+MsSqlExprTranslator._registry[Cast] = sa_cast_mssql
 MsSqlExprTranslator._registry[BinaryLength] = sa_format_binary_length_mssql
 
 MySQLExprTranslator._registry[RawSQL] = sa_format_raw_sql
diff --git a/third_party/ibis/ibis_oracle/registry.py b/third_party/ibis/ibis_oracle/registry.py
index f31f4aa7f..6ff4b61ff 100644
--- a/third_party/ibis/ibis_oracle/registry.py
+++ b/third_party/ibis/ibis_oracle/registry.py
@@ -23,6 +23,7 @@
 
 import sqlalchemy as sa
 
+from ibis.backends.base.sql.alchemy.registry import _cast as sa_fixed_cast
 import ibis.common.exceptions as com
 import ibis.expr.operations as ops
 
@@ -103,24 +104,15 @@ def _cast(t, op):
     typ = op.to
     arg_dtype = arg.output_dtype
 
-    sa_arg = t.translate(arg)
-
-    # specialize going from an integer type to a timestamp
-    if arg_dtype.is_integer() and typ.is_timestamp():
-        return t.integer_to_timestamp(sa_arg, tz=typ.timezone)
-
-    if arg_dtype.is_binary() and typ.is_string():
-        return sa.func.encode(sa_arg, "escape")
+    # Specialize going from a binary float type to a string.
+    if (arg_dtype.is_float32() or arg_dtype.is_float64()) and typ.is_string():
+        sa_arg = t.translate(arg)
+        # This prevents output in scientific notation but we still have
+        # problems with the lossy nature of BINARY_FLOAT/DOUBLE.
+        return sa.func.to_char(sa_arg, "TM9")
 
-    if typ.is_binary():
-        #  decode yields a column of memoryview which is annoying to deal with
-        # in pandas. CAST(expr AS BYTEA) is correct and returns byte strings.
-        return sa.cast(sa_arg, sa.LargeBinary())
-
-    if typ.is_json() and not t.native_json_type:
-        return sa_arg
-
-    return sa.cast(sa_arg, t.get_sqla_type(typ))
+    # Follow the original Ibis code path.
+    return sa_fixed_cast(t, op)
 
 
 def _typeof(t, op):
@@ -157,7 +149,6 @@ def _string_agg(t, op):
 }
 
 try:
-
     _strftime_to_oracle_rules.update(
         {
             "%c": locale.nl_langinfo(locale.D_T_FMT),  # locale date and time
diff --git a/third_party/ibis/ibis_teradata/registry.py b/third_party/ibis/ibis_teradata/registry.py
index b77d25c43..0eaa4780e 100644
--- a/third_party/ibis/ibis_teradata/registry.py
+++ b/third_party/ibis/ibis_teradata/registry.py
@@ -54,6 +54,14 @@ def _cast(t, op):
     arg, target_type = op.args
     arg_formatted = t.translate(arg)
     input_dtype = arg.output_dtype
+
+    # Specialize going from a binary float type to a string.
+    # Trying to avoid scientific notation.
+    if (input_dtype.is_float32() or input_dtype.is_float64()) and op.to.is_string():
+        # Cannot return sa.func.to_char because t (TeradataExprTranslator) returns
+        # a string containing the column and not a SQLAlchemy Column object.
+        return f"TO_CHAR({arg_formatted},'TM9')"
+
     return teradata_cast(arg_formatted, input_dtype, target_type)