Skip to content

Commit

Permalink
fix: Better casts to string for binary floats/doubles (#1078)
Browse files Browse the repository at this point in the history
* fix: Better casts to string for binary floats/doubles
  • Loading branch information
nj1973 committed Jan 22, 2024
1 parent 58f8fcb commit 15bfc4c
Show file tree
Hide file tree
Showing 7 changed files with 42 additions and 41 deletions.
4 changes: 2 additions & 2 deletions tests/system/data_sources/test_hive.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ def disabled_test_row_validation_core_types():
)
def test_row_validation_core_types_to_bigquery():
parser = cli_tools.configure_arg_parser()
# TODO Change --hash string below to include col_float32,col_float64 when issue-841 is complete.
# col_float64 is excluded below because there is no way to control the format when casting to string.
args = parser.parse_args(
[
"validate",
Expand All @@ -322,7 +322,7 @@ def test_row_validation_core_types_to_bigquery():
"-tbls=pso_data_validator.dvt_core_types",
"--primary-keys=id",
"--filter-status=fail",
"--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_varchar_30,col_char_2,col_string,col_date,col_datetime,col_tstz",
"--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float32,col_varchar_30,col_char_2,col_string,col_date,col_datetime,col_tstz",
]
)
config_managers = main.build_config_managers_from_args(args)
Expand Down
4 changes: 2 additions & 2 deletions tests/system/data_sources/test_oracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,7 @@ def test_row_validation_core_types():
)
def test_row_validation_core_types_to_bigquery():
"""Oracle to BigQuery dvt_core_types row validation"""
# TODO Change --hash string below to include col_float32,col_float64 when issue-841 is complete.
# Excluded col_float32,col_float64 due to the lossy nature of BINARY_FLOAT/DOUBLE.
parser = cli_tools.configure_arg_parser()
args = parser.parse_args(
[
Expand Down Expand Up @@ -423,9 +423,9 @@ def test_row_validation_core_types_to_bigquery():
)
def test_row_validation_oracle_to_postgres():
# TODO Change hash_cols below to include col_tstz when issue-706 is complete.
# TODO Change hash_cols below to include col_float32,col_float64 when issue-841 is complete.
# TODO col_raw is blocked by issue-773 (is it even reasonable to expect binary columns to work here?)
# TODO Change hash_cols below to include col_nvarchar_30,col_nchar_2 when issue-772 is complete.
# Excluded col_float32,col_float64 due to the lossy nature of BINARY_FLOAT/DOUBLE.
# Excluded CLOB/NCLOB/BLOB columns because lob values cannot be concatenated
hash_cols = ",".join(
[
Expand Down
3 changes: 1 addition & 2 deletions tests/system/data_sources/test_sql_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,6 @@ def test_row_validation_core_types_to_bigquery():
parser = cli_tools.configure_arg_parser()
# TODO When issue-834 is complete add col_string to --hash string below.
# TODO Change --hash string below to include col_tstz when issue-929 is complete.
# TODO Change --hash string below to include col_float32,col_float64 when issue-841 is complete.
args = parser.parse_args(
[
"validate",
Expand All @@ -435,7 +434,7 @@ def test_row_validation_core_types_to_bigquery():
"-tbls=pso_data_validator.dvt_core_types",
"--primary-keys=id",
"--filter-status=fail",
"--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_varchar_30,col_char_2,col_date,col_datetime",
"--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float32,col_float64,col_varchar_30,col_char_2,col_date,col_datetime",
]
)
config_managers = main.build_config_managers_from_args(args)
Expand Down
3 changes: 1 addition & 2 deletions tests/system/data_sources/test_teradata.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,6 @@ def test_row_validation_core_types_to_bigquery():
# Excluded col_string because LONG VARCHAR column causes exception regardless of column contents:
# [Error 3798] A column or character expression is larger than the max size.
# TODO Change --hash option to include col_tstz when issue-929 is complete.
# TODO Change --hash option to include col_float32,col_float64 when issue-841 is complete.
args = parser.parse_args(
[
"validate",
Expand All @@ -434,7 +433,7 @@ def test_row_validation_core_types_to_bigquery():
"-tbls=udf.dvt_core_types=pso_data_validator.dvt_core_types",
"--primary-keys=id",
"--filter-status=fail",
"--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_varchar_30,col_char_2,col_date,col_datetime",
"--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float32,col_float64,col_varchar_30,col_char_2,col_date,col_datetime",
]
)
config_managers = main.build_config_managers_from_args(args)
Expand Down
34 changes: 19 additions & 15 deletions third_party/ibis/ibis_addon/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@
import ibis.expr.operations as ops
import ibis.expr.rules as rlz
import sqlalchemy as sa
from ibis.backends.base.sql.alchemy.registry import fixed_arity as sa_fixed_arity
from ibis.backends.base.sql.alchemy.registry import (
fixed_arity as sa_fixed_arity,
_cast as sa_fixed_cast,
)
from ibis.backends.base.sql.alchemy.translator import AlchemyExprTranslator
from ibis.backends.base.sql.compiler.translator import ExprTranslator
from ibis.backends.base.sql.registry import fixed_arity
Expand Down Expand Up @@ -324,15 +327,14 @@ def sa_cast_postgres(t, op):
typ = op.to
arg_dtype = arg.output_dtype

sa_arg = t.translate(arg)

# Specialize going from numeric(p,s>0) to string
if (
arg_dtype.is_decimal()
and arg_dtype.scale
and arg_dtype.scale > 0
and typ.is_string()
):
sa_arg = t.translate(arg)
# When casting a number to string PostgreSQL includes the full scale, e.g.:
# SELECT CAST(CAST(100 AS DECIMAL(5,2)) AS VARCHAR(10));
# 100.00
Expand All @@ -347,22 +349,23 @@ def sa_cast_postgres(t, op):
)
return sa.func.rtrim(sa.func.to_char(sa_arg, fmt), ".")

# specialize going from an integer type to a timestamp
if arg_dtype.is_integer() and typ.is_timestamp():
return t.integer_to_timestamp(sa_arg, tz=typ.timezone)
# Follow the original Ibis code path.
return sa_fixed_cast(t, op)

if arg_dtype.is_binary() and typ.is_string():
return sa.func.encode(sa_arg, "escape")

if typ.is_binary():
# decode yields a column of memoryview which is annoying to deal with
# in pandas. CAST(expr AS BYTEA) is correct and returns byte strings.
return sa.cast(sa_arg, sa.LargeBinary())
def sa_cast_mssql(t, op):
arg = op.arg
typ = op.to
arg_dtype = arg.output_dtype

if typ.is_json() and not t.native_json_type:
return sa_arg
# Specialize going from a binary float type to a string.
if (arg_dtype.is_float32() or arg_dtype.is_float64()) and typ.is_string():
sa_arg = t.translate(arg)
# This prevents output in scientific notation, at least for my tests it did.
return sa.func.format(sa_arg, "G")

return sa.cast(sa_arg, t.get_sqla_type(typ))
# Follow the original Ibis code path.
return sa_fixed_cast(t, op)


def _sa_string_join(t, op):
Expand Down Expand Up @@ -458,6 +461,7 @@ def _bigquery_field_to_ibis_dtype(field):
MsSqlExprTranslator._registry[StringJoin] = _sa_string_join
MsSqlExprTranslator._registry[RandomScalar] = sa_format_new_id
MsSqlExprTranslator._registry[Strftime] = strftime_mssql
MsSqlExprTranslator._registry[Cast] = sa_cast_mssql
MsSqlExprTranslator._registry[BinaryLength] = sa_format_binary_length_mssql

MySQLExprTranslator._registry[RawSQL] = sa_format_raw_sql
Expand Down
27 changes: 9 additions & 18 deletions third_party/ibis/ibis_oracle/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

import sqlalchemy as sa

from ibis.backends.base.sql.alchemy.registry import _cast as sa_fixed_cast
import ibis.common.exceptions as com
import ibis.expr.operations as ops

Expand Down Expand Up @@ -103,24 +104,15 @@ def _cast(t, op):
typ = op.to
arg_dtype = arg.output_dtype

sa_arg = t.translate(arg)

# specialize going from an integer type to a timestamp
if arg_dtype.is_integer() and typ.is_timestamp():
return t.integer_to_timestamp(sa_arg, tz=typ.timezone)

if arg_dtype.is_binary() and typ.is_string():
return sa.func.encode(sa_arg, "escape")
# Specialize going from a binary float type to a string.
if (arg_dtype.is_float32() or arg_dtype.is_float64()) and typ.is_string():
sa_arg = t.translate(arg)
# This prevents output in scientific notation but we still have
# problems with the lossy nature of BINARY_FLOAT/DOUBLE.
return sa.func.to_char(sa_arg, "TM9")

if typ.is_binary():
# decode yields a column of memoryview which is annoying to deal with
# in pandas. CAST(expr AS BYTEA) is correct and returns byte strings.
return sa.cast(sa_arg, sa.LargeBinary())

if typ.is_json() and not t.native_json_type:
return sa_arg

return sa.cast(sa_arg, t.get_sqla_type(typ))
# Follow the original Ibis code path.
return sa_fixed_cast(t, op)


def _typeof(t, op):
Expand Down Expand Up @@ -157,7 +149,6 @@ def _string_agg(t, op):
}

try:

_strftime_to_oracle_rules.update(
{
"%c": locale.nl_langinfo(locale.D_T_FMT), # locale date and time
Expand Down
8 changes: 8 additions & 0 deletions third_party/ibis/ibis_teradata/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,14 @@ def _cast(t, op):
arg, target_type = op.args
arg_formatted = t.translate(arg)
input_dtype = arg.output_dtype

# Specialize going from a binary float type to a string.
# Trying to avoid scientific notation.
if (input_dtype.is_float32() or input_dtype.is_float64()) and op.to.is_string():
# Cannot return sa.func.to_char because t (TeradataExprTranslator) returns
# a string containing the column and not a SQLAlchemy Column object.
return f"TO_CHAR({arg_formatted},'TM9')"

return teradata_cast(arg_formatted, input_dtype, target_type)


Expand Down

0 comments on commit 15bfc4c

Please sign in to comment.