Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Better casts to string for binary floats/doubles #1078

Merged
merged 2 commits into from
Jan 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions tests/system/data_sources/test_hive.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ def disabled_test_row_validation_core_types():
)
def test_row_validation_core_types_to_bigquery():
parser = cli_tools.configure_arg_parser()
# TODO Change --hash string below to include col_float32,col_float64 when issue-841 is complete.
# col_float64 is excluded below because there is no way to control the format when casting to string.
args = parser.parse_args(
[
"validate",
Expand All @@ -322,7 +322,7 @@ def test_row_validation_core_types_to_bigquery():
"-tbls=pso_data_validator.dvt_core_types",
"--primary-keys=id",
"--filter-status=fail",
"--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_varchar_30,col_char_2,col_string,col_date,col_datetime,col_tstz",
"--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float32,col_varchar_30,col_char_2,col_string,col_date,col_datetime,col_tstz",
]
)
config_managers = main.build_config_managers_from_args(args)
Expand Down
4 changes: 2 additions & 2 deletions tests/system/data_sources/test_oracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,7 @@ def test_row_validation_core_types():
)
def test_row_validation_core_types_to_bigquery():
"""Oracle to BigQuery dvt_core_types row validation"""
# TODO Change --hash string below to include col_float32,col_float64 when issue-841 is complete.
# Excluded col_float32,col_float64 due to the lossy nature of BINARY_FLOAT/DOUBLE.
parser = cli_tools.configure_arg_parser()
args = parser.parse_args(
[
Expand Down Expand Up @@ -423,9 +423,9 @@ def test_row_validation_core_types_to_bigquery():
)
def test_row_validation_oracle_to_postgres():
# TODO Change hash_cols below to include col_tstz when issue-706 is complete.
# TODO Change hash_cols below to include col_float32,col_float64 when issue-841 is complete.
# TODO col_raw is blocked by issue-773 (is it even reasonable to expect binary columns to work here?)
# TODO Change hash_cols below to include col_nvarchar_30,col_nchar_2 when issue-772 is complete.
# Excluded col_float32,col_float64 due to the lossy nature of BINARY_FLOAT/DOUBLE.
# Excluded CLOB/NCLOB/BLOB columns because lob values cannot be concatenated
hash_cols = ",".join(
[
Expand Down
3 changes: 1 addition & 2 deletions tests/system/data_sources/test_sql_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,6 @@ def test_row_validation_core_types_to_bigquery():
parser = cli_tools.configure_arg_parser()
# TODO When issue-834 is complete add col_string to --hash string below.
# TODO Change --hash string below to include col_tstz when issue-929 is complete.
# TODO Change --hash string below to include col_float32,col_float64 when issue-841 is complete.
args = parser.parse_args(
[
"validate",
Expand All @@ -435,7 +434,7 @@ def test_row_validation_core_types_to_bigquery():
"-tbls=pso_data_validator.dvt_core_types",
"--primary-keys=id",
"--filter-status=fail",
"--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_varchar_30,col_char_2,col_date,col_datetime",
"--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float32,col_float64,col_varchar_30,col_char_2,col_date,col_datetime",
]
)
config_managers = main.build_config_managers_from_args(args)
Expand Down
3 changes: 1 addition & 2 deletions tests/system/data_sources/test_teradata.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,6 @@ def test_row_validation_core_types_to_bigquery():
# Excluded col_string because LONG VARCHAR column causes exception regardless of column contents:
# [Error 3798] A column or character expression is larger than the max size.
# TODO Change --hash option to include col_tstz when issue-929 is complete.
# TODO Change --hash option to include col_float32,col_float64 when issue-841 is complete.
args = parser.parse_args(
[
"validate",
Expand All @@ -434,7 +433,7 @@ def test_row_validation_core_types_to_bigquery():
"-tbls=udf.dvt_core_types=pso_data_validator.dvt_core_types",
"--primary-keys=id",
"--filter-status=fail",
"--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_varchar_30,col_char_2,col_date,col_datetime",
"--hash=col_int8,col_int16,col_int32,col_int64,col_dec_20,col_dec_38,col_dec_10_2,col_float32,col_float64,col_varchar_30,col_char_2,col_date,col_datetime",
]
)
config_managers = main.build_config_managers_from_args(args)
Expand Down
34 changes: 19 additions & 15 deletions third_party/ibis/ibis_addon/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@
import ibis.expr.operations as ops
import ibis.expr.rules as rlz
import sqlalchemy as sa
from ibis.backends.base.sql.alchemy.registry import fixed_arity as sa_fixed_arity
from ibis.backends.base.sql.alchemy.registry import (
fixed_arity as sa_fixed_arity,
_cast as sa_fixed_cast,
)
from ibis.backends.base.sql.alchemy.translator import AlchemyExprTranslator
from ibis.backends.base.sql.compiler.translator import ExprTranslator
from ibis.backends.base.sql.registry import fixed_arity
Expand Down Expand Up @@ -324,15 +327,14 @@ def sa_cast_postgres(t, op):
typ = op.to
arg_dtype = arg.output_dtype

sa_arg = t.translate(arg)

# Specialize going from numeric(p,s>0) to string
if (
arg_dtype.is_decimal()
and arg_dtype.scale
and arg_dtype.scale > 0
and typ.is_string()
):
sa_arg = t.translate(arg)
# When casting a number to string PostgreSQL includes the full scale, e.g.:
# SELECT CAST(CAST(100 AS DECIMAL(5,2)) AS VARCHAR(10));
# 100.00
Expand All @@ -347,22 +349,23 @@ def sa_cast_postgres(t, op):
)
return sa.func.rtrim(sa.func.to_char(sa_arg, fmt), ".")

# specialize going from an integer type to a timestamp
if arg_dtype.is_integer() and typ.is_timestamp():
return t.integer_to_timestamp(sa_arg, tz=typ.timezone)
# Follow the original Ibis code path.
return sa_fixed_cast(t, op)

if arg_dtype.is_binary() and typ.is_string():
return sa.func.encode(sa_arg, "escape")

if typ.is_binary():
# decode yields a column of memoryview which is annoying to deal with
# in pandas. CAST(expr AS BYTEA) is correct and returns byte strings.
return sa.cast(sa_arg, sa.LargeBinary())
def sa_cast_mssql(t, op):
arg = op.arg
typ = op.to
arg_dtype = arg.output_dtype

if typ.is_json() and not t.native_json_type:
return sa_arg
# Specialize going from a binary float type to a string.
if (arg_dtype.is_float32() or arg_dtype.is_float64()) and typ.is_string():
sa_arg = t.translate(arg)
# This prevents output in scientific notation, at least for my tests it did.
return sa.func.format(sa_arg, "G")

return sa.cast(sa_arg, t.get_sqla_type(typ))
# Follow the original Ibis code path.
return sa_fixed_cast(t, op)


def _sa_string_join(t, op):
Expand Down Expand Up @@ -458,6 +461,7 @@ def _bigquery_field_to_ibis_dtype(field):
MsSqlExprTranslator._registry[StringJoin] = _sa_string_join
MsSqlExprTranslator._registry[RandomScalar] = sa_format_new_id
MsSqlExprTranslator._registry[Strftime] = strftime_mssql
MsSqlExprTranslator._registry[Cast] = sa_cast_mssql
MsSqlExprTranslator._registry[BinaryLength] = sa_format_binary_length_mssql

MySQLExprTranslator._registry[RawSQL] = sa_format_raw_sql
Expand Down
27 changes: 9 additions & 18 deletions third_party/ibis/ibis_oracle/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

import sqlalchemy as sa

from ibis.backends.base.sql.alchemy.registry import _cast as sa_fixed_cast
import ibis.common.exceptions as com
import ibis.expr.operations as ops

Expand Down Expand Up @@ -103,24 +104,15 @@ def _cast(t, op):
typ = op.to
arg_dtype = arg.output_dtype

sa_arg = t.translate(arg)

# specialize going from an integer type to a timestamp
if arg_dtype.is_integer() and typ.is_timestamp():
return t.integer_to_timestamp(sa_arg, tz=typ.timezone)

if arg_dtype.is_binary() and typ.is_string():
return sa.func.encode(sa_arg, "escape")
# Specialize going from a binary float type to a string.
if (arg_dtype.is_float32() or arg_dtype.is_float64()) and typ.is_string():
sa_arg = t.translate(arg)
# This prevents output in scientific notation but we still have
# problems with the lossy nature of BINARY_FLOAT/DOUBLE.
return sa.func.to_char(sa_arg, "TM9")

if typ.is_binary():
# decode yields a column of memoryview which is annoying to deal with
# in pandas. CAST(expr AS BYTEA) is correct and returns byte strings.
return sa.cast(sa_arg, sa.LargeBinary())

if typ.is_json() and not t.native_json_type:
return sa_arg

return sa.cast(sa_arg, t.get_sqla_type(typ))
# Follow the original Ibis code path.
return sa_fixed_cast(t, op)


def _typeof(t, op):
Expand Down Expand Up @@ -157,7 +149,6 @@ def _string_agg(t, op):
}

try:

_strftime_to_oracle_rules.update(
{
"%c": locale.nl_langinfo(locale.D_T_FMT), # locale date and time
Expand Down
8 changes: 8 additions & 0 deletions third_party/ibis/ibis_teradata/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,14 @@ def _cast(t, op):
arg, target_type = op.args
arg_formatted = t.translate(arg)
input_dtype = arg.output_dtype

# Specialize going from a binary float type to a string.
# Trying to avoid scientific notation.
if (input_dtype.is_float32() or input_dtype.is_float64()) and op.to.is_string():
# Cannot return sa.func.to_char because t (TeradataExprTranslator) returns
# a string containing the column and not a SQLAlchemy Column object.
return f"TO_CHAR({arg_formatted},'TM9')"

return teradata_cast(arg_formatted, input_dtype, target_type)


Expand Down