Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: issue 740 teradata strftime function #747

Merged
merged 2 commits into from
Feb 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 123 additions & 2 deletions tests/system/data_sources/test_teradata.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
}


TERADATA_CONFIG = {
TERADATA_COLUMN_CONFIG = {
# Specific Connection Config
consts.CONFIG_SOURCE_CONN: conn,
consts.CONFIG_TARGET_CONN: conn,
Expand Down Expand Up @@ -60,9 +60,130 @@
],
}

TERADATA_ROW_CONFIG = {
# Specific Connection Config
consts.CONFIG_SOURCE_CONN: conn,
consts.CONFIG_TARGET_CONN: conn,
# Validation Type
consts.CONFIG_TYPE: "Row",
# Configuration Required Depending on Validator Type
consts.CONFIG_SCHEMA_NAME: "Sys_Calendar",
consts.CONFIG_TABLE_NAME: "CALENDAR",
consts.CONFIG_TARGET_SCHEMA_NAME: "Sys_Calendar",
consts.CONFIG_TARGET_TABLE_NAME: "CALENDAR",
consts.CONFIG_THRESHOLD: 0.0,
consts.CONFIG_FORMAT: "table",
consts.CONFIG_FILTER_STATUS: None,
consts.CONFIG_FILTERS: [],
consts.CONFIG_USE_RANDOM_ROWS: False,
consts.CONFIG_COMPARISON_FIELDS: [
{
consts.CONFIG_SOURCE_COLUMN: "concat__all",
consts.CONFIG_TARGET_COLUMN: "concat__all",
consts.CONFIG_FIELD_ALIAS: "concat__all",
consts.CONFIG_CAST: None,
}
],
consts.CONFIG_CALCULATED_FIELDS: [
{
consts.CONFIG_CALCULATED_SOURCE_COLUMNS: ["calendar_date"],
consts.CONFIG_CALCULATED_TARGET_COLUMNS: ["calendar_date"],
consts.CONFIG_FIELD_ALIAS: "cast__calendar_date",
consts.CONFIG_TYPE: "custom",
consts.CONFIG_DEPTH: 0,
consts.CONFIG_CUSTOM_IBIS_EXPR: "ibis.expr.api.TimestampValue.strftime",
consts.CONFIG_CUSTOM_PARAMS: [{"format_str": "%Y-%m-%d"}],
},
{
consts.CONFIG_CALCULATED_SOURCE_COLUMNS: ["day_of_week"],
consts.CONFIG_CALCULATED_TARGET_COLUMNS: ["day_of_week"],
consts.CONFIG_FIELD_ALIAS: "cast__day_of_week",
consts.CONFIG_TYPE: "cast",
consts.CONFIG_DEPTH: 0,
},
{
consts.CONFIG_CALCULATED_SOURCE_COLUMNS: ["cast__calendar_date"],
consts.CONFIG_CALCULATED_TARGET_COLUMNS: ["cast__calendar_date"],
consts.CONFIG_FIELD_ALIAS: "ifnull__cast__calendar_date",
consts.CONFIG_TYPE: "ifnull",
consts.CONFIG_DEPTH: 1,
},
{
consts.CONFIG_CALCULATED_SOURCE_COLUMNS: ["cast__day_of_week"],
consts.CONFIG_CALCULATED_TARGET_COLUMNS: ["cast__day_of_week"],
consts.CONFIG_FIELD_ALIAS: "ifnull__cast__day_of_week",
consts.CONFIG_TYPE: "ifnull",
consts.CONFIG_DEPTH: 1,
},
{
consts.CONFIG_CALCULATED_SOURCE_COLUMNS: ["ifnull__cast__calendar_date"],
consts.CONFIG_CALCULATED_TARGET_COLUMNS: ["ifnull__cast__calendar_date"],
consts.CONFIG_FIELD_ALIAS: "rstrip__ifnull__cast__calendar_date",
consts.CONFIG_TYPE: "rstrip",
consts.CONFIG_DEPTH: 2,
},
{
consts.CONFIG_CALCULATED_SOURCE_COLUMNS: ["ifnull__cast__day_of_week"],
consts.CONFIG_CALCULATED_TARGET_COLUMNS: ["ifnull__cast__day_of_week"],
consts.CONFIG_FIELD_ALIAS: "rstrip__ifnull__cast__day_of_week",
consts.CONFIG_TYPE: "rstrip",
consts.CONFIG_DEPTH: 2,
},
{
consts.CONFIG_CALCULATED_SOURCE_COLUMNS: [
"rstrip__ifnull__cast__calendar_date"
],
consts.CONFIG_CALCULATED_TARGET_COLUMNS: [
"rstrip__ifnull__cast__calendar_date"
],
consts.CONFIG_FIELD_ALIAS: "upper__rstrip__ifnull__cast__calendar_date",
consts.CONFIG_TYPE: "upper",
consts.CONFIG_DEPTH: 3,
},
{
consts.CONFIG_CALCULATED_SOURCE_COLUMNS: [
"rstrip__ifnull__cast__day_of_week"
],
consts.CONFIG_CALCULATED_TARGET_COLUMNS: [
"rstrip__ifnull__cast__day_of_week"
],
consts.CONFIG_FIELD_ALIAS: "upper__rstrip__ifnull__cast__day_of_week",
consts.CONFIG_TYPE: "upper",
consts.CONFIG_DEPTH: 3,
},
{
consts.CONFIG_CALCULATED_SOURCE_COLUMNS: [
"upper__rstrip__ifnull__cast__calendar_date",
"upper__rstrip__ifnull__cast__day_of_week",
],
consts.CONFIG_CALCULATED_TARGET_COLUMNS: [
"upper__rstrip__ifnull__cast__calendar_date",
"upper__rstrip__ifnull__cast__day_of_week",
],
consts.CONFIG_FIELD_ALIAS: "concat__all",
consts.CONFIG_TYPE: "concat",
consts.CONFIG_DEPTH: 4,
},
],
consts.CONFIG_PRIMARY_KEYS: [
{
consts.CONFIG_SOURCE_COLUMN: "calendar_date",
consts.CONFIG_TARGET_COLUMN: "calendar_date",
consts.CONFIG_FIELD_ALIAS: "calendar_date",
consts.CONFIG_CAST: None,
}
],
}


def test_count_validator():
validator = data_validation.DataValidation(TERADATA_CONFIG, verbose=True)
validator = data_validation.DataValidation(TERADATA_COLUMN_CONFIG, verbose=True)
df = validator.execute()
assert int(df["source_agg_value"][0]) > 0
assert df["source_agg_value"][0] == df["target_agg_value"][0]


def test_row_validator():
validator = data_validation.DataValidation(TERADATA_ROW_CONFIG, verbose=True)
df = validator.execute()
assert df["validation_status"][0] == "success"
162 changes: 135 additions & 27 deletions third_party/ibis/ibis_teradata/compiler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
import datetime
from functools import partial
from io import StringIO
import itertools
import locale
import platform
import re
import string
import warnings

import numpy as np
import toolz
Expand All @@ -20,7 +26,6 @@
from .datatypes import ibis_type_to_teradata_type
from ibis.backends.base_sql import fixed_arity, literal, reduction, unary
from ibis.backends.base_sql.compiler import (
BaseExprTranslator,
BaseSelect,
BaseTableSetFormatter,
)
Expand Down Expand Up @@ -483,6 +488,134 @@ def _arbitrary(translator, expr):
return "ANY_VALUE({})".format(translator.translate(arg))


# Teradata rules copied from ibis PostgreSQL compiler
_strftime_to_teradata_rules = {
'%a': 'Dy',
'%A': 'Day',
'%w': 'D', # 1-based day of week, see below for how we make this 0-based
'%d': 'DD', # day of month
'%-d': 'FMDD', # - is no leading zero for Python same for FM in Teradata
'%b': 'Mon', # Sep
'%B': 'Month', # September
'%m': 'MM', # 01
'%-m': 'FMMM', # 1
'%y': 'YY', # 15
'%Y': 'YYYY', # 2015
'%H': 'HH24', # 09
'%-H': 'FMHH24', # 9
'%I': 'HH12', # 09
'%-I': 'FMHH12', # 9
'%p': 'AM', # AM or PM
'%M': 'MI', # zero padded minute
'%-M': 'FMMI', # Minute
'%S': 'SS', # zero padded second
'%-S': 'FMSS', # Second
'%f': 'FF6', # zero padded microsecond
'%z': 'TZR', # utf offset
'%Z': 'TZR', # uppercase timezone name
'%j': 'DDD', # zero padded day of year
'%-j': 'FMDDD', # day of year
'%U': 'WW', # 1-based week of year
# 'W': ?, # meh
}

try:
_strftime_to_teradata_rules.update(
{
'%c': locale.nl_langinfo(locale.D_T_FMT), # locale date and time
'%x': locale.nl_langinfo(locale.D_FMT), # locale date
'%X': locale.nl_langinfo(locale.T_FMT), # locale time
}
)
except AttributeError:
warnings.warn(
'locale specific date formats (%%c, %%x, %%X) are not yet implemented '
'for %s' % platform.system()
)

# Translate strftime spec into mostly equivalent Teradata spec
_scanner = re.Scanner(
# double quotes need to be escaped
[('"', lambda scanner, token: r'\"')]
+ [
(
'|'.join(
map(
'(?:{})'.format,
itertools.chain(
_strftime_to_teradata_rules.keys(),
[
# "%e" is in the C standard and Python actually
# generates this if your spec contains "%c" but we
# don't officially support it as a specifier so we
# need to special case it in the scanner
'%e',
r'\s+',
r'[{}]'.format(re.escape(string.punctuation)),
r'[^{}\s]+'.format(re.escape(string.punctuation)),
],
),
)
),
lambda scanner, token: token,
)
]
)

_lexicon_values = frozenset(_strftime_to_teradata_rules.values())

_strftime_blacklist = frozenset(['%w', '%U', '%c', '%x', '%X', '%e'])


def _reduce_tokens(tokens):
"""
Reduce strftime for,mat elements to an equivalent string of Teradata elements.
Ideally this would be a full reduction like for Oracle and PostgreSQL but the "arg" parameter received
by _strftime is not equivalent and ends up as a string rather than a column name.
I've followed pattern set by other functions in this module and build a string function spec rather than
using sa.func.to_char.
"""
# TODO It would be nice to bring this into line with Oracle/PostgreSQL strftime function processing.

# current list of tokens
curtokens = []

non_special_tokens = (
frozenset(_strftime_to_teradata_rules) - _strftime_blacklist
)

# TODO: how much of a hack is this?
for token in tokens:
# we are a non-special token %A, %d, etc.
if token in non_special_tokens:
curtokens.append(_strftime_to_teradata_rules[token])

# we have a string like DD, to escape this we
# surround it with double quotes
elif token in _lexicon_values:
curtokens.append('"{}"'.format(token))

# we have a token that needs special treatment
elif token in _strftime_blacklist:
raise UnsupportedOperationError('Not right now')

# uninteresting text
else:
curtokens.append(token)
return ''.join(curtokens)


def _strftime(translator, expr):
arg, format_string = expr.op().args
fmt_string = translator.translate(format_string)
arg_formatted = translator.translate(arg)
tokens, _ = _scanner.scan(fmt_string)
translated_format = _reduce_tokens(tokens)
return "TO_CHAR({}, {})".format(
arg_formatted, translated_format
)


_date_units = {
"Y": "YEAR",
"Q": "QUARTER",
Expand Down Expand Up @@ -537,13 +670,6 @@ def _formatter(translator, expr):
return _formatter


STRFTIME_FORMAT_FUNCTIONS = {
dt.Date: "DATE",
dt.Time: "TIME",
dt.Timestamp: "TIMESTAMP",
}


_operation_registry.update(
{
ops.ExtractYear: _extract_field("year"),
Expand All @@ -559,6 +685,7 @@ def _formatter(translator, expr):
ops.StringJoin: _string_join,
ops.StringAscii: _string_ascii,
ops.StringFind: _string_find,
ops.Strftime: _strftime,
ops.StrRight: _string_right,
ops.Repeat: fixed_arity("REPEAT", 2),
ops.RegexSearch: _regex_search,
Expand Down Expand Up @@ -628,25 +755,6 @@ def teradata_compiles_divide(t, e):
return "IEEE_DIVIDE({}, {})".format(*map(t.translate, e.op().args))


@compiles(ops.Strftime)
def compiles_strftime(translator, expr):
arg, format_string = expr.op().args
arg_type = arg.type()
strftime_format_func_name = STRFTIME_FORMAT_FUNCTIONS[type(arg_type)]
fmt_string = translator.translate(format_string)
arg_formatted = translator.translate(arg)
if isinstance(arg_type, dt.Timestamp):
return "FORMAT_{}({}, {}, {!r})".format(
strftime_format_func_name,
fmt_string,
arg_formatted,
arg_type.timezone if arg_type.timezone is not None else "UTC",
)
return "FORMAT_{}({}, {})".format(
strftime_format_func_name, fmt_string, arg_formatted
)


@compiles(ops.StringToTimestamp)
def compiles_string_to_timestamp(translator, expr):
arg, format_string, timezone_arg = expr.op().args
Expand Down