diff --git a/tests/system/data_sources/test_teradata.py b/tests/system/data_sources/test_teradata.py index 1de23f432..674995081 100644 --- a/tests/system/data_sources/test_teradata.py +++ b/tests/system/data_sources/test_teradata.py @@ -30,7 +30,7 @@ } -TERADATA_CONFIG = { +TERADATA_COLUMN_CONFIG = { # Specific Connection Config consts.CONFIG_SOURCE_CONN: conn, consts.CONFIG_TARGET_CONN: conn, @@ -60,9 +60,130 @@ ], } +TERADATA_ROW_CONFIG = { + # Specific Connection Config + consts.CONFIG_SOURCE_CONN: conn, + consts.CONFIG_TARGET_CONN: conn, + # Validation Type + consts.CONFIG_TYPE: "Row", + # Configuration Required Depending on Validator Type + consts.CONFIG_SCHEMA_NAME: "Sys_Calendar", + consts.CONFIG_TABLE_NAME: "CALENDAR", + consts.CONFIG_TARGET_SCHEMA_NAME: "Sys_Calendar", + consts.CONFIG_TARGET_TABLE_NAME: "CALENDAR", + consts.CONFIG_THRESHOLD: 0.0, + consts.CONFIG_FORMAT: "table", + consts.CONFIG_FILTER_STATUS: None, + consts.CONFIG_FILTERS: [], + consts.CONFIG_USE_RANDOM_ROWS: False, + consts.CONFIG_COMPARISON_FIELDS: [ + { + consts.CONFIG_SOURCE_COLUMN: "concat__all", + consts.CONFIG_TARGET_COLUMN: "concat__all", + consts.CONFIG_FIELD_ALIAS: "concat__all", + consts.CONFIG_CAST: None, + } + ], + consts.CONFIG_CALCULATED_FIELDS: [ + { + consts.CONFIG_CALCULATED_SOURCE_COLUMNS: ["calendar_date"], + consts.CONFIG_CALCULATED_TARGET_COLUMNS: ["calendar_date"], + consts.CONFIG_FIELD_ALIAS: "cast__calendar_date", + consts.CONFIG_TYPE: "custom", + consts.CONFIG_DEPTH: 0, + consts.CONFIG_CUSTOM_IBIS_EXPR: "ibis.expr.api.TimestampValue.strftime", + consts.CONFIG_CUSTOM_PARAMS: [{"format_str": "%Y-%m-%d"}], + }, + { + consts.CONFIG_CALCULATED_SOURCE_COLUMNS: ["day_of_week"], + consts.CONFIG_CALCULATED_TARGET_COLUMNS: ["day_of_week"], + consts.CONFIG_FIELD_ALIAS: "cast__day_of_week", + consts.CONFIG_TYPE: "cast", + consts.CONFIG_DEPTH: 0, + }, + { + consts.CONFIG_CALCULATED_SOURCE_COLUMNS: ["cast__calendar_date"], + consts.CONFIG_CALCULATED_TARGET_COLUMNS: ["cast__calendar_date"], + consts.CONFIG_FIELD_ALIAS: "ifnull__cast__calendar_date", + consts.CONFIG_TYPE: "ifnull", + consts.CONFIG_DEPTH: 1, + }, + { + consts.CONFIG_CALCULATED_SOURCE_COLUMNS: ["cast__day_of_week"], + consts.CONFIG_CALCULATED_TARGET_COLUMNS: ["cast__day_of_week"], + consts.CONFIG_FIELD_ALIAS: "ifnull__cast__day_of_week", + consts.CONFIG_TYPE: "ifnull", + consts.CONFIG_DEPTH: 1, + }, + { + consts.CONFIG_CALCULATED_SOURCE_COLUMNS: ["ifnull__cast__calendar_date"], + consts.CONFIG_CALCULATED_TARGET_COLUMNS: ["ifnull__cast__calendar_date"], + consts.CONFIG_FIELD_ALIAS: "rstrip__ifnull__cast__calendar_date", + consts.CONFIG_TYPE: "rstrip", + consts.CONFIG_DEPTH: 2, + }, + { + consts.CONFIG_CALCULATED_SOURCE_COLUMNS: ["ifnull__cast__day_of_week"], + consts.CONFIG_CALCULATED_TARGET_COLUMNS: ["ifnull__cast__day_of_week"], + consts.CONFIG_FIELD_ALIAS: "rstrip__ifnull__cast__day_of_week", + consts.CONFIG_TYPE: "rstrip", + consts.CONFIG_DEPTH: 2, + }, + { + consts.CONFIG_CALCULATED_SOURCE_COLUMNS: [ + "rstrip__ifnull__cast__calendar_date" + ], + consts.CONFIG_CALCULATED_TARGET_COLUMNS: [ + "rstrip__ifnull__cast__calendar_date" + ], + consts.CONFIG_FIELD_ALIAS: "upper__rstrip__ifnull__cast__calendar_date", + consts.CONFIG_TYPE: "upper", + consts.CONFIG_DEPTH: 3, + }, + { + consts.CONFIG_CALCULATED_SOURCE_COLUMNS: [ + "rstrip__ifnull__cast__day_of_week" + ], + consts.CONFIG_CALCULATED_TARGET_COLUMNS: [ + "rstrip__ifnull__cast__day_of_week" + ], + consts.CONFIG_FIELD_ALIAS: "upper__rstrip__ifnull__cast__day_of_week", + consts.CONFIG_TYPE: "upper", + consts.CONFIG_DEPTH: 3, + }, + { + consts.CONFIG_CALCULATED_SOURCE_COLUMNS: [ + "upper__rstrip__ifnull__cast__calendar_date", + "upper__rstrip__ifnull__cast__day_of_week", + ], + consts.CONFIG_CALCULATED_TARGET_COLUMNS: [ + "upper__rstrip__ifnull__cast__calendar_date", + "upper__rstrip__ifnull__cast__day_of_week", + ], + consts.CONFIG_FIELD_ALIAS: "concat__all", + consts.CONFIG_TYPE: "concat", + consts.CONFIG_DEPTH: 4, + }, + ], + consts.CONFIG_PRIMARY_KEYS: [ + { + consts.CONFIG_SOURCE_COLUMN: "calendar_date", + consts.CONFIG_TARGET_COLUMN: "calendar_date", + consts.CONFIG_FIELD_ALIAS: "calendar_date", + consts.CONFIG_CAST: None, + } + ], +} + def test_count_validator(): - validator = data_validation.DataValidation(TERADATA_CONFIG, verbose=True) + validator = data_validation.DataValidation(TERADATA_COLUMN_CONFIG, verbose=True) df = validator.execute() assert int(df["source_agg_value"][0]) > 0 assert df["source_agg_value"][0] == df["target_agg_value"][0] + + +def test_row_validator(): + validator = data_validation.DataValidation(TERADATA_ROW_CONFIG, verbose=True) + df = validator.execute() + assert df["validation_status"][0] == "success" diff --git a/third_party/ibis/ibis_teradata/compiler.py b/third_party/ibis/ibis_teradata/compiler.py index 0ab44f151..7c71d8e5d 100644 --- a/third_party/ibis/ibis_teradata/compiler.py +++ b/third_party/ibis/ibis_teradata/compiler.py @@ -1,6 +1,12 @@ import datetime from functools import partial from io import StringIO +import itertools +import locale +import platform +import re +import string +import warnings import numpy as np import toolz @@ -20,7 +26,6 @@ from .datatypes import ibis_type_to_teradata_type from ibis.backends.base_sql import fixed_arity, literal, reduction, unary from ibis.backends.base_sql.compiler import ( - BaseExprTranslator, BaseSelect, BaseTableSetFormatter, ) @@ -483,6 +488,134 @@ def _arbitrary(translator, expr): return "ANY_VALUE({})".format(translator.translate(arg)) +# Teradata rules copied from ibis PostgreSQL compiler +_strftime_to_teradata_rules = { + '%a': 'Dy', + '%A': 'Day', + '%w': 'D', # 1-based day of week, see below for how we make this 0-based + '%d': 'DD', # day of month + '%-d': 'FMDD', # - is no leading zero for Python same for FM in Teradata + '%b': 'Mon', # Sep + '%B': 'Month', # September + '%m': 'MM', # 01 + '%-m': 'FMMM', # 1 + '%y': 'YY', # 15 + '%Y': 'YYYY', # 2015 + '%H': 'HH24', # 09 + '%-H': 'FMHH24', # 9 + '%I': 'HH12', # 09 + '%-I': 'FMHH12', # 9 + '%p': 'AM', # AM or PM + '%M': 'MI', # zero padded minute + '%-M': 'FMMI', # Minute + '%S': 'SS', # zero padded second + '%-S': 'FMSS', # Second + '%f': 'FF6', # zero padded microsecond + '%z': 'TZR', # utf offset + '%Z': 'TZR', # uppercase timezone name + '%j': 'DDD', # zero padded day of year + '%-j': 'FMDDD', # day of year + '%U': 'WW', # 1-based week of year + # 'W': ?, # meh +} + +try: + _strftime_to_teradata_rules.update( + { + '%c': locale.nl_langinfo(locale.D_T_FMT), # locale date and time + '%x': locale.nl_langinfo(locale.D_FMT), # locale date + '%X': locale.nl_langinfo(locale.T_FMT), # locale time + } + ) +except AttributeError: + warnings.warn( + 'locale specific date formats (%%c, %%x, %%X) are not yet implemented ' + 'for %s' % platform.system() + ) + +# Translate strftime spec into mostly equivalent Teradata spec +_scanner = re.Scanner( + # double quotes need to be escaped + [('"', lambda scanner, token: r'\"')] + + [ + ( + '|'.join( + map( + '(?:{})'.format, + itertools.chain( + _strftime_to_teradata_rules.keys(), + [ + # "%e" is in the C standard and Python actually + # generates this if your spec contains "%c" but we + # don't officially support it as a specifier so we + # need to special case it in the scanner + '%e', + r'\s+', + r'[{}]'.format(re.escape(string.punctuation)), + r'[^{}\s]+'.format(re.escape(string.punctuation)), + ], + ), + ) + ), + lambda scanner, token: token, + ) + ] +) + +_lexicon_values = frozenset(_strftime_to_teradata_rules.values()) + +_strftime_blacklist = frozenset(['%w', '%U', '%c', '%x', '%X', '%e']) + + +def _reduce_tokens(tokens): + """ + Reduce strftime for,mat elements to an equivalent string of Teradata elements. + Ideally this would be a full reduction like for Oracle and PostgreSQL but the "arg" parameter received + by _strftime is not equivalent and ends up as a string rather than a column name. + I've followed pattern set by other functions in this module and build a string function spec rather than + using sa.func.to_char. + """ + # TODO It would be nice to bring this into line with Oracle/PostgreSQL strftime function processing. + + # current list of tokens + curtokens = [] + + non_special_tokens = ( + frozenset(_strftime_to_teradata_rules) - _strftime_blacklist + ) + + # TODO: how much of a hack is this? + for token in tokens: + # we are a non-special token %A, %d, etc. + if token in non_special_tokens: + curtokens.append(_strftime_to_teradata_rules[token]) + + # we have a string like DD, to escape this we + # surround it with double quotes + elif token in _lexicon_values: + curtokens.append('"{}"'.format(token)) + + # we have a token that needs special treatment + elif token in _strftime_blacklist: + raise UnsupportedOperationError('Not right now') + + # uninteresting text + else: + curtokens.append(token) + return ''.join(curtokens) + + +def _strftime(translator, expr): + arg, format_string = expr.op().args + fmt_string = translator.translate(format_string) + arg_formatted = translator.translate(arg) + tokens, _ = _scanner.scan(fmt_string) + translated_format = _reduce_tokens(tokens) + return "TO_CHAR({}, {})".format( + arg_formatted, translated_format + ) + + _date_units = { "Y": "YEAR", "Q": "QUARTER", @@ -537,13 +670,6 @@ def _formatter(translator, expr): return _formatter -STRFTIME_FORMAT_FUNCTIONS = { - dt.Date: "DATE", - dt.Time: "TIME", - dt.Timestamp: "TIMESTAMP", -} - - _operation_registry.update( { ops.ExtractYear: _extract_field("year"), @@ -559,6 +685,7 @@ def _formatter(translator, expr): ops.StringJoin: _string_join, ops.StringAscii: _string_ascii, ops.StringFind: _string_find, + ops.Strftime: _strftime, ops.StrRight: _string_right, ops.Repeat: fixed_arity("REPEAT", 2), ops.RegexSearch: _regex_search, @@ -628,25 +755,6 @@ def teradata_compiles_divide(t, e): return "IEEE_DIVIDE({}, {})".format(*map(t.translate, e.op().args)) -@compiles(ops.Strftime) -def compiles_strftime(translator, expr): - arg, format_string = expr.op().args - arg_type = arg.type() - strftime_format_func_name = STRFTIME_FORMAT_FUNCTIONS[type(arg_type)] - fmt_string = translator.translate(format_string) - arg_formatted = translator.translate(arg) - if isinstance(arg_type, dt.Timestamp): - return "FORMAT_{}({}, {}, {!r})".format( - strftime_format_func_name, - fmt_string, - arg_formatted, - arg_type.timezone if arg_type.timezone is not None else "UTC", - ) - return "FORMAT_{}({}, {})".format( - strftime_format_func_name, fmt_string, arg_formatted - ) - - @compiles(ops.StringToTimestamp) def compiles_string_to_timestamp(translator, expr): arg, format_string, timezone_arg = expr.op().args