From b1d49428d8f1990f5eee61b9e6487dbc2f561369 Mon Sep 17 00:00:00 2001 From: Neha Nene Date: Tue, 29 Aug 2023 16:05:39 -0400 Subject: [PATCH] feat: Support BQ decimal precision and scale for schema validation (#960) * feat: support BQ decimal precision and scale for schema validation, limit Teradata query to one row for custom query schema * feat: limit Hive function to one row for get_schema_using_query function --- tests/system/data_sources/test_hive.py | 4 ---- tests/system/data_sources/test_oracle.py | 4 ---- tests/system/data_sources/test_postgres.py | 4 ---- tests/system/data_sources/test_snowflake.py | 4 ---- tests/system/data_sources/test_sql_server.py | 4 ---- tests/system/data_sources/test_teradata.py | 6 +----- third_party/ibis/ibis_addon/operations.py | 16 ++++++++++++++++ third_party/ibis/ibis_impala/api.py | 2 +- third_party/ibis/ibis_teradata/__init__.py | 2 +- 9 files changed, 19 insertions(+), 27 deletions(-) diff --git a/tests/system/data_sources/test_hive.py b/tests/system/data_sources/test_hive.py index 603024ffc..dddccb0cf 100644 --- a/tests/system/data_sources/test_hive.py +++ b/tests/system/data_sources/test_hive.py @@ -166,10 +166,6 @@ def test_schema_validation_core_types_to_bigquery(): ( # All Hive integrals go to BigQuery INT64. "--allow-list=int8:int64,int16:int64,int32:int64," - # Hive decimals that map to BigQuery NUMERIC. - "decimal(20,0):decimal(38,9),decimal(10,2):decimal(38,9)," - # Hive decimals that map to BigQuery BIGNUMERIC. - "decimal(38,0):decimal(76,38)," # Hive does not have a time zoned "timestamp:timestamp('UTC')," # BigQuery does not have a float32 type. diff --git a/tests/system/data_sources/test_oracle.py b/tests/system/data_sources/test_oracle.py index 646f5e5e6..36fe618ce 100644 --- a/tests/system/data_sources/test_oracle.py +++ b/tests/system/data_sources/test_oracle.py @@ -165,10 +165,6 @@ def test_schema_validation_core_types_to_bigquery(): ( # Integral Oracle NUMBERS go to BigQuery INT64. "--allow-list=decimal(2,0):int64,decimal(4,0):int64,decimal(9,0):int64,decimal(18,0):int64," - # Oracle NUMBERS that map to BigQuery NUMERIC. - "decimal(20,0):decimal(38,9),decimal(10,2):decimal(38,9)," - # Oracle NUMBERS that map to BigQuery BIGNUMERIC. - "decimal(38,0):decimal(76,38)," # BigQuery does not have a float32 type. "float32:float64" ), diff --git a/tests/system/data_sources/test_postgres.py b/tests/system/data_sources/test_postgres.py index 87705af78..c0926cb5d 100644 --- a/tests/system/data_sources/test_postgres.py +++ b/tests/system/data_sources/test_postgres.py @@ -566,10 +566,6 @@ def test_schema_validation_core_types_to_bigquery(): ( # PostgreSQL integrals go to BigQuery INT64. "--allow-list=int16:int64,int32:int64," - # Oracle NUMBERS that map to BigQuery NUMERIC. - "decimal(20,0):decimal(38,9),decimal(10,2):decimal(38,9)," - # Oracle NUMBERS that map to BigQuery BIGNUMERIC. - "decimal(38,0):decimal(76,38)," # BigQuery does not have a float32 type. "float32:float64" ), diff --git a/tests/system/data_sources/test_snowflake.py b/tests/system/data_sources/test_snowflake.py index cf86076e8..4eb8aa469 100644 --- a/tests/system/data_sources/test_snowflake.py +++ b/tests/system/data_sources/test_snowflake.py @@ -190,10 +190,6 @@ def test_schema_validation_core_types_to_bigquery(): ( # Integral Snowflake NUMBERs to to BigQuery INT64. "--allow-list=decimal(38,0):int64," - # Snowflake NUMBERS that map to BigQuery NUMERIC. - "decimal(20,0):decimal(38,9),decimal(10,2):decimal(38,9)," - # Snowflake NUMBERS that map to BigQuery BIGNUMERIC - "decimal(38,0):decimal(76,38)," # TODO When issue-706 is complete remove the timestamp line below "timestamp('UTC'):timestamp" ), diff --git a/tests/system/data_sources/test_sql_server.py b/tests/system/data_sources/test_sql_server.py index 83e857710..5932ca649 100644 --- a/tests/system/data_sources/test_sql_server.py +++ b/tests/system/data_sources/test_sql_server.py @@ -292,10 +292,6 @@ def test_schema_validation_core_types_to_bigquery(): ( # All SQL Server integrals go to BigQuery INT64. "--allow-list=int8:int64,int16:int64,int32:int64," - # SQL Server decimals that map to BigQuery NUMERIC. - "decimal(20,0):decimal(38,9),decimal(10,2):decimal(38,9)," - # SQL Server decimals that map to BigQuery BIGNUMERIC. - "decimal(38,0):decimal(76,38)," # BigQuery does not have a float32 type. "float32:float64" ), diff --git a/tests/system/data_sources/test_teradata.py b/tests/system/data_sources/test_teradata.py index f3771ee62..a70881085 100644 --- a/tests/system/data_sources/test_teradata.py +++ b/tests/system/data_sources/test_teradata.py @@ -244,11 +244,7 @@ def test_schema_validation_core_types_to_bigquery(): "--exclusion-columns=id", ( # Teradata integrals go to BigQuery INT64. - "--allow-list=int8:int64,int16:int64,int32:int64," - # Teradata NUMBERS that map to BigQuery NUMERIC. - "decimal(20,0):decimal(38,9),decimal(10,2):decimal(38,9)," - # Teradata NUMBERS that map to BigQuery BIGNUMERIC. - "decimal(38,0):decimal(76,38)" + "--allow-list=int8:int64,int16:int64,int32:int64" ), ] ) diff --git a/third_party/ibis/ibis_addon/operations.py b/third_party/ibis/ibis_addon/operations.py index 9e160593c..6f14f0d5e 100644 --- a/third_party/ibis/ibis_addon/operations.py +++ b/third_party/ibis/ibis_addon/operations.py @@ -363,6 +363,22 @@ def _bigquery_field_to_ibis_dtype(field): names = [el.name for el in fields] ibis_types = list(map(dt.dtype, fields)) ibis_type = dt.Struct(dict(zip(names, ibis_types))) + elif typ == "NUMERIC": + if not field.precision and not field.scale: + return dt.Decimal(precision=38, scale=9, nullable=field.is_nullable) + return dt.Decimal( + precision=field.precision, + scale=field.scale or 0, + nullable=field.is_nullable, + ) + elif typ == "BIGNUMERIC": + if not field.precision and not field.scale: + return dt.Decimal(precision=76, scale=38, nullable=field.is_nullable) + return dt.Decimal( + precision=field.precision, + scale=field.scale or 0, + nullable=field.is_nullable, + ) else: ibis_type = _BQ_LEGACY_TO_STANDARD.get(typ, typ) if ibis_type in _BQ_DTYPE_TO_IBIS_TYPE: diff --git a/third_party/ibis/ibis_impala/api.py b/third_party/ibis/ibis_impala/api.py index 2d9a0ab2a..053714fe1 100644 --- a/third_party/ibis/ibis_impala/api.py +++ b/third_party/ibis/ibis_impala/api.py @@ -194,7 +194,7 @@ def _if_null(op): def _get_schema_using_query(self, query): # Removing LIMIT 0 around query since it returns no results in Hive - cur = self.raw_sql(query) + cur = self.raw_sql(f"SELECT * FROM ({query}) t0 LIMIT 1") cur.fetchall() ibis_fields = self._adapt_types(cur.description) cur.release() diff --git a/third_party/ibis/ibis_teradata/__init__.py b/third_party/ibis/ibis_teradata/__init__.py index 548fbf5d8..d0656e536 100644 --- a/third_party/ibis/ibis_teradata/__init__.py +++ b/third_party/ibis/ibis_teradata/__init__.py @@ -141,7 +141,7 @@ def get_schema(self, table_name: str, database: str = None) -> sch.Schema: return sch.Schema(schema) def _get_schema_using_query(self, query): - cur = self.raw_sql(query) + cur = self.raw_sql(f"SELECT TOP 1 * FROM ({query}) AS t0") # resets the state of the cursor and closes operation cur.fetchall() ibis_fields = self._adapt_types(cur.description)