Skip to content

Commit

Permalink
fix: Fixes bug in get_max_in_list_size (#1158)
Browse files Browse the repository at this point in the history
* feat: Cater for Oracle IN list limit of 1000

* fix: Fix bug in get_max_in_list_size

---------

Co-authored-by: Helen Cristina <[email protected]>
  • Loading branch information
nj1973 and helensilva14 committed Jun 4, 2024
1 parent e3fe3d1 commit 973e6b6
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 15 deletions.
17 changes: 17 additions & 0 deletions data_validation/clients.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,23 @@ def get_max_column_length(client):
return 128


def get_max_in_list_size(client, in_list_over_expressions=False):
if client.name == "snowflake":
if in_list_over_expressions:
# This is a workaround for Snowflake limitation:
# SQL compilation error: In-list contains more than 50 non-constant values
# getattr(..., "cast") expression above is looking for lists where the contents are casts and not simple literals.
return 50
else:
return 16000
elif is_oracle_client(client):
# This is a workaround for Oracle limitation:
# ORA-01795: maximum number of expressions in a list is 1000
return 1000
else:
return None


CLIENT_LOOKUP = {
"BigQuery": get_bigquery_client,
"Impala": impala_connect,
Expand Down
27 changes: 12 additions & 15 deletions data_validation/validation_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from copy import deepcopy

from data_validation import consts, metadata
from data_validation.clients import get_max_in_list_size
from data_validation.query_builder.query_builder import (
AggregateField,
CalculatedField,
Expand Down Expand Up @@ -65,23 +66,17 @@ def __init__(self, config_manager):
self.add_primary_keys()
self.add_query_limit()

def _construct_isin_filter(self, column_name: str, in_list: list) -> FilterField:
def _construct_isin_filter(
self, client, column_name: str, in_list: list
) -> FilterField:
"""Return a FilterField object that is either isin(...) or (isin(...) OR isin(...) OR...)."""

def get_max_in_list_size():
if (
self.source_client.name == "snowflake"
and in_list
and getattr(in_list[0], "cast", None)
):
# This is a workaround for Snowflake limitation:
# SQL compilation error: In-list contains more than 50 non-constant values
# getattr(..., "cast") expression above is looking for lists where the contents are casts and not simple literals.
return 50
else:
return None

max_in_list_size = get_max_in_list_size()
max_in_list_size = get_max_in_list_size(
client,
in_list_over_expressions=bool(
in_list and getattr(in_list[0], "cast", None)
),
)
if max_in_list_size and len(in_list) > max_in_list_size:
source_filters = [
FilterField.isin(column_name, _)
Expand Down Expand Up @@ -304,10 +299,12 @@ def add_filter(self, filter_field):
)
elif filter_field[consts.CONFIG_TYPE] == consts.FILTER_TYPE_ISIN:
source_filter = self._construct_isin_filter(
self.source_client,
filter_field[consts.CONFIG_FILTER_SOURCE_COLUMN],
filter_field[consts.CONFIG_FILTER_SOURCE_VALUE],
)
target_filter = self._construct_isin_filter(
self.target_client,
filter_field[consts.CONFIG_FILTER_TARGET_COLUMN],
filter_field[consts.CONFIG_FILTER_TARGET_VALUE],
)
Expand Down
8 changes: 8 additions & 0 deletions tests/unit/test_validation_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,14 @@ def test_validation_add_filters(module_under_test):
pandas.Timestamp("2020-01-01 10:11:13"),
],
},
# Test with many IDs.
{
consts.CONFIG_TYPE: consts.FILTER_TYPE_ISIN,
consts.CONFIG_FILTER_SOURCE_COLUMN: "id",
consts.CONFIG_FILTER_SOURCE_VALUE: range(0, 10000),
consts.CONFIG_FILTER_TARGET_COLUMN: "id",
consts.CONFIG_FILTER_TARGET_VALUE: range(0, 10000),
},
],
)
def test_validation_add_filter(module_under_test, filter_field: dict):
Expand Down

0 comments on commit 973e6b6

Please sign in to comment.