Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Fixes bug in get_max_in_list_size #1158

Merged
merged 4 commits into from
Jun 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions data_validation/clients.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,23 @@ def get_max_column_length(client):
return 128


def get_max_in_list_size(client, in_list_over_expressions=False):
if client.name == "snowflake":
if in_list_over_expressions:
# This is a workaround for Snowflake limitation:
# SQL compilation error: In-list contains more than 50 non-constant values
# getattr(..., "cast") expression above is looking for lists where the contents are casts and not simple literals.
return 50
else:
return 16000
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Curious to know: how did you find this other number for the list size? It would be helpful to put some sort of product documentation as a comment for future reference, but I could only find community posts mentioning it like this one: https://community.snowflake.com/s/question/0D5Do00000O3sJ0KAJ/how-o-get-around-th-16k-limit-on-number-of-expressions-in-in-clause-in-sql-generated-by-a-businessobjects-report

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was not the same post but I saw another 2 Snowflake community posts.

I don't think we need to document it because this code is ensuring that we don't hit the 16k limit, at 16,000 we'll switch to:

WHERE id IN (1, ..., 16000) OR id IN (16001, ..., n)

We will find other limits for other engines I'm sure. I started to read about SQL Server but that has a memory limit based on the execution plan and not a hard/fixed limit, so I decided to just include the limits I knew about for now.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, it makes sense, it's clearer for me now, thank you!

And for sure, totally agree that we might face it for other engines but at the moment we can stick with the ones we're aware of the values and usage

elif is_oracle_client(client):
# This is a workaround for Oracle limitation:
# ORA-01795: maximum number of expressions in a list is 1000
return 1000
else:
return None


CLIENT_LOOKUP = {
"BigQuery": get_bigquery_client,
"Impala": impala_connect,
Expand Down
27 changes: 12 additions & 15 deletions data_validation/validation_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from copy import deepcopy

from data_validation import consts, metadata
from data_validation.clients import get_max_in_list_size
from data_validation.query_builder.query_builder import (
AggregateField,
CalculatedField,
Expand Down Expand Up @@ -65,23 +66,17 @@ def __init__(self, config_manager):
self.add_primary_keys()
self.add_query_limit()

def _construct_isin_filter(self, column_name: str, in_list: list) -> FilterField:
def _construct_isin_filter(
self, client, column_name: str, in_list: list
) -> FilterField:
"""Return a FilterField object that is either isin(...) or (isin(...) OR isin(...) OR...)."""

def get_max_in_list_size():
if (
self.source_client.name == "snowflake"
and in_list
and getattr(in_list[0], "cast", None)
):
# This is a workaround for Snowflake limitation:
# SQL compilation error: In-list contains more than 50 non-constant values
# getattr(..., "cast") expression above is looking for lists where the contents are casts and not simple literals.
return 50
else:
return None

max_in_list_size = get_max_in_list_size()
max_in_list_size = get_max_in_list_size(
client,
in_list_over_expressions=bool(
in_list and getattr(in_list[0], "cast", None)
),
)
if max_in_list_size and len(in_list) > max_in_list_size:
source_filters = [
FilterField.isin(column_name, _)
Expand Down Expand Up @@ -304,10 +299,12 @@ def add_filter(self, filter_field):
)
elif filter_field[consts.CONFIG_TYPE] == consts.FILTER_TYPE_ISIN:
source_filter = self._construct_isin_filter(
self.source_client,
filter_field[consts.CONFIG_FILTER_SOURCE_COLUMN],
filter_field[consts.CONFIG_FILTER_SOURCE_VALUE],
)
target_filter = self._construct_isin_filter(
self.target_client,
filter_field[consts.CONFIG_FILTER_TARGET_COLUMN],
filter_field[consts.CONFIG_FILTER_TARGET_VALUE],
)
Expand Down
8 changes: 8 additions & 0 deletions tests/unit/test_validation_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,14 @@ def test_validation_add_filters(module_under_test):
pandas.Timestamp("2020-01-01 10:11:13"),
],
},
# Test with many IDs.
{
consts.CONFIG_TYPE: consts.FILTER_TYPE_ISIN,
consts.CONFIG_FILTER_SOURCE_COLUMN: "id",
consts.CONFIG_FILTER_SOURCE_VALUE: range(0, 10000),
consts.CONFIG_FILTER_TARGET_COLUMN: "id",
consts.CONFIG_FILTER_TARGET_VALUE: range(0, 10000),
},
],
)
def test_validation_add_filter(module_under_test, filter_field: dict):
Expand Down