diff --git a/data_validation/clients.py b/data_validation/clients.py index 26219761..50a6cf80 100644 --- a/data_validation/clients.py +++ b/data_validation/clients.py @@ -274,6 +274,23 @@ def get_max_column_length(client): return 128 +def get_max_in_list_size(client, in_list_over_expressions=False): + if client.name == "snowflake": + if in_list_over_expressions: + # This is a workaround for Snowflake limitation: + # SQL compilation error: In-list contains more than 50 non-constant values + # getattr(..., "cast") expression above is looking for lists where the contents are casts and not simple literals. + return 50 + else: + return 16000 + elif is_oracle_client(client): + # This is a workaround for Oracle limitation: + # ORA-01795: maximum number of expressions in a list is 1000 + return 1000 + else: + return None + + CLIENT_LOOKUP = { "BigQuery": get_bigquery_client, "Impala": impala_connect, diff --git a/data_validation/validation_builder.py b/data_validation/validation_builder.py index c43b7ec1..55e39d6f 100644 --- a/data_validation/validation_builder.py +++ b/data_validation/validation_builder.py @@ -15,6 +15,7 @@ from copy import deepcopy from data_validation import consts, metadata +from data_validation.clients import get_max_in_list_size from data_validation.query_builder.query_builder import ( AggregateField, CalculatedField, @@ -65,23 +66,17 @@ def __init__(self, config_manager): self.add_primary_keys() self.add_query_limit() - def _construct_isin_filter(self, column_name: str, in_list: list) -> FilterField: + def _construct_isin_filter( + self, client, column_name: str, in_list: list + ) -> FilterField: """Return a FilterField object that is either isin(...) or (isin(...) OR isin(...) OR...).""" - def get_max_in_list_size(): - if ( - self.source_client.name == "snowflake" - and in_list - and getattr(in_list[0], "cast", None) - ): - # This is a workaround for Snowflake limitation: - # SQL compilation error: In-list contains more than 50 non-constant values - # getattr(..., "cast") expression above is looking for lists where the contents are casts and not simple literals. - return 50 - else: - return None - - max_in_list_size = get_max_in_list_size() + max_in_list_size = get_max_in_list_size( + client, + in_list_over_expressions=bool( + in_list and getattr(in_list[0], "cast", None) + ), + ) if max_in_list_size and len(in_list) > max_in_list_size: source_filters = [ FilterField.isin(column_name, _) @@ -304,10 +299,12 @@ def add_filter(self, filter_field): ) elif filter_field[consts.CONFIG_TYPE] == consts.FILTER_TYPE_ISIN: source_filter = self._construct_isin_filter( + self.source_client, filter_field[consts.CONFIG_FILTER_SOURCE_COLUMN], filter_field[consts.CONFIG_FILTER_SOURCE_VALUE], ) target_filter = self._construct_isin_filter( + self.target_client, filter_field[consts.CONFIG_FILTER_TARGET_COLUMN], filter_field[consts.CONFIG_FILTER_TARGET_VALUE], ) diff --git a/tests/unit/test_validation_builder.py b/tests/unit/test_validation_builder.py index b39ac5ba..efe6814e 100644 --- a/tests/unit/test_validation_builder.py +++ b/tests/unit/test_validation_builder.py @@ -266,6 +266,14 @@ def test_validation_add_filters(module_under_test): pandas.Timestamp("2020-01-01 10:11:13"), ], }, + # Test with many IDs. + { + consts.CONFIG_TYPE: consts.FILTER_TYPE_ISIN, + consts.CONFIG_FILTER_SOURCE_COLUMN: "id", + consts.CONFIG_FILTER_SOURCE_VALUE: range(0, 10000), + consts.CONFIG_FILTER_TARGET_COLUMN: "id", + consts.CONFIG_FILTER_TARGET_VALUE: range(0, 10000), + }, ], ) def test_validation_add_filter(module_under_test, filter_field: dict):