From fc42c6181a73f79d55dc74ddd383d462a0ca4e7a Mon Sep 17 00:00:00 2001 From: "Piyush:)" <47020544+piyushsarraf@users.noreply.github.com> Date: Mon, 26 Jun 2023 13:54:07 +0530 Subject: [PATCH] feat: Adding Random-Row support for Custom Query (#891) --- data_validation/cli_tools.py | 12 +++++++++- data_validation/data_validation.py | 22 ++++++++++++++----- .../query_builder/random_row_builder.py | 13 +++++++++++ 3 files changed, 40 insertions(+), 7 deletions(-) diff --git a/data_validation/cli_tools.py b/data_validation/cli_tools.py index 36d023a95..13f7419cc 100644 --- a/data_validation/cli_tools.py +++ b/data_validation/cli_tools.py @@ -666,6 +666,17 @@ def _configure_custom_query_row_parser(custom_query_row_parser): type=threshold_float, help="Float max threshold for percent difference", ) + optional_arguments.add_argument( + "--use-random-row", + "-rr", + action="store_true", + help="Finds a set of random rows of the first primary key supplied.", + ) + optional_arguments.add_argument( + "--random-row-batch-size", + "-rbs", + help="Row batch size used for random row filters (default 10,000).", + ) # Group required arguments required_arguments = custom_query_row_parser.add_argument_group( @@ -1268,7 +1279,6 @@ def get_pre_build_configs(args: Namespace, validate_cmd: str) -> List[Dict]: if ( args.command != "generate-table-partitions" and config_type != consts.SCHEMA_VALIDATION - and config_type != consts.CUSTOM_QUERY ): use_random_rows = args.use_random_row random_row_batch_size = args.random_row_batch_size diff --git a/data_validation/data_validation.py b/data_validation/data_validation.py index 23b948275..a26e85c19 100644 --- a/data_validation/data_validation.py +++ b/data_validation/data_validation.py @@ -110,16 +110,26 @@ def _add_random_row_filter(self): # Filter for only first primary key (multi-pk filter not supported) primary_key_info = self.config_manager.primary_keys[0] - query = RandomRowBuilder( + randomRowBuilder = RandomRowBuilder( [primary_key_info[consts.CONFIG_SOURCE_COLUMN]], self.config_manager.random_row_batch_size(), - ).compile( - self.config_manager.source_client, - self.config_manager.source_schema, - self.config_manager.source_table, - self.validation_builder.source_builder, ) + if (self.config_manager.validation_type == consts.CUSTOM_QUERY) and ( + self.config_manager.custom_query_type == consts.ROW_VALIDATION.lower() + ): + query = randomRowBuilder.compile_custom_query( + self.config_manager.source_client, + self.config_manager.source_query, + ) + else: + query = randomRowBuilder.compile( + self.config_manager.source_client, + self.config_manager.source_schema, + self.config_manager.source_table, + self.validation_builder.source_builder, + ) + random_rows = self.config_manager.source_client.execute(query) if len(random_rows) == 0: return diff --git a/data_validation/query_builder/random_row_builder.py b/data_validation/query_builder/random_row_builder.py index 6e8809e5d..b03ff3e52 100644 --- a/data_validation/query_builder/random_row_builder.py +++ b/data_validation/query_builder/random_row_builder.py @@ -116,6 +116,19 @@ def compile( return query + def compile_custom_query(self, data_client: ibis.client, query: str) -> ibis.Expr: + """Return an Ibis query object for a given query. + + Args: + data_client (IbisClient): The client used to query random rows. + query (String): Custom query provided by user. + """ + table = clients.get_ibis_query(data_client, query) + randomly_sorted_table = self.maybe_add_random_sort(data_client, table) + query = randomly_sorted_table.limit(self.batch_size)[self.primary_keys] + + return query + def maybe_add_random_sort( self, data_client: ibis.client, table: ibis.Expr ) -> ibis.Expr: