Skip to content

Commit

Permalink
feat: Adding Random-Row support for Custom Query (#891)
Browse files Browse the repository at this point in the history
  • Loading branch information
piyushsarraf committed Jun 26, 2023
1 parent ba641e0 commit fc42c61
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 7 deletions.
12 changes: 11 additions & 1 deletion data_validation/cli_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -666,6 +666,17 @@ def _configure_custom_query_row_parser(custom_query_row_parser):
type=threshold_float,
help="Float max threshold for percent difference",
)
optional_arguments.add_argument(
"--use-random-row",
"-rr",
action="store_true",
help="Finds a set of random rows of the first primary key supplied.",
)
optional_arguments.add_argument(
"--random-row-batch-size",
"-rbs",
help="Row batch size used for random row filters (default 10,000).",
)

# Group required arguments
required_arguments = custom_query_row_parser.add_argument_group(
Expand Down Expand Up @@ -1268,7 +1279,6 @@ def get_pre_build_configs(args: Namespace, validate_cmd: str) -> List[Dict]:
if (
args.command != "generate-table-partitions"
and config_type != consts.SCHEMA_VALIDATION
and config_type != consts.CUSTOM_QUERY
):
use_random_rows = args.use_random_row
random_row_batch_size = args.random_row_batch_size
Expand Down
22 changes: 16 additions & 6 deletions data_validation/data_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,16 +110,26 @@ def _add_random_row_filter(self):
# Filter for only first primary key (multi-pk filter not supported)
primary_key_info = self.config_manager.primary_keys[0]

query = RandomRowBuilder(
randomRowBuilder = RandomRowBuilder(
[primary_key_info[consts.CONFIG_SOURCE_COLUMN]],
self.config_manager.random_row_batch_size(),
).compile(
self.config_manager.source_client,
self.config_manager.source_schema,
self.config_manager.source_table,
self.validation_builder.source_builder,
)

if (self.config_manager.validation_type == consts.CUSTOM_QUERY) and (
self.config_manager.custom_query_type == consts.ROW_VALIDATION.lower()
):
query = randomRowBuilder.compile_custom_query(
self.config_manager.source_client,
self.config_manager.source_query,
)
else:
query = randomRowBuilder.compile(
self.config_manager.source_client,
self.config_manager.source_schema,
self.config_manager.source_table,
self.validation_builder.source_builder,
)

random_rows = self.config_manager.source_client.execute(query)
if len(random_rows) == 0:
return
Expand Down
13 changes: 13 additions & 0 deletions data_validation/query_builder/random_row_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,19 @@ def compile(

return query

def compile_custom_query(self, data_client: ibis.client, query: str) -> ibis.Expr:
"""Return an Ibis query object for a given query.
Args:
data_client (IbisClient): The client used to query random rows.
query (String): Custom query provided by user.
"""
table = clients.get_ibis_query(data_client, query)
randomly_sorted_table = self.maybe_add_random_sort(data_client, table)
query = randomly_sorted_table.limit(self.batch_size)[self.primary_keys]

return query

def maybe_add_random_sort(
self, data_client: ibis.client, table: ibis.Expr
) -> ibis.Expr:
Expand Down

0 comments on commit fc42c61

Please sign in to comment.