From 5a747c23ba5e140619feeb54ffcc7a00a6266877 Mon Sep 17 00:00:00 2001 From: Neha Nene Date: Wed, 12 Jun 2024 16:13:07 -0400 Subject: [PATCH] docs: Update generate-partitions flags (#1168) * docs: update generate-partitions flags * docs: fix bug when removing threshold from args --- README.md | 13 +++ data_validation/cli_tools.py | 210 ++++++++++------------------------- 2 files changed, 71 insertions(+), 152 deletions(-) diff --git a/README.md b/README.md index 5ba2e527..ca7e4fc3 100644 --- a/README.md +++ b/README.md @@ -246,10 +246,23 @@ data-validation (--verbose or -v) (--log-level or -ll) generate-table-partitions --partition-num [1-1000], -pn [1-1000] Number of partitions/config files to generate In case this value exceeds the row count of the source/target table, it will be decreased to max(source_row_count, target_row_count) + [--bq-result-handler or -bqrh PROJECT_ID.DATASET.TABLE] + BigQuery destination for validation results. Defaults to stdout. + See: *Validation Reports* section + [--service-account or -sa PATH_TO_SA_KEY] + Service account to use for BigQuery result handler output. [--filters SOURCE_FILTER:TARGET_FILTER] Colon separated string values of source and target filters. If target filter is not provided, the source filter will run on source and target tables. See: *Filters* section + [--labels or -l KEY1=VALUE1,KEY2=VALUE2] + Comma-separated key value pair labels for the run. + [--format or -fmt FORMAT] + Format for stdout output. Supported formats are (text, csv, json, table). Defaults to table. + [--filter-status or -fs STATUSES_LIST] + Comma separated list of statuses to filter the validation results. Supported statuses are (success, fail). If no list is provided, all statuses are returned. + [--trim-string-pks, -tsp] + Trims string based primary key values, intended for use when one engine uses padded string semantics (e.g. CHAR(n)) and the other does not (e.g. VARCHAR(n)). ``` #### Schema Validations diff --git a/data_validation/cli_tools.py b/data_validation/cli_tools.py index b2134f8d..c6495df9 100644 --- a/data_validation/cli_tools.py +++ b/data_validation/cli_tools.py @@ -202,76 +202,36 @@ def _configure_partition_parser(subparsers): """Configure arguments to generate partitioned config files.""" partition_parser = subparsers.add_parser( "generate-table-partitions", - help=( - "Generate partitions for validation and store the Config files in " - "a directory" - ), + help=("Generate table partitions and store validation config files"), ) - - # Group all optional arguments together optional_arguments = partition_parser.add_argument_group("optional arguments") - optional_arguments.add_argument( - "--threshold", - "-th", - type=threshold_float, - help="Float max threshold for percent difference", - ) - optional_arguments.add_argument( - "--filters", - "-filters", - help="Filters in the format source_filter:target_filter", - ) - - # Group all required arguments together required_arguments = partition_parser.add_argument_group("required arguments") + + _configure_row_parser( + partition_parser, + optional_arguments, + required_arguments, + is_generate_partitions=True, + ) required_arguments.add_argument( - "--primary-keys", - "-pk", + "--config-dir", + "-cdir", required=True, - help="Comma separated list of primary key columns 'col_a,col_b'", + help="Directory path to store YAML config files. " + "GCS: Provide a full gs:// path of the target directory. " + "Eg: `gs:///partitons_dir`. " + "Local: Provide a relative path of the target directory. " + "Eg: `partitions_dir`", ) required_arguments.add_argument( - "--tables-list", - "-tbls", + "--partition-num", + "-pn", required=True, - help=( - "Comma separated tables list in the form " - "'schema.table=target_schema.target_table'" - ), - ) - - # Group for mutually exclusive required arguments. Either must be supplied - mutually_exclusive_arguments = required_arguments.add_mutually_exclusive_group( - required=True - ) - mutually_exclusive_arguments.add_argument( - "--hash", - "-hash", - help=( - "Comma separated list of columns for hash 'col_a,col_b' or * for " - "all columns" - ), - ) - mutually_exclusive_arguments.add_argument( - "--concat", - "-concat", - help=( - "Comma separated list of columns for concat 'col_a,col_b' or * " - "for all columns" - ), - ) - - mutually_exclusive_arguments.add_argument( - "--comparison-fields", - "-comp-fields", - help=( - "Individual columns to compare. If comparing a calculated field use " - "the column alias." - ), + help="Number of partitions/config files to generate, a number from 2 to 10,000", + type=_check_no_partitions, + metavar="[2-10000]", ) - _add_common_partition_arguments(optional_arguments, required_arguments) - def _configure_beta_parser(subparsers): """Configure beta commands for the parser.""" @@ -444,7 +404,9 @@ def _configure_validate_parser(subparsers): _configure_column_parser(column_parser) row_parser = validate_subparsers.add_parser("row", help="Run a row validation") - _configure_row_parser(row_parser) + optional_arguments = row_parser.add_argument_group("optional arguments") + required_arguments = row_parser.add_argument_group("required arguments") + _configure_row_parser(row_parser, optional_arguments, required_arguments) schema_parser = validate_subparsers.add_parser( "schema", help="Run a schema validation" @@ -457,11 +419,11 @@ def _configure_validate_parser(subparsers): _configure_custom_query_parser(custom_query_parser) -def _configure_row_parser(row_parser): +def _configure_row_parser( + parser, optional_arguments, required_arguments, is_generate_partitions=False +): """Configure arguments to run row level validations.""" - # Group optional arguments - optional_arguments = row_parser.add_argument_group("optional arguments") optional_arguments.add_argument( "--threshold", "-th", @@ -473,17 +435,6 @@ def _configure_row_parser(row_parser): "-filters", help="Filters in the format source_filter:target_filter", ) - optional_arguments.add_argument( - "--use-random-row", - "-rr", - action="store_true", - help="Finds a set of random rows of the first primary key supplied.", - ) - optional_arguments.add_argument( - "--random-row-batch-size", - "-rbs", - help="Row batch size used for random row filters (default 10,000).", - ) optional_arguments.add_argument( "--trim-string-pks", "-tsp", @@ -493,9 +444,21 @@ def _configure_row_parser(row_parser): "padded string semantics (e.g. CHAR(n)) and the other does not (e.g. VARCHAR(n))." ), ) + # Generate-table-partitions does not support random row + if not is_generate_partitions: + optional_arguments.add_argument( + "--use-random-row", + "-rr", + action="store_true", + help="Finds a set of random rows of the first primary key supplied.", + ) + optional_arguments.add_argument( + "--random-row-batch-size", + "-rbs", + help="Row batch size used for random row filters (default 10,000).", + ) # Group required arguments - required_arguments = row_parser.add_argument_group("required arguments") required_arguments.add_argument( "--tables-list", "-tbls", @@ -539,7 +502,11 @@ def _configure_row_parser(row_parser): "the column alias." ), ) - _add_common_arguments(optional_arguments, required_arguments) + _add_common_arguments( + optional_arguments, + required_arguments, + is_generate_partitions=is_generate_partitions, + ) def _configure_column_parser(column_parser): @@ -917,7 +884,9 @@ def _configure_custom_query_column_parser(custom_query_column_parser): _add_common_arguments(optional_arguments, required_arguments) -def _add_common_arguments(optional_arguments, required_arguments): +def _add_common_arguments( + optional_arguments, required_arguments, is_generate_partitions=False +): # Group all Required Arguments together required_arguments.add_argument( "--source-conn", "-sc", required=True, help="Source connection name" @@ -938,16 +907,17 @@ def _add_common_arguments(optional_arguments, required_arguments): "-sa", help="Path to SA key file for result handler output", ) - optional_arguments.add_argument( - "--config-file", - "-c", - help="Store the validation config in the YAML File Path specified", - ) - optional_arguments.add_argument( - "--config-file-json", - "-cj", - help="Store the validation config in the JSON File Path specified to be used for application use cases", - ) + if not is_generate_partitions: + optional_arguments.add_argument( + "--config-file", + "-c", + help="Store the validation config in the YAML File Path specified", + ) + optional_arguments.add_argument( + "--config-file-json", + "-cj", + help="Store the validation config in the JSON File Path specified to be used for application use cases", + ) optional_arguments.add_argument( "--format", "-fmt", @@ -975,70 +945,6 @@ def _check_no_partitions(value: str) -> int: ) -def _add_common_partition_arguments(optional_arguments, required_arguments): - """Add all arguments common to get-partition command""" - - # Group all Required Arguments together - required_arguments.add_argument( - "--source-conn", "-sc", required=True, help="Source connection name" - ) - required_arguments.add_argument( - "--target-conn", "-tc", required=True, help="Target connection name" - ) - required_arguments.add_argument( - "--config-dir", - "-cdir", - required=True, - help="Directory path to store YAML config files. " - "GCS: Provide a full gs:// path of the target directory. " - "Eg: `gs:///partiitons_dir`. " - "Local: Provide a relative path of the target directory. " - "Eg: `partitions_dir`", - ) - required_arguments.add_argument( - "--partition-num", - "-pn", - required=True, - help="Number of partitions/config files to generate, a number from 2 to 10,000", - type=_check_no_partitions, - metavar="[2-10000]", - ) - - # Optional arguments - optional_arguments.add_argument( - "--bq-result-handler", - "-bqrh", - help="BigQuery result handler config details", - ) - optional_arguments.add_argument( - "--labels", "-l", help="Key value pair labels for validation run" - ) - optional_arguments.add_argument( - "--service-account", - "-sa", - help="Path to SA key file for result handler output", - ) - optional_arguments.add_argument( - "--format", - "-fmt", - default="table", - help=( - "Set the format for printing command output, Supported formats are " - "(text, csv, json, table). Defaults to table" - ), - ) - optional_arguments.add_argument( - "--filter-status", - "-fs", - # TODO: update if we start to support other statuses - help=( - "Comma separated list of statuses to filter the validation results. " - "Supported statuses are (success, fail). If no list is provided, " - "all statuses are returned" - ), - ) - - def get_connection_config_from_args(args): """Return dict with connection config supplied.""" config = {