diff --git a/.gitignore b/.gitignore index 7f8d6daf4..f60db0632 100644 --- a/.gitignore +++ b/.gitignore @@ -84,6 +84,7 @@ terraform.rc # Custom *.yaml +*.json partitions_dir setup.sh diff --git a/README.md b/README.md index 85f90e55f..0e6a4e934 100644 --- a/README.md +++ b/README.md @@ -53,8 +53,8 @@ setup steps needed to install and use the Data Validation Tool. Before using this tool, you will need to create connections to the source and target tables. Once the connections are created, you can run validations on those tables. Validation results can be printed to stdout (default) or outputted -to BigQuery (recommended). DVT also allows you to save or edit validation -configurations in a YAML file. This is useful for running common validations or +to BigQuery (recommended). DVT also allows you to save and edit validation +configurations in a YAML or JSON file. This is useful for running common validations or updating the configuration. ### Managing Connections @@ -123,7 +123,10 @@ data-validation (--verbose or -v) (--log-level or -ll) validate column If target filter is not provided, the source filter will run on source and target tables. See: *Filters* section [--config-file or -c CONFIG_FILE] - YAML Config File Path to be used for storing validations. + YAML Config File Path to be used for storing validations and other features. + See: *Running DVT with YAML Configuration Files* section + [--config-file-json or -cj CONFIG_FILE_JSON] + JSON Config File Path to be used for storing validations only for application purposes. [--threshold or -th THRESHOLD] Float value. Maximum pct_difference allowed for validation to be considered a success. Defaults to 0.0 [--labels or -l KEY1=VALUE1,KEY2=VALUE2] @@ -159,7 +162,7 @@ and finally hashing the row. Under the hood, row validation uses [Calculated Fields](https://github.com/GoogleCloudPlatform/professional-services-data-validator#calculated-fields) to -apply functions such as IFNULL() or RTRIM(). These can be edited in the YAML config to customize your row validation. +apply functions such as IFNULL() or RTRIM(). These can be edited in the YAML or JSON config file to customize your row validation. ``` data-validation (--verbose or -v) (--log-level or -ll) validate row @@ -190,7 +193,10 @@ data-validation (--verbose or -v) (--log-level or -ll) validate row If target filter is not provided, the source filter will run on source and target tables. See: *Filters* section [--config-file or -c CONFIG_FILE] - YAML Config File Path to be used for storing validations. + YAML Config File Path to be used for storing validations and other features. + See: *Running DVT with YAML Configuration Files* section + [--config-file-json or -cj CONFIG_FILE_JSON] + JSON Config File Path to be used for storing validations only for application purposes. [--labels or -l KEY1=VALUE1,KEY2=VALUE2] Comma-separated key value pair labels for the run. [--format or -fmt] Format for stdout output. Supported formats are (text, csv, json, table). @@ -267,7 +273,10 @@ data-validation (--verbose or -v) (--log-level or -ll) validate schema [--service-account or -sa PATH_TO_SA_KEY] Service account to use for BigQuery result handler output. [--config-file or -c CONFIG_FILE] - YAML Config File Path to be used for storing validations. + YAML Config File Path to be used for storing validations and other features. + See: *Running DVT with YAML Configuration Files* section + [--config-file-json or -cj CONFIG_FILE_JSON] + JSON Config File Path to be used for storing validations only for application purposes. [--format or -fmt] Format for stdout output. Supported formats are (text, csv, json, table). Defaults to table. [--filter-status or -fs STATUSES_LIST] @@ -318,7 +327,10 @@ data-validation (--verbose or -v) (--log-level or -ll) validate custom-query col [--service-account or -sa PATH_TO_SA_KEY] Service account to use for BigQuery result handler output. [--config-file or -c CONFIG_FILE] - YAML Config File Path to be used for storing validations. + YAML Config File Path to be used for storing validations and other features. + See: *Running DVT with YAML Configuration Files* section + [--config-file-json or -cj CONFIG_FILE_JSON] + JSON Config File Path to be used for storing validations only for application purposes. [--labels or -l KEY1=VALUE1,KEY2=VALUE2] Comma-separated key value pair labels for the run. [--format or -fmt] Format for stdout output. Supported formats are (text, csv, json, table). @@ -377,7 +389,10 @@ data-validation (--verbose or -v) (--log-level or -ll) validate custom-query row [--service-account or -sa PATH_TO_SA_KEY] Service account to use for BigQuery result handler output. [--config-file or -c CONFIG_FILE] - YAML Config File Path to be used for storing validations. + YAML Config File Path to be used for storing validations and other features. + See: *Running DVT with YAML Configuration Files* section + [--config-file-json or -cj CONFIG_FILE_JSON] + JSON Config File Path to be used for storing validations only for application purposes. [--labels or -l KEY1=VALUE1,KEY2=VALUE2] Comma-separated key value pair labels for the run. [--format or -fmt] Format for stdout output. Supported formats are (text, csv, json, table). @@ -426,7 +441,7 @@ The following command creates a YAML file for the validation of the my_bq_conn -tbls bigquery-public-data.new_york_citibike.citibike_trips -c citibike.yaml`. -The vaildation config file is saved to the GCS path specified by the `PSO_DV_CONFIG_HOME` +The validation config file is saved to the GCS path specified by the `PSO_DV_CONFIG_HOME` env variable if that has been set; otherwise, it is saved to wherever the tool is run. You can now edit the YAML file if, for example, the `new_york_citibike` table is @@ -627,7 +642,7 @@ significant figure. Once a calculated field is defined, it can be referenced by other calculated fields at any "depth" or higher. Depth controls how many subqueries are executed -in the resulting query. For example, with the following YAML config... +in the resulting query. For example, with the following YAML config: ```yaml - calculated_fields: @@ -648,7 +663,7 @@ in the resulting query. For example, with the following YAML config... depth: 1 # calculated one query above ``` -is equivalent to the following SQL query... +is equivalent to the following SQL query: ```sql SELECT diff --git a/data_validation/__main__.py b/data_validation/__main__.py index 6f17f30e4..8cba4cf1a 100644 --- a/data_validation/__main__.py +++ b/data_validation/__main__.py @@ -45,13 +45,21 @@ def _get_arg_config_file(args): - """Return String yaml config file path.""" + """Return String YAML config file path.""" if not args.config_file: raise ValueError("YAML Config File was not supplied.") return args.config_file +def _get_arg_config_file_json(args): + """Return String JSON config file path.""" + if not args.config_file_json: + raise ValueError("JSON Config File was not supplied.") + + return args.config_file_json + + def get_aggregate_config(args, config_manager: ConfigManager): """Return list of formated aggregation objects. @@ -489,6 +497,26 @@ def convert_config_to_yaml(args, config_managers): return yaml_config +def convert_config_to_json(config_managers) -> dict: + """Return dict objects formatted for json validations. + JSON configs correspond to ConfigManager objects and therefore can only correspond to + one table validation. + + Args: + config_managers (list[ConfigManager]): List of config manager instances. + """ + + if len(config_managers) > 1: + raise ValueError( + "JSON configs can only be created for single table validations." + ) + config_manager = config_managers[0] + json_config = config_manager.config + json_config[consts.CONFIG_SOURCE_CONN] = config_manager.get_source_connection() + json_config[consts.CONFIG_TARGET_CONN] = config_manager.get_target_connection() + return json_config + + def run_validation(config_manager, dry_run=False, verbose=False): """Run a single validation. @@ -552,7 +580,7 @@ def run_validations(args, config_managers): for config_manager in config_managers: if config_manager.config and consts.CONFIG_FILE in config_manager.config: logging.info( - "Currently running the validation for yml file: %s", + "Currently running the validation for YAML file: %s", config_manager.config[consts.CONFIG_FILE], ) try: @@ -580,6 +608,17 @@ def store_yaml_config_file(args, config_managers): cli_tools.store_validation(config_file_path, yaml_configs) +def store_json_config_file(args, config_managers): + """Build a JSON config file from the supplied configs. + + Args: + config_managers (list[ConfigManager]): List of config manager instances. + """ + json_config = convert_config_to_json(config_managers) + config_file_path = _get_arg_config_file_json(args) + cli_tools.store_validation_json(config_file_path, json_config) + + def partition_and_store_config_files(args: Namespace) -> None: """Build multiple YAML Config files using user specified partition logic @@ -597,7 +636,7 @@ def partition_and_store_config_files(args: Namespace) -> None: def run(args) -> None: """Splits execution into: - 1. Build and save single Yaml Config file + 1. Build and save single Config file (YAML or JSON) 2. Run Validations Args: @@ -609,6 +648,8 @@ def run(args) -> None: config_managers = build_config_managers_from_args(args) if args.config_file: store_yaml_config_file(args, config_managers) + elif args.config_file_json: + store_json_config_file(args, config_managers) else: run_validations(args, config_managers) diff --git a/data_validation/cli_tools.py b/data_validation/cli_tools.py index 3fb695ca2..3fc0a1f14 100644 --- a/data_validation/cli_tools.py +++ b/data_validation/cli_tools.py @@ -321,7 +321,7 @@ def _configure_raw_query(subparsers): def _configure_validation_config_parser(subparsers): - """Configure arguments to run a data validation YAML config.""" + """Configure arguments to run a data validation YAML config file.""" validation_config_parser = subparsers.add_parser( "configs", help="Run validations stored in a YAML config file" ) @@ -922,7 +922,12 @@ def _add_common_arguments(optional_arguments, required_arguments): optional_arguments.add_argument( "--config-file", "-c", - help="Store the validation in the YAML Config File Path specified", + help="Store the validation config in the YAML File Path specified", + ) + optional_arguments.add_argument( + "--config-file-json", + "-cj", + help="Store the validation config in the JSON File Path specified to be used for application use cases", ) optional_arguments.add_argument( "--format", @@ -1074,6 +1079,12 @@ def store_validation(validation_file_name, yaml_config): mgr.create_validation_yaml(validation_file_name, yaml_config) +def store_validation_json(validation_file_name, json_config): + """Store the validation JSON config under the given name.""" + mgr = state_manager.StateManager() + mgr.create_validation_json(validation_file_name, json_config) + + def store_partition(target_file_path, yaml_config, target_folder_path=None): """Store the partition YAML config under the given name.""" mgr = state_manager.StateManager(target_folder_path) diff --git a/data_validation/consts.py b/data_validation/consts.py index 1ecc61870..dcd256089 100644 --- a/data_validation/consts.py +++ b/data_validation/consts.py @@ -19,6 +19,7 @@ SECRET_MANAGER_PROJECT_ID = "secret_manager_project_id" CONFIG = "config" CONFIG_FILE = "config_file" +CONFIG_FILE_JSON = "config_file_json" CONFIG_SOURCE_CONN_NAME = "source_conn_name" CONFIG_TARGET_CONN_NAME = "target_conn_name" CONFIG_SOURCE_CONN = "source_conn" diff --git a/data_validation/state_manager.py b/data_validation/state_manager.py index a2676b4bd..f788e6e6b 100644 --- a/data_validation/state_manager.py +++ b/data_validation/state_manager.py @@ -116,6 +116,17 @@ def create_validation_yaml(self, name: str, yaml_config: Dict[str, str]): yaml_config_str = dump(yaml_config, Dumper=Dumper) self._write_file(validation_path, yaml_config_str) + def create_validation_json(self, name: str, json_config: Dict[str, str]): + """Create a validation file and store the given config as JSON. + + Args: + name (String): The name of the validation. + json_config (Dict): A dictionary with the validation details. + """ + validation_path = self._get_validation_path(name) + json_config_str = json.dumps(json_config) + self._write_file(validation_path, json_config_str) + def create_partition_yaml(self, target_file_path: str, yaml_config: Dict[str, str]): """Create a validation file and store the given config as YAML. @@ -131,7 +142,7 @@ def get_validation_config(self, name: str, config_dir=None) -> Dict[str, str]: """Get a validation configuration from the expected file. Args: - name: The name of the validation. + name: The name of the validation file. Returns: A dict of the validation values from the file. """ diff --git a/samples/airflow/README.md b/samples/airflow/README.md index acdc8e07b..a0a019f5a 100644 --- a/samples/airflow/README.md +++ b/samples/airflow/README.md @@ -15,19 +15,14 @@ By default, the DAG will output the results to BigQuery as a result handler. ### Instructions 1. Download the DAG file in this directory -2. Update the JSON configuration for your use case (update table names, etc.) +2. Get the JSON configuration for your use case, explained in the next section 3. Upload it to the DAGs folder in your Airflow environment -### Limitations +## JSON Configuration -The Airflow DAG expects the raw config JSON which is not the same as a YAML config converted to JSON. +The Airflow DAG expects a JSON config content which is not the same as a YAML config converted to JSON format. The parameters in a typical YAML config file for DVT are slightly different from the JSON config file version, which is generated after DVT parses the YAML. -Parameters in a typical YAML config file are slightly different from the raw JSON config, -which is generated after DVT parses the YAML. The [build_config_manager()](https://github.com/GoogleCloudPlatform/professional-services-data-validator/blob/develop/data_validation/config_manager.py#L429) -method generates the JSON config and should be used as a reference. - -Our Cloud Run sample also expects a raw JSON config in the `'data'` variable shown -[here](https://github.com/GoogleCloudPlatform/professional-services-data-validator/tree/develop/samples/run#test-cloud-run-endpoint). +You can get the JSON content specific for your scenario by using our CLI and providing the argument to generate the JSON config file [`--config-file-json` or `-cj .json`]. IMPORTANT: do not forget to make the necessary adjustments between JSON and Python objects, check [this link as a reference](https://python-course.eu/applications-python/json-and-python.php). For example, the following YAML config is equivalent to the JSON config below, where the source param is written as `source_conn`. @@ -58,6 +53,3 @@ validations: ] } ``` - -For more implementation details, [this](https://github.com/GoogleCloudPlatform/professional-services-data-validator/blob/develop/data_validation/config_manager.py#L444) -is where the raw JSON config is generated in the DVT code. diff --git a/samples/airflow/dvt_airflow_dag.py b/samples/airflow/dvt_airflow_dag.py index e4e79f2e9..7475d786f 100644 --- a/samples/airflow/dvt_airflow_dag.py +++ b/samples/airflow/dvt_airflow_dag.py @@ -56,6 +56,8 @@ def validation_function(project): BQ_CONN = {"source_type": "BigQuery", "project_id": project} + # You can get the JSON content specific for your scenario by using our CLI and providing the argument to generate the JSON config file [`--config-file-json` or `-cj .json`]. + # IMPORTANT: do not forget to make the necessary adjustments between JSON and Python objects, check this link as a reference: https://python-course.eu/applications-python/json-and-python.php. GROUPED_CONFIG_COUNT_VALID = { # BigQuery Specific Connection Config "source_conn": BQ_CONN, diff --git a/samples/functions/README.md b/samples/functions/README.md index 35313deff..afa2ed023 100644 --- a/samples/functions/README.md +++ b/samples/functions/README.md @@ -32,7 +32,9 @@ cd ../../ ``` ### JSON Configuration -Below is an example of the JSON configuration that can be passed to the Cloud Function. + +Below is an example of the JSON configuration that can be passed to the Cloud Function. You can get the JSON content specific for your scenario by using our CLI and providing the argument to generate the JSON config file [`--config-file-json` or `-cj .json`]. IMPORTANT: do not forget to make the necessary adjustments between JSON and Python objects, check [this link as a reference](https://python-course.eu/applications-python/json-and-python.php). + ```json { "config":{ diff --git a/samples/run/README.md b/samples/run/README.md index 9a3d3ee94..9e1de75e4 100644 --- a/samples/run/README.md +++ b/samples/run/README.md @@ -34,7 +34,7 @@ gcloud run deploy --image gcr.io/${PROJECT_ID}/data-validation \ You can easily run a request via Python. For a quick test, we have provided this logic in `test.py` to run a validation against a public BigQuery table. The example is similar and also shows how you can forward results to BigQuery from the Cloud Run job: -``` +```python # Copyright 2020 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -75,7 +75,8 @@ def get_cloud_run_url(service_name, project_id): return re.findall("URL:.*\n", description)[0].split()[1].strip() - +# You can get the JSON content specific for your scenario by using our CLI and providing the argument to generate the JSON config file [`--config-file-json` or `-cj .json`]. +# IMPORTANT: do not forget to make the necessary adjustments between JSON and Python objects, check this link as a reference: https://python-course.eu/applications-python/json-and-python.php. data = { "source_conn": { "source_type": "BigQuery", diff --git a/samples/run/test.py b/samples/run/test.py index 30f0cb046..8c263cfde 100644 --- a/samples/run/test.py +++ b/samples/run/test.py @@ -39,7 +39,8 @@ def get_cloud_run_url(service_name, project_id): return re.findall("URL:.*\n", description)[0].split()[1].strip() - +# You can get the JSON content specific for your scenario by using our CLI and providing the argument to generate the JSON config file [`--config-file-json` or `-cj .json`]. +# IMPORTANT: do not forget to make the necessary adjustments between JSON and Python objects, check this link as a reference: https://python-course.eu/applications-python/json-and-python.php. data = { "source_conn": { "source_type": "BigQuery", diff --git a/samples/tests/__init__.py b/samples/tests/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/samples/tests/airflow_test_utils.py b/samples/tests/airflow_test_utils.py deleted file mode 100644 index 6d88aaf6a..000000000 --- a/samples/tests/airflow_test_utils.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright 2018 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Utilities for unit testing DAGs. - -Copied from -https://github.com/GoogleCloudPlatform/python-docs-samples/blob/master/composer/workflows/unit_testing.py -""" - -# [START composer_dag_unit_testing] -from airflow import models - - -def assert_has_valid_dag(module): - """Assert that a module contains a valid DAG.""" - - no_dag_found = True - - for dag in vars(module).values(): - if isinstance(dag, models.DAG): - no_dag_found = False - dag.test_cycle() # Throws if a task cycle is found. - - if no_dag_found: - raise AssertionError("module does not contain a valid DAG") - - -# [END composer_dag_unit_testing] diff --git a/samples/tests/test_airflow_dag.py b/samples/tests/test_airflow_dag.py deleted file mode 100644 index fbb82cec3..000000000 --- a/samples/tests/test_airflow_dag.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright 2022 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import airflow_test_utils - - -def test_dag_import(): - """Test that the DAG file can be successfully imported. - This tests that the DAG can be parsed, but does not run it in an Airflow - environment. This is a recommended sanity check by the official Airflow - docs: https://airflow.incubator.apache.org/tutorial.html#testing - """ - from ..airflow import dvt_airflow_dag as module - - airflow_test_utils.assert_has_valid_dag(module) diff --git a/tests/system/data_sources/test_bigquery.py b/tests/system/data_sources/test_bigquery.py index ef7e3689b..873180620 100644 --- a/tests/system/data_sources/test_bigquery.py +++ b/tests/system/data_sources/test_bigquery.py @@ -403,6 +403,34 @@ LIMIT 10 """.strip() +TEST_JSON_VALIDATION_CONFIG = { + consts.CONFIG_TYPE: "Column", + consts.CONFIG_SOURCE_CONN_NAME: "mock-conn", + consts.CONFIG_TARGET_CONN_NAME: "mock-conn", + consts.CONFIG_TABLE_NAME: "dvt_core_types", + consts.CONFIG_SCHEMA_NAME: "pso_data_validator", + consts.CONFIG_TARGET_SCHEMA_NAME: "pso_data_validator", + consts.CONFIG_TARGET_TABLE_NAME: "dvt_core_types", + consts.CONFIG_LABELS: [], + consts.CONFIG_THRESHOLD: 0.0, + consts.CONFIG_FORMAT: "table", + consts.CONFIG_RESULT_HANDLER: None, + consts.CONFIG_FILTERS: [], + consts.CONFIG_USE_RANDOM_ROWS: False, + consts.CONFIG_RANDOM_ROW_BATCH_SIZE: None, + consts.CONFIG_FILTER_STATUS: ["fail"], + consts.CONFIG_AGGREGATES: [ + { + consts.CONFIG_SOURCE_COLUMN: None, + consts.CONFIG_TARGET_COLUMN: None, + consts.CONFIG_FIELD_ALIAS: "count", + consts.CONFIG_TYPE: "count", + }, + ], + consts.CONFIG_SOURCE_CONN: BQ_CONN, + consts.CONFIG_TARGET_CONN: BQ_CONN, +} + def test_count_validator(): validator = data_validation.DataValidation(CONFIG_COUNT_VALID, verbose=True) @@ -1271,3 +1299,28 @@ def test_custom_query_validation_core_types(mock_conn): df = validator.execute() # With filter on failures the data frame should be empty assert len(df) == 0 + + +@mock.patch( + "data_validation.state_manager.StateManager.get_connection_config", + return_value=BQ_CONN, +) +def test_column_validation_convert_config_to_json(mock_conn): + parser = cli_tools.configure_arg_parser() + args = parser.parse_args( + [ + "validate", + "column", + "-sc=mock-conn", + "-tc=mock-conn", + "-tbls=pso_data_validator.dvt_core_types", + "--filter-status=fail", + "--config-file-json=bq-column-validation.json", + ] + ) + config_managers = main.build_config_managers_from_args(args) + assert len(config_managers) == 1 + + json_config = main.convert_config_to_json(config_managers) + # assert structure + assert json_config == TEST_JSON_VALIDATION_CONFIG diff --git a/tests/unit/test__main.py b/tests/unit/test__main.py index 9d3e132ce..b9450efee 100644 --- a/tests/unit/test__main.py +++ b/tests/unit/test__main.py @@ -77,7 +77,6 @@ "kube_completions": True, "config_dir": "gs://pso-kokoro-resources/resources/test/unit/test__main/3validations", } - CONFIG_RUNNER_ARGS_3 = { "verbose": False, "log_level": "INFO",