From 177977207aa75318a231889e958e78b43510a4a5 Mon Sep 17 00:00:00 2001 From: Neha Nene Date: Fri, 19 May 2023 12:49:09 -0400 Subject: [PATCH] fix: dry-run bug when running configs, added CODEOWNERS, and docs (#865) --- CODEOWNERS | 13 ++ data_validation/__main__.py | 2 +- data_validation/cli_tools.py | 6 + docs/connections.md | 426 +++++++++++++---------------------- 4 files changed, 180 insertions(+), 267 deletions(-) create mode 100644 CODEOWNERS diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 000000000..d7096563f --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1,13 @@ +# Order is important; the last matching pattern takes the most +# precedence. + +# The main branch in this repo is protected and requires reviews +# from code owners. This means that any pull request +# which affects code with a code owner must be approved +# by that code owner before the pull request can be merged +# into the protected branch. + +# Pull requests which modify any of the following +# file types must be approved by one of the listed owner(s): + +*.py @GoogleCloudPlatform/pso-code-review-committee \ No newline at end of file diff --git a/data_validation/__main__.py b/data_validation/__main__.py index 9abc6d7e4..29f1b21fc 100644 --- a/data_validation/__main__.py +++ b/data_validation/__main__.py @@ -466,7 +466,7 @@ def run_validations(args, config_managers): ) except Exception as e: logging.error( - "Error %s occured while running config file %s. Skipping it for now.", + "Error %s occurred while running config file %s. Skipping it for now.", str(e), config_manager.config[consts.CONFIG_FILE], ) diff --git a/data_validation/cli_tools.py b/data_validation/cli_tools.py index 59d247b07..cc754310a 100644 --- a/data_validation/cli_tools.py +++ b/data_validation/cli_tools.py @@ -327,6 +327,12 @@ def _configure_validation_config_parser(subparsers): run_parser = configs_subparsers.add_parser( "run", help="Run your validation configs" ) + run_parser.add_argument( + "--dry-run", + "-dr", + action="store_true", + help="Prints source and target SQL to stdout in lieu of performing a validation.", + ) run_parser.add_argument( "--config-file", "-c", diff --git a/docs/connections.md b/docs/connections.md index 9b18baf63..27e8fcb3f 100644 --- a/docs/connections.md +++ b/docs/connections.md @@ -13,26 +13,19 @@ To do so simply add the GCS path to the environment. Note that if this path is s eg. `export PSO_DV_CONFIG_HOME=gs://my-bucket/my/connections/path/` -The following commands can be used to create connections: +## Using GCP Secret Manager +DVT supports [Google Cloud Secret Manager](https://cloud.google.com/secret-manager) for storing and referencing secrets in your connection configuration. -## Command template to create a connection: -Secret manager flags are optional +If the secret-manager flags are present, the remaining connection flags should reference secret names instead of the secret itself. For example, +the following BigQuery connection references a secret with name 'dvt-project-id' stored in project MY-PROJECT. ---secret-manager-type ---secret-manager-project-id - -``` -data-validation connections add --secret-manager-type --secret-manager-project-id --connection-name CONN_NAME source-type -``` - -## Create a sample BigQuery connection: -``` -data-validation connections add --connection-name MY_BQ_CONN BigQuery --project-id MY_GCP_PROJECT ``` +data-validation connections add \ + --secret-manager-type GCP \ + --secret-manager-project-id \ + --connection-name bq BigQuery \ + --project-id 'dvt-project-id' -## Create a sample Teradata connection: -``` -data-validation connections add --connection-name MY_TD_CONN Teradata --host HOST_IP --port PORT --user-name USER-NAME --password PASSWORD ``` ## List existing connections @@ -60,44 +53,36 @@ The data validation tool supports the following connection types. * [DB2](#DB2) * [AlloyDB](#AlloyDB) -As you see above, Teradata and BigQuery have different sets of custom arguments (for example project-id for BQ versus host for Teradata). -Every connection type requires its own configuration for connectivity. To find out the parameters for each connection type, use the following command. +Every connection type requires its own configuration for connectivity. To find out the parameters for each connection type, use the following command: ``` data-validation connections add -c CONN_NAME -h ``` -Below is the expected configuration for each type. +Below are the connection parameters for each database. ## Raw ``` -{ - # Raw JSON config for a connection - "json": '{ "secret_manager_type": null, "secret_manager_project_id": null, "source-type": "BigQuery", "project-id": "pso-kokoro-resources", "google-service-account-key-path": null}' -} +data-validation connections add + [--secret-manager-type ] Secret Manager type (None, GCP) + [--secret-manager-project-id SECRET_PROJECT_ID] Secret Manager project ID + --connection-name CONN_NAME Raw Connection name + --json JSON Raw JSON for connection ``` +The raw JSON can also be found in the connection config file. For example, +`'{"source_type": "BigQuery", "project_id": "my-project-id"}'` ## Google BigQuery ``` -{ - # secret manager type - "secret_manager_type": "GCP", - - # secret manager project id - "secret_manager_project_id": "secrets-project-id", - - # Configuration Required for All Data Sources - "source-type": "BigQuery", - - # BigQuery Specific Connection Config - "project-id": "my-project-name", - - # (Optional) BigQuery JSON Config File for On-Prem usecases - "google-service-account-key-path": "/path/to/key.json" -} +data-validation connections add + [--secret-manager-type ] Secret Manager type (None, GCP) + [--secret-manager-project-id SECRET_PROJECT_ID] Secret Manager project ID + --connection-name CONN_NAME BigQuery Connection name + --project-id MY_PROJECT Project ID where BQ data resides + [--google-service-account-key-path PATH_TO_SA_KEY] Path to SA key ``` -### User/Service account needs following BigQuery permissions to run this validator tool: +### User/Service account needs following BigQuery permissions to run DVT: * bigquery.jobs.create (BigQuery JobUser role) * bigquery.readsessions.create (BigQuery Read Session User) * bigquery.tables.get (BigQuery Data Viewer) @@ -107,85 +92,55 @@ Below is the expected configuration for each type. * bigquery.tables.update (BigQuery Data Editor) * bigquery.tables.updateData (BigQuery Data Editor) + ## Google Spanner ``` -{ - # secret manager type - "secret_manager_type": "GCP", - - # secret manager type - "secret_manager_project_id": "secrets-project-id", - - # Configuration Required for All Data Sources - "source-type": "Spanner", - - # GCP Project to use for Spanner - "project-id": "my-project-name", - - # ID of Spanner instance to connect to - "instance-id": "my-instance-id", - - # ID of Spanner database (schema) to connect to - "database-id": "my-database-id", - - # (Optional) Spanner JSON Config File for On-Prem usecases - "google-service-account-key-path": "/path/to/key.json" -} -``` - -### User/Service account needs following Spanner role to run this validator tool: +data-validation connections add + [--secret-manager-type ] Secret Manager type (None, GCP) + [--secret-manager-project-id SECRET_PROJECT_ID] Secret Manager project ID + --connection-name CONN_NAME Spanner Connection name + --project-id MY_PROJECT Project ID where BQ data resides + --instance-id MY_INSTANCE Spanner instance to connect to + --database-id MY-DB Spanner database (schema) to connect to + [--google-service-account-key-path PATH_TO_SA_KEY] Path to SA key +``` + +### User/Service account needs following Spanner role to run DVT: * roles/spanner.databaseReader ## Teradata Please note that Teradata is not-native to this package and must be installed via `pip install teradatasql` if you have a license. + ``` -{ - # secret manager type - "secret_manager_type": "GCP", - - # secret manager project id - "secret_manager_project_id": "secrets-project-id", - - - # Configuration Required for All Data Sources - "source-type": "Teradata", - - # Connection Details - "host": "127.0.0.1", - "port":1025, - # (Optional) - "logmech":"TD2", - "user-name":"my-user", - "password":"my-password" -} +data-validation connections add + [--secret-manager-type ] Secret Manager type (None, GCP) + [--secret-manager-project-id SECRET_PROJECT_ID] Secret Manager project ID + --connection-name CONN_NAME Teradata Connection name + --host HOST Teradata host + --port PORT Teradata port, defaults to 1025 + --user-name USER Teradata user + --password PASSWORD Teradata password + [--logmech LOGMECH] Teradata logmech, defaults to "TD2" ``` ## Oracle Please note the Oracle package is not installed by default. You will need to follow [cx_Oracle](https://cx-oracle.readthedocs.io/en/latest/user_guide/installation.html) installation steps. Then `pip install cx_Oracle`. ``` -{ - # secret manager type - "secret_manager_type": "GCP", - - # secret manager project id - "secret_manager_project_id": "secrets-project-id", - - # Configuration Required for All Data Sources - "source-type": "Oracle", - - # Connection Details - "host": "127.0.0.1", - "port":1521, - "user":"my-user", - "password":"my-password", - "database": "XE", - -} +data-validation connections add + [--secret-manager-type ] Secret Manager type (None, GCP) + [--secret-manager-project-id SECRET_PROJECT_ID] Secret Manager project ID + --connection-name CONN_NAME Oracle Connection name + --host HOST Oracle host + --port PORT Oracle port, defaults to 1521 + --user USER Oracle user + --password PASSWORD Oracle password + --database DATABASE Oracle database ``` -### Oracle User permissions to run the validator tool: + +### Oracle User permissions to run DVT: * CREATE SESSION * READ or SELECT on any tables to be validated * Optional - Read on SYS.V_$TRANSACTION (required to get isolation level, if privilege is not given then will default to Read Commited, [more_details](https://docs.sqlalchemy.org/en/14/dialects/oracle.html#transaction-isolation-level-autocommit)) @@ -193,158 +148,102 @@ Then `pip install cx_Oracle`. ## MSSQL Server Please note the MSSQL Server package is not installed by default. You will need to follow [SQL Server](https://docs.microsoft.com/en-us/sql/connect/odbc/linux-mac/installing-the-microsoft-odbc-driver-for-sql-server) installation steps. Then `pip install pyodbc`. -``` -{ - # secret manager type - "secret_manager_type": "GCP", - - # secret manager project id - "secret_manager_project_id": "secrets-project-id", - - # Configuration Required for All Data Sources - "source-type": "MSSQL", - - # Connection Details - "host": "127.0.0.1", - "port": 1433, - "user": "my-user", - "password": "my-password", - "database": "my-db", -} +``` +data-validation connections add + [--secret-manager-type ] Secret Manager type (None, GCP) + [--secret-manager-project-id SECRET_PROJECT_ID] Secret Manager project ID + --connection-name CONN_NAME MSSQL Connection name + --host HOST MSSQL host + --port PORT MSSQL port, defaults to 1433 + --user USER MSSQL user + --password PASSWORD MSSQL password + --database DATABASE MSSQL database ``` ## Postgres ``` -{ - # secret manager type - "secret_manager_type": "GCP", - - # secret manager project id - "secret_manager_project_id": "secrets-project-id", - - # Configuration Required for All Data Sources - "source-type": "Postgres", - - # Connection Details - "host": "127.0.0.1", - "port":5432, - "user": "my-user", - "password": "my-password", - "database":"my-db" -} +data-validation connections add + [--secret-manager-type ] Secret Manager type (None, GCP) + [--secret-manager-project-id SECRET_PROJECT_ID] Secret Manager project ID + --connection-name CONN_NAME Postgres Connection name + --host HOST Postgres host + --port PORT Postgres port, defaults to 5432 + --user USER Postgres user + --password PASSWORD Postgres password + --database DATABASE Postgres database ``` ## AlloyDB Please note AlloyDB supports same connection config as Postgres. ``` -{ - # secret manager type - "secret_manager_type": "GCP", - - # secret manager project id - "secret_manager_project_id": "secrets-project-id", - - # Configuration Required for All Data Sources - "source-type": "Postgres", - - # Connection Details - "host": "127.0.0.1", - "port":5432, - "user": "my-user", - "password": "my-password", - "database":"my-db" -} +data-validation connections add + [--secret-manager-type ] Secret Manager type (None, GCP) + [--secret-manager-project-id SECRET_PROJECT_ID] Secret Manager project ID + --connection-name CONN_NAME Postgres Connection name + --host HOST Postgres host + --port PORT Postgres port, defaults to 5432 + --user USER Postgres user + --password PASSWORD Postgres password + --database DATABASE Postgres database ``` ## MySQL ``` -{ - # secret manager type - "secret_manager_type": "GCP", - - # secret manager project id - "secret_manager_project_id": "secrets-project-id", - - # Configuration Required for All Data Sources - "source-type": "MySQL", - - # Connection Details - "host": "127.0.0.1", - "port":3306 - "user": "my-user", - "password": "my-password", - "database":"my-db" -} +data-validation connections add + [--secret-manager-type ] Secret Manager type (None, GCP) + [--secret-manager-project-id SECRET_PROJECT_ID] Secret Manager project ID + --connection-name CONN_NAME MySQL Connection name + --host HOST MySQL host + --port PORT MySQL port, defaults to 3306 + --user USER MySQL user + --password PASSWORD MySQL password + --database DATABASE MySQL database ``` ## Redshift ``` -{ - # secret manager type - "secret_manager_type": "GCP", - - # secret manager project id - "secret_manager_project_id": "secrets-project-id", - - # Configuration Required for All Data Sources - "source-type": "Redshift", - - # Connection Details - "host": "127.0.0.1", - "port":5439, - "user": "my-user", - "password": "my-password", - "database":"my-db" -} +data-validation connections add + [--secret-manager-type ] Secret Manager type (None, GCP) + [--secret-manager-project-id SECRET_PROJECT_ID] Secret Manager project ID + --connection-name CONN_NAME Redshift Connection name + --host HOST Redshift host + --port PORT Redshift port, defaults to 5439 + --user USER Redshift user + --password PASSWORD Redshift password + --database DATABASE Redshift database ``` ## FileSystem (CSV or JSON only) ``` -{ - # Configuration Required for All Data Sources - "source-type": "FileSystem", - - # Table name to use as a reference for file data - "table-name": "my-table-name", - - # The local, s3, or GCS file path to the data - "file-path": "gs://path/to/file", - - # The file type. Either 'csv' or 'json' - "file-type":"csv" -} +data-validation connections add + [--secret-manager-type ] Secret Manager type (None, GCP) + [--secret-manager-project-id SECRET_PROJECT_ID] Secret Manager project ID + --connection-name CONN_NAME FileSystem Connection name + --table-name TABLE_NAME Table name to use as reference for file data + --file-path FILE_PATH Local, GCS, or S3 file path + --file-type FILE_TYPE File type (CSV, JSON) ``` ## Impala ``` -{ - # secret manager type - "secret_manager_type": "GCP", - - # secret manager project id - "secret_manager_project_id": "secrets-project-id", - - # Configuration Required for All Data Sources - "source-type": "Impala", - - # Connection Details - "host": "127.0.0.1", - "port": 10000, - "database": "default", - "auth-mechanism": "PLAIN", - - # (Optional) - "use-ssl": False, - "timeout": 45, - "ca-cert": "path-certificate", - "user": "user", - "password": "password", - "pool-size": 10, - "hdfs-client": "hdfs-client", - "use-http-transport": False, - "http-path": "", -} +data-validation connections add + [--secret-manager-type ] Secret Manager type (None, GCP) + [--secret-manager-project-id SECRET_PROJECT_ID] Secret Manager project ID + --connection-name CONN_NAME Impala Connection name + --host HOST Impala host + --port PORT Impala port + --database DATABASE Impala database, defaults to "default" + [--auth-mechanism AUTH_MECH] Auth mechanism, defaults to "PLAIN" + [--user USER] Impala user + [--password PASSWORD] Impala password + [--use-ssl USE_SSL] Use SSL (True, False) + [--timeout TIMEOUT] Timeout, defaults to 45 + [--ca-cert CA_CERT] CA Cert + [--pool-size POOL_SIZE] Impala pool size, default to 8 + [--hdfs-client CLIENT] HDFS client + [--use-http-transport TRANSPORT] HTTP Transport (True, False) + [--http-path PATH] HTTP Path ``` ## Hive @@ -357,46 +256,41 @@ Please note that for Group By validations, the following property must be set in pip install ibis-framework[impala] ``` -``` -{ - - # secret manager type - "secret_manager_type": "GCP", - - # secret manager project id - "secret_manager_project_id": "secrets-project-id", - - # Hive is based off Impala connector - "source-type": "Impala", + Hive connections are based on the Ibis Impala connection which uses [impyla](https://github.com/cloudera/impyla). + Only Hive >=0.11 is supported due to impyla's dependency on HiveServer2. - # Connection Details - "host": "HIVE-IP-ADDRESS", - "port": 10000, - "database": "default", - "auth-mechanism":"PLAIN" -} + ``` +data-validation connections add + [--secret-manager-type ] Secret Manager type (None, GCP) + [--secret-manager-project-id SECRET_PROJECT_ID] Secret Manager project ID + --connection-name CONN_NAME Impala Connection name + --host HOST Hive host + --port PORT Hive port, defaults to 10000 + --database DATABASE Hive database, defaults to "default" + [--auth-mechanism AUTH_MECH] Auth mechanism, defaults to "PLAIN" + [--user USER] Hive user + [--password PASSWORD] Hive password + [--use-ssl USE_SSL] Use SSL (True, False) + [--timeout TIMEOUT] Timeout, defaults to 45 + [--ca-cert CA_CERT] CA Cert + [--pool-size POOL_SIZE] Hive pool size, default to 8 + [--hdfs-client CLIENT] HDFS client + [--use-http-transport TRANSPORT] HTTP Transport (True, False) + [--http-path PATH] HTTP Path ``` -Only Hive >=0.11 is supported due to [impyla](https://github.com/cloudera/impyla)'s dependency on HiveServer2. + ## DB2 ``` -{ - # secret manager type - "secret_manager_type": "GCP", - - # secret manager project id - "secret_manager_project_id": "secrets-project-id", - - # Configuration Required for All Data Sources - "source-type": "DB2", - - # Connection Details - "host": "localhost", - "port": 50000, - "driver": "ibm-db-sa", - "user": "my-username", - "password": "my-password", - "database": "my-db", - "url": "my-url", -} +data-validation connections add + [--secret-manager-type ] Secret Manager type (None, GCP) + [--secret-manager-project-id SECRET_PROJECT_ID] Secret Manager project ID + --connection-name CONN_NAME DB2 Connection name + --host HOST DB2 host + --port PORT DB2 port, defaults to 50000 + --user USER DB2 user + --password PASSWORD DB2 password + --database DATABASE DB2 database + [--url URL] URL link in DB2 to connect to + [--driver DRIVER] DB2 driver, defaults to "ibm_db_sa" ``` \ No newline at end of file