From d6c40f1f780f634c9b31a5a6ae9e0fc9312eb5ee Mon Sep 17 00:00:00 2001 From: Moukhtar Ahmed <100784684+mokhahmed@users.noreply.github.com> Date: Mon, 27 Feb 2023 12:05:11 +0100 Subject: [PATCH] feat: gcp secret manger support for DVT (#704) * adding support for secret manger [GCP] * adding support for secret manger [GCP] * adding support for secret manger [GCP] * adding support for secret manger [GCP] * Revert "adding support for secret manger [GCP]" This reverts commit b959cdaf5b4f7953e11cd7a9abef60329dc48399. * adding support for secret manger [GCP] * Merge remote-tracking branch 'origin/develop' into secret-manager # Conflicts: # docs/connections.md * Merge remote-tracking branch 'origin/develop' into secret-manager # Conflicts: # docs/connections.md * code review fixes - fix manger typo to manager - rename argument description - add google-cloud-secret-manager to requirement file * code review fixes - revert the naming of variables connections.md * code review fixes - revert the naming of variables connections.md * code review fixes - revert the naming of variables connections.md * code review fixes - revert the naming of variables connections.md * code review fixes - add test cases for secret manager class * code review fixes - revert the naming of variables connections.md * code review fixes - move secret manager testcases * code review fixes - revert the naming of variables connections.md --- data_validation/cli_tools.py | 22 ++++- data_validation/clients.py | 26 ++++-- data_validation/config_manager.py | 1 - data_validation/consts.py | 2 + data_validation/partition_builder.py | 2 - .../query_builder/query_builder.py | 1 - data_validation/result_handlers/bigquery.py | 1 - data_validation/secret_manager.py | 52 ++++++++++++ docs/connections.md | 85 ++++++++++++++++++- noxfile.py | 12 +++ setup.py | 1 + tests/system/test_secret_manager.py | 33 +++++++ tests/unit/test_cli_tools.py | 2 - 13 files changed, 223 insertions(+), 17 deletions(-) create mode 100644 data_validation/secret_manager.py create mode 100644 tests/system/test_secret_manager.py diff --git a/data_validation/cli_tools.py b/data_validation/cli_tools.py index 2a6ea521e..a96a7d362 100644 --- a/data_validation/cli_tools.py +++ b/data_validation/cli_tools.py @@ -332,16 +332,28 @@ def _configure_validation_config_parser(subparsers): def _configure_connection_parser(subparsers): """Configure the Parser for Connection Management.""" + connection_parser = subparsers.add_parser( "connections", help="Manage & Store connections to your Databases" ) connect_subparsers = connection_parser.add_subparsers(dest="connect_cmd") _ = connect_subparsers.add_parser("list", help="List your connections") - add_parser = connect_subparsers.add_parser("add", help="Store a new connection") add_parser.add_argument( "--connection-name", "-c", help="Name of connection used as reference" ) + add_parser.add_argument( + "--secret-manager-type", + "-sm", + default=None, + help="Secret manager type to store credentials by default will be None ", + ) + add_parser.add_argument( + "--secret-manager-project-id", + "-sm-prj-id", + default=None, + help="Project ID for the secret manager that stores the credentials", + ) _configure_database_specific_parsers(add_parser) @@ -909,7 +921,13 @@ def _add_common_partition_arguments(optional_arguments, required_arguments): def get_connection_config_from_args(args): """Return dict with connection config supplied.""" - config = {consts.SOURCE_TYPE: args.connect_type} + config = { + consts.SOURCE_TYPE: args.connect_type, + consts.SECRET_MANAGER_TYPE: getattr(args, consts.SECRET_MANAGER_TYPE), + consts.SECRET_MANAGER_PROJECT_ID: getattr( + args, consts.SECRET_MANAGER_PROJECT_ID + ), + } if args.connect_type == "Raw": return json.loads(args.json) diff --git a/data_validation/clients.py b/data_validation/clients.py index f2a36e3e1..177e4494a 100644 --- a/data_validation/clients.py +++ b/data_validation/clients.py @@ -30,8 +30,8 @@ from ibis.backends.postgres.client import PostgreSQLClient from third_party.ibis.ibis_cloud_spanner.api import connect as spanner_connect from third_party.ibis.ibis_impala.api import impala_connect - from data_validation import client_info, consts, exceptions +from data_validation.secret_manager import SecretManagerBuilder ibis.options.sql.default_limit = None @@ -224,12 +224,28 @@ def get_data_client(connection_config): """Return DataClient client from given configuration""" connection_config = copy.deepcopy(connection_config) source_type = connection_config.pop(consts.SOURCE_TYPE) + secret_manager_type = connection_config.pop(consts.SECRET_MANAGER_TYPE, None) + secret_manager_project_id = connection_config.pop( + consts.SECRET_MANAGER_PROJECT_ID, None + ) + + decrypted_connection_config = {} + if secret_manager_type is not None: + sm = SecretManagerBuilder().build(secret_manager_type.lower()) + for config_item in connection_config: + decrypted_connection_config[config_item] = sm.maybe_secret( + secret_manager_project_id, connection_config[config_item] + ) + else: + decrypted_connection_config = connection_config # The ibis_bigquery.connect expects a credentials object, not a string. - if consts.GOOGLE_SERVICE_ACCOUNT_KEY_PATH in connection_config: - key_path = connection_config.pop(consts.GOOGLE_SERVICE_ACCOUNT_KEY_PATH) + if consts.GOOGLE_SERVICE_ACCOUNT_KEY_PATH in decrypted_connection_config: + key_path = decrypted_connection_config.pop( + consts.GOOGLE_SERVICE_ACCOUNT_KEY_PATH + ) if key_path: - connection_config[ + decrypted_connection_config[ "credentials" ] = google.oauth2.service_account.Credentials.from_service_account_file( key_path @@ -242,7 +258,7 @@ def get_data_client(connection_config): raise Exception(msg) try: - data_client = CLIENT_LOOKUP[source_type](**connection_config) + data_client = CLIENT_LOOKUP[source_type](**decrypted_connection_config) data_client._source_type = source_type except Exception as e: msg = 'Connection Type "{source_type}" could not connect: {error}'.format( diff --git a/data_validation/config_manager.py b/data_validation/config_manager.py index 6bbd95286..8545c0f83 100644 --- a/data_validation/config_manager.py +++ b/data_validation/config_manager.py @@ -500,7 +500,6 @@ def build_column_configs(self, columns): casefold_target_columns = {x.casefold(): str(x) for x in target_table.columns} for column in columns: - if column.casefold() not in casefold_source_columns: raise ValueError(f"Grouped Column DNE in source: {column}") if column.casefold() not in casefold_target_columns: diff --git a/data_validation/consts.py b/data_validation/consts.py index d8e356348..ffbfea5d6 100644 --- a/data_validation/consts.py +++ b/data_validation/consts.py @@ -15,6 +15,8 @@ # Configuration Fields SOURCE_TYPE = "source_type" +SECRET_MANAGER_TYPE = "secret_manager_type" +SECRET_MANAGER_PROJECT_ID = "secret_manager_project_id" CONFIG = "config" CONFIG_FILE = "config_file" CONFIG_SOURCE_CONN_NAME = "source_conn_name" diff --git a/data_validation/partition_builder.py b/data_validation/partition_builder.py index 60466bbcb..4b4e9412a 100644 --- a/data_validation/partition_builder.py +++ b/data_validation/partition_builder.py @@ -26,7 +26,6 @@ class PartitionBuilder: def __init__(self, config_managers: List[ConfigManager], args: Namespace) -> None: - self.config_managers = config_managers self.table_count = len(config_managers) self.args = args @@ -97,7 +96,6 @@ def _get_partition_key_filters(self) -> List[List[str]]: master_filter_list = [] for config_manager in self.config_managers: - validation_builder = ValidationBuilder(config_manager) source_partition_row_builder = PartitionRowBuilder( diff --git a/data_validation/query_builder/query_builder.py b/data_validation/query_builder/query_builder.py index 3044f154d..8aaae5f77 100644 --- a/data_validation/query_builder/query_builder.py +++ b/data_validation/query_builder/query_builder.py @@ -261,7 +261,6 @@ def compile(self, ibis_table): class CalculatedField(object): def __init__(self, ibis_expr, config, fields, cast=None, **kwargs): - """A representation of an calculated field to build a query. Args: diff --git a/data_validation/result_handlers/bigquery.py b/data_validation/result_handlers/bigquery.py index 0dc31fa60..46e686cd9 100644 --- a/data_validation/result_handlers/bigquery.py +++ b/data_validation/result_handlers/bigquery.py @@ -64,7 +64,6 @@ def get_handler_for_project( return BigQueryResultHandler(client, status_list=status_list, table_id=table_id) def execute(self, result_df): - if self._status_list is not None: result_df = filter_validation_status(self._status_list, result_df) diff --git a/data_validation/secret_manager.py b/data_validation/secret_manager.py new file mode 100644 index 000000000..a5ec5fbba --- /dev/null +++ b/data_validation/secret_manager.py @@ -0,0 +1,52 @@ +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + + +class SecretManagerBuilder: + def build(self, client_type): + """ + :param client_type: + :return: secret manager instance currently support gcp secret manager + """ + if client_type.lower() == "gcp": + return GCPSecretManager() + else: + raise Exception(f"{client_type} is not supported yet.") + + +class GCPSecretManager: + """ + GCPSecretManager: client to access secrets stored at GCP secret manager + """ + + def __init__(self): + # Import the Secret Manager client library. + from google.cloud import secretmanager + + # Create the Secret Manager client. + self.client = secretmanager.SecretManagerServiceClient() + + def maybe_secret(self, project_id, secret_id, version_id="latest"): + """ + Get information about the given secret. + :return String value with the secret value or the secret id if the secret value if not exists + """ + try: + # Build the resource name of the secret. + name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}" + # Access the secret version. + response = self.client.access_secret_version(name=name) + # Return the decoded payload. + payload = response.payload.data.decode("UTF-8") + return payload + except Exception as e: + logging.warning(f"{secret_id} : {e}") + return secret_id diff --git a/docs/connections.md b/docs/connections.md index 334894500..fcb925c21 100644 --- a/docs/connections.md +++ b/docs/connections.md @@ -16,8 +16,13 @@ eg. The following commands can be used to create connections: ## Command template to create a connection: +Secret manager flags are optional + +--secret-manager-type +--secret-manager-project-id + ``` -data-validation connections add --connection-name CONN_NAME source-type +data-validation connections add --connection-name CONN_NAME source-type --secret-manager-type --secret-manager-project-id ``` ## Create a sample BigQuery connection: @@ -68,13 +73,19 @@ Below is the expected configuration for each type. ``` { # Raw JSON config for a connection - "json": '{"source-type": "BigQuery", "project-id": "pso-kokoro-resources", "google-service-account-key-path": null}' - + "json": '{ "secret_manager_type": null, "secret_manager_project_id": null, "source-type": "BigQuery", "project-id": "pso-kokoro-resources", "google-service-account-key-path": null}' +} ``` ## Google BigQuery ``` { + # secret manager type + "secret_manager_type": "GCP", + + # secret manager project id + "secret_manager_project_id": "secrets-project-id", + # Configuration Required for All Data Sources "source-type": "BigQuery", @@ -99,6 +110,12 @@ Below is the expected configuration for each type. ## Google Spanner ``` { + # secret manager type + "secret_manager_type": "GCP", + + # secret manager type + "secret_manager_project_id": "secrets-project-id", + # Configuration Required for All Data Sources "source-type": "Spanner", @@ -124,6 +141,13 @@ Please note that Teradata is not-native to this package and must be installed via `pip install teradatasql` if you have a license. ``` { + # secret manager type + "secret_manager_type": "GCP", + + # secret manager project id + "secret_manager_project_id": "secrets-project-id", + + # Configuration Required for All Data Sources "source-type": "Teradata", @@ -142,6 +166,12 @@ Please note the Oracle package is not installed by default. You will need to fol Then `pip install cx_Oracle`. ``` { + # secret manager type + "secret_manager_type": "GCP", + + # secret manager project id + "secret_manager_project_id": "secrets-project-id", + # Configuration Required for All Data Sources "source-type": "Oracle", @@ -160,6 +190,12 @@ Please note the MSSQL Server package is not installed by default. You will need Then `pip install pyodbc`. ``` { + # secret manager type + "secret_manager_type": "GCP", + + # secret manager project id + "secret_manager_project_id": "secrets-project-id", + # Configuration Required for All Data Sources "source-type": "MSSQL", @@ -176,6 +212,12 @@ Then `pip install pyodbc`. ## Postgres ``` { + # secret manager type + "secret_manager_type": "GCP", + + # secret manager project id + "secret_manager_project_id": "secrets-project-id", + # Configuration Required for All Data Sources "source-type": "Postgres", @@ -192,6 +234,12 @@ Then `pip install pyodbc`. Please note AlloyDB supports same connection config as Postgres. ``` { + # secret manager type + "secret_manager_type": "GCP", + + # secret manager project id + "secret_manager_project_id": "secrets-project-id", + # Configuration Required for All Data Sources "source-type": "Postgres", @@ -207,6 +255,12 @@ Please note AlloyDB supports same connection config as Postgres. ## MySQL ``` { + # secret manager type + "secret_manager_type": "GCP", + + # secret manager project id + "secret_manager_project_id": "secrets-project-id", + # Configuration Required for All Data Sources "source-type": "MySQL", @@ -222,6 +276,12 @@ Please note AlloyDB supports same connection config as Postgres. ## Redshift ``` { + # secret manager type + "secret_manager_type": "GCP", + + # secret manager project id + "secret_manager_project_id": "secrets-project-id", + # Configuration Required for All Data Sources "source-type": "Redshift", @@ -254,6 +314,12 @@ Please note AlloyDB supports same connection config as Postgres. ## Impala ``` { + # secret manager type + "secret_manager_type": "GCP", + + # secret manager project id + "secret_manager_project_id": "secrets-project-id", + # Configuration Required for All Data Sources "source-type": "Impala", @@ -277,6 +343,13 @@ Please note that for Group By validations, the following property must be set in ``` { + + # secret manager type + "secret_manager_type": "GCP", + + # secret manager project id + "secret_manager_project_id": "secrets-project-id", + # Hive is based off Impala connector "source-type": "Impala", @@ -292,6 +365,12 @@ Only Hive >=0.11 is supported due to [impyla](https://github.com/cloudera/impyla ## DB2 ``` { + # secret manager type + "secret_manager_type": "GCP", + + # secret manager project id + "secret_manager_project_id": "secrets-project-id", + # Configuration Required for All Data Sources "source-type": "DB2", diff --git a/noxfile.py b/noxfile.py index 28c4bc6e0..75a421259 100644 --- a/noxfile.py +++ b/noxfile.py @@ -256,3 +256,15 @@ def integration_hive(session): raise Exception("Expected Env Var: %s" % env_var) session.run("pytest", "tests/system/data_sources/test_hive.py", *session.posargs) + + +@nox.session(python=random.choice(PYTHON_VERSIONS), venv_backend="venv") +def integration_secrets(session): + """ + Run SecretManager integration tests. + Ensure the SecretManager is running as expected. + """ + _setup_session_requirements(session, extra_packages=[]) + + test_path = "tests/system/test_secret_manager.py" + session.run("pytest", test_path, *session.posargs) diff --git a/setup.py b/setup.py index 6b7d4cd63..f51a9cdc8 100644 --- a/setup.py +++ b/setup.py @@ -52,6 +52,7 @@ "tabulate==0.8.9", "Flask==2.0.2", "parsy==2.0", + "google-cloud-secret-manager<=2.15.0", ] extras_require = { diff --git a/tests/system/test_secret_manager.py b/tests/system/test_secret_manager.py new file mode 100644 index 000000000..d43c705a9 --- /dev/null +++ b/tests/system/test_secret_manager.py @@ -0,0 +1,33 @@ +import os +import pytest +from data_validation import secret_manager + + +project_id = os.getenv("project_id") + + +def test_secret_manager_not_supported(): + """Test build secret manger.""" + with pytest.raises(Exception) as ex: + client_type = "test_manager" + secret_manager.SecretManagerBuilder().build(client_type) + assert f"{client_type} is not supported yet." == str(ex.value) + + +def test_access_gcp_secret_exists(): + """Test maybeSecret with a secret exists.""" + client_type = "gcp" + secret_id = "db_test_user" + expected_secret_value = os.getenv("secret_value") + manager = secret_manager.SecretManagerBuilder().build(client_type) + secret_value = manager.maybe_secret(project_id, secret_id) + assert secret_value == expected_secret_value + + +def test_access_gcp_secret_not_exists(): + """Test maybeSecret with a secret not exists.""" + client_type = "gcp" + secret_id = "db_test_user_not_exists" + manager = secret_manager.SecretManagerBuilder().build(client_type) + secret_value = manager.maybe_secret(project_id, secret_id) + assert secret_value == secret_id diff --git a/tests/unit/test_cli_tools.py b/tests/unit/test_cli_tools.py index a27c30118..cd1ed03e8 100644 --- a/tests/unit/test_cli_tools.py +++ b/tests/unit/test_cli_tools.py @@ -115,7 +115,6 @@ def test_get_connection_config_from_args(): def test_create_and_list_connections(caplog, fs): - caplog.set_level(logging.INFO) # Create Connection parser = cli_tools.configure_arg_parser() @@ -144,7 +143,6 @@ def test_configure_arg_parser_list_and_run_validation_configs(): def test_create_and_list_and_get_validations(caplog, fs): - caplog.set_level(logging.INFO) # Create validation config file cli_tools.store_validation("example_validation.yaml", TEST_VALIDATION_CONFIG)