From 90762862b476d7fba531affa4f35985a51add0e4 Mon Sep 17 00:00:00 2001 From: Dylan Hercher Date: Thu, 22 Jul 2021 12:29:54 -0700 Subject: [PATCH] feat: logic to deploy dvt on Cloud Run (#280) * feat: logic to deploy dvt on Cloud Run * feat: logic to deploy dvt on Cloud Run * Update samples/run/README.md Co-authored-by: Neha Nene * Update samples/run/README.md Co-authored-by: Neha Nene * remove setting project_id * remove kokoro project * Update samples/run/test.py Co-authored-by: Neha Nene * remove pso kokoro project * add more details to docs * feat: logic to deploy dvt on Cloud Run w/ Oracle example commented out. * feat: logic to deploy dvt on Cloud Run w/ Oracle example commented out. Co-authored-by: Neha Nene --- samples/run/Dockerfile | 42 ++++++++++++++ samples/run/README.md | 126 +++++++++++++++++++++++++++++++++++++++++ samples/run/deploy.sh | 6 ++ samples/run/main.py | 75 ++++++++++++++++++++++++ samples/run/test.py | 63 +++++++++++++++++++++ 5 files changed, 312 insertions(+) create mode 100644 samples/run/Dockerfile create mode 100644 samples/run/README.md create mode 100755 samples/run/deploy.sh create mode 100644 samples/run/main.py create mode 100644 samples/run/test.py diff --git a/samples/run/Dockerfile b/samples/run/Dockerfile new file mode 100644 index 000000000..8b5fcb3a2 --- /dev/null +++ b/samples/run/Dockerfile @@ -0,0 +1,42 @@ +FROM python:3.9-slim + +# Allow statements and log messages to immediately appear in the Knative logs +ENV PYTHONUNBUFFERED True + +# Copy local code to the container image. +ENV APP_HOME /app +WORKDIR $APP_HOME +COPY . ./ + +# Install production dependencies. +RUN apt-get update \ + && apt-get install gcc -y \ + && apt-get clean +RUN pip install --upgrade pip +RUN pip install Flask gunicorn google_pso_data_validator + +# Oracle Dependencies +# if you are using Oracle you should add .rpm files +# under your license to a directory called oracle/ +# and then uncomment the setup below. + +# ENV ORACLE_SID oracle +# ENV ORACLE_ODBC_VERSION 12.2 +# ENV ORACLE_HOME /usr/lib/oracle/${ORACLE_ODBC_VERSION}/client64 + +# RUN pip install cx_Oracle +# RUN apt-get -y install --fix-missing --upgrade vim alien unixodbc-dev wget libaio1 libaio-dev + +# COPY oracle/*.rpm ./ +# RUN alien -i *.rpm && rm *.rpm \ +# && echo "/usr/lib/oracle/${ORACLE_ODBC_VERSION}/client64/lib/" > /etc/ld.so.conf.d/oracle.conf \ +# && ln -s /usr/include/oracle/${ORACLE_ODBC_VERSION}/client64 $ORACLE_HOME/include \ +# && ldconfig -v + + +# Run the web service on container startup. Here we use the gunicorn +# webserver, with one worker process and 8 threads. +# For environments with multiple CPU cores, increase the number of workers +# to be equal to the cores available. +# Timeout is set to 0 to disable the timeouts of the workers to allow Cloud Run to handle instance scaling. +CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 --timeout 0 main:app diff --git a/samples/run/README.md b/samples/run/README.md new file mode 100644 index 000000000..5b216f99f --- /dev/null +++ b/samples/run/README.md @@ -0,0 +1,126 @@ +# Data Validation on Cloud Run + +### Quick Steps + +The dpeloyment logic is discussed in more detail below; however, to quickly +deploy you follow follow this simple script: + +``` +export PROJECT_ID= +./deploy.sh +python3 test.py +``` + +### Build Docker Image + +You will need to build a Docker image to be used by Cloud Run. In order to add +Teradata or Oracle, you will need to customize the Dockerfile and add your +licensed utilities. + +``` +export PROJECT_ID= +gcloud builds submit --tag gcr.io/${PROJECT_ID}/data-validation \ + --project=${PROJECT_ID} +``` + +### Deploy to Cloud Run + +``` +gcloud run deploy --image gcr.io/${PROJECT_ID}/data-validation \ + --project=${PROJECT_ID} +``` + +### Test Cloud Run Endpoint + +You can easily run a request via Python. For a quick test, we have provided this logic in `test.py` to run a validation against a public BigQuery table. The example is similar and also shows how you can foward results to BigQuery from the Cloud Run job: + +``` +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import requests + +PROJECT_ID = os.environ.get("PROJECT_ID") + +DESCRIBE_SERVICE = """ +gcloud run services describe {service_name} --region=us-central1 --project={project_id} +""" + + +def get_token(): + with os.popen("gcloud auth print-identity-token") as cmd: + token = cmd.read().strip() + + return token + + +def get_cloud_run_url(service_name, project_id): + describe_service = DESCRIBE_SERVICE.format(service_name=service_name, + project_id=project_id) + with os.popen(describe_service) as service: + description = service.read() + + return re.findall("URL:.*\n", description)[0].split()[1].strip() + + +data = { + "source_conn": { + "source_type": "BigQuery", + "project_id": PROJECT_ID, + }, + "target_conn": { + "source_type": "BigQuery", + "project_id": PROJECT_ID, + }, + "type": "Column", + "schema_name": "bigquery-public-data.new_york_citibike", + "table_name": "citibike_stations", + "target_schema_name": "bigquery-public-data.new_york_citibike", + "target_table_name": "citibike_stations", + "result_handler": { + "type": "BigQuery", + "project_id": PROJECT_ID, + "table_id": "pso_data_validator.results" + }, + + "aggregates": [ + { + "source_column": None, + "target_column": None, + "field_alias": "count", + "type": "count" + }], +} + +url = get_cloud_run_url("data-validation", PROJECT_ID) +res = requests.post( + url, headers={"Authorization": "Bearer " + get_token()}, json=data) +print(res.content.decode()) +``` + +### Oracle Setup + +If you would like to use Data Validation against an Oracle DB you will need to +supply your own license files. To do so: + +1) Create an `oracle` directory and add your .rpm files into it. + +- oracle/oracle-instantclient12.2-basiclite-12.2.0.1.0-1.x86_64.rpm +- oracle/oracle-instantclient12.2-devel-12.2.0.1.0-1.x86_64.rpm +- oracle/oracle-instantclient12.2-odbc-12.2.0.1.0-2.x86_64.rpm + +2) Uncomment all logic in the Dockerfile under the Oracle Dependencies comments + diff --git a/samples/run/deploy.sh b/samples/run/deploy.sh new file mode 100755 index 000000000..ce19056af --- /dev/null +++ b/samples/run/deploy.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +gcloud builds submit --tag gcr.io/${PROJECT_ID}/data-validation \ + --project=${PROJECT_ID} +gcloud run deploy data-validation --image gcr.io/${PROJECT_ID}/data-validation \ + --region=us-central1 --project=${PROJECT_ID} diff --git a/samples/run/main.py b/samples/run/main.py new file mode 100644 index 000000000..3a3579a9b --- /dev/null +++ b/samples/run/main.py @@ -0,0 +1,75 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +from data_validation import data_validation +import flask +import pandas + +app = flask.Flask(__name__) + + +def _clean_dataframe(df): + rows = df.to_dict(orient="record") + for row in rows: + for key in row: + if type(row[key]) in [pandas.Timestamp]: + row[key] = str(row[key]) + + return json.dumps(rows) + + +def _get_request_content(request): + return request.json + + +def validate(config): + """Run Data Validation against the supplied config.""" + validator = data_validation.DataValidation(config) + df = validator.execute() + + return _clean_dataframe(df) + + +def main(request): + """ Handle incoming Data Validation requests. + + request (flask.Request): HTTP request object. + """ + try: + config = _get_request_content(request)["config"] + return validate(config) + except Exception as e: + return "Unknown Error: {}".format(e) + + +@app.route("/", methods=["POST"]) +def run(): + try: + config = _get_request_content(flask.request) + result = validate(config) + return str(result) + except Exception as e: + print(e) + return "Found Error: {}".format(e) + + +@app.route("/test", methods=["POST"]) +def other(): + return _get_request_content(flask.request) + + +if __name__ == "__main__": + app.run(debug=True, host="0.0.0.0", port=int(os.environ.get("PORT", 8080))) diff --git a/samples/run/test.py b/samples/run/test.py new file mode 100644 index 000000000..9734b1e39 --- /dev/null +++ b/samples/run/test.py @@ -0,0 +1,63 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import requests + +PROJECT_ID = os.environ.get("PROJECT_ID") + +DESCRIBE_SERVICE = """ +gcloud run services describe {service_name} --region=us-central1 --project={project_id} +""" + + +def get_token(): + with os.popen("gcloud auth print-identity-token") as cmd: + token = cmd.read().strip() + + return token + + +def get_cloud_run_url(service_name, project_id): + describe_service = DESCRIBE_SERVICE.format( + service_name=service_name, project_id=project_id + ) + with os.popen(describe_service) as service: + description = service.read() + + return re.findall("URL:.*\n", description)[0].split()[1].strip() + + +data = { + "source_conn": {"source_type": "BigQuery", "project_id": PROJECT_ID,}, + "target_conn": {"source_type": "BigQuery", "project_id": PROJECT_ID,}, + "type": "Column", + "schema_name": "bigquery-public-data.new_york_citibike", + "table_name": "citibike_stations", + "target_schema_name": "bigquery-public-data.new_york_citibike", + "target_table_name": "citibike_stations", + "aggregates": [ + { + "source_column": None, + "target_column": None, + "field_alias": "count", + "type": "count", + } + ], +} + +url = get_cloud_run_url("data-validation", PROJECT_ID) +res = requests.post(url, headers={"Authorization": "Bearer " + get_token()}, json=data) +print(res.content.decode())