-
Notifications
You must be signed in to change notification settings - Fork 109
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: logic to deploy dvt on Cloud Run (#280)
* feat: logic to deploy dvt on Cloud Run * feat: logic to deploy dvt on Cloud Run * Update samples/run/README.md Co-authored-by: Neha Nene <[email protected]> * Update samples/run/README.md Co-authored-by: Neha Nene <[email protected]> * remove setting project_id * remove kokoro project * Update samples/run/test.py Co-authored-by: Neha Nene <[email protected]> * remove pso kokoro project * add more details to docs * feat: logic to deploy dvt on Cloud Run w/ Oracle example commented out. * feat: logic to deploy dvt on Cloud Run w/ Oracle example commented out. Co-authored-by: Neha Nene <[email protected]>
- Loading branch information
1 parent
eaa052f
commit 9076286
Showing
5 changed files
with
312 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
FROM python:3.9-slim | ||
|
||
# Allow statements and log messages to immediately appear in the Knative logs | ||
ENV PYTHONUNBUFFERED True | ||
|
||
# Copy local code to the container image. | ||
ENV APP_HOME /app | ||
WORKDIR $APP_HOME | ||
COPY . ./ | ||
|
||
# Install production dependencies. | ||
RUN apt-get update \ | ||
&& apt-get install gcc -y \ | ||
&& apt-get clean | ||
RUN pip install --upgrade pip | ||
RUN pip install Flask gunicorn google_pso_data_validator | ||
|
||
# Oracle Dependencies | ||
# if you are using Oracle you should add .rpm files | ||
# under your license to a directory called oracle/ | ||
# and then uncomment the setup below. | ||
|
||
# ENV ORACLE_SID oracle | ||
# ENV ORACLE_ODBC_VERSION 12.2 | ||
# ENV ORACLE_HOME /usr/lib/oracle/${ORACLE_ODBC_VERSION}/client64 | ||
|
||
# RUN pip install cx_Oracle | ||
# RUN apt-get -y install --fix-missing --upgrade vim alien unixodbc-dev wget libaio1 libaio-dev | ||
|
||
# COPY oracle/*.rpm ./ | ||
# RUN alien -i *.rpm && rm *.rpm \ | ||
# && echo "/usr/lib/oracle/${ORACLE_ODBC_VERSION}/client64/lib/" > /etc/ld.so.conf.d/oracle.conf \ | ||
# && ln -s /usr/include/oracle/${ORACLE_ODBC_VERSION}/client64 $ORACLE_HOME/include \ | ||
# && ldconfig -v | ||
|
||
|
||
# Run the web service on container startup. Here we use the gunicorn | ||
# webserver, with one worker process and 8 threads. | ||
# For environments with multiple CPU cores, increase the number of workers | ||
# to be equal to the cores available. | ||
# Timeout is set to 0 to disable the timeouts of the workers to allow Cloud Run to handle instance scaling. | ||
CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 --timeout 0 main:app |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
# Data Validation on Cloud Run | ||
|
||
### Quick Steps | ||
|
||
The dpeloyment logic is discussed in more detail below; however, to quickly | ||
deploy you follow follow this simple script: | ||
|
||
``` | ||
export PROJECT_ID=<PROJECT-ID> | ||
./deploy.sh | ||
python3 test.py | ||
``` | ||
|
||
### Build Docker Image | ||
|
||
You will need to build a Docker image to be used by Cloud Run. In order to add | ||
Teradata or Oracle, you will need to customize the Dockerfile and add your | ||
licensed utilities. | ||
|
||
``` | ||
export PROJECT_ID=<PROJECT-ID> | ||
gcloud builds submit --tag gcr.io/${PROJECT_ID}/data-validation \ | ||
--project=${PROJECT_ID} | ||
``` | ||
|
||
### Deploy to Cloud Run | ||
|
||
``` | ||
gcloud run deploy --image gcr.io/${PROJECT_ID}/data-validation \ | ||
--project=${PROJECT_ID} | ||
``` | ||
|
||
### Test Cloud Run Endpoint | ||
|
||
You can easily run a request via Python. For a quick test, we have provided this logic in `test.py` to run a validation against a public BigQuery table. The example is similar and also shows how you can foward results to BigQuery from the Cloud Run job: | ||
|
||
``` | ||
# Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
import os | ||
import re | ||
import requests | ||
PROJECT_ID = os.environ.get("PROJECT_ID") | ||
DESCRIBE_SERVICE = """ | ||
gcloud run services describe {service_name} --region=us-central1 --project={project_id} | ||
""" | ||
def get_token(): | ||
with os.popen("gcloud auth print-identity-token") as cmd: | ||
token = cmd.read().strip() | ||
return token | ||
def get_cloud_run_url(service_name, project_id): | ||
describe_service = DESCRIBE_SERVICE.format(service_name=service_name, | ||
project_id=project_id) | ||
with os.popen(describe_service) as service: | ||
description = service.read() | ||
return re.findall("URL:.*\n", description)[0].split()[1].strip() | ||
data = { | ||
"source_conn": { | ||
"source_type": "BigQuery", | ||
"project_id": PROJECT_ID, | ||
}, | ||
"target_conn": { | ||
"source_type": "BigQuery", | ||
"project_id": PROJECT_ID, | ||
}, | ||
"type": "Column", | ||
"schema_name": "bigquery-public-data.new_york_citibike", | ||
"table_name": "citibike_stations", | ||
"target_schema_name": "bigquery-public-data.new_york_citibike", | ||
"target_table_name": "citibike_stations", | ||
"result_handler": { | ||
"type": "BigQuery", | ||
"project_id": PROJECT_ID, | ||
"table_id": "pso_data_validator.results" | ||
}, | ||
"aggregates": [ | ||
{ | ||
"source_column": None, | ||
"target_column": None, | ||
"field_alias": "count", | ||
"type": "count" | ||
}], | ||
} | ||
url = get_cloud_run_url("data-validation", PROJECT_ID) | ||
res = requests.post( | ||
url, headers={"Authorization": "Bearer " + get_token()}, json=data) | ||
print(res.content.decode()) | ||
``` | ||
|
||
### Oracle Setup | ||
|
||
If you would like to use Data Validation against an Oracle DB you will need to | ||
supply your own license files. To do so: | ||
|
||
1) Create an `oracle` directory and add your .rpm files into it. | ||
|
||
- oracle/oracle-instantclient12.2-basiclite-12.2.0.1.0-1.x86_64.rpm | ||
- oracle/oracle-instantclient12.2-devel-12.2.0.1.0-1.x86_64.rpm | ||
- oracle/oracle-instantclient12.2-odbc-12.2.0.1.0-2.x86_64.rpm | ||
|
||
2) Uncomment all logic in the Dockerfile under the Oracle Dependencies comments | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
#!/bin/bash | ||
|
||
gcloud builds submit --tag gcr.io/${PROJECT_ID}/data-validation \ | ||
--project=${PROJECT_ID} | ||
gcloud run deploy data-validation --image gcr.io/${PROJECT_ID}/data-validation \ | ||
--region=us-central1 --project=${PROJECT_ID} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
# Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import json | ||
import os | ||
from data_validation import data_validation | ||
import flask | ||
import pandas | ||
|
||
app = flask.Flask(__name__) | ||
|
||
|
||
def _clean_dataframe(df): | ||
rows = df.to_dict(orient="record") | ||
for row in rows: | ||
for key in row: | ||
if type(row[key]) in [pandas.Timestamp]: | ||
row[key] = str(row[key]) | ||
|
||
return json.dumps(rows) | ||
|
||
|
||
def _get_request_content(request): | ||
return request.json | ||
|
||
|
||
def validate(config): | ||
"""Run Data Validation against the supplied config.""" | ||
validator = data_validation.DataValidation(config) | ||
df = validator.execute() | ||
|
||
return _clean_dataframe(df) | ||
|
||
|
||
def main(request): | ||
""" Handle incoming Data Validation requests. | ||
request (flask.Request): HTTP request object. | ||
""" | ||
try: | ||
config = _get_request_content(request)["config"] | ||
return validate(config) | ||
except Exception as e: | ||
return "Unknown Error: {}".format(e) | ||
|
||
|
||
@app.route("/", methods=["POST"]) | ||
def run(): | ||
try: | ||
config = _get_request_content(flask.request) | ||
result = validate(config) | ||
return str(result) | ||
except Exception as e: | ||
print(e) | ||
return "Found Error: {}".format(e) | ||
|
||
|
||
@app.route("/test", methods=["POST"]) | ||
def other(): | ||
return _get_request_content(flask.request) | ||
|
||
|
||
if __name__ == "__main__": | ||
app.run(debug=True, host="0.0.0.0", port=int(os.environ.get("PORT", 8080))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
# Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import os | ||
import re | ||
import requests | ||
|
||
PROJECT_ID = os.environ.get("PROJECT_ID") | ||
|
||
DESCRIBE_SERVICE = """ | ||
gcloud run services describe {service_name} --region=us-central1 --project={project_id} | ||
""" | ||
|
||
|
||
def get_token(): | ||
with os.popen("gcloud auth print-identity-token") as cmd: | ||
token = cmd.read().strip() | ||
|
||
return token | ||
|
||
|
||
def get_cloud_run_url(service_name, project_id): | ||
describe_service = DESCRIBE_SERVICE.format( | ||
service_name=service_name, project_id=project_id | ||
) | ||
with os.popen(describe_service) as service: | ||
description = service.read() | ||
|
||
return re.findall("URL:.*\n", description)[0].split()[1].strip() | ||
|
||
|
||
data = { | ||
"source_conn": {"source_type": "BigQuery", "project_id": PROJECT_ID,}, | ||
"target_conn": {"source_type": "BigQuery", "project_id": PROJECT_ID,}, | ||
"type": "Column", | ||
"schema_name": "bigquery-public-data.new_york_citibike", | ||
"table_name": "citibike_stations", | ||
"target_schema_name": "bigquery-public-data.new_york_citibike", | ||
"target_table_name": "citibike_stations", | ||
"aggregates": [ | ||
{ | ||
"source_column": None, | ||
"target_column": None, | ||
"field_alias": "count", | ||
"type": "count", | ||
} | ||
], | ||
} | ||
|
||
url = get_cloud_run_url("data-validation", PROJECT_ID) | ||
res = requests.post(url, headers={"Authorization": "Bearer " + get_token()}, json=data) | ||
print(res.content.decode()) |