Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: logic to deploy dvt on Cloud Run #280

Merged
merged 11 commits into from
Jul 22, 2021
23 changes: 23 additions & 0 deletions samples/run/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
FROM python:3.9-slim

# Allow statements and log messages to immediately appear in the Knative logs
ENV PYTHONUNBUFFERED True

# Copy local code to the container image.
ENV APP_HOME /app
WORKDIR $APP_HOME
COPY . ./

# Install production dependencies.
RUN apt-get update \
&& apt-get install gcc -y \
&& apt-get clean
RUN pip install --upgrade pip
RUN pip install Flask gunicorn google_pso_data_validator

# Run the web service on container startup. Here we use the gunicorn
# webserver, with one worker process and 8 threads.
# For environments with multiple CPU cores, increase the number of workers
# to be equal to the cores available.
# Timeout is set to 0 to disable the timeouts of the workers to allow Cloud Run to handle instance scaling.
CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 --timeout 0 main:app
96 changes: 96 additions & 0 deletions samples/run/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Data Validation on Cloud Run

### Build Docker Image

You will need to build a Docker image to be used by Cloud Run. In order to add
Teradata or Oracle, you will need to customize the Dockerfile and add your
licensed utilities.

```
export PROJECT_ID=<PROJECT-ID>
gcloud builds submit --tag gcr.io/${PROJECT_ID}/data-validation \
--project=${PROJECT_ID}
```

### Deploy to Cloud Run

```
gcloud run deploy --image gcr.io/${PROJECT_ID}/data-validation \
--project=${PROJECT_ID}
```

### Test Cloud Run Endpoint

You can easily run a request via Python. For example:
dhercher marked this conversation as resolved.
Show resolved Hide resolved

```
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import re
import requests

PROJECT_ID = os.environ.get("PROJECT_ID", "pso-kokoro-resources")
dhercher marked this conversation as resolved.
Show resolved Hide resolved

DESCRIBE_SERVICE = """
gcloud run services describe {service_name} --region=us-central1 --project={project_id}
"""


def get_token():
with os.popen("gcloud auth print-identity-token") as cmd:
token = cmd.read().strip()

return token


def get_cloud_run_url(service_name, project_id):
describe_service = DESCRIBE_SERVICE.format(service_name=service_name,
project_id=project_id)
with os.popen(describe_service) as service:
description = service.read()

return re.findall("URL:.*\n", description)[0].split()[1].strip()


data = {
"source_conn": {
"source_type": "BigQuery",
"project_id": PROJECT_ID,
},
"target_conn": {
"source_type": "BigQuery",
"project_id": PROJECT_ID,
},
"type": "Column",
"schema_name": "bigquery-public-data.new_york_citibike",
"table_name": "citibike_stations",
"target_schema_name": "bigquery-public-data.new_york_citibike",
"target_table_name": "citibike_stations",
dhercher marked this conversation as resolved.
Show resolved Hide resolved

"aggregates": [
{
"source_column": None,
"target_column": None,
"field_alias": "count",
"type": "count"
}],
}

url = get_cloud_run_url("data-validation", PROJECT_ID)
res = requests.post(
url, headers={"Authorization": "Bearer " + get_token()}, json=data)
print(res.content)
dhercher marked this conversation as resolved.
Show resolved Hide resolved
```
7 changes: 7 additions & 0 deletions samples/run/deploy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash

export PROJECT_ID=pso-kokoro-resources
dhercher marked this conversation as resolved.
Show resolved Hide resolved
gcloud builds submit --tag gcr.io/${PROJECT_ID}/data-validation \
--project=${PROJECT_ID}
gcloud run deploy data-validation --image gcr.io/${PROJECT_ID}/data-validation \
--region=us-central1 --project=${PROJECT_ID}
75 changes: 75 additions & 0 deletions samples/run/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import json
import os
from data_validation import data_validation
import flask
import pandas

app = flask.Flask(__name__)


def _clean_dataframe(df):
rows = df.to_dict(orient="record")
for row in rows:
for key in row:
if type(row[key]) in [pandas.Timestamp]:
row[key] = str(row[key])

return json.dumps(rows)


def _get_request_content(request):
return request.json


def validate(config):
"""Run Data Validation against the supplied config."""
validator = data_validation.DataValidation(config)
df = validator.execute()

return _clean_dataframe(df)


def main(request):
""" Handle incoming Data Validation requests.

request (flask.Request): HTTP request object.
"""
try:
config = _get_request_content(request)["config"]
return validate(config)
except Exception as e:
return "Unknown Error: {}".format(e)


@app.route("/", methods=["POST"])
def run():
try:
config = _get_request_content(flask.request)
result = validate(config)
return str(result)
except Exception as e:
print(e)
return "Found Error: {}".format(e)


@app.route("/test", methods=["POST"])
def other():
return _get_request_content(flask.request)


if __name__ == "__main__":
app.run(debug=True, host="0.0.0.0", port=int(os.environ.get("PORT", 8080)))
63 changes: 63 additions & 0 deletions samples/run/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import re
import requests

PROJECT_ID = os.environ.get("PROJECT_ID", "pso-kokoro-resources")
dhercher marked this conversation as resolved.
Show resolved Hide resolved

DESCRIBE_SERVICE = """
gcloud run services describe {service_name} --region=us-central1 --project={project_id}
"""


def get_token():
with os.popen("gcloud auth print-identity-token") as cmd:
token = cmd.read().strip()

return token


def get_cloud_run_url(service_name, project_id):
describe_service = DESCRIBE_SERVICE.format(
service_name=service_name, project_id=project_id
)
with os.popen(describe_service) as service:
description = service.read()

return re.findall("URL:.*\n", description)[0].split()[1].strip()


data = {
"source_conn": {"source_type": "BigQuery", "project_id": PROJECT_ID,},
"target_conn": {"source_type": "BigQuery", "project_id": PROJECT_ID,},
"type": "Column",
"schema_name": "bigquery-public-data.new_york_citibike",
"table_name": "citibike_stations",
"target_schema_name": "bigquery-public-data.new_york_citibike",
"target_table_name": "citibike_stations",
dhercher marked this conversation as resolved.
Show resolved Hide resolved
"aggregates": [
{
"source_column": None,
"target_column": None,
"field_alias": "count",
"type": "count",
}
],
}

url = get_cloud_run_url("data-validation", PROJECT_ID)
res = requests.post(url, headers={"Authorization": "Bearer " + get_token()}, json=data)
print(res.content)
dhercher marked this conversation as resolved.
Show resolved Hide resolved