Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Added FileSystem connection type #254

Merged
merged 8 commits into from
May 27, 2021
15 changes: 5 additions & 10 deletions data_validation/cli_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,11 @@
["database_id", "ID of Spanner database (schema) to connect to"],
["google_service_account_key_path", "(Optional) GCP SA Key Path"],
],
"FileSystem": [
["table_name", "Table name to use as reference for file data"],
["file_path", "The local, s3, or GCS file path to the data"],
["file_type", "The file type of the file.'csv' or 'json'"],
],
}


Expand Down Expand Up @@ -266,16 +271,6 @@ def _configure_run_parser(subparsers):
help='Filter config details [{"type":"custom","source":"xyz=xyz","target":"XYZ=XYZ"}]',
)

# add beta features arguments here
if "beta" in sys.argv:
print("*** Enabling beta features ***")
# Sample argument to show how new ones can be added when beta flag is provided
# run_parser.add_argument(
# "--test-beta",
# "-test-beta",
# help="testing beta"
# )


def _configure_connection_parser(subparsers):
""" Configure the Parser for Connection Management. """
Expand Down
2 changes: 1 addition & 1 deletion data_validation/clients.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ def get_data_client(connection_config):
"Impala": impala_connect,
"MySQL": MySQLClient,
"Oracle": OracleClient,
"Pandas": get_pandas_client,
"FileSystem": get_pandas_client,
"Postgres": PostgreSQLClient,
"Redshift": PostgreSQLClient,
"Teradata": TeradataClient,
Expand Down
20 changes: 12 additions & 8 deletions data_validation/config_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def filters(self):
@property
def source_schema(self):
"""Return string value of source schema."""
return self._config[consts.CONFIG_SCHEMA_NAME]
return self._config.get(consts.CONFIG_SCHEMA_NAME, None)

@property
def source_table(self):
Expand All @@ -143,9 +143,7 @@ def source_table(self):
@property
def target_schema(self):
"""Return string value of target schema."""
return self._config.get(
consts.CONFIG_TARGET_SCHEMA_NAME, self._config[consts.CONFIG_SCHEMA_NAME]
)
return self._config.get(consts.CONFIG_TARGET_SCHEMA_NAME, self.source_schema)

@property
def target_table(self):
Expand Down Expand Up @@ -283,11 +281,7 @@ def build_config_manager(
consts.CONFIG_TYPE: config_type,
consts.CONFIG_SOURCE_CONN: source_conn,
consts.CONFIG_TARGET_CONN: target_conn,
consts.CONFIG_SCHEMA_NAME: table_obj[consts.CONFIG_SCHEMA_NAME],
consts.CONFIG_TABLE_NAME: table_obj[consts.CONFIG_TABLE_NAME],
consts.CONFIG_TARGET_SCHEMA_NAME: table_obj.get(
consts.CONFIG_TARGET_SCHEMA_NAME, table_obj[consts.CONFIG_SCHEMA_NAME]
),
consts.CONFIG_TARGET_TABLE_NAME: table_obj.get(
consts.CONFIG_TARGET_TABLE_NAME, table_obj[consts.CONFIG_TABLE_NAME]
),
Expand All @@ -297,6 +291,16 @@ def build_config_manager(
consts.CONFIG_FILTERS: filter_config,
}

# Only FileSystem connections do not require schemas
if source_conn["source_type"] != "FileSystem":
config[consts.CONFIG_SCHEMA_NAME] = table_obj[consts.CONFIG_SCHEMA_NAME]
config[consts.CONFIG_TARGET_SCHEMA_NAME] = table_obj.get(
consts.CONFIG_TARGET_SCHEMA_NAME, table_obj[consts.CONFIG_SCHEMA_NAME]
)
else:
config[consts.CONFIG_SCHEMA_NAME] = None
config[consts.CONFIG_TARGET_SCHEMA_NAME] = None

return ConfigManager(config, source_client, target_client, verbose=verbose)

def build_config_grouped_columns(self, grouped_columns):
Expand Down
18 changes: 17 additions & 1 deletion docs/connections.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ Below is the expected configuration for each type.
# Configuration Required for All Data Soures
"source_type": "Spanner",

# GCP Project to use for Spanne
# GCP Project to use for Spanner
"project_id": "my-project-name",

# ID of Spanner instance to connect to
Expand Down Expand Up @@ -220,3 +220,19 @@ Then `pip install pyodbc`.
}
```

## FileSystem
```
{
# Configuration Required for All Data Soures
"source_type": "FileSystem",

# Table name to use as a reference for file data
"table_name": "my_table_name",

# The local, s3, or GCS file path to the data
"file_path": "gs://path/to/file",

# The file type. Either 'csv' or 'json
"file_type":"csv"
}
```
7 changes: 6 additions & 1 deletion docs/examples.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
This page describes some basic use cases of the tool.

**PLEASE NOTE:** In below commands, my_bq_conn refers to the connection name for your BigQuery project. We are validating BigQuery tables that are
available in BigQuery public datasets.
available in BigQuery public datasets. These examples validate a table agaist itself for example purposes.

#### Simple COUNT(*) on a table
````shell script
Expand Down Expand Up @@ -113,6 +113,11 @@ data-validation run -t GroupedColumn -sc my_bq_conn -tc my_bq_conn -tbls '[{"sch
data-validation run -t Column -sc my_bq_conn -tc my_bq_conn -tbls '[{"schema_name":"bigquery-public-data.new_york_citibike","table_name":"citibike_trips"}]' --count '["tripduration","start_station_name"]' -l "run-tag=test-run,run-val=test"
````

#### Run validation on a file
````shell script
data-validation run -t Column -sc local_file -tc local_file -tbls '[{"table_name":"my_local_file"}]' --count '["name"]'
````

#### Run custom SQL
````shell script
data-validation query
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/test_clients.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
SOURCE_TABLE_FILE_PATH = "source_table_data.json"
JSON_DATA = """[{"col_a":0,"col_b":"a"},{"col_a":1,"col_b":"b"}]"""
SOURCE_CONN_CONFIG = {
"source_type": "Pandas",
"source_type": "FileSystem",
"table_name": "my_table",
"file_path": SOURCE_TABLE_FILE_PATH,
"file_type": "json",
Expand Down
10 changes: 10 additions & 0 deletions tests/unit/test_config_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,16 @@ def test_config_property(module_under_test):
assert config == config_manager._config


def test_schema_property(module_under_test):
"""Test getting schema."""
config_manager = module_under_test.ConfigManager(
SAMPLE_CONFIG, MockIbisClient(), MockIbisClient(), verbose=False
)

target_schema = config_manager.target_schema
assert target_schema == "bigquery-public-data.new_york_citibike"


def test_filters_property(module_under_test):
config_manager = module_under_test.ConfigManager(
SAMPLE_CONFIG, MockIbisClient(), MockIbisClient(), verbose=False
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/test_data_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@
TARGET_TABLE_FILE_PATH = "target_table_data.json"

SOURCE_CONN_CONFIG = {
"source_type": "Pandas",
"source_type": "FileSystem",
"table_name": "my_table",
"file_path": SOURCE_TABLE_FILE_PATH,
"file_type": "json",
}

TARGET_CONN_CONFIG = {
"source_type": "Pandas",
"source_type": "FileSystem",
"table_name": "my_table",
"file_path": TARGET_TABLE_FILE_PATH,
"file_type": "json",
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/test_schema_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,14 @@
TARGET_TABLE_FILE_PATH = "target_table_data.json"

SOURCE_CONN_CONFIG = {
"source_type": "Pandas",
"source_type": "FileSystem",
"table_name": "my_table",
"file_path": SOURCE_TABLE_FILE_PATH,
"file_type": "json",
}

TARGET_CONN_CONFIG = {
"source_type": "Pandas",
"source_type": "FileSystem",
"table_name": "my_table",
"file_path": TARGET_TABLE_FILE_PATH,
"file_type": "json",
Expand Down