GoogleCloudPlatform · nehanene15 · May 27, 2021 · May 14, 2021 · May 14, 2021 · May 14, 2021
@@ -112,6 +112,11 @@
         ["database_id", "ID of Spanner database (schema) to connect to"],
         ["google_service_account_key_path", "(Optional) GCP SA Key Path"],
     ],
+    "FileSystem": [
+        ["table_name", "Table name to use as reference for file data"],
+        ["file_path", "The local, s3, or GCS file path to the data"],
+        ["file_type", "The file type of the file.'csv' or 'json'"],
+    ],
 }
 
 
@@ -266,16 +271,6 @@ def _configure_run_parser(subparsers):
         help='Filter config details [{"type":"custom","source":"xyz=xyz","target":"XYZ=XYZ"}]',
     )
 
-    # add beta features arguments here
-    if "beta" in sys.argv:
-        print("*** Enabling beta features ***")
-        # Sample argument to show how new ones can be added when beta flag is provided
-        # run_parser.add_argument(
-        #    "--test-beta",
-        #     "-test-beta",
-        #    help="testing beta"
-        # )
-
 
 def _configure_connection_parser(subparsers):
     """ Configure the Parser for Connection Management. """

@@ -214,7 +214,7 @@ def get_data_client(connection_config):
     "Impala": impala_connect,
     "MySQL": MySQLClient,
     "Oracle": OracleClient,
-    "Pandas": get_pandas_client,
+    "FileSystem": get_pandas_client,
     "Postgres": PostgreSQLClient,
     "Redshift": PostgreSQLClient,
     "Teradata": TeradataClient,

@@ -133,7 +133,7 @@ def filters(self):
     @property
     def source_schema(self):
         """Return string value of source schema."""
-        return self._config[consts.CONFIG_SCHEMA_NAME]
+        return self._config.get(consts.CONFIG_SCHEMA_NAME, None)
 
     @property
     def source_table(self):
@@ -143,9 +143,7 @@ def source_table(self):
     @property
     def target_schema(self):
         """Return string value of target schema."""
-        return self._config.get(
-            consts.CONFIG_TARGET_SCHEMA_NAME, self._config[consts.CONFIG_SCHEMA_NAME]
-        )
+        return self._config.get(consts.CONFIG_TARGET_SCHEMA_NAME, self.source_schema)
 
     @property
     def target_table(self):
@@ -283,11 +281,7 @@ def build_config_manager(
             consts.CONFIG_TYPE: config_type,
             consts.CONFIG_SOURCE_CONN: source_conn,
             consts.CONFIG_TARGET_CONN: target_conn,
-            consts.CONFIG_SCHEMA_NAME: table_obj[consts.CONFIG_SCHEMA_NAME],
             consts.CONFIG_TABLE_NAME: table_obj[consts.CONFIG_TABLE_NAME],
-            consts.CONFIG_TARGET_SCHEMA_NAME: table_obj.get(
-                consts.CONFIG_TARGET_SCHEMA_NAME, table_obj[consts.CONFIG_SCHEMA_NAME]
-            ),
             consts.CONFIG_TARGET_TABLE_NAME: table_obj.get(
                 consts.CONFIG_TARGET_TABLE_NAME, table_obj[consts.CONFIG_TABLE_NAME]
             ),
@@ -297,6 +291,16 @@ def build_config_manager(
             consts.CONFIG_FILTERS: filter_config,
         }
 
+        # Only FileSystem connections do not require schemas
+        if source_conn["source_type"] != "FileSystem":
+            config[consts.CONFIG_SCHEMA_NAME] = table_obj[consts.CONFIG_SCHEMA_NAME]
+            config[consts.CONFIG_TARGET_SCHEMA_NAME] = table_obj.get(
+                consts.CONFIG_TARGET_SCHEMA_NAME, table_obj[consts.CONFIG_SCHEMA_NAME]
+            )
+        else:
+            config[consts.CONFIG_SCHEMA_NAME] = None
+            config[consts.CONFIG_TARGET_SCHEMA_NAME] = None
+
         return ConfigManager(config, source_client, target_client, verbose=verbose)
 
     def build_config_grouped_columns(self, grouped_columns):

diff --git a/docs/connections.md b/docs/connections.md
@@ -91,7 +91,7 @@ Below is the expected configuration for each type.
     # Configuration Required for All Data Soures
     "source_type": "Spanner",
 
-    # GCP Project to use for Spanne
+    # GCP Project to use for Spanner
     "project_id": "my-project-name",
 
     # ID of Spanner instance to connect to
@@ -220,3 +220,19 @@ Then `pip install pyodbc`.
 }
 ```
 
+## FileSystem
+```
+{
+    # Configuration Required for All Data Soures
+    "source_type": "FileSystem",
+
+    # Table name to use as a reference for file data
+    "table_name": "my_table_name",
+
+    # The local, s3, or GCS file path to the data
+    "file_path": "gs://path/to/file",
+
+    # The file type. Either 'csv' or 'json
+    "file_type":"csv"
+}
+```
diff --git a/docs/examples.md b/docs/examples.md
@@ -2,7 +2,7 @@
 This page describes some basic use cases of the tool.
 
 **PLEASE NOTE:** In below commands, my_bq_conn refers to the connection name for your BigQuery project. We are validating BigQuery tables that are
-available in BigQuery public datasets.
+available in BigQuery public datasets. These examples validate a table agaist itself for example purposes.  
 
 #### Simple COUNT(*) on a table
 ````shell script
@@ -113,6 +113,11 @@ data-validation run -t GroupedColumn -sc my_bq_conn -tc my_bq_conn -tbls '[{"sch
 data-validation run -t Column -sc my_bq_conn -tc my_bq_conn -tbls '[{"schema_name":"bigquery-public-data.new_york_citibike","table_name":"citibike_trips"}]' --count '["tripduration","start_station_name"]' -l "run-tag=test-run,run-val=test"
 ````
 
+#### Run validation on a file
+````shell script
+data-validation run -t Column -sc local_file -tc local_file -tbls '[{"table_name":"my_local_file"}]' --count '["name"]'
+````
+
 #### Run custom SQL 
 ````shell script
 data-validation query

@@ -30,7 +30,7 @@
 SOURCE_TABLE_FILE_PATH = "source_table_data.json"
 JSON_DATA = """[{"col_a":0,"col_b":"a"},{"col_a":1,"col_b":"b"}]"""
 SOURCE_CONN_CONFIG = {
-    "source_type": "Pandas",
+    "source_type": "FileSystem",
     "table_name": "my_table",
     "file_path": SOURCE_TABLE_FILE_PATH,
     "file_type": "json",

@@ -101,6 +101,16 @@ def test_config_property(module_under_test):
     assert config == config_manager._config
 
 
+def test_schema_property(module_under_test):
+    """Test getting schema."""
+    config_manager = module_under_test.ConfigManager(
+        SAMPLE_CONFIG, MockIbisClient(), MockIbisClient(), verbose=False
+    )
+
+    target_schema = config_manager.target_schema
+    assert target_schema == "bigquery-public-data.new_york_citibike"
+
+
 def test_filters_property(module_under_test):
     config_manager = module_under_test.ConfigManager(
         SAMPLE_CONFIG, MockIbisClient(), MockIbisClient(), verbose=False

@@ -26,14 +26,14 @@
 TARGET_TABLE_FILE_PATH = "target_table_data.json"
 
 SOURCE_CONN_CONFIG = {
-    "source_type": "Pandas",
+    "source_type": "FileSystem",
     "table_name": "my_table",
     "file_path": SOURCE_TABLE_FILE_PATH,
     "file_type": "json",
 }
 
 TARGET_CONN_CONFIG = {
-    "source_type": "Pandas",
+    "source_type": "FileSystem",
     "table_name": "my_table",
     "file_path": TARGET_TABLE_FILE_PATH,
     "file_type": "json",

@@ -23,14 +23,14 @@
 TARGET_TABLE_FILE_PATH = "target_table_data.json"
 
 SOURCE_CONN_CONFIG = {
-    "source_type": "Pandas",
+    "source_type": "FileSystem",
     "table_name": "my_table",
     "file_path": SOURCE_TABLE_FILE_PATH,
     "file_type": "json",
 }
 
 TARGET_CONN_CONFIG = {
-    "source_type": "Pandas",
+    "source_type": "FileSystem",
     "table_name": "my_table",
     "file_path": TARGET_TABLE_FILE_PATH,
     "file_type": "json",