feat: Issue262 impala connect (#281)

* feat: add Impala to cli_tools * feat: Modify impala cli tools params and add conversion data types * Add case where user doesnt provide params * database typo * missing indent * Add impala to connections page and fix typo * Update readme with Impala
GoogleCloudPlatform · Jul 20, 2021 · eaa052f · eaa052f
1 parent ae076e5
commit eaa052f
Show file tree

Hide file tree

Showing 4 changed files with 38 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -36,6 +36,7 @@ DVT supports the following connection types:
 * [MySQL](docs/connections.md#mysql)
 * [Redshift](docs/connections.md#redshift)
 * [FileSystem](docs/connections.md#filesystem)
+* [Impala](docs/connections.md#impala)
 
 The [Connections](docs/connections.md) page provides details about how to create
 and list connections for the validation tool.

diff --git a/data_validation/cli_tools.py b/data_validation/cli_tools.py
@@ -118,6 +118,12 @@
         ["file_path", "The local, s3, or GCS file path to the data"],
         ["file_type", "The file type of the file.'csv' or 'json'"],
     ],
+    "Impala": [
+        ["host", "Desired Impala host"],
+        ["port", "Desired Imapala port (10000 if not provided)"],
+        ["database", "Desired Impala database (default if not provided)"],
+        ["auth_mechanism", "Desired Impala auth mechanism (PLAIN if not provided)"],
+    ],
 }
 
 

diff --git a/docs/connections.md b/docs/connections.md
@@ -43,6 +43,7 @@ The data validation tool supports the following connection types.
 * [MySQL](#mysql)
 * [Redshift](#redshift)
 * [FileSystem](#filesystem)
+* [Impala](#Impala)
 
 As you see above, Teradata and BigQuery have different sets of custom arguments (for example project_id for BQ versus host for Teradata).
 
@@ -65,7 +66,7 @@ Below is the expected configuration for each type.
 ## Google BigQuery
 ```
 {
-    # Configuration Required for All Data Soures
+    # Configuration Required for All Data Sources
     "source_type": "BigQuery",
 
     # BigQuery Specific Connection Config
@@ -89,7 +90,7 @@ Below is the expected configuration for each type.
 ## Google Spanner
 ```
 {
-    # Configuration Required for All Data Soures
+    # Configuration Required for All Data Sources
     "source_type": "Spanner",
 
     # GCP Project to use for Spanner
@@ -114,7 +115,7 @@ Please note the Teradata is not-native to this package and must be installed
 via `pip install teradatasql` if you have a license.
 ```
 {
-    # Configuration Required for All Data Soures
+    # Configuration Required for All Data Sources
     "source_type": "Teradata",
 
     # Connection Details
@@ -130,7 +131,7 @@ Please note the Oracle package is not installed by default. You will need to fol
 Then `pip install cx_Oracle`.
 ```
 {
-    # Configuration Required for All Data Soures
+    # Configuration Required for All Data Sources
     "source_type": "Oracle",
 
     # Connection Details
@@ -148,7 +149,7 @@ Please note the MSSQL Server package is not installed by default. You will need
 Then `pip install pyodbc`.
 ```
 {
-    # Configuration Required for All Data Soures
+    # Configuration Required for All Data Sources
     "source_type": "MSSQL",
 
     # Connection Details
@@ -164,7 +165,7 @@ Then `pip install pyodbc`.
 ## Snowflake
 ```
 {
-    # Configuration Required for All Data Soures
+    # Configuration Required for All Data Sources
     "source_type": "Snowflake",
 
     # Connection Details
@@ -179,7 +180,7 @@ Then `pip install pyodbc`.
 ## Postgres
 ```
 {
-    # Configuration Required for All Data Soures
+    # Configuration Required for All Data Sources
     "source_type": "Postgres",
 
     # Connection Details
@@ -194,7 +195,7 @@ Then `pip install pyodbc`.
 ## MySQL
 ```
 {
-    # Configuration Required for All Data Soures
+    # Configuration Required for All Data Sources
     "source_type": "MySQL",
 
     # Connection Details
@@ -209,7 +210,7 @@ Then `pip install pyodbc`.
 ## Redshift
 ```
 {
-    # Configuration Required for All Data Soures
+    # Configuration Required for All Data Sources
     "source_type": "Redshift",
 
     # Connection Details
@@ -224,7 +225,7 @@ Then `pip install pyodbc`.
 ## FileSystem
 ```
 {
-    # Configuration Required for All Data Soures
+    # Configuration Required for All Data Sources
     "source_type": "FileSystem",
 
     # Table name to use as a reference for file data
@@ -237,3 +238,17 @@ Then `pip install pyodbc`.
     "file_type":"csv"
 }
 ```
+
+## Impala
+```
+{
+    # Configuration Required for All Data Sources
+    "source_type": "Impala",
+
+    # Connection Details
+    "host": "127.0.0.1",
+    "port": 10000,
+    "database": "default",
+    "auth_mechanism":"PLAIN"
+}
+```
diff --git a/third_party/ibis/ibis_impala/api.py b/third_party/ibis/ibis_impala/api.py
@@ -12,12 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ibis.backends.impala import connect as impala_connect
+from ibis.backends.impala import connect
 from ibis.backends.impala import udf
 import ibis.expr.datatypes as dt
 
 _impala_to_ibis_type = udf._impala_to_ibis_type
 
+def impala_connect(host=None, port=10000, database="default", auth_mechanism="PLAIN"):
+   auth_mechanism = (auth_mechanism, "PLAIN")[auth_mechanism is None]
+   database = (database, "default")[database is None]
+   port = (port, 10000)[port is None]
+   return connect(host=host, port=int(port), database=database,auth_mechanism=auth_mechanism)
 
 def parse_type(t):
   """Returns the Ibis datatype from source type."""