diff --git a/README.md b/README.md index dcdc27d3a..73130d39f 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ DVT supports the following connection types: * [MySQL](docs/connections.md#mysql) * [Redshift](docs/connections.md#redshift) * [FileSystem](docs/connections.md#filesystem) +* [Impala](docs/connections.md#impala) The [Connections](docs/connections.md) page provides details about how to create and list connections for the validation tool. diff --git a/data_validation/cli_tools.py b/data_validation/cli_tools.py index 9b12f8d1d..84c5224e4 100644 --- a/data_validation/cli_tools.py +++ b/data_validation/cli_tools.py @@ -118,6 +118,12 @@ ["file_path", "The local, s3, or GCS file path to the data"], ["file_type", "The file type of the file.'csv' or 'json'"], ], + "Impala": [ + ["host", "Desired Impala host"], + ["port", "Desired Imapala port (10000 if not provided)"], + ["database", "Desired Impala database (default if not provided)"], + ["auth_mechanism", "Desired Impala auth mechanism (PLAIN if not provided)"], + ], } diff --git a/docs/connections.md b/docs/connections.md index 3a4a1d4b1..ea522a5de 100644 --- a/docs/connections.md +++ b/docs/connections.md @@ -43,6 +43,7 @@ The data validation tool supports the following connection types. * [MySQL](#mysql) * [Redshift](#redshift) * [FileSystem](#filesystem) +* [Impala](#Impala) As you see above, Teradata and BigQuery have different sets of custom arguments (for example project_id for BQ versus host for Teradata). @@ -65,7 +66,7 @@ Below is the expected configuration for each type. ## Google BigQuery ``` { - # Configuration Required for All Data Soures + # Configuration Required for All Data Sources "source_type": "BigQuery", # BigQuery Specific Connection Config @@ -89,7 +90,7 @@ Below is the expected configuration for each type. ## Google Spanner ``` { - # Configuration Required for All Data Soures + # Configuration Required for All Data Sources "source_type": "Spanner", # GCP Project to use for Spanner @@ -114,7 +115,7 @@ Please note the Teradata is not-native to this package and must be installed via `pip install teradatasql` if you have a license. ``` { - # Configuration Required for All Data Soures + # Configuration Required for All Data Sources "source_type": "Teradata", # Connection Details @@ -130,7 +131,7 @@ Please note the Oracle package is not installed by default. You will need to fol Then `pip install cx_Oracle`. ``` { - # Configuration Required for All Data Soures + # Configuration Required for All Data Sources "source_type": "Oracle", # Connection Details @@ -148,7 +149,7 @@ Please note the MSSQL Server package is not installed by default. You will need Then `pip install pyodbc`. ``` { - # Configuration Required for All Data Soures + # Configuration Required for All Data Sources "source_type": "MSSQL", # Connection Details @@ -164,7 +165,7 @@ Then `pip install pyodbc`. ## Snowflake ``` { - # Configuration Required for All Data Soures + # Configuration Required for All Data Sources "source_type": "Snowflake", # Connection Details @@ -179,7 +180,7 @@ Then `pip install pyodbc`. ## Postgres ``` { - # Configuration Required for All Data Soures + # Configuration Required for All Data Sources "source_type": "Postgres", # Connection Details @@ -194,7 +195,7 @@ Then `pip install pyodbc`. ## MySQL ``` { - # Configuration Required for All Data Soures + # Configuration Required for All Data Sources "source_type": "MySQL", # Connection Details @@ -209,7 +210,7 @@ Then `pip install pyodbc`. ## Redshift ``` { - # Configuration Required for All Data Soures + # Configuration Required for All Data Sources "source_type": "Redshift", # Connection Details @@ -224,7 +225,7 @@ Then `pip install pyodbc`. ## FileSystem ``` { - # Configuration Required for All Data Soures + # Configuration Required for All Data Sources "source_type": "FileSystem", # Table name to use as a reference for file data @@ -237,3 +238,17 @@ Then `pip install pyodbc`. "file_type":"csv" } ``` + +## Impala +``` +{ + # Configuration Required for All Data Sources + "source_type": "Impala", + + # Connection Details + "host": "127.0.0.1", + "port": 10000, + "database": "default", + "auth_mechanism":"PLAIN" +} +``` diff --git a/third_party/ibis/ibis_impala/api.py b/third_party/ibis/ibis_impala/api.py index 2e4a4843e..7455d22b0 100644 --- a/third_party/ibis/ibis_impala/api.py +++ b/third_party/ibis/ibis_impala/api.py @@ -12,12 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -from ibis.backends.impala import connect as impala_connect +from ibis.backends.impala import connect from ibis.backends.impala import udf import ibis.expr.datatypes as dt _impala_to_ibis_type = udf._impala_to_ibis_type +def impala_connect(host=None, port=10000, database="default", auth_mechanism="PLAIN"): + auth_mechanism = (auth_mechanism, "PLAIN")[auth_mechanism is None] + database = (database, "default")[database is None] + port = (port, 10000)[port is None] + return connect(host=host, port=int(port), database=database,auth_mechanism=auth_mechanism) def parse_type(t): """Returns the Ibis datatype from source type."""