-
Notifications
You must be signed in to change notification settings - Fork 692
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
rfctr: Implement SQL V2 Dest Connector (#3323)
- Loading branch information
Showing
7 changed files
with
449 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
## 0.14.10-dev9 | ||
## 0.14.10-dev10 | ||
|
||
### Enhancements | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
__version__ = "0.14.10-dev9" # pragma: no cover | ||
__version__ = "0.14.10-dev10" # pragma: no cover |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
from dataclasses import dataclass | ||
|
||
import click | ||
|
||
from unstructured.ingest.v2.cli.base import DestCmd | ||
from unstructured.ingest.v2.cli.interfaces import CliConfig | ||
from unstructured.ingest.v2.processes.connectors.sql import CONNECTOR_TYPE | ||
|
||
SQL_DRIVERS = {"postgresql", "sqlite"} | ||
|
||
|
||
@dataclass | ||
class SQLCliConnectionConfig(CliConfig): | ||
@staticmethod | ||
def get_cli_options() -> list[click.Option]: | ||
options = [ | ||
click.Option( | ||
["--db-type"], | ||
required=True, | ||
type=click.Choice(SQL_DRIVERS), | ||
help="Type of the database backend", | ||
), | ||
click.Option( | ||
["--username"], | ||
default=None, | ||
type=str, | ||
help="DB username", | ||
), | ||
click.Option( | ||
["--password"], | ||
default=None, | ||
type=str, | ||
help="DB password", | ||
), | ||
click.Option( | ||
["--host"], | ||
default=None, | ||
type=str, | ||
help="DB host", | ||
), | ||
click.Option( | ||
["--port"], | ||
default=None, | ||
type=int, | ||
help="DB host connection port", | ||
), | ||
click.Option( | ||
["--database"], | ||
default=None, | ||
type=str, | ||
help="Database name. For sqlite databases, this is the path to the .db file.", | ||
), | ||
] | ||
return options | ||
|
||
|
||
@dataclass | ||
class SQLCliUploaderConfig(CliConfig): | ||
@staticmethod | ||
def get_cli_options() -> list[click.Option]: | ||
options = [ | ||
click.Option( | ||
["--batch-size"], | ||
default=100, | ||
type=int, | ||
help="Number of records per batch", | ||
) | ||
] | ||
return options | ||
|
||
|
||
@dataclass | ||
class SQLCliUploadStagerConfig(CliConfig): | ||
@staticmethod | ||
def get_cli_options() -> list[click.Option]: | ||
return [] | ||
|
||
|
||
sql_dest_cmd = DestCmd( | ||
cmd_name=CONNECTOR_TYPE, | ||
connection_config=SQLCliConnectionConfig, | ||
uploader_config=SQLCliUploaderConfig, | ||
upload_stager_config=SQLCliUploadStagerConfig, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import os | ||
import sqlite3 | ||
from pathlib import Path | ||
|
||
from unstructured.ingest.v2.interfaces import ProcessorConfig | ||
from unstructured.ingest.v2.logger import logger | ||
from unstructured.ingest.v2.pipeline.pipeline import Pipeline | ||
from unstructured.ingest.v2.processes.chunker import ChunkerConfig | ||
from unstructured.ingest.v2.processes.connectors.local import ( | ||
LocalConnectionConfig, | ||
LocalDownloaderConfig, | ||
LocalIndexerConfig, | ||
) | ||
from unstructured.ingest.v2.processes.connectors.sql import ( | ||
DatabaseType, | ||
SimpleSqlConfig, | ||
SQLAccessConfig, | ||
SQLUploaderConfig, | ||
SQLUploadStagerConfig, | ||
) | ||
from unstructured.ingest.v2.processes.embedder import EmbedderConfig | ||
from unstructured.ingest.v2.processes.partitioner import PartitionerConfig | ||
|
||
base_path = Path(__file__).parent.parent.parent.parent.parent | ||
docs_path = base_path / "example-docs" | ||
work_dir = base_path / "tmp_ingest" | ||
output_path = work_dir / "output" | ||
download_path = work_dir / "download" | ||
|
||
SQLITE_DB = "test-sql-db.sqlite" | ||
|
||
if __name__ == "__main__": | ||
logger.info(f"Writing all content in: {work_dir.resolve()}") | ||
|
||
configs = { | ||
"context": ProcessorConfig(work_dir=str(work_dir.resolve())), | ||
"indexer_config": LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"), | ||
"downloader_config": LocalDownloaderConfig(download_dir=download_path), | ||
"source_connection_config": LocalConnectionConfig(), | ||
"partitioner_config": PartitionerConfig(strategy="fast"), | ||
"chunker_config": ChunkerConfig( | ||
chunking_strategy="by_title", | ||
chunk_include_orig_elements=False, | ||
chunk_max_characters=1500, | ||
chunk_multipage_sections=True, | ||
), | ||
"embedder_config": EmbedderConfig(embedding_provider="langchain-huggingface"), | ||
"stager_config": SQLUploadStagerConfig(), | ||
"uploader_config": SQLUploaderConfig(batch_size=10), | ||
} | ||
|
||
if os.path.exists(SQLITE_DB): | ||
os.remove(SQLITE_DB) | ||
|
||
connection = sqlite3.connect(database=SQLITE_DB) | ||
|
||
query = None | ||
script_path = ( | ||
Path(__file__).parent.parent.parent.parent.parent | ||
/ Path("scripts/sql-test-helpers/create-sqlite-schema.sql") | ||
).resolve() | ||
with open(script_path) as f: | ||
query = f.read() | ||
cursor = connection.cursor() | ||
cursor.executescript(query) | ||
connection.close() | ||
|
||
# sqlite test first | ||
Pipeline.from_configs( | ||
destination_connection_config=SimpleSqlConfig( | ||
db_type=DatabaseType.SQLITE, | ||
database=SQLITE_DB, | ||
access_config=SQLAccessConfig(), | ||
), | ||
**configs, | ||
).run() | ||
|
||
# now, pg with pgvector | ||
Pipeline.from_configs( | ||
destination_connection_config=SimpleSqlConfig( | ||
db_type=DatabaseType.POSTGRESQL, | ||
database="elements", | ||
host="localhost", | ||
port=5433, | ||
access_config=SQLAccessConfig(username="unstructured", password="test"), | ||
), | ||
**configs, | ||
).run() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.