Skip to content

Commit

Permalink
feat/more conservative ingest logging (#3301)
Browse files Browse the repository at this point in the history
### Description
Isolate all log statements that happen per record and make them debug
level to avoid bloating the console output.
  • Loading branch information
rbiseck3 committed Jun 26, 2024
1 parent 575957b commit 5179b73
Show file tree
Hide file tree
Showing 7 changed files with 11 additions and 13 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.14.9-dev5
## 0.14.9-dev6

### Enhancements

Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.14.9-dev5" # pragma: no cover
__version__ = "0.14.9-dev6" # pragma: no cover
2 changes: 1 addition & 1 deletion unstructured/ingest/v2/pipeline/steps/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __post_init__(self):
if self.process.config
else None
)
logger.info(f"Starting {self.identifier} with configs: {config}")
logger.info(f"Created {self.identifier} with configs: {config}")

def should_embed(self, filepath: Path, file_data: FileData) -> bool:
if self.context.reprocess or file_data.reprocess:
Expand Down
2 changes: 0 additions & 2 deletions unstructured/ingest/v2/processes/connectors/chroma.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,8 +193,6 @@ def run(self, contents: list[UploadContent], **kwargs: Any) -> None:
f"at {self.connection_config.host}",
)

logger.info(f"Inserting / updating {len(elements_dict)} documents to destination ")

collection = self.client.get_or_create_collection(
name=self.connection_config.collection_name
)
Expand Down
4 changes: 2 additions & 2 deletions unstructured/ingest/v2/processes/connectors/fsspec/fsspec.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ def _run(self, path: Path, file_data: FileData) -> None:
if self.fs.exists(path=str(upload_path)) and not self.upload_config.overwrite:
logger.debug(f"Skipping upload of {path} to {upload_path}, file already exists")
return
logger.info(f"Writing local file {path_str} to {upload_path}")
logger.debug(f"Writing local file {path_str} to {upload_path}")
self.fs.upload(lpath=path_str, rpath=str(upload_path))

async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> None:
Expand All @@ -352,5 +352,5 @@ async def run_async(self, path: Path, file_data: FileData, **kwargs: Any) -> Non
if already_exists and not self.upload_config.overwrite:
logger.debug(f"Skipping upload of {path} to {upload_path}, file already exists")
return
logger.info(f"Writing local file {path_str} to {upload_path}")
logger.debug(f"Writing local file {path_str} to {upload_path}")
self.fs.upload(lpath=path_str, rpath=str(upload_path))
8 changes: 4 additions & 4 deletions unstructured/ingest/v2/processes/connectors/google_drive.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,8 +185,8 @@ def get_paginated_results(
if extensions:
ext_filter = " or ".join([f"fileExtension = '{e}'" for e in extensions])
q = f"{q} and ({ext_filter} or mimeType = 'application/vnd.google-apps.folder')"
logger.info(f"Query used when indexing: {q}")
logger.info("response fields limited to: {}".format(", ".join(self.fields)))
logger.debug(f"Query used when indexing: {q}")
logger.debug("response fields limited to: {}".format(", ".join(self.fields)))
done = False
page_token = None
files_response = []
Expand Down Expand Up @@ -297,7 +297,7 @@ def is_float(value: str):
def _write_file(self, file_data: FileData, file_contents: io.BytesIO):
download_path = self.get_download_path(file_data=file_data)
download_path.parent.mkdir(parents=True, exist_ok=True)
logger.info(f"writing {file_data.source_identifiers.fullpath} to {download_path}")
logger.debug(f"writing {file_data.source_identifiers.fullpath} to {download_path}")
with open(download_path, "wb") as handler:
handler.write(file_contents.getbuffer())
if (
Expand All @@ -315,7 +315,7 @@ def _write_file(self, file_data: FileData, file_contents: io.BytesIO):
def run(self, file_data: FileData, **kwargs: Any) -> download_responses:
from googleapiclient.http import MediaIoBaseDownload

logger.info(f"fetching file: {file_data.source_identifiers.fullpath}")
logger.debug(f"fetching file: {file_data.source_identifiers.fullpath}")
mime_type = file_data.additional_metadata["mimeType"]
record_id = file_data.identifier
files_client = self.connection_config.get_files_service()
Expand Down
4 changes: 2 additions & 2 deletions unstructured/ingest/v2/processes/partitioner.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def partition_locally(
from unstructured.partition.auto import partition

logger.debug(f"Using local partition with kwargs: {self.config.to_partition_kwargs()}")
logger.info(f"partitioning file {filename} with metadata {metadata.to_dict()}")
logger.debug(f"partitioning file {filename} with metadata {metadata.to_dict()}")
elements = partition(
filename=str(filename.resolve()),
data_source_metadata=metadata,
Expand Down Expand Up @@ -142,7 +142,7 @@ async def partition_via_api(
) -> list[dict]:
from unstructured_client import UnstructuredClient

logger.info(f"partitioning file {filename} with metadata: {metadata.to_dict()}")
logger.debug(f"partitioning file {filename} with metadata: {metadata.to_dict()}")
client = UnstructuredClient(
server_url=self.config.partition_endpoint, api_key_auth=self.config.api_key
)
Expand Down

0 comments on commit 5179b73

Please sign in to comment.