diff --git a/pyproject.toml b/pyproject.toml index d0d1373..4137803 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "openparse" description = "Streamlines the process of preparing documents for LLM's." readme = "README.md" requires-python = ">=3.8" -version = "0.5.4" +version = "0.5.5" authors = [{name = "Sergey Filimonov", email = "hello@sergey.fyi"}] dependencies = [ "PyMuPDF >= 1.23.2", diff --git a/src/openparse/types.py b/src/openparse/_types.py similarity index 100% rename from src/openparse/types.py rename to src/openparse/_types.py diff --git a/src/openparse/doc_parser.py b/src/openparse/doc_parser.py index feefa89..4c474bc 100644 --- a/src/openparse/doc_parser.py +++ b/src/openparse/doc_parser.py @@ -3,7 +3,7 @@ from openparse import tables, text, consts from openparse.pdf import Pdf -from openparse.types import NOT_GIVEN, NotGiven +from openparse._types import NOT_GIVEN, NotGiven from openparse.processing import ( IngestionPipeline, BasicIngestionPipeline, @@ -34,7 +34,7 @@ class PyMuPDFArgsDict(TypedDict, total=False): def _table_args_dict_to_model( - args_dict: Union[TableTransformersArgsDict, PyMuPDFArgsDict] + args_dict: Union[TableTransformersArgsDict, PyMuPDFArgsDict], ) -> Union[tables.TableTransformersArgs, tables.PyMuPDFArgs]: if args_dict["parsing_algorithm"] == "table-transformers": return tables.TableTransformersArgs(**args_dict) diff --git a/src/openparse/pdf.py b/src/openparse/pdf.py index bbed5cc..100f347 100644 --- a/src/openparse/pdf.py +++ b/src/openparse/pdf.py @@ -1,9 +1,9 @@ import random -import tempfile +import io from pathlib import Path from typing import Iterator, List, Literal, Optional, Union, Tuple, Any -from pydantic import BaseModel +from pydantic import BaseModel from pdfminer.high_level import extract_pages from pdfminer.layout import LTPage from pypdf import PdfReader, PdfWriter @@ -115,9 +115,9 @@ def to_pymupdf_doc(self): if not self.writer.pages: return fitz.open(self.file_path) - with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmpfile: - self.writer.write(tmpfile.name) - return fitz.open(tmpfile.name) + byte_stream = io.BytesIO() + self.writer.write(byte_stream) + return fitz.open(None, byte_stream) def _draw_bboxes( self, diff --git a/src/openparse/version.py b/src/openparse/version.py index e894b5d..d690686 100644 --- a/src/openparse/version.py +++ b/src/openparse/version.py @@ -1,4 +1,4 @@ -OPEN_PARSE_VERSION = "0.5.4" +OPEN_PARSE_VERSION = "0.5.5" def version_info() -> str: