diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 1a53c86..0e78585 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -1,34 +1,51 @@ -# Library Motivation: +# Architecture -**Open source:** Most libraries rely on chunking documents based on fixed string lengths. This naive approach throws out an enormous amount of valuable information that's hidden in the document structure. It also makes it challenging to display query citations down the line. We find quality citations to be invaluable in any non trivial RAG application. +## Core -**Commercial Offerings:** We've found these to either be cost prohibitive or lacking in performance. Foundational model providers like OpenAI and Google have also started implementing their own files API but these are black boxes and don't support complex querying. +#### /PDF +This is really just a wrapper class around a pdfminer, pymupdf and pypdf. We implement some basic visualization / export methods. Would like to migrate away from pymupdf for converting pdfs to images due to its licensing. +#### /Schemas -# Architecture +This is where we define the data models for the project. We use pydantic to define these models. This is useful for serialization and validation. Some methods in these classes need more robust testing asap. In general we prefer freezing as many attributes as possible to avoid unexpected behavior. -## Element Extraction +## Elements -#### 1. /Text +#### /Text This module implements basic text parsing along with basic markdown support. -#### 2. /Tables +We parse text into markdown by looking at the font size and style charachter by charachter. This gets combined into a span which represents a string of charachters with the same styling. + +Spans get combined into lines and lines get combined into elements. Elements are the basic building blocks of the document. They can be headings, paragraphs, bullets, etc. + +Optionally we can use PyMuPDF to OCR the document. This is not recommended as a default due to the additional computational cost and inherent inaccuracies of OCR. We're looking at integrating [doctr](https://github.com/mindee/doctr). + +Here's an article that goes into more details on available [OCR libraries](https://source.opennews.org/articles/our-search-best-ocr-tool-2023/). + +#### /Tables + +We implement a few different methods for table extraction. + +Table Transformer can be used for both detecting tables and their contents. + +PyMuPDF is also can be used to table detection and content extraction. + +Lastly unitable is our recommended approach for table extraction. It is a transformers based approach with **state-of-the-art** performance. Unfortunately, its performance is hindered by the fact that we still need to use the table-transformers model to detect table bounding boxes. Table Transformers's performance leaves a lot to be desired and may miss some tables or crop them incorrectly. **If you're aware of a stronger perfoming model, please let us know.** -We implement Table Transformer for parsing tables and their contents. Tables can be exported in a couple different formats - str, markdown or html. +We're also looking at speeding unitable up. This can either be done by quantizing the model or by using the smaller, 70M parameter model they released. Unfortunately, the smaller model was not fine tuned so this is holding us back from implementing it. You can see the published paper [here](https://arxiv.org/abs/2403.04822). ## Processing Pipeline -#### 3. /Processing +#### /Processing This is mostly a collection of fast heuristics to combine and split elements. The main idea is to have a fast and simple way to process the data. We looked into more complex methods like [Layout Parser Documentation](https://layout-parser.github.io/) but did not find the result compelling enough to full integrate. -You can implement you own rules by defining a subclass of `ProcessingStep` and adding it to the processing planner. +We also have a semantic processing pipeline that uses embeddings to cluster similar nodes together. This is powerful but we need to look into more robust ways to cluster these since we currently hardcode similarity thresholds. -#### 4. /Post Processing +You can implement you own rules by defining a subclass of `ProcessingStep` and adding a `process` method. This method should take a list of nodes and return a list of nodes. -There's promising techniques that require an LLM to integrate. This is currently unused. +Then you can add this to the pipeline by calling `add_step` on the `DocumentParser` object or create a new pipeline object with your custom steps. -- Using embeddings to combine similar nodes. This is especially useful to combine nodes that stretch across pages. -- Use an LLM to describe the contents of an image or graph. This helps with recall. +This can be done by subclassing `ProcessingPipeline` and adding your custom steps to the `transformations` attribute. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..394568e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,54 @@ +[build-system] +requires = ["setuptools>=42", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "openparse" +description = "Streamlines the process of preparing documents for LLM's." +readme = "README.md" +requires-python = ">=3.8" +version = "0.5.1" +authors = [{name = "Sergey Filimonov", email = "hello@sergey.fyi"}] +dependencies = [ + "PyMuPDF >= 1.23.2", + "pillow >= 8.3", + "pydantic >= 2.0", + "pypdf >= 4.0.0", + "pdfminer.six >= 20200401", + "tiktoken >= 0.3", + "openai >= 1.0.0", + "numpy", +] + +[project.urls] +homepage = "https://github.com/Filimoa/open-parse" +repository = "https://github.com/Filimoa/open-parse" +documentation = "https://filimoa.github.io/open-parse" + +[project.optional-dependencies] +ml = [ + "torch", + "torchvision", + "transformers", + "tokenizers", +] + +[project.scripts] +openparse-download = "openparse.cli:download_unitable_weights" + +[tool.ruff.lint] +select = [ + "E", # pycodestyle errors + "W", # pycodestyle warnings + "F", # pyflakes + "I", # isort + "B", # flake8-bugbear + "C4", # flake8-comprehensions + "UP", # pyupgrade +] +ignore = [ + "E501", # line too long, handled by black + "B008", # do not perform function calls in argument defaults + "C901", # too complex + "W191", # indentation contains tabs +] diff --git a/setup.py b/setup.py deleted file mode 100644 index 9de0119..0000000 --- a/setup.py +++ /dev/null @@ -1,35 +0,0 @@ -from setuptools import setup - -setup( - name="openparse", - version="0.5.1", - entry_points={ - "console_scripts": [ - "openparse-download= openparse.cli:download_unitable_weights", - ], - }, - install_requires=[ - "PyMuPDF >= 1.23.2", - "pillow >= 8.3", - "pydantic >= 2.0", - "pypdf >= 4.0.0", - "pdfminer.six >= 20200401", - "tiktoken >= 0.3", - "openai >= 1.0.0", - "numpy", - ], - extras_require={ - "ml": [ - "torch", - "torchvision", - "transformers", - "tokenizers", - ], - }, - author="Sergey Filimonov", - author_email="hello@sergey.fyi", - description="Streamlines the process of preparing documents for LLM's.", - long_description=open("README.md").read(), - long_description_content_type="text/markdown", - url="https://github.com/Filimoa/open-parse/", -)