feat: enhance analysis options with od model dump and better vis (#3234)

This PR adds new capabilities for drawing bboxes for each layout (extracted, inferred, ocr and final) + OD model output dump as a json file for better analysis. --------- Co-authored-by: Christine Straub <[email protected]> Co-authored-by: Michal Martyniak <[email protected]>
Unstructured-IO · Jun 26, 2024 · 575957b · 575957b
1 parent f2fee0c
commit 575957b
Show file tree

Hide file tree

Showing 12 changed files with 1,127 additions and 11 deletions.
diff --git a/.gitignore b/.gitignore
@@ -204,3 +204,6 @@ examples/**/output/
 
 outputdiff.txt
 metricsdiff.txt
+
+# analysis
+annotated/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,11 @@
 
 ### Enhancements
 
+* **Added visualization and OD model result dump for PDF** In PDF `hi_res` strategy the `analysis` parameter can be used
+  to visualize the result of the OD model and dump the result to a file.
+  Additionally, the visualization of bounding boxes of each layout source is rendered and saved
+  for each page.
+
 ### Features
 
 ### Fixes

diff --git a/Dockerfile b/Dockerfile
@@ -9,7 +9,10 @@ COPY unstructured unstructured
 COPY test_unstructured test_unstructured
 COPY example-docs example-docs
 
-RUN chown -R notebook-user:notebook-user /app && ln -s /usr/bin/python3.11 /usr/bin/python3
+RUN chown -R notebook-user:notebook-user /app && \
+  apk add font-ubuntu && \
+  fc-cache -fv && \
+  ln -s /usr/bin/python3.11 /usr/bin/python3
 
 USER notebook-user
 

diff --git a/test_unstructured/partition/pdf_image/test_analysis.py b/test_unstructured/partition/pdf_image/test_analysis.py
@@ -0,0 +1,154 @@
+import numpy as np
+import pytest
+from PIL import Image
+from unstructured_inference.inference.elements import Rectangle
+from unstructured_inference.inference.layout import DocumentLayout, PageLayout
+from unstructured_inference.inference.layoutelement import LayoutElement
+
+from unstructured.partition.pdf_image.analysis.bbox_visualisation import (
+    TextAlignment,
+    get_bbox_text_size,
+    get_bbox_thickness,
+    get_label_rect_and_coords,
+    get_rgb_color,
+    get_text_color,
+)
+from unstructured.partition.pdf_image.analysis.layout_dump import ObjectDetectionLayoutDumper
+
+
+@pytest.mark.parametrize("color", ["red", "green", "blue", "yellow", "black", "white"])
+def test_get_rgb_color(color: str):
+    color_tuple = get_rgb_color(color)
+
+    assert isinstance(color_tuple, tuple)
+    assert len(color_tuple) == 3
+    assert all(isinstance(c, int) for c in color_tuple)
+    assert all(0 <= c <= 255 for c in color_tuple)
+
+
+@pytest.mark.parametrize(
+    ("bbox", "expected_text_size"),
+    [
+        ((0, 0, 90, 90), 17),
+        ((0, 0, 500, 200), 21),
+        ((0, 0, 10000, 10000), 32),
+    ],
+)
+def test_get_bbox_text_size(bbox: tuple[int, int, int, int], expected_text_size):
+    page_size = (1700, 2200)  # standard size of a page
+    text_size = get_bbox_text_size(bbox, page_size)
+
+    assert text_size == expected_text_size
+
+
+@pytest.mark.parametrize(
+    ("bbox", "expected_box_thickness"),
+    [
+        ((0, 0, 90, 90), 1),
+        ((0, 0, 450, 250), 2),
+        ((0, 0, 600, 1000), 3),
+    ],
+)
+def test_get_bbox_thickness(bbox: tuple[int, int, int, int], expected_box_thickness):
+    page_size = (1700, 2200)  # standard size of a page
+    box_thickness = get_bbox_thickness(bbox, page_size)
+
+    assert box_thickness == expected_box_thickness
+
+
+@pytest.mark.parametrize(
+    ("color", "expected_text_color"),
+    [
+        ("navy", "white"),
+        ("crimson", "white"),
+        ("maroon", "white"),
+        ("dimgray", "white"),
+        ("darkgreen", "white"),
+        ("darkcyan", "white"),
+        ("fuchsia", "white"),
+        ("violet", "black"),
+        ("gold", "black"),
+        ("aqua", "black"),
+        ("greenyellow", "black"),
+    ],
+)
+def test_best_text_color(color, expected_text_color):
+    color_tuple = get_rgb_color(color)
+    expected_text_color_tuple = get_rgb_color(expected_text_color)
+
+    _, text_color_tuple = get_text_color(color_tuple)
+    assert text_color_tuple == expected_text_color_tuple
+
+
+@pytest.mark.parametrize(
+    ("alignment", "expected_text_bbox"),
+    [
+        (TextAlignment.CENTER, ((145, 145), (155, 155))),
+        (TextAlignment.TOP_LEFT, ((100, 90), (120, 100))),
+        (TextAlignment.TOP_RIGHT, ((180, 100), (200, 110))),
+        (TextAlignment.BOTTOM_LEFT, ((100, 190), (120, 200))),
+        (TextAlignment.BOTTOM_RIGHT, ((180, 190), (200, 200))),
+    ],
+)
+def test_get_text_bbox(alignment, expected_text_bbox):
+    text_bbox, text_xy = get_label_rect_and_coords(
+        alignment=alignment, bbox_points=(100, 100, 200, 200), text_width=10, text_height=10
+    )
+    # adding high atol to account for the text-based extending of the bbox
+    assert np.allclose(text_bbox, expected_text_bbox, atol=10)
+
+
+def test_od_document_layout_dump():
+    page1 = PageLayout(
+        number=1,
+        image=Image.new("1", (1, 1)),
+        image_metadata={"width": 100, "height": 100},
+    )
+    page1.elements = [
+        LayoutElement(type="Title", bbox=Rectangle(x1=0, y1=0, x2=10, y2=10), prob=0.7),
+        LayoutElement(type="Paragraph", bbox=Rectangle(x1=0, y1=100, x2=10, y2=110), prob=0.8),
+    ]
+    page2 = PageLayout(
+        number=2,
+        image=Image.new("1", (1, 1)),
+        image_metadata={"width": 100, "height": 100},
+    )
+    page2.elements = [
+        LayoutElement(type="Table", bbox=Rectangle(x1=0, y1=0, x2=10, y2=10), prob=0.9),
+        LayoutElement(type="Image", bbox=Rectangle(x1=0, y1=100, x2=10, y2=110), prob=1.0),
+    ]
+    od_document_layout = DocumentLayout(pages=[page1, page2])
+
+    expected_dump = {
+        "pages": [
+            {
+                "number": 1,
+                "size": {
+                    "width": 100,
+                    "height": 100,
+                },
+                "elements": [
+                    {"bbox": [0, 0, 10, 10], "type": "Title", "prob": 0.7},
+                    {"bbox": [0, 100, 10, 110], "type": "Paragraph", "prob": 0.8},
+                ],
+            },
+            {
+                "number": 2,
+                "size": {
+                    "width": 100,
+                    "height": 100,
+                },
+                "elements": [
+                    {"bbox": [0, 0, 10, 10], "type": "Table", "prob": 0.9},
+                    {"bbox": [0, 100, 10, 110], "type": "Image", "prob": 1.0},
+                ],
+            },
+        ]
+    }
+    od_layout_dump = ObjectDetectionLayoutDumper(od_document_layout).dump()
+
+    assert {"pages": od_layout_dump.get("pages")} == expected_dump
+
+    # check OD model classes are attached but do not depend on a specific model instance
+    assert "object_detection_classes" in od_layout_dump
+    assert len(od_layout_dump["object_detection_classes"]) > 0
diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py
@@ -3,6 +3,7 @@
 import math
 import os
 import tempfile
+from pathlib import Path
 from tempfile import SpooledTemporaryFile
 from unittest import mock
 
@@ -1322,3 +1323,32 @@ def test_unique_and_deterministic_element_ids(strategy, expected_ids):
     )
     ids = [element.id for element in elements]
     assert ids == expected_ids, "Element IDs do not match expected IDs"
+
+
+def test_analysis_artifacts_saved():
+    with tempfile.TemporaryDirectory() as temp_dir:
+        filename = example_doc_path("layout-parser-paper-fast.pdf")
+        pdf.partition_pdf(
+            filename=filename,
+            strategy=PartitionStrategy.HI_RES,
+            analysis=True,
+            analyzed_image_output_dir_path=temp_dir,
+        )
+
+        analysis_dir = Path(temp_dir)
+        layout_dump_dir = analysis_dir / "analysis" / "layout-parser-paper-fast" / "layout_dump"
+        assert layout_dump_dir.exists()
+        layout_dump_files = list(layout_dump_dir.iterdir())
+        assert len(layout_dump_files) == 1
+        assert (layout_dump_dir / "object_detection.json").exists()
+
+        bboxes_dir = analysis_dir / "analysis" / "layout-parser-paper-fast" / "bboxes"
+        assert bboxes_dir.exists()
+        bboxes_files = list(bboxes_dir.iterdir())
+        assert len(bboxes_files) == 2 * 4  # 2 pages * 4 different layouts per page
+
+        expected_layouts = ["od_model", "ocr", "pdfminer", "final"]
+        expected_pages = [1, 2]
+        for el in expected_layouts:
+            for page in expected_pages:
+                assert bboxes_dir / f"page{page}_layout_{el}.png" in bboxes_files
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -46,9 +46,19 @@
     spooled_to_bytes_io_if_needed,
 )
 from unstructured.partition.lang import check_language_args, prepare_languages_for_tesseract
+from unstructured.partition.pdf_image.analysis.bbox_visualisation import (
+    AnalysisDrawer,
+    FinalLayoutDrawer,
+    OCRLayoutDrawer,
+    ODModelLayoutDrawer,
+    PdfminerLayoutDrawer,
+)
+from unstructured.partition.pdf_image.analysis.layout_dump import (
+    JsonLayoutDumper,
+    ObjectDetectionLayoutDumper,
+)
 from unstructured.partition.pdf_image.form_extraction import run_form_extraction
 from unstructured.partition.pdf_image.pdf_image_utils import (
-    annotate_layout_elements,
     check_element_types_to_extract,
     convert_pdf_to_images,
     get_the_last_modification_date_pdf_or_img,
@@ -533,6 +543,13 @@ def _partition_pdf_or_image_local(
             f"(currently {pdf_image_dpi}).",
         )
 
+    pdfminer_drawer: Optional[PdfminerLayoutDrawer] = None
+    od_model_drawer: Optional[ODModelLayoutDrawer] = None
+    ocr_drawer: Optional[OCRLayoutDrawer] = None
+    od_model_layout_dumper: Optional[ObjectDetectionLayoutDumper] = None
+    skip_bboxes = env_config.ANALYSIS_BBOX_SKIP
+    skip_dump_od = env_config.ANALYSIS_DUMP_OD_SKIP
+
     if file is None:
         inferred_document_layout = process_file_with_model(
             filename,
@@ -561,15 +578,19 @@ def _partition_pdf_or_image_local(
                     else:
                         analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
                 os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
-                annotate_layout_elements(
-                    inferred_document_layout=inferred_document_layout,
-                    extracted_layout=extracted_layout,
-                    filename=filename,
-                    output_dir_path=analyzed_image_output_dir_path,
-                    pdf_image_dpi=pdf_image_dpi,
-                    is_image=is_image,
-                )
-
+                if not skip_bboxes:
+                    pdfminer_drawer = PdfminerLayoutDrawer(
+                        layout=extracted_layout,
+                    )
+                    od_model_drawer = ODModelLayoutDrawer(
+                        layout=inferred_document_layout,
+                    )
+                    ocr_drawer = OCRLayoutDrawer()
+                if not skip_dump_od:
+                    od_model_layout_dumper = ObjectDetectionLayoutDumper(
+                        layout=inferred_document_layout,
+                        model_name=hi_res_model_name,
+                    )
             # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
             merged_document_layout = merge_inferred_with_extracted_layout(
                 inferred_document_layout=inferred_document_layout,
@@ -586,6 +607,7 @@ def _partition_pdf_or_image_local(
                 ocr_languages=ocr_languages,
                 ocr_mode=ocr_mode,
                 pdf_image_dpi=pdf_image_dpi,
+                ocr_drawer=ocr_drawer,
             )
     else:
         inferred_document_layout = process_data_with_model(
@@ -609,6 +631,23 @@ def _partition_pdf_or_image_local(
                 else []
             )
 
+            if analysis:
+                if not analyzed_image_output_dir_path:
+                    if env_config.GLOBAL_WORKING_DIR_ENABLED:
+                        analyzed_image_output_dir_path = str(
+                            Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
+                        )
+                    else:
+                        analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
+                os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
+                pdfminer_drawer = PdfminerLayoutDrawer(
+                    layout=extracted_layout,
+                )
+                od_model_drawer = ODModelLayoutDrawer(
+                    layout=inferred_document_layout,
+                )
+                ocr_drawer = OCRLayoutDrawer()
+
             # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
             merged_document_layout = merge_inferred_with_extracted_layout(
                 inferred_document_layout=inferred_document_layout,
@@ -627,6 +666,7 @@ def _partition_pdf_or_image_local(
                 ocr_languages=ocr_languages,
                 ocr_mode=ocr_mode,
                 pdf_image_dpi=pdf_image_dpi,
+                ocr_drawer=ocr_drawer,
             )
 
     # NOTE(alan): starting with v2, chipper sorts the elements itself.
@@ -715,6 +755,39 @@ def _partition_pdf_or_image_local(
         )
         out_elements.extend(forms)
 
+    if analysis and not skip_bboxes:
+        final_drawer = FinalLayoutDrawer(
+            layout=out_elements,
+        )
+        analysis_drawer = AnalysisDrawer(
+            filename=filename,
+            save_dir=analyzed_image_output_dir_path,
+            draw_grid=env_config.ANALYSIS_BBOX_DRAW_GRID,
+            draw_caption=env_config.ANALYSIS_BBOX_DRAW_CAPTION,
+            resize=env_config.ANALYSIS_BBOX_RESIZE,
+            format=env_config.ANALYSIS_BBOX_FORMAT,
+        )
+
+        if od_model_drawer:
+            analysis_drawer.add_drawer(od_model_drawer)
+
+        if pdfminer_drawer:
+            analysis_drawer.add_drawer(pdfminer_drawer)
+
+        if ocr_drawer:
+            analysis_drawer.add_drawer(ocr_drawer)
+        analysis_drawer.add_drawer(final_drawer)
+        analysis_drawer.process()
+
+    if analysis and not skip_dump_od:
+        json_layout_dumper = JsonLayoutDumper(
+            filename=filename,
+            save_dir=analyzed_image_output_dir_path,
+        )
+        if od_model_layout_dumper:
+            json_layout_dumper.add_layout_dumper(od_model_layout_dumper)
+        json_layout_dumper.process()
+
     return out_elements
 
 

diff --git a/unstructured/partition/pdf_image/analysis/__init__.py b/unstructured/partition/pdf_image/analysis/__init__.py