Skip to content

Commit

Permalink
feat: enhance analysis options with od model dump and better vis (#3234)
Browse files Browse the repository at this point in the history
This PR adds new capabilities for drawing bboxes for each layout
(extracted, inferred, ocr and final) + OD model output dump as a json
file for better analysis.

---------

Co-authored-by: Christine Straub <[email protected]>
Co-authored-by: Michal Martyniak <[email protected]>
  • Loading branch information
3 people committed Jun 26, 2024
1 parent f2fee0c commit 575957b
Show file tree
Hide file tree
Showing 12 changed files with 1,127 additions and 11 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -204,3 +204,6 @@ examples/**/output/

outputdiff.txt
metricsdiff.txt

# analysis
annotated/
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

### Enhancements

* **Added visualization and OD model result dump for PDF** In PDF `hi_res` strategy the `analysis` parameter can be used
to visualize the result of the OD model and dump the result to a file.
Additionally, the visualization of bounding boxes of each layout source is rendered and saved
for each page.

### Features

### Fixes
Expand Down
5 changes: 4 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@ COPY unstructured unstructured
COPY test_unstructured test_unstructured
COPY example-docs example-docs

RUN chown -R notebook-user:notebook-user /app && ln -s /usr/bin/python3.11 /usr/bin/python3
RUN chown -R notebook-user:notebook-user /app && \
apk add font-ubuntu && \
fc-cache -fv && \
ln -s /usr/bin/python3.11 /usr/bin/python3

USER notebook-user

Expand Down
154 changes: 154 additions & 0 deletions test_unstructured/partition/pdf_image/test_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import numpy as np
import pytest
from PIL import Image
from unstructured_inference.inference.elements import Rectangle
from unstructured_inference.inference.layout import DocumentLayout, PageLayout
from unstructured_inference.inference.layoutelement import LayoutElement

from unstructured.partition.pdf_image.analysis.bbox_visualisation import (
TextAlignment,
get_bbox_text_size,
get_bbox_thickness,
get_label_rect_and_coords,
get_rgb_color,
get_text_color,
)
from unstructured.partition.pdf_image.analysis.layout_dump import ObjectDetectionLayoutDumper


@pytest.mark.parametrize("color", ["red", "green", "blue", "yellow", "black", "white"])
def test_get_rgb_color(color: str):
color_tuple = get_rgb_color(color)

assert isinstance(color_tuple, tuple)
assert len(color_tuple) == 3
assert all(isinstance(c, int) for c in color_tuple)
assert all(0 <= c <= 255 for c in color_tuple)


@pytest.mark.parametrize(
("bbox", "expected_text_size"),
[
((0, 0, 90, 90), 17),
((0, 0, 500, 200), 21),
((0, 0, 10000, 10000), 32),
],
)
def test_get_bbox_text_size(bbox: tuple[int, int, int, int], expected_text_size):
page_size = (1700, 2200) # standard size of a page
text_size = get_bbox_text_size(bbox, page_size)

assert text_size == expected_text_size


@pytest.mark.parametrize(
("bbox", "expected_box_thickness"),
[
((0, 0, 90, 90), 1),
((0, 0, 450, 250), 2),
((0, 0, 600, 1000), 3),
],
)
def test_get_bbox_thickness(bbox: tuple[int, int, int, int], expected_box_thickness):
page_size = (1700, 2200) # standard size of a page
box_thickness = get_bbox_thickness(bbox, page_size)

assert box_thickness == expected_box_thickness


@pytest.mark.parametrize(
("color", "expected_text_color"),
[
("navy", "white"),
("crimson", "white"),
("maroon", "white"),
("dimgray", "white"),
("darkgreen", "white"),
("darkcyan", "white"),
("fuchsia", "white"),
("violet", "black"),
("gold", "black"),
("aqua", "black"),
("greenyellow", "black"),
],
)
def test_best_text_color(color, expected_text_color):
color_tuple = get_rgb_color(color)
expected_text_color_tuple = get_rgb_color(expected_text_color)

_, text_color_tuple = get_text_color(color_tuple)
assert text_color_tuple == expected_text_color_tuple


@pytest.mark.parametrize(
("alignment", "expected_text_bbox"),
[
(TextAlignment.CENTER, ((145, 145), (155, 155))),
(TextAlignment.TOP_LEFT, ((100, 90), (120, 100))),
(TextAlignment.TOP_RIGHT, ((180, 100), (200, 110))),
(TextAlignment.BOTTOM_LEFT, ((100, 190), (120, 200))),
(TextAlignment.BOTTOM_RIGHT, ((180, 190), (200, 200))),
],
)
def test_get_text_bbox(alignment, expected_text_bbox):
text_bbox, text_xy = get_label_rect_and_coords(
alignment=alignment, bbox_points=(100, 100, 200, 200), text_width=10, text_height=10
)
# adding high atol to account for the text-based extending of the bbox
assert np.allclose(text_bbox, expected_text_bbox, atol=10)


def test_od_document_layout_dump():
page1 = PageLayout(
number=1,
image=Image.new("1", (1, 1)),
image_metadata={"width": 100, "height": 100},
)
page1.elements = [
LayoutElement(type="Title", bbox=Rectangle(x1=0, y1=0, x2=10, y2=10), prob=0.7),
LayoutElement(type="Paragraph", bbox=Rectangle(x1=0, y1=100, x2=10, y2=110), prob=0.8),
]
page2 = PageLayout(
number=2,
image=Image.new("1", (1, 1)),
image_metadata={"width": 100, "height": 100},
)
page2.elements = [
LayoutElement(type="Table", bbox=Rectangle(x1=0, y1=0, x2=10, y2=10), prob=0.9),
LayoutElement(type="Image", bbox=Rectangle(x1=0, y1=100, x2=10, y2=110), prob=1.0),
]
od_document_layout = DocumentLayout(pages=[page1, page2])

expected_dump = {
"pages": [
{
"number": 1,
"size": {
"width": 100,
"height": 100,
},
"elements": [
{"bbox": [0, 0, 10, 10], "type": "Title", "prob": 0.7},
{"bbox": [0, 100, 10, 110], "type": "Paragraph", "prob": 0.8},
],
},
{
"number": 2,
"size": {
"width": 100,
"height": 100,
},
"elements": [
{"bbox": [0, 0, 10, 10], "type": "Table", "prob": 0.9},
{"bbox": [0, 100, 10, 110], "type": "Image", "prob": 1.0},
],
},
]
}
od_layout_dump = ObjectDetectionLayoutDumper(od_document_layout).dump()

assert {"pages": od_layout_dump.get("pages")} == expected_dump

# check OD model classes are attached but do not depend on a specific model instance
assert "object_detection_classes" in od_layout_dump
assert len(od_layout_dump["object_detection_classes"]) > 0
30 changes: 30 additions & 0 deletions test_unstructured/partition/pdf_image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import math
import os
import tempfile
from pathlib import Path
from tempfile import SpooledTemporaryFile
from unittest import mock

Expand Down Expand Up @@ -1322,3 +1323,32 @@ def test_unique_and_deterministic_element_ids(strategy, expected_ids):
)
ids = [element.id for element in elements]
assert ids == expected_ids, "Element IDs do not match expected IDs"


def test_analysis_artifacts_saved():
with tempfile.TemporaryDirectory() as temp_dir:
filename = example_doc_path("layout-parser-paper-fast.pdf")
pdf.partition_pdf(
filename=filename,
strategy=PartitionStrategy.HI_RES,
analysis=True,
analyzed_image_output_dir_path=temp_dir,
)

analysis_dir = Path(temp_dir)
layout_dump_dir = analysis_dir / "analysis" / "layout-parser-paper-fast" / "layout_dump"
assert layout_dump_dir.exists()
layout_dump_files = list(layout_dump_dir.iterdir())
assert len(layout_dump_files) == 1
assert (layout_dump_dir / "object_detection.json").exists()

bboxes_dir = analysis_dir / "analysis" / "layout-parser-paper-fast" / "bboxes"
assert bboxes_dir.exists()
bboxes_files = list(bboxes_dir.iterdir())
assert len(bboxes_files) == 2 * 4 # 2 pages * 4 different layouts per page

expected_layouts = ["od_model", "ocr", "pdfminer", "final"]
expected_pages = [1, 2]
for el in expected_layouts:
for page in expected_pages:
assert bboxes_dir / f"page{page}_layout_{el}.png" in bboxes_files
93 changes: 83 additions & 10 deletions unstructured/partition/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,19 @@
spooled_to_bytes_io_if_needed,
)
from unstructured.partition.lang import check_language_args, prepare_languages_for_tesseract
from unstructured.partition.pdf_image.analysis.bbox_visualisation import (
AnalysisDrawer,
FinalLayoutDrawer,
OCRLayoutDrawer,
ODModelLayoutDrawer,
PdfminerLayoutDrawer,
)
from unstructured.partition.pdf_image.analysis.layout_dump import (
JsonLayoutDumper,
ObjectDetectionLayoutDumper,
)
from unstructured.partition.pdf_image.form_extraction import run_form_extraction
from unstructured.partition.pdf_image.pdf_image_utils import (
annotate_layout_elements,
check_element_types_to_extract,
convert_pdf_to_images,
get_the_last_modification_date_pdf_or_img,
Expand Down Expand Up @@ -533,6 +543,13 @@ def _partition_pdf_or_image_local(
f"(currently {pdf_image_dpi}).",
)

pdfminer_drawer: Optional[PdfminerLayoutDrawer] = None
od_model_drawer: Optional[ODModelLayoutDrawer] = None
ocr_drawer: Optional[OCRLayoutDrawer] = None
od_model_layout_dumper: Optional[ObjectDetectionLayoutDumper] = None
skip_bboxes = env_config.ANALYSIS_BBOX_SKIP
skip_dump_od = env_config.ANALYSIS_DUMP_OD_SKIP

if file is None:
inferred_document_layout = process_file_with_model(
filename,
Expand Down Expand Up @@ -561,15 +578,19 @@ def _partition_pdf_or_image_local(
else:
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
annotate_layout_elements(
inferred_document_layout=inferred_document_layout,
extracted_layout=extracted_layout,
filename=filename,
output_dir_path=analyzed_image_output_dir_path,
pdf_image_dpi=pdf_image_dpi,
is_image=is_image,
)

if not skip_bboxes:
pdfminer_drawer = PdfminerLayoutDrawer(
layout=extracted_layout,
)
od_model_drawer = ODModelLayoutDrawer(
layout=inferred_document_layout,
)
ocr_drawer = OCRLayoutDrawer()
if not skip_dump_od:
od_model_layout_dumper = ObjectDetectionLayoutDumper(
layout=inferred_document_layout,
model_name=hi_res_model_name,
)
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = merge_inferred_with_extracted_layout(
inferred_document_layout=inferred_document_layout,
Expand All @@ -586,6 +607,7 @@ def _partition_pdf_or_image_local(
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_drawer=ocr_drawer,
)
else:
inferred_document_layout = process_data_with_model(
Expand All @@ -609,6 +631,23 @@ def _partition_pdf_or_image_local(
else []
)

if analysis:
if not analyzed_image_output_dir_path:
if env_config.GLOBAL_WORKING_DIR_ENABLED:
analyzed_image_output_dir_path = str(
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
)
else:
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
pdfminer_drawer = PdfminerLayoutDrawer(
layout=extracted_layout,
)
od_model_drawer = ODModelLayoutDrawer(
layout=inferred_document_layout,
)
ocr_drawer = OCRLayoutDrawer()

# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
merged_document_layout = merge_inferred_with_extracted_layout(
inferred_document_layout=inferred_document_layout,
Expand All @@ -627,6 +666,7 @@ def _partition_pdf_or_image_local(
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
pdf_image_dpi=pdf_image_dpi,
ocr_drawer=ocr_drawer,
)

# NOTE(alan): starting with v2, chipper sorts the elements itself.
Expand Down Expand Up @@ -715,6 +755,39 @@ def _partition_pdf_or_image_local(
)
out_elements.extend(forms)

if analysis and not skip_bboxes:
final_drawer = FinalLayoutDrawer(
layout=out_elements,
)
analysis_drawer = AnalysisDrawer(
filename=filename,
save_dir=analyzed_image_output_dir_path,
draw_grid=env_config.ANALYSIS_BBOX_DRAW_GRID,
draw_caption=env_config.ANALYSIS_BBOX_DRAW_CAPTION,
resize=env_config.ANALYSIS_BBOX_RESIZE,
format=env_config.ANALYSIS_BBOX_FORMAT,
)

if od_model_drawer:
analysis_drawer.add_drawer(od_model_drawer)

if pdfminer_drawer:
analysis_drawer.add_drawer(pdfminer_drawer)

if ocr_drawer:
analysis_drawer.add_drawer(ocr_drawer)
analysis_drawer.add_drawer(final_drawer)
analysis_drawer.process()

if analysis and not skip_dump_od:
json_layout_dumper = JsonLayoutDumper(
filename=filename,
save_dir=analyzed_image_output_dir_path,
)
if od_model_layout_dumper:
json_layout_dumper.add_layout_dumper(od_model_layout_dumper)
json_layout_dumper.process()

return out_elements


Expand Down
Empty file.
Loading

0 comments on commit 575957b

Please sign in to comment.