Skip to content

Commit

Permalink
fix(auto): partition() passes strategy to DOC,ODT (#3278)
Browse files Browse the repository at this point in the history
**Summary**
Remedy gap where `strategy` argument passed to `partition()` was not
forwarded to `partition_doc()` or `partition_odt()` and so was not
making its way to `partition_docx()`.
  • Loading branch information
scanny committed Jun 26, 2024
1 parent 0665e94 commit f2fee0c
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 33 deletions.
4 changes: 2 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.14.9-dev4
## 0.14.9-dev5

### Enhancements

Expand All @@ -7,6 +7,7 @@
### Fixes

* **Fix a bug where multiple `soffice` processes could be attempted** Add a wait mechanism in `convert_office_doc` so that the function first checks if another `soffice` is running already: if yes wait till the other process finishes or till the wait timeout before spawning a subprocess to run `soffice`
* **`partition()` now forwards `strategy` arg to `partition_docx()`, `partition_pptx()`, and their brokering partitioners for DOC, ODT, and PPT formats.** A `strategy` argument passed to `partition()` (or the default value "auto" assigned by `partition()`) is now forwarded to `partition_docx()`, `partition_pptx()`, and their brokering partitioners when those filetypes are detected.

## 0.14.8

Expand All @@ -20,7 +21,6 @@

* **Bump unstructured-inference==0.7.36** Fix `ValueError` when converting cells to html.
* **`partition()` now forwards `strategy` arg to `partition_docx()`, `partition_ppt()`, and `partition_pptx()`.** A `strategy` argument passed to `partition()` (or the default value "auto" assigned by `partition()`) is now forwarded to `partition_docx()`, `partition_ppt()`, and `partition_pptx()` when those filetypes are detected.

* **Fix missing sensitive field markers** for embedders

## 0.14.7
Expand Down
54 changes: 24 additions & 30 deletions test_unstructured/partition/test_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements
assert elements == expected_docx_elements


@pytest.mark.parametrize("file_name", ["simple.docx", "simple.doc", "simple.odt"])
@pytest.mark.parametrize(
"strategy",
[
Expand All @@ -187,7 +188,17 @@ def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements
PartitionStrategy.OCR_ONLY,
],
)
def test_partition_forwards_strategy_arg_to_partition_docx(request: FixtureRequest, strategy: str):
def test_partition_forwards_strategy_arg_to_partition_docx_and_its_brokers(
request: FixtureRequest, file_name: str, strategy: str
):
"""The `strategy` arg value received by `partition()` is received by `partition_docx().
To do this in the brokering-partitioner cases (DOC, ODT) it must make its way to
`partition_doc()` or `partition_odt()` which must then forward it to `partition_docx()`. This
test makes sure it made it all the way.
Note this is 3 file-types X 4 strategies = 12 test-cases.
"""
from unstructured.partition.docx import _DocxPartitioner

def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]:
Expand All @@ -200,7 +211,7 @@ def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]:
side_effect=fake_iter_document_elements,
)

(element,) = partition(example_doc_path("simple.docx"), strategy=strategy)
(element,) = partition(example_doc_path(file_name), strategy=strategy)

_iter_elements_.assert_called_once_with(ANY)
assert element.text == f"strategy=={strategy}"
Expand Down Expand Up @@ -589,6 +600,7 @@ def test_auto_partition_pptx_from_filename():
assert elements[0].metadata.file_directory == os.path.split(filename)[0]


@pytest.mark.parametrize("file_name", ["simple.pptx", "fake-power-point.ppt"])
@pytest.mark.parametrize(
"strategy",
[
Expand All @@ -598,35 +610,17 @@ def test_auto_partition_pptx_from_filename():
PartitionStrategy.OCR_ONLY,
],
)
def test_partition_forwards_strategy_arg_to_partition_pptx(request: FixtureRequest, strategy: str):
from unstructured.partition.pptx import _PptxPartitioner

def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]:
yield Text(f"strategy=={self._opts.strategy}")

_iter_elements_ = method_mock(
request,
_PptxPartitioner,
"_iter_presentation_elements",
side_effect=fake_iter_presentation_elements,
)

(element,) = partition(example_doc_path("simple.pptx"), strategy=strategy)

_iter_elements_.assert_called_once_with(ANY)
assert element.text == f"strategy=={strategy}"
def test_partition_forwards_strategy_arg_to_partition_pptx_and_its_brokers(
request: FixtureRequest, file_name: str, strategy: str
):
"""The `strategy` arg value received by `partition()` is received by `partition_pptx().
To do this in the brokering-partitioner case (PPT) the strategy argument must make its way to
`partition_ppt()` which must then forward it to `partition_pptx()`. This test makes sure it
made it all the way.
@pytest.mark.parametrize(
"strategy",
[
PartitionStrategy.AUTO,
PartitionStrategy.FAST,
PartitionStrategy.HI_RES,
PartitionStrategy.OCR_ONLY,
],
)
def test_partition_forwards_strategy_arg_to_partition_ppt(request: FixtureRequest, strategy: str):
Note this is 2 file-types X 4 strategies = 8 test-cases.
"""
from unstructured.partition.pptx import _PptxPartitioner

def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]:
Expand All @@ -639,7 +633,7 @@ def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]
side_effect=fake_iter_presentation_elements,
)

(element,) = partition(example_doc_path("fake-power-point.ppt"), strategy=strategy)
(element,) = partition(example_doc_path(file_name), strategy=strategy)

_iter_elements_.assert_called_once_with(ANY)
assert element.text == f"strategy=={strategy}"
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.14.9-dev4" # pragma: no cover
__version__ = "0.14.9-dev5" # pragma: no cover
2 changes: 2 additions & 0 deletions unstructured/partition/auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,7 @@ def partition(
languages=languages,
detect_language_per_element=detect_language_per_element,
starting_page_number=starting_page_number,
strategy=strategy,
**kwargs,
)
elif filetype == FileType.DOCX:
Expand All @@ -339,6 +340,7 @@ def partition(
languages=languages,
detect_language_per_element=detect_language_per_element,
starting_page_number=starting_page_number,
strategy=strategy,
**kwargs,
)
elif filetype == FileType.EML:
Expand Down

0 comments on commit f2fee0c

Please sign in to comment.