Skip to content

Commit f2fee0c

Browse files
authored
fix(auto): partition() passes strategy to DOC,ODT (#3278)
**Summary** Remedy gap where `strategy` argument passed to `partition()` was not forwarded to `partition_doc()` or `partition_odt()` and so was not making its way to `partition_docx()`.
1 parent 0665e94 commit f2fee0c

File tree

4 files changed

+29
-33
lines changed

4 files changed

+29
-33
lines changed

CHANGELOG.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.14.9-dev4
1+
## 0.14.9-dev5
22

33
### Enhancements
44

@@ -7,6 +7,7 @@
77
### Fixes
88

99
* **Fix a bug where multiple `soffice` processes could be attempted** Add a wait mechanism in `convert_office_doc` so that the function first checks if another `soffice` is running already: if yes wait till the other process finishes or till the wait timeout before spawning a subprocess to run `soffice`
10+
* **`partition()` now forwards `strategy` arg to `partition_docx()`, `partition_pptx()`, and their brokering partitioners for DOC, ODT, and PPT formats.** A `strategy` argument passed to `partition()` (or the default value "auto" assigned by `partition()`) is now forwarded to `partition_docx()`, `partition_pptx()`, and their brokering partitioners when those filetypes are detected.
1011

1112
## 0.14.8
1213

@@ -20,7 +21,6 @@
2021

2122
* **Bump unstructured-inference==0.7.36** Fix `ValueError` when converting cells to html.
2223
* **`partition()` now forwards `strategy` arg to `partition_docx()`, `partition_ppt()`, and `partition_pptx()`.** A `strategy` argument passed to `partition()` (or the default value "auto" assigned by `partition()`) is now forwarded to `partition_docx()`, `partition_ppt()`, and `partition_pptx()` when those filetypes are detected.
23-
2424
* **Fix missing sensitive field markers** for embedders
2525

2626
## 0.14.7

test_unstructured/partition/test_auto.py

Lines changed: 24 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements
178178
assert elements == expected_docx_elements
179179

180180

181+
@pytest.mark.parametrize("file_name", ["simple.docx", "simple.doc", "simple.odt"])
181182
@pytest.mark.parametrize(
182183
"strategy",
183184
[
@@ -187,7 +188,17 @@ def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements
187188
PartitionStrategy.OCR_ONLY,
188189
],
189190
)
190-
def test_partition_forwards_strategy_arg_to_partition_docx(request: FixtureRequest, strategy: str):
191+
def test_partition_forwards_strategy_arg_to_partition_docx_and_its_brokers(
192+
request: FixtureRequest, file_name: str, strategy: str
193+
):
194+
"""The `strategy` arg value received by `partition()` is received by `partition_docx().
195+
196+
To do this in the brokering-partitioner cases (DOC, ODT) it must make its way to
197+
`partition_doc()` or `partition_odt()` which must then forward it to `partition_docx()`. This
198+
test makes sure it made it all the way.
199+
200+
Note this is 3 file-types X 4 strategies = 12 test-cases.
201+
"""
191202
from unstructured.partition.docx import _DocxPartitioner
192203

193204
def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]:
@@ -200,7 +211,7 @@ def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]:
200211
side_effect=fake_iter_document_elements,
201212
)
202213

203-
(element,) = partition(example_doc_path("simple.docx"), strategy=strategy)
214+
(element,) = partition(example_doc_path(file_name), strategy=strategy)
204215

205216
_iter_elements_.assert_called_once_with(ANY)
206217
assert element.text == f"strategy=={strategy}"
@@ -589,6 +600,7 @@ def test_auto_partition_pptx_from_filename():
589600
assert elements[0].metadata.file_directory == os.path.split(filename)[0]
590601

591602

603+
@pytest.mark.parametrize("file_name", ["simple.pptx", "fake-power-point.ppt"])
592604
@pytest.mark.parametrize(
593605
"strategy",
594606
[
@@ -598,35 +610,17 @@ def test_auto_partition_pptx_from_filename():
598610
PartitionStrategy.OCR_ONLY,
599611
],
600612
)
601-
def test_partition_forwards_strategy_arg_to_partition_pptx(request: FixtureRequest, strategy: str):
602-
from unstructured.partition.pptx import _PptxPartitioner
603-
604-
def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]:
605-
yield Text(f"strategy=={self._opts.strategy}")
606-
607-
_iter_elements_ = method_mock(
608-
request,
609-
_PptxPartitioner,
610-
"_iter_presentation_elements",
611-
side_effect=fake_iter_presentation_elements,
612-
)
613-
614-
(element,) = partition(example_doc_path("simple.pptx"), strategy=strategy)
615-
616-
_iter_elements_.assert_called_once_with(ANY)
617-
assert element.text == f"strategy=={strategy}"
613+
def test_partition_forwards_strategy_arg_to_partition_pptx_and_its_brokers(
614+
request: FixtureRequest, file_name: str, strategy: str
615+
):
616+
"""The `strategy` arg value received by `partition()` is received by `partition_pptx().
618617
618+
To do this in the brokering-partitioner case (PPT) the strategy argument must make its way to
619+
`partition_ppt()` which must then forward it to `partition_pptx()`. This test makes sure it
620+
made it all the way.
619621
620-
@pytest.mark.parametrize(
621-
"strategy",
622-
[
623-
PartitionStrategy.AUTO,
624-
PartitionStrategy.FAST,
625-
PartitionStrategy.HI_RES,
626-
PartitionStrategy.OCR_ONLY,
627-
],
628-
)
629-
def test_partition_forwards_strategy_arg_to_partition_ppt(request: FixtureRequest, strategy: str):
622+
Note this is 2 file-types X 4 strategies = 8 test-cases.
623+
"""
630624
from unstructured.partition.pptx import _PptxPartitioner
631625

632626
def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]:
@@ -639,7 +633,7 @@ def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]
639633
side_effect=fake_iter_presentation_elements,
640634
)
641635

642-
(element,) = partition(example_doc_path("fake-power-point.ppt"), strategy=strategy)
636+
(element,) = partition(example_doc_path(file_name), strategy=strategy)
643637

644638
_iter_elements_.assert_called_once_with(ANY)
645639
assert element.text == f"strategy=={strategy}"

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.14.9-dev4" # pragma: no cover
1+
__version__ = "0.14.9-dev5" # pragma: no cover

unstructured/partition/auto.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,7 @@ def partition(
316316
languages=languages,
317317
detect_language_per_element=detect_language_per_element,
318318
starting_page_number=starting_page_number,
319+
strategy=strategy,
319320
**kwargs,
320321
)
321322
elif filetype == FileType.DOCX:
@@ -339,6 +340,7 @@ def partition(
339340
languages=languages,
340341
detect_language_per_element=detect_language_per_element,
341342
starting_page_number=starting_page_number,
343+
strategy=strategy,
342344
**kwargs,
343345
)
344346
elif filetype == FileType.EML:

0 commit comments

Comments
 (0)