Skip to content

Commit 16df694

Browse files
authored
fix(auto): partition() passes strategy to PPTX,DOCX (#3273)
**Summary** Remedy gap where `strategy` argument passed to `partition()` was not forwarded to `partition_pptx()` or `partition_docx()`.
1 parent 6fe1c99 commit 16df694

File tree

4 files changed

+67
-2
lines changed

4 files changed

+67
-2
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
1-
## 0.14.8-dev1
1+
## 0.14.8-dev2
22

33
### Enhancements
44

55
### Features
66

77
### Fixes
88

9+
* **`partition()` now forwards `strategy` arg to `partition_docx()` and `partition_pptx()`.** A `strategy` argument passed to `partition()` (or the default value "auto" assigned by `partition()`) is now forwarded to `partition_docx()` and `partition_pptx()` when those filetypes are detected.
10+
911
## 0.14.7
1012

1113
### Enhancements

test_unstructured/partition/test_auto.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# pyright: reportPrivateUsage=false
2+
13
from __future__ import annotations
24

35
import json
@@ -6,6 +8,7 @@
68
import tempfile
79
import warnings
810
from importlib import import_module
11+
from typing import Iterator
912
from unittest.mock import Mock, patch
1013

1114
import docx
@@ -20,10 +23,12 @@
2023
EXPECTED_TEXT_XLSX,
2124
EXPECTED_TITLE,
2225
)
26+
from test_unstructured.unit_utils import ANY, FixtureRequest, example_doc_path, method_mock
2327
from unstructured.chunking.title import chunk_by_title
2428
from unstructured.cleaners.core import clean_extra_whitespace
2529
from unstructured.documents.elements import (
2630
Address,
31+
Element,
2732
ElementMetadata,
2833
ListItem,
2934
NarrativeText,
@@ -173,6 +178,34 @@ def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements
173178
assert elements == expected_docx_elements
174179

175180

181+
@pytest.mark.parametrize(
182+
"strategy",
183+
[
184+
PartitionStrategy.AUTO,
185+
PartitionStrategy.FAST,
186+
PartitionStrategy.HI_RES,
187+
PartitionStrategy.OCR_ONLY,
188+
],
189+
)
190+
def test_partition_forwards_strategy_arg_to_partition_docx(request: FixtureRequest, strategy: str):
191+
from unstructured.partition.docx import _DocxPartitioner
192+
193+
def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]:
194+
yield Text(f"strategy=={self._opts.strategy}")
195+
196+
_iter_elements_ = method_mock(
197+
request,
198+
_DocxPartitioner,
199+
"_iter_document_elements",
200+
side_effect=fake_iter_document_elements,
201+
)
202+
203+
(element,) = partition(example_doc_path("simple.docx"), strategy=strategy)
204+
205+
_iter_elements_.assert_called_once_with(ANY)
206+
assert element.text == f"strategy=={strategy}"
207+
208+
176209
@pytest.mark.parametrize(
177210
("pass_metadata_filename", "content_type"),
178211
[(False, None), (False, "text/html"), (True, "text/html"), (True, None)],
@@ -556,6 +589,34 @@ def test_auto_partition_pptx_from_filename():
556589
assert elements[0].metadata.file_directory == os.path.split(filename)[0]
557590

558591

592+
@pytest.mark.parametrize(
593+
"strategy",
594+
[
595+
PartitionStrategy.AUTO,
596+
PartitionStrategy.FAST,
597+
PartitionStrategy.HI_RES,
598+
PartitionStrategy.OCR_ONLY,
599+
],
600+
)
601+
def test_partition_forwards_strategy_arg_to_partition_pptx(request: FixtureRequest, strategy: str):
602+
from unstructured.partition.pptx import _PptxPartitioner
603+
604+
def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]:
605+
yield Text(f"strategy=={self._opts.strategy}")
606+
607+
_iter_elements_ = method_mock(
608+
request,
609+
_PptxPartitioner,
610+
"_iter_presentation_elements",
611+
side_effect=fake_iter_presentation_elements,
612+
)
613+
614+
(element,) = partition(example_doc_path("simple.pptx"), strategy=strategy)
615+
616+
_iter_elements_.assert_called_once_with(ANY)
617+
assert element.text == f"strategy=={strategy}"
618+
619+
559620
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
560621
def test_auto_partition_ppt_from_filename():
561622
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt")

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.14.8-dev1" # pragma: no cover
1+
__version__ = "0.14.8-dev2" # pragma: no cover

unstructured/partition/auto.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,7 @@ def partition(
327327
languages=languages,
328328
detect_language_per_element=detect_language_per_element,
329329
starting_page_number=starting_page_number,
330+
strategy=strategy,
330331
**kwargs,
331332
)
332333
elif filetype == FileType.ODT:
@@ -499,6 +500,7 @@ def partition(
499500
languages=languages,
500501
detect_language_per_element=detect_language_per_element,
501502
starting_page_number=starting_page_number,
503+
strategy=strategy,
502504
**kwargs,
503505
)
504506
elif filetype == FileType.JSON:

0 commit comments

Comments
 (0)