|
| 1 | +# pyright: reportPrivateUsage=false |
| 2 | + |
1 | 3 | from __future__ import annotations |
2 | 4 |
|
3 | 5 | import json |
|
6 | 8 | import tempfile |
7 | 9 | import warnings |
8 | 10 | from importlib import import_module |
| 11 | +from typing import Iterator |
9 | 12 | from unittest.mock import Mock, patch |
10 | 13 |
|
11 | 14 | import docx |
|
20 | 23 | EXPECTED_TEXT_XLSX, |
21 | 24 | EXPECTED_TITLE, |
22 | 25 | ) |
| 26 | +from test_unstructured.unit_utils import ANY, FixtureRequest, example_doc_path, method_mock |
23 | 27 | from unstructured.chunking.title import chunk_by_title |
24 | 28 | from unstructured.cleaners.core import clean_extra_whitespace |
25 | 29 | from unstructured.documents.elements import ( |
26 | 30 | Address, |
| 31 | + Element, |
27 | 32 | ElementMetadata, |
28 | 33 | ListItem, |
29 | 34 | NarrativeText, |
@@ -173,6 +178,34 @@ def test_auto_partition_doc_with_file(mock_docx_document, expected_docx_elements |
173 | 178 | assert elements == expected_docx_elements |
174 | 179 |
|
175 | 180 |
|
| 181 | +@pytest.mark.parametrize( |
| 182 | + "strategy", |
| 183 | + [ |
| 184 | + PartitionStrategy.AUTO, |
| 185 | + PartitionStrategy.FAST, |
| 186 | + PartitionStrategy.HI_RES, |
| 187 | + PartitionStrategy.OCR_ONLY, |
| 188 | + ], |
| 189 | +) |
| 190 | +def test_partition_forwards_strategy_arg_to_partition_docx(request: FixtureRequest, strategy: str): |
| 191 | + from unstructured.partition.docx import _DocxPartitioner |
| 192 | + |
| 193 | + def fake_iter_document_elements(self: _DocxPartitioner) -> Iterator[Element]: |
| 194 | + yield Text(f"strategy=={self._opts.strategy}") |
| 195 | + |
| 196 | + _iter_elements_ = method_mock( |
| 197 | + request, |
| 198 | + _DocxPartitioner, |
| 199 | + "_iter_document_elements", |
| 200 | + side_effect=fake_iter_document_elements, |
| 201 | + ) |
| 202 | + |
| 203 | + (element,) = partition(example_doc_path("simple.docx"), strategy=strategy) |
| 204 | + |
| 205 | + _iter_elements_.assert_called_once_with(ANY) |
| 206 | + assert element.text == f"strategy=={strategy}" |
| 207 | + |
| 208 | + |
176 | 209 | @pytest.mark.parametrize( |
177 | 210 | ("pass_metadata_filename", "content_type"), |
178 | 211 | [(False, None), (False, "text/html"), (True, "text/html"), (True, None)], |
@@ -556,6 +589,34 @@ def test_auto_partition_pptx_from_filename(): |
556 | 589 | assert elements[0].metadata.file_directory == os.path.split(filename)[0] |
557 | 590 |
|
558 | 591 |
|
| 592 | +@pytest.mark.parametrize( |
| 593 | + "strategy", |
| 594 | + [ |
| 595 | + PartitionStrategy.AUTO, |
| 596 | + PartitionStrategy.FAST, |
| 597 | + PartitionStrategy.HI_RES, |
| 598 | + PartitionStrategy.OCR_ONLY, |
| 599 | + ], |
| 600 | +) |
| 601 | +def test_partition_forwards_strategy_arg_to_partition_pptx(request: FixtureRequest, strategy: str): |
| 602 | + from unstructured.partition.pptx import _PptxPartitioner |
| 603 | + |
| 604 | + def fake_iter_presentation_elements(self: _PptxPartitioner) -> Iterator[Element]: |
| 605 | + yield Text(f"strategy=={self._opts.strategy}") |
| 606 | + |
| 607 | + _iter_elements_ = method_mock( |
| 608 | + request, |
| 609 | + _PptxPartitioner, |
| 610 | + "_iter_presentation_elements", |
| 611 | + side_effect=fake_iter_presentation_elements, |
| 612 | + ) |
| 613 | + |
| 614 | + (element,) = partition(example_doc_path("simple.pptx"), strategy=strategy) |
| 615 | + |
| 616 | + _iter_elements_.assert_called_once_with(ANY) |
| 617 | + assert element.text == f"strategy=={strategy}" |
| 618 | + |
| 619 | + |
559 | 620 | @pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container") |
560 | 621 | def test_auto_partition_ppt_from_filename(): |
561 | 622 | filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt") |
|
0 commit comments