Skip to content

Commit 39b74a2

Browse files
authored
fix(test): Remedy macOS-only test failure not triggered by CI (#2957)
**Summary** A crude and OS-specific mechanism was used to detect when a path represented a temp-file. Change that to be robust across operating systems and localized configurations. The specific problem was for DOC files but this PR fixes it for PPT too which was prone to the same problem.
1 parent 7dea2fa commit 39b74a2

File tree

6 files changed

+37
-6
lines changed

6 files changed

+37
-6
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.13.7-dev3
1+
## 0.13.7-dev4
22

33
### Enhancements
44

@@ -9,6 +9,7 @@
99
### Fixes
1010

1111
* **`partition_docx()` handles short table rows.** The DOCX format allows a table row to start late and/or end early, meaning cells at the beginning or end of a row can be omitted. While there are legitimate uses for this capability, using it in practice is relatively rare. However, it can happen unintentionally when adjusting cell borders with the mouse. Accommodate this case and generate accurate `.text` and `.metadata.text_as_html` for these tables.
12+
* **Remedy macOS test failure not triggered by CI.** Generalize temp-file detection beyond hard-coded Linux-specific prefix.
1213

1314
## 0.13.6
1415

test_unstructured/partition/docx/test_doc.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,21 @@ def test_partition_doc_suppresses_modified_date_from_file_by_default(mocker: Moc
251251
assert elements[0].metadata.last_modified is None
252252

253253

254+
def test_partition_doc_pulls_modified_date_from_file_when_date_from_file_object_arg_is_True(
255+
mocker: MockFixture,
256+
):
257+
modified_date_on_file = "2024-05-01T09:24:28"
258+
mocker.patch(
259+
"unstructured.partition.doc.get_last_modified_date_from_file",
260+
return_value=modified_date_on_file,
261+
)
262+
263+
with open(example_doc_path("fake.doc"), "rb") as f:
264+
elements = partition_doc(file=f, date_from_file_object=True)
265+
266+
assert elements[0].metadata.last_modified == modified_date_on_file
267+
268+
254269
def test_partition_doc_from_file_explicit_get_metadata_date(
255270
mocker,
256271
filename="example-docs/fake.doc",

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.13.7-dev3" # pragma: no cover
1+
__version__ = "0.13.7-dev4" # pragma: no cover

unstructured/partition/docx.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,12 @@
6969
is_possible_title,
7070
is_us_city_state_zip,
7171
)
72-
from unstructured.utils import dependency_exists, lazyproperty, requires_dependencies
72+
from unstructured.utils import (
73+
dependency_exists,
74+
is_temp_file_path,
75+
lazyproperty,
76+
requires_dependencies,
77+
)
7378

7479
if dependency_exists("pypandoc"):
7580
import pypandoc
@@ -772,7 +777,7 @@ def _last_modified(self) -> Optional[str]:
772777

773778
# -- if the file is on the filesystem, get its date from there --
774779
if file_path is not None:
775-
return None if file_path.startswith("/tmp") else get_last_modified_date(file_path)
780+
return None if is_temp_file_path(file_path) else get_last_modified_date(file_path)
776781

777782
# -- otherwise, as long as user explicitly requested it, try getting it from the file-like
778783
# -- object (unlikely since BytesIO and its brethren have no such metadata).

unstructured/partition/pptx.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
is_possible_title,
4848
)
4949
from unstructured.partition.utils.constants import PartitionStrategy
50-
from unstructured.utils import lazyproperty
50+
from unstructured.utils import is_temp_file_path, lazyproperty
5151

5252
DETECTION_ORIGIN = "pptx"
5353

@@ -442,7 +442,7 @@ def last_modified(self) -> Optional[str]:
442442
if self._file_path:
443443
return (
444444
None
445-
if self._file_path.startswith("/tmp")
445+
if is_temp_file_path(self._file_path)
446446
else get_last_modified_date(self._file_path)
447447
)
448448

unstructured/utils.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import os
88
import platform
99
import subprocess
10+
import tempfile
1011
import threading
1112
from datetime import datetime
1213
from functools import wraps
@@ -75,6 +76,15 @@ def iter_tds(row_cell_strs: Sequence[str]) -> Iterator[str]:
7576
return f"<table>{''.join(iter_trs(matrix))}</table>" if matrix else ""
7677

7778

79+
def is_temp_file_path(file_path: str) -> bool:
80+
"""True when file_path is in the Python-defined tempdir.
81+
82+
The Python-defined temp directory is platform dependent (macOS != Linux != Windows)
83+
and can also be determined by an environment variable (TMPDIR, TEMP, or TMP).
84+
"""
85+
return file_path.startswith(tempfile.gettempdir())
86+
87+
7888
class lazyproperty(Generic[_T]):
7989
"""Decorator like @property, but evaluated only on first access.
8090

0 commit comments

Comments
 (0)