Skip to content

Commit 05a7a36

Browse files
committed
test: added integration test for caching mechanism
1 parent 7fcc964 commit 05a7a36

File tree

1 file changed

+84
-0
lines changed

1 file changed

+84
-0
lines changed

_test_unstructured_client/integration/test_decorators.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
from __future__ import annotations
22

3+
import tempfile
4+
from pathlib import Path
5+
36
import httpx
47
import json
58
import pytest
@@ -102,6 +105,87 @@ def test_integration_split_pdf_has_same_output_as_non_split(
102105
)
103106
assert len(diff) == 0
104107

108+
@pytest.mark.parametrize( ("filename", "expected_ok", "strategy"), [
109+
("_sample_docs/layout-parser-paper.pdf", True, "hi_res"), # 16
110+
]# pages
111+
)
112+
@pytest.mark.parametrize( ("use_caching", "cache_dir"), [
113+
(True, None), # Use default cache dir
114+
(True, Path(tempfile.gettempdir()) / "test_integration_unstructured_client1"), # Use custom cache dir
115+
(False, None), # Don't use caching
116+
(False, Path(tempfile.gettempdir()) / "test_integration_unstructured_client2"), # Don't use caching, use custom cache dir
117+
])
118+
def test_integration_split_pdf_with_caching(
119+
filename: str, expected_ok: bool, strategy: str, use_caching: bool,
120+
cache_dir: Path | None
121+
):
122+
try:
123+
response = requests.get("http://localhost:8000/general/docs")
124+
assert response.status_code == 200, "The unstructured-api is not running on localhost:8000"
125+
except requests.exceptions.ConnectionError:
126+
assert False, "The unstructured-api is not running on localhost:8000"
127+
128+
client = UnstructuredClient(api_key_auth=FAKE_KEY, server_url="localhost:8000")
129+
130+
with open(filename, "rb") as f:
131+
files = shared.Files(
132+
content=f.read(),
133+
file_name=filename,
134+
)
135+
136+
if not expected_ok:
137+
# This will append .pdf to filename to fool first line of filetype detection, to simulate decoding error
138+
files.file_name += ".pdf"
139+
140+
parameters = shared.PartitionParameters(
141+
files=files,
142+
strategy=strategy,
143+
languages=["eng"],
144+
split_pdf_page=True,
145+
split_pdf_cache_tmp_data=use_caching,
146+
split_pdf_cache_dir=cache_dir,
147+
)
148+
149+
req = operations.PartitionRequest(
150+
partition_parameters=parameters
151+
)
152+
153+
try:
154+
resp_split = client.general.partition(request=req)
155+
except (HTTPValidationError, AttributeError) as exc:
156+
if not expected_ok:
157+
assert "File does not appear to be a valid PDF" in str(exc)
158+
return
159+
else:
160+
assert exc is None
161+
162+
parameters.split_pdf_page = False
163+
164+
req = operations.PartitionRequest(
165+
partition_parameters=parameters
166+
)
167+
168+
resp_single = client.general.partition(request=req)
169+
170+
assert len(resp_split.elements) == len(resp_single.elements)
171+
assert resp_split.content_type == resp_single.content_type
172+
assert resp_split.status_code == resp_single.status_code
173+
174+
diff = DeepDiff(
175+
t1=resp_split.elements,
176+
t2=resp_single.elements,
177+
exclude_regex_paths=[
178+
r"root\[\d+\]\['metadata'\]\['parent_id'\]",
179+
r"root\[\d+\]\['element_id'\]",
180+
],
181+
)
182+
assert len(diff) == 0
183+
184+
# make sure the cache dir was cleaned if passed explicitly
185+
if cache_dir:
186+
assert not Path(cache_dir).exists()
187+
188+
105189

106190
def test_integration_split_pdf_for_file_with_no_name():
107191
"""

0 commit comments

Comments
 (0)