|
| 1 | +import httpx |
| 2 | +import json |
1 | 3 | import pytest |
2 | 4 | import requests |
3 | 5 | from deepdiff import DeepDiff |
| 6 | +from httpx import Response |
| 7 | + |
| 8 | +from requests_toolbelt.multipart.decoder import MultipartDecoder # type: ignore |
| 9 | + |
4 | 10 | from unstructured_client import UnstructuredClient |
5 | 11 | from unstructured_client.models import shared, operations |
6 | 12 | from unstructured_client.models.errors import HTTPValidationError |
| 13 | +from unstructured_client.utils.retries import BackoffStrategy, RetryConfig |
| 14 | +from unstructured_client._hooks.custom import form_utils |
| 15 | +from unstructured_client._hooks.custom import split_pdf_hook |
7 | 16 |
|
8 | 17 | FAKE_KEY = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" |
9 | 18 |
|
@@ -275,3 +284,80 @@ def test_integration_split_pdf_strict_mode( |
275 | 284 | ], |
276 | 285 | ) |
277 | 286 | assert len(diff) == 0 |
| 287 | + |
| 288 | + |
| 289 | +@pytest.mark.asyncio |
| 290 | +async def test_split_pdf_requests_do_retry(monkeypatch): |
| 291 | + """ |
| 292 | + Test that when we split a pdf, the split requests will honor retryable errors. |
| 293 | + """ |
| 294 | + number_of_split_502s = 2 |
| 295 | + number_of_last_page_502s = 2 |
| 296 | + |
| 297 | + async def mock_send(_, request): |
| 298 | + """ |
| 299 | + Return a predefined number of 502s for requests with certain starting_page_number values. |
| 300 | +
|
| 301 | + This is because N-1 splits are sent off in the hook logic. These need explicit retry handling. |
| 302 | + The final split is returned to the SDK and gets the built in retry code. |
| 303 | +
|
| 304 | + We want to make sure both code paths are retried. |
| 305 | + """ |
| 306 | + request_body = request.read() |
| 307 | + decoded_body = MultipartDecoder(request_body, request.headers.get("Content-Type")) |
| 308 | + form_data = form_utils.parse_form_data(decoded_body) |
| 309 | + |
| 310 | + nonlocal number_of_split_502s |
| 311 | + nonlocal number_of_last_page_502s |
| 312 | + |
| 313 | + if number_of_split_502s > 0: |
| 314 | + if "starting_page_number" in form_data and int(form_data["starting_page_number"]) < 3: |
| 315 | + number_of_split_502s -= 1 |
| 316 | + return Response(502, request=request) |
| 317 | + |
| 318 | + if number_of_last_page_502s > 0: |
| 319 | + if "starting_page_number" in form_data and int(form_data["starting_page_number"]) > 12: |
| 320 | + number_of_last_page_502s -= 1 |
| 321 | + return Response(502, request=request) |
| 322 | + |
| 323 | + mock_return_data = [{ |
| 324 | + "type": "Title", |
| 325 | + "text": "Hello", |
| 326 | + }] |
| 327 | + |
| 328 | + return Response( |
| 329 | + 200, |
| 330 | + request=request, |
| 331 | + content=json.dumps(mock_return_data), |
| 332 | + headers={"Content-Type": "application/json"}, |
| 333 | + ) |
| 334 | + |
| 335 | + monkeypatch.setattr(split_pdf_hook.httpx.AsyncClient, "send", mock_send) |
| 336 | + |
| 337 | + sdk = UnstructuredClient( |
| 338 | + api_key_auth=FAKE_KEY, |
| 339 | + server_url="localhost:8000", |
| 340 | + retry_config=RetryConfig("backoff", BackoffStrategy(200, 1000, 1.5, 1000), False), |
| 341 | + ) |
| 342 | + |
| 343 | + filename = "_sample_docs/layout-parser-paper.pdf" |
| 344 | + with open(filename, "rb") as f: |
| 345 | + files = shared.Files( |
| 346 | + content=f.read(), |
| 347 | + file_name=filename, |
| 348 | + ) |
| 349 | + |
| 350 | + req = operations.PartitionRequest( |
| 351 | + shared.PartitionParameters( |
| 352 | + files=files, |
| 353 | + split_pdf_page=True, |
| 354 | + split_pdf_allow_failed=False, |
| 355 | + strategy="fast", |
| 356 | + ) |
| 357 | + ) |
| 358 | + |
| 359 | + res = await sdk.general.partition_async(request=req) |
| 360 | + |
| 361 | + assert number_of_split_502s == 0 |
| 362 | + assert number_of_last_page_502s == 0 |
| 363 | + assert res.status_code == 200 |
0 commit comments