Skip to content

Commit b3389c0

Browse files
authored
fix: fix "not a valid pdf error" in parallel mode (#186)
This was a subtle bug that came out in the retry logic. When we get a 500 during the requests.post, we'll try again. However, the pdf was stored in a BytesIO, which had already been read the first time we sent it. The next request sends an empty file, which results in a 400 response masking the original error. Steps to verify: * First, checkout `main` * Start up the api in parallel mode ``` export UNSTRUCTURED_PARALLEL_MODE_ENABLED=true export UNSTRUCTURED_PARALLEL_MODE_URL=http://localhost:8000/general/v0/general make run-web-app ``` * Insert a 500 error into `prepline_general/api/general.py:partition_pdf_splits()` ``` # If it's small enough, just process locally if len(pdf_pages) <= pages_per_pdf: raise HTTPException(status_code=500) # Throw an error here return partition( file=file, file_filename=file_filename, content_type=content_type, **partition_kwargs ) ``` * Send a document and see that the 500 is hidden behind a 400 error ``` $ curl 'http://localhost:8000/general/v0/general' --header 'Accept: application/json' --form files=@sample-docs/layout-parser-paper-fast.pdf {"detail":"layout-parser-paper-fast.pdf does not appear to be a valid PDF"}% ``` * Switch to this branch and do it again - you should now get a 500 `Internal server error` response
1 parent f491b85 commit b3389c0

File tree

5 files changed

+28
-13
lines changed

5 files changed

+28
-13
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.0.36-dev0
2+
3+
* Fix a bug in parallel mode causing `not a valid pdf` errors
4+
15
## 0.0.35
26

37
* Bump unstructured library to 0.9.2

pipeline-notebooks/pipeline-general.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -617,7 +617,7 @@
617617
" new_pdf.write(pdf_buffer)\n",
618618
" pdf_buffer.seek(0)\n",
619619
"\n",
620-
" split_pdfs.append((pdf_buffer, offset))\n",
620+
" split_pdfs.append((pdf_buffer.read(), offset))\n",
621621
" offset += split_size\n",
622622
"\n",
623623
" return split_pdfs\n",

prepline_general/api/general.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def get_pdf_splits(pdf_pages, split_size=1):
9090
new_pdf.write(pdf_buffer)
9191
pdf_buffer.seek(0)
9292

93-
split_pdfs.append((pdf_buffer, offset))
93+
split_pdfs.append((pdf_buffer.read(), offset))
9494
offset += split_size
9595

9696
return split_pdfs
@@ -484,7 +484,7 @@ def return_content_type(filename):
484484

485485

486486
@router.post("/general/v0/general")
487-
@router.post("/general/v0.0.35/general")
487+
@router.post("/general/v0.0.36/general")
488488
def pipeline_1(
489489
request: Request,
490490
gz_uncompressed_content_type: Optional[str] = Form(default=None),

preprocessing-pipeline-family.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
name: general
2-
version: 0.0.35
2+
version: 0.0.36

test_general/api/test_app.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -415,7 +415,6 @@ def test_parallel_mode_returns_errors(monkeypatch):
415415
response = client.post(
416416
MAIN_API_ROUTE,
417417
files=[("files", (str(test_file), open(test_file, "rb"), "application/pdf"))],
418-
data={"pdf_processing_mode": "parallel"},
419418
)
420419

421420
assert response.status_code == 500
@@ -432,13 +431,12 @@ def test_parallel_mode_returns_errors(monkeypatch):
432431
response = client.post(
433432
MAIN_API_ROUTE,
434433
files=[("files", (str(test_file), open(test_file, "rb"), "application/pdf"))],
435-
data={"pdf_processing_mode": "parallel"},
436434
)
437435

438436
assert response.status_code == 400
439437

440438

441-
def test_partition_file_via_api_retry(monkeypatch, mocker):
439+
def test_partition_file_via_api_will_retry(monkeypatch, mocker):
442440
"""
443441
Verify number of retries with parallel mode
444442
"""
@@ -449,23 +447,36 @@ def test_partition_file_via_api_retry(monkeypatch, mocker):
449447
monkeypatch.setenv("UNSTRUCTURED_PARALLEL_RETRY_ATTEMPTS", "2")
450448
monkeypatch.setenv("UNSTRUCTURED_PARALLEL_RETRY_BACKOFF_TIME", "0.1")
451449

450+
num_calls = 0
451+
452+
# Return a transient error the first time
453+
def mock_response(*args, **kwargs):
454+
nonlocal num_calls
455+
num_calls += 1
456+
457+
if num_calls == 1:
458+
return MockResponse(status_code=500)
459+
460+
return MockResponse(status_code=200)
461+
452462
monkeypatch.setattr(
453463
requests,
454464
"post",
455-
lambda *args, **kwargs: MockResponse(status_code=500),
465+
mock_response,
456466
)
457-
mock_sleep = mocker.patch("time.sleep")
467+
468+
# This needs to be mocked when we return 200
469+
mocker.patch("prepline_general.api.general.elements_from_json")
470+
458471
client = TestClient(app)
459-
test_file = Path("sample-docs") / "layout-parser-paper.pdf"
472+
test_file = Path("sample-docs") / "layout-parser-paper-fast.pdf"
460473

461474
response = client.post(
462475
MAIN_API_ROUTE,
463476
files=[("files", (str(test_file), open(test_file, "rb"), "application/pdf"))],
464-
data={"pdf_processing_mode": "parallel"},
465477
)
466478

467-
assert response.status_code == 500
468-
assert mock_sleep.call_count == 2
479+
assert response.status_code == 200
469480

470481

471482
def test_partition_file_via_api_no_retryable_error_code(monkeypatch, mocker):

0 commit comments

Comments
 (0)