Skip to content

Commit c5f7119

Browse files
authored
catch 500 error for fast strategy (#316)
Closes #314 We don't support any fallback for fast strategy on images anymore, and unst will raise [value error](https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/partition/strategies.py#L22) if partition uses fast strategy for image files. This catches the error and raises a 400 HTTPException. Also, this refactors `pipeline_api` to extract out some validation of arguments.
1 parent 6c01cb9 commit c5f7119

File tree

2 files changed

+104
-55
lines changed

2 files changed

+104
-55
lines changed

prepline_general/api/general.py

Lines changed: 89 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -310,51 +310,20 @@ def pipeline_api(
310310

311311
logger.debug(f"filetype: {file_content_type}")
312312

313-
# Reject traffic when free memory is below minimum
314-
# Default to 2GB
315-
mem = psutil.virtual_memory()
316-
memory_free_minimum = int(os.environ.get("UNSTRUCTURED_MEMORY_FREE_MINIMUM_MB", 2048))
317-
318-
if mem.available <= memory_free_minimum * 1024 * 1024:
319-
raise HTTPException(
320-
status_code=503, detail="Server is under heavy load. Please try again later."
321-
)
313+
_check_free_memory()
322314

323315
if file_content_type == "application/pdf":
324-
try:
325-
pdf = PdfReader(file)
326-
327-
# This will raise if the file is encrypted
328-
pdf.metadata
329-
except pypdf.errors.FileNotDecryptedError:
330-
raise HTTPException(
331-
status_code=400,
332-
detail="File is encrypted. Please decrypt it with password.",
333-
)
334-
except pypdf.errors.PdfReadError:
335-
raise HTTPException(status_code=400, detail="File does not appear to be a valid PDF")
336-
337-
strategy = (m_strategy[0] if len(m_strategy) else "auto").lower()
338-
strategies = ["fast", "hi_res", "auto", "ocr_only"]
339-
if strategy not in strategies:
340-
raise HTTPException(
341-
status_code=400, detail=f"Invalid strategy: {strategy}. Must be one of {strategies}"
342-
)
316+
pdf = _check_pdf(file)
343317

344318
show_coordinates_str = (m_coordinates[0] if len(m_coordinates) else "false").lower()
345319
show_coordinates = show_coordinates_str == "true"
346320

347-
hi_res_model_name = m_hi_res_model_name[0] if len(m_hi_res_model_name) else None
348-
349-
# Make sure chipper aliases to the latest model
350-
if hi_res_model_name and hi_res_model_name == "chipper":
351-
hi_res_model_name = "chipperv2"
352-
353-
if hi_res_model_name and hi_res_model_name in CHIPPER_MODEL_TYPES and show_coordinates:
354-
raise HTTPException(
355-
status_code=400,
356-
detail=f"coordinates aren't available when using the {hi_res_model_name} model type",
357-
)
321+
hi_res_model_name = _validate_hi_res_model_name(m_hi_res_model_name, show_coordinates)
322+
strategy = _validate_strategy(m_strategy)
323+
chunking_strategy = _validate_chunking_strategy(m_chunking_strategy)
324+
pdf_infer_table_structure = _set_pdf_infer_table_structure(
325+
m_pdf_infer_table_structure, strategy
326+
)
358327

359328
# Parallel mode is set by env variable
360329
enable_parallel_mode = os.environ.get("UNSTRUCTURED_PARALLEL_MODE_ENABLED", "false")
@@ -372,26 +341,10 @@ def pipeline_api(
372341
xml_keep_tags_str = (m_xml_keep_tags[0] if len(m_xml_keep_tags) else "false").lower()
373342
xml_keep_tags = xml_keep_tags_str == "true"
374343

375-
pdf_infer_table_structure = (
376-
m_pdf_infer_table_structure[0] if len(m_pdf_infer_table_structure) else "false"
377-
).lower()
378-
if strategy == "hi_res" and pdf_infer_table_structure == "true":
379-
pdf_infer_table_structure = True
380-
else:
381-
pdf_infer_table_structure = False
382-
383344
skip_infer_table_types = (
384345
m_skip_infer_table_types[0] if len(m_skip_infer_table_types) else ["pdf", "jpg", "png"]
385346
)
386347

387-
chunking_strategy = m_chunking_strategy[0].lower() if len(m_chunking_strategy) else None
388-
chunk_strategies = ["by_title"]
389-
if chunking_strategy and (chunking_strategy not in chunk_strategies):
390-
raise HTTPException(
391-
status_code=400,
392-
detail=f"Invalid chunking strategy: {chunking_strategy}. Must be one of {chunk_strategies}",
393-
)
394-
395348
multipage_sections_str = (
396349
m_multipage_sections[0] if len(m_multipage_sections) else "true"
397350
).lower()
@@ -500,6 +453,11 @@ def pipeline_api(
500453
status_code=400,
501454
detail="Json schema does not match the Unstructured schema",
502455
)
456+
if "fast strategy is not available for image files" in e.args[0]:
457+
raise HTTPException(
458+
status_code=400,
459+
detail="The fast strategy is not available for image files",
460+
)
503461

504462
raise e
505463
except zipfile.BadZipFile:
@@ -534,6 +492,82 @@ def pipeline_api(
534492
return result
535493

536494

495+
def _check_free_memory():
496+
"""Reject traffic when free memory is below minimum.
497+
Default to 2GB."""
498+
mem = psutil.virtual_memory()
499+
memory_free_minimum = int(os.environ.get("UNSTRUCTURED_MEMORY_FREE_MINIMUM_MB", 2048))
500+
501+
if mem.available <= memory_free_minimum * 1024 * 1024:
502+
raise HTTPException(
503+
status_code=503, detail="Server is under heavy load. Please try again later."
504+
)
505+
506+
507+
def _check_pdf(file):
508+
"""Check if the PDF file is encrypted, otherwise assume it is not a valid PDF."""
509+
try:
510+
pdf = PdfReader(file)
511+
512+
# This will raise if the file is encrypted
513+
pdf.metadata
514+
return pdf
515+
except pypdf.errors.FileNotDecryptedError:
516+
raise HTTPException(
517+
status_code=400,
518+
detail="File is encrypted. Please decrypt it with password.",
519+
)
520+
except pypdf.errors.PdfReadError:
521+
raise HTTPException(status_code=400, detail="File does not appear to be a valid PDF")
522+
523+
524+
def _validate_strategy(m_strategy):
525+
strategy = (m_strategy[0] if len(m_strategy) else "auto").lower()
526+
strategies = ["fast", "hi_res", "auto", "ocr_only"]
527+
if strategy not in strategies:
528+
raise HTTPException(
529+
status_code=400, detail=f"Invalid strategy: {strategy}. Must be one of {strategies}"
530+
)
531+
return strategy
532+
533+
534+
def _validate_hi_res_model_name(m_hi_res_model_name, show_coordinates):
535+
hi_res_model_name = m_hi_res_model_name[0] if len(m_hi_res_model_name) else None
536+
537+
# Make sure chipper aliases to the latest model
538+
if hi_res_model_name and hi_res_model_name == "chipper":
539+
hi_res_model_name = "chipperv2"
540+
541+
if hi_res_model_name and hi_res_model_name in CHIPPER_MODEL_TYPES and show_coordinates:
542+
raise HTTPException(
543+
status_code=400,
544+
detail=f"coordinates aren't available when using the {hi_res_model_name} model type",
545+
)
546+
return hi_res_model_name
547+
548+
549+
def _validate_chunking_strategy(m_chunking_strategy):
550+
chunking_strategy = m_chunking_strategy[0].lower() if len(m_chunking_strategy) else None
551+
chunk_strategies = ["by_title"]
552+
if chunking_strategy and (chunking_strategy not in chunk_strategies):
553+
raise HTTPException(
554+
status_code=400,
555+
detail=f"Invalid chunking strategy: {chunking_strategy}. Must be one of {chunk_strategies}",
556+
)
557+
return chunking_strategy
558+
559+
560+
def _set_pdf_infer_table_structure(m_pdf_infer_table_structure, strategy):
561+
pdf_infer_table_structure = (
562+
m_pdf_infer_table_structure[0] if len(m_pdf_infer_table_structure) else "false"
563+
).lower()
564+
if strategy == "hi_res" and pdf_infer_table_structure == "true":
565+
pdf_infer_table_structure = True
566+
else:
567+
pdf_infer_table_structure = False
568+
return pdf_infer_table_structure
569+
570+
537571
def get_validated_mimetype(file):
538572
"""
539573
Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too

test_general/api/test_app.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -813,3 +813,18 @@ def make_request(*args):
813813
# Assert only one call got through
814814
assert status_codes.count(200) == 1
815815
assert status_codes.count(503) == 2
816+
817+
818+
def test_invalid_strategy_for_image_file():
819+
"""
820+
Verify that we get a 400 error if we use "strategy=fast" with an image file
821+
"""
822+
client = TestClient(app)
823+
test_file = Path("sample-docs") / "layout-parser-paper-fast.jpg"
824+
resp = client.post(
825+
MAIN_API_ROUTE,
826+
files=[("files", (str(test_file), open(test_file, "rb")))],
827+
data={"strategy": "fast"},
828+
)
829+
assert resp.status_code == 400
830+
assert "fast strategy is not available for image files" in resp.text

0 commit comments

Comments
 (0)