@@ -310,51 +310,20 @@ def pipeline_api(
310310
311311 logger .debug (f"filetype: { file_content_type } " )
312312
313- # Reject traffic when free memory is below minimum
314- # Default to 2GB
315- mem = psutil .virtual_memory ()
316- memory_free_minimum = int (os .environ .get ("UNSTRUCTURED_MEMORY_FREE_MINIMUM_MB" , 2048 ))
317-
318- if mem .available <= memory_free_minimum * 1024 * 1024 :
319- raise HTTPException (
320- status_code = 503 , detail = "Server is under heavy load. Please try again later."
321- )
313+ _check_free_memory ()
322314
323315 if file_content_type == "application/pdf" :
324- try :
325- pdf = PdfReader (file )
326-
327- # This will raise if the file is encrypted
328- pdf .metadata
329- except pypdf .errors .FileNotDecryptedError :
330- raise HTTPException (
331- status_code = 400 ,
332- detail = "File is encrypted. Please decrypt it with password." ,
333- )
334- except pypdf .errors .PdfReadError :
335- raise HTTPException (status_code = 400 , detail = "File does not appear to be a valid PDF" )
336-
337- strategy = (m_strategy [0 ] if len (m_strategy ) else "auto" ).lower ()
338- strategies = ["fast" , "hi_res" , "auto" , "ocr_only" ]
339- if strategy not in strategies :
340- raise HTTPException (
341- status_code = 400 , detail = f"Invalid strategy: { strategy } . Must be one of { strategies } "
342- )
316+ pdf = _check_pdf (file )
343317
344318 show_coordinates_str = (m_coordinates [0 ] if len (m_coordinates ) else "false" ).lower ()
345319 show_coordinates = show_coordinates_str == "true"
346320
347- hi_res_model_name = m_hi_res_model_name [0 ] if len (m_hi_res_model_name ) else None
348-
349- # Make sure chipper aliases to the latest model
350- if hi_res_model_name and hi_res_model_name == "chipper" :
351- hi_res_model_name = "chipperv2"
352-
353- if hi_res_model_name and hi_res_model_name in CHIPPER_MODEL_TYPES and show_coordinates :
354- raise HTTPException (
355- status_code = 400 ,
356- detail = f"coordinates aren't available when using the { hi_res_model_name } model type" ,
357- )
321+ hi_res_model_name = _validate_hi_res_model_name (m_hi_res_model_name , show_coordinates )
322+ strategy = _validate_strategy (m_strategy )
323+ chunking_strategy = _validate_chunking_strategy (m_chunking_strategy )
324+ pdf_infer_table_structure = _set_pdf_infer_table_structure (
325+ m_pdf_infer_table_structure , strategy
326+ )
358327
359328 # Parallel mode is set by env variable
360329 enable_parallel_mode = os .environ .get ("UNSTRUCTURED_PARALLEL_MODE_ENABLED" , "false" )
@@ -372,26 +341,10 @@ def pipeline_api(
372341 xml_keep_tags_str = (m_xml_keep_tags [0 ] if len (m_xml_keep_tags ) else "false" ).lower ()
373342 xml_keep_tags = xml_keep_tags_str == "true"
374343
375- pdf_infer_table_structure = (
376- m_pdf_infer_table_structure [0 ] if len (m_pdf_infer_table_structure ) else "false"
377- ).lower ()
378- if strategy == "hi_res" and pdf_infer_table_structure == "true" :
379- pdf_infer_table_structure = True
380- else :
381- pdf_infer_table_structure = False
382-
383344 skip_infer_table_types = (
384345 m_skip_infer_table_types [0 ] if len (m_skip_infer_table_types ) else ["pdf" , "jpg" , "png" ]
385346 )
386347
387- chunking_strategy = m_chunking_strategy [0 ].lower () if len (m_chunking_strategy ) else None
388- chunk_strategies = ["by_title" ]
389- if chunking_strategy and (chunking_strategy not in chunk_strategies ):
390- raise HTTPException (
391- status_code = 400 ,
392- detail = f"Invalid chunking strategy: { chunking_strategy } . Must be one of { chunk_strategies } " ,
393- )
394-
395348 multipage_sections_str = (
396349 m_multipage_sections [0 ] if len (m_multipage_sections ) else "true"
397350 ).lower ()
@@ -500,6 +453,11 @@ def pipeline_api(
500453 status_code = 400 ,
501454 detail = "Json schema does not match the Unstructured schema" ,
502455 )
456+ if "fast strategy is not available for image files" in e .args [0 ]:
457+ raise HTTPException (
458+ status_code = 400 ,
459+ detail = "The fast strategy is not available for image files" ,
460+ )
503461
504462 raise e
505463 except zipfile .BadZipFile :
@@ -534,6 +492,82 @@ def pipeline_api(
534492 return result
535493
536494
495+ def _check_free_memory ():
496+ """Reject traffic when free memory is below minimum.
497+ Default to 2GB."""
498+ mem = psutil .virtual_memory ()
499+ memory_free_minimum = int (os .environ .get ("UNSTRUCTURED_MEMORY_FREE_MINIMUM_MB" , 2048 ))
500+
501+ if mem .available <= memory_free_minimum * 1024 * 1024 :
502+ raise HTTPException (
503+ status_code = 503 , detail = "Server is under heavy load. Please try again later."
504+ )
505+
506+
507+ def _check_pdf (file ):
508+ """Check if the PDF file is encrypted, otherwise assume it is not a valid PDF."""
509+ try :
510+ pdf = PdfReader (file )
511+
512+ # This will raise if the file is encrypted
513+ pdf .metadata
514+ return pdf
515+ except pypdf .errors .FileNotDecryptedError :
516+ raise HTTPException (
517+ status_code = 400 ,
518+ detail = "File is encrypted. Please decrypt it with password." ,
519+ )
520+ except pypdf .errors .PdfReadError :
521+ raise HTTPException (status_code = 400 , detail = "File does not appear to be a valid PDF" )
522+
523+
524+ def _validate_strategy (m_strategy ):
525+ strategy = (m_strategy [0 ] if len (m_strategy ) else "auto" ).lower ()
526+ strategies = ["fast" , "hi_res" , "auto" , "ocr_only" ]
527+ if strategy not in strategies :
528+ raise HTTPException (
529+ status_code = 400 , detail = f"Invalid strategy: { strategy } . Must be one of { strategies } "
530+ )
531+ return strategy
532+
533+
534+ def _validate_hi_res_model_name (m_hi_res_model_name , show_coordinates ):
535+ hi_res_model_name = m_hi_res_model_name [0 ] if len (m_hi_res_model_name ) else None
536+
537+ # Make sure chipper aliases to the latest model
538+ if hi_res_model_name and hi_res_model_name == "chipper" :
539+ hi_res_model_name = "chipperv2"
540+
541+ if hi_res_model_name and hi_res_model_name in CHIPPER_MODEL_TYPES and show_coordinates :
542+ raise HTTPException (
543+ status_code = 400 ,
544+ detail = f"coordinates aren't available when using the { hi_res_model_name } model type" ,
545+ )
546+ return hi_res_model_name
547+
548+
549+ def _validate_chunking_strategy (m_chunking_strategy ):
550+ chunking_strategy = m_chunking_strategy [0 ].lower () if len (m_chunking_strategy ) else None
551+ chunk_strategies = ["by_title" ]
552+ if chunking_strategy and (chunking_strategy not in chunk_strategies ):
553+ raise HTTPException (
554+ status_code = 400 ,
555+ detail = f"Invalid chunking strategy: { chunking_strategy } . Must be one of { chunk_strategies } " ,
556+ )
557+ return chunking_strategy
558+
559+
560+ def _set_pdf_infer_table_structure (m_pdf_infer_table_structure , strategy ):
561+ pdf_infer_table_structure = (
562+ m_pdf_infer_table_structure [0 ] if len (m_pdf_infer_table_structure ) else "false"
563+ ).lower ()
564+ if strategy == "hi_res" and pdf_infer_table_structure == "true" :
565+ pdf_infer_table_structure = True
566+ else :
567+ pdf_infer_table_structure = False
568+ return pdf_infer_table_structure
569+
570+
537571def get_validated_mimetype (file ):
538572 """
539573 Return a file's mimetype, either via the file.content_type or the mimetypes lib if that's too
0 commit comments