@@ -336,7 +336,7 @@ class PDFMinerLoader(BasePDFLoader):
336336
337337 def __init__ (
338338 self ,
339- file_path : str ,
339+ file_path : Union [ str , PurePath ] ,
340340 * ,
341341 headers : Optional [dict ] = None ,
342342 extract_images : bool = False ,
@@ -376,7 +376,10 @@ def lazy_load(
376376class PDFMinerPDFasHTMLLoader (BasePDFLoader ):
377377 """Load `PDF` files as HTML content using `PDFMiner`."""
378378
379- def __init__ (self , file_path : str , * , headers : Optional [dict ] = None ):
379+ def __init__ (self ,
380+ file_path : Union [str , PurePath ],
381+ * ,
382+ headers : Optional [dict ] = None ):
380383 """Initialize with a file path."""
381384 try :
382385 from pdfminer .high_level import extract_text_to_fp # noqa:F401
@@ -414,7 +417,7 @@ class PyMuPDFLoader(BasePDFLoader):
414417
415418 def __init__ (
416419 self ,
417- file_path : str ,
420+ file_path : Union [ str , PurePath ] ,
418421 * ,
419422 headers : Optional [dict ] = None ,
420423 extract_images : bool = False ,
@@ -611,7 +614,7 @@ class PDFPlumberLoader(BasePDFLoader):
611614
612615 def __init__ (
613616 self ,
614- file_path : str ,
617+ file_path : Union [ str , PurePath ] ,
615618 text_kwargs : Optional [Mapping [str , Any ]] = None ,
616619 dedupe : bool = False ,
617620 headers : Optional [dict ] = None ,
@@ -890,7 +893,7 @@ def _make_config(self) -> dict:
890893 from dedoc .utils .langchain import make_manager_pdf_config
891894
892895 return make_manager_pdf_config (
893- file_path = self .file_path ,
896+ file_path = str ( self .file_path ) ,
894897 parsing_params = self .parsing_parameters ,
895898 split = self .split ,
896899 )
@@ -901,7 +904,7 @@ class DocumentIntelligenceLoader(BasePDFLoader):
901904
902905 def __init__ (
903906 self ,
904- file_path : str ,
907+ file_path : Union [ str , PurePath ] ,
905908 client : Any ,
906909 model : str = "prebuilt-document" ,
907910 headers : Optional [dict ] = None ,
@@ -1008,7 +1011,7 @@ def lazy_load(self) -> Iterator[Document]:
10081011
10091012 # Directly call asyncio.run to execute zerox synchronously
10101013 zerox_output = asyncio .run (
1011- zerox (file_path = self .file_path , model = self .model , ** self .zerox_kwargs )
1014+ zerox (file_path = str ( self .file_path ) , model = self .model , ** self .zerox_kwargs )
10121015 )
10131016
10141017 # Convert zerox output to Document instances and yield them
0 commit comments