Prepare the integration of new versions of PDFLoader.

pprados · pprados · commit 21759e23e486 · 2025-01-07T17:08:12.000+01:00
Add file_path with PurePath
Add CloudBlobLoader in __init__
Replace Dict/List to dict/list
diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py
@@ -225,7 +225,7 @@ class PyPDFLoader(BasePDFLoader):
 
     def __init__(
         self,
-        file_path: Union[str, PurePath],
+        file_path: str,
         password: Optional[Union[str, bytes]] = None,
         headers: Optional[dict] = None,
         extract_images: bool = False,
@@ -264,7 +264,7 @@ class PyPDFium2Loader(BasePDFLoader):
 
     def __init__(
         self,
-        file_path: Union[str, PurePath],
+        file_path: str,
         *,
         headers: Optional[dict] = None,
         extract_images: bool = False,
@@ -336,7 +336,7 @@ class PDFMinerLoader(BasePDFLoader):
 
     def __init__(
         self,
-        file_path: Union[str, PurePath],
+        file_path: str,
         *,
         headers: Optional[dict] = None,
         extract_images: bool = False,
@@ -376,9 +376,7 @@ def lazy_load(
 class PDFMinerPDFasHTMLLoader(BasePDFLoader):
     """Load `PDF` files as HTML content using `PDFMiner`."""
 
-    def __init__(
-        self, file_path: Union[str, PurePath], *, headers: Optional[dict] = None
-    ):
+    def __init__(self, file_path: str, *, headers: Optional[dict] = None):
         """Initialize with a file path."""
         try:
             from pdfminer.high_level import extract_text_to_fp  # noqa:F401
@@ -406,7 +404,7 @@ def lazy_load(self) -> Iterator[Document]:
                 output_type="html",
             )
         metadata = {
-            "source": str(self.file_path) if self.web_path is None else self.web_path
+            "source": self.file_path if self.web_path is None else self.web_path
         }
         yield Document(page_content=output_string.getvalue(), metadata=metadata)
 
@@ -416,7 +414,7 @@ class PyMuPDFLoader(BasePDFLoader):
 
     def __init__(
         self,
-        file_path: Union[str, PurePath],
+        file_path: str,
         *,
         headers: Optional[dict] = None,
         extract_images: bool = False,
@@ -613,7 +611,7 @@ class PDFPlumberLoader(BasePDFLoader):
 
     def __init__(
         self,
-        file_path: Union[str, PurePath],
+        file_path: str,
         text_kwargs: Optional[Mapping[str, Any]] = None,
         dedupe: bool = False,
         headers: Optional[dict] = None,
@@ -892,7 +890,7 @@ def _make_config(self) -> dict:
         from dedoc.utils.langchain import make_manager_pdf_config
 
         return make_manager_pdf_config(
-            file_path=str(self.file_path),
+            file_path=self.file_path,
             parsing_params=self.parsing_parameters,
             split=self.split,
         )
@@ -903,7 +901,7 @@ class DocumentIntelligenceLoader(BasePDFLoader):
 
     def __init__(
         self,
-        file_path: Union[str, PurePath],
+        file_path: str,
         client: Any,
         model: str = "prebuilt-document",
         headers: Optional[dict] = None,
@@ -1010,7 +1008,7 @@ def lazy_load(self) -> Iterator[Document]:
 
         # Directly call asyncio.run to execute zerox synchronously
         zerox_output = asyncio.run(
-            zerox(file_path=str(self.file_path), model=self.model, **self.zerox_kwargs)
+            zerox(file_path=self.file_path, model=self.model, **self.zerox_kwargs)
         )
 
         # Convert zerox output to Document instances and yield them
diff --git a/libs/community/tests/integration_tests/document_loaders/test_pdf.py b/libs/community/tests/integration_tests/document_loaders/test_pdf.py
@@ -224,9 +224,7 @@ def test_amazontextract_loader(
 @pytest.mark.skip(reason="Requires AWS credentials to run")
 def test_amazontextract_loader_failures() -> None:
     # 2-page PDF local file system
-    two_page_pdf = (
-        Path(__file__).parent.parent / "examples/multi-page-forms-sample-2-page.pdf"
-    )
+    two_page_pdf = Path(__file__).parent.parent / "examples/multi-page-forms-sample-2-page.pdf"
     loader = AmazonTextractPDFLoader(two_page_pdf)
     with pytest.raises(ValueError):
         loader.load()