Skip to content

Commit 21759e2

Browse files
committed
Prepare the integration of new versions of PDFLoader.
Add file_path with PurePath Add CloudBlobLoader in __init__ Replace Dict/List to dict/list
1 parent d9c51b7 commit 21759e2

File tree

2 files changed

+11
-15
lines changed
  • libs/community
    • langchain_community/document_loaders
    • tests/integration_tests/document_loaders

2 files changed

+11
-15
lines changed

libs/community/langchain_community/document_loaders/pdf.py

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ class PyPDFLoader(BasePDFLoader):
225225

226226
def __init__(
227227
self,
228-
file_path: Union[str, PurePath],
228+
file_path: str,
229229
password: Optional[Union[str, bytes]] = None,
230230
headers: Optional[dict] = None,
231231
extract_images: bool = False,
@@ -264,7 +264,7 @@ class PyPDFium2Loader(BasePDFLoader):
264264

265265
def __init__(
266266
self,
267-
file_path: Union[str, PurePath],
267+
file_path: str,
268268
*,
269269
headers: Optional[dict] = None,
270270
extract_images: bool = False,
@@ -336,7 +336,7 @@ class PDFMinerLoader(BasePDFLoader):
336336

337337
def __init__(
338338
self,
339-
file_path: Union[str, PurePath],
339+
file_path: str,
340340
*,
341341
headers: Optional[dict] = None,
342342
extract_images: bool = False,
@@ -376,9 +376,7 @@ def lazy_load(
376376
class PDFMinerPDFasHTMLLoader(BasePDFLoader):
377377
"""Load `PDF` files as HTML content using `PDFMiner`."""
378378

379-
def __init__(
380-
self, file_path: Union[str, PurePath], *, headers: Optional[dict] = None
381-
):
379+
def __init__(self, file_path: str, *, headers: Optional[dict] = None):
382380
"""Initialize with a file path."""
383381
try:
384382
from pdfminer.high_level import extract_text_to_fp # noqa:F401
@@ -406,7 +404,7 @@ def lazy_load(self) -> Iterator[Document]:
406404
output_type="html",
407405
)
408406
metadata = {
409-
"source": str(self.file_path) if self.web_path is None else self.web_path
407+
"source": self.file_path if self.web_path is None else self.web_path
410408
}
411409
yield Document(page_content=output_string.getvalue(), metadata=metadata)
412410

@@ -416,7 +414,7 @@ class PyMuPDFLoader(BasePDFLoader):
416414

417415
def __init__(
418416
self,
419-
file_path: Union[str, PurePath],
417+
file_path: str,
420418
*,
421419
headers: Optional[dict] = None,
422420
extract_images: bool = False,
@@ -613,7 +611,7 @@ class PDFPlumberLoader(BasePDFLoader):
613611

614612
def __init__(
615613
self,
616-
file_path: Union[str, PurePath],
614+
file_path: str,
617615
text_kwargs: Optional[Mapping[str, Any]] = None,
618616
dedupe: bool = False,
619617
headers: Optional[dict] = None,
@@ -892,7 +890,7 @@ def _make_config(self) -> dict:
892890
from dedoc.utils.langchain import make_manager_pdf_config
893891

894892
return make_manager_pdf_config(
895-
file_path=str(self.file_path),
893+
file_path=self.file_path,
896894
parsing_params=self.parsing_parameters,
897895
split=self.split,
898896
)
@@ -903,7 +901,7 @@ class DocumentIntelligenceLoader(BasePDFLoader):
903901

904902
def __init__(
905903
self,
906-
file_path: Union[str, PurePath],
904+
file_path: str,
907905
client: Any,
908906
model: str = "prebuilt-document",
909907
headers: Optional[dict] = None,
@@ -1010,7 +1008,7 @@ def lazy_load(self) -> Iterator[Document]:
10101008

10111009
# Directly call asyncio.run to execute zerox synchronously
10121010
zerox_output = asyncio.run(
1013-
zerox(file_path=str(self.file_path), model=self.model, **self.zerox_kwargs)
1011+
zerox(file_path=self.file_path, model=self.model, **self.zerox_kwargs)
10141012
)
10151013

10161014
# Convert zerox output to Document instances and yield them

libs/community/tests/integration_tests/document_loaders/test_pdf.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -224,9 +224,7 @@ def test_amazontextract_loader(
224224
@pytest.mark.skip(reason="Requires AWS credentials to run")
225225
def test_amazontextract_loader_failures() -> None:
226226
# 2-page PDF local file system
227-
two_page_pdf = (
228-
Path(__file__).parent.parent / "examples/multi-page-forms-sample-2-page.pdf"
229-
)
227+
two_page_pdf = Path(__file__).parent.parent / "examples/multi-page-forms-sample-2-page.pdf"
230228
loader = AmazonTextractPDFLoader(two_page_pdf)
231229
with pytest.raises(ValueError):
232230
loader.load()

0 commit comments

Comments
 (0)