Skip to content

Commit acf4358

Browse files
committed
Fix deprecated load() with kwargs
1 parent 9b45bd8 commit acf4358

File tree

3 files changed

+38
-5
lines changed
  • libs/community

3 files changed

+38
-5
lines changed

libs/community/langchain_community/document_loaders/parsers/pdf.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,15 @@ def __init__(
531531
self.extract_tables_settings = extract_tables_settings
532532

533533
def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type]
534+
return self._lazy_parse(
535+
blob,
536+
)
537+
538+
def _lazy_parse(
539+
self,
540+
blob: Blob,
541+
text_kwargs: Optional[dict[str, Any]] = None, # deprectaed
542+
) -> Iterator[Document]: # type: ignore[valid-type]
534543
"""Lazily parse the blob.
535544
Insert image, if possible, between two paragraphs.
536545
In this way, a paragraph can be continued on the next page.
@@ -547,6 +556,8 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
547556
try:
548557
import pymupdf
549558

559+
if not text_kwargs:
560+
text_kwargs = {}
550561
if not self.extract_tables_settings:
551562
from pymupdf.table import (
552563
DEFAULT_JOIN_TOLERANCE,
@@ -597,7 +608,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
597608
doc_metadata = self._extract_metadata(doc, blob)
598609
full_content = []
599610
for page in doc:
600-
all_text = self._get_page_content(doc, page, blob).strip()
611+
all_text = self._get_page_content(doc, page, text_kwargs).strip()
601612
if self.mode == "page":
602613
yield Document(
603614
page_content=all_text,
@@ -615,7 +626,10 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
615626
)
616627

617628
def _get_page_content(
618-
self, doc: pymupdf.Document, page: pymupdf.Page, blob: Blob
629+
self,
630+
doc: pymupdf.Document,
631+
page: pymupdf.Page,
632+
text_kwargs: dict[str, Any],
619633
) -> str:
620634
"""Get the text of the page using PyMuPDF and RapidOCR and issue a warning
621635
if it is empty.
@@ -628,7 +642,7 @@ def _get_page_content(
628642
Returns:
629643
str: The text content of the page.
630644
"""
631-
text_from_page = page.get_text(**self.text_kwargs)
645+
text_from_page = page.get_text(**{**self.text_kwargs, **text_kwargs})
632646
images_from_page = self._extract_images_from_page(doc, page)
633647
tables_from_page = self._extract_tables_from_page(page)
634648
extras = []

libs/community/langchain_community/document_loaders/pdf.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -544,17 +544,28 @@ def __init__(
544544
extract_tables_settings=extract_tables_settings,
545545
)
546546

547-
def lazy_load(self) -> Iterator[Document]:
547+
def _lazy_load(self, **kwargs: Any) -> Iterator[Document]:
548548
"""Lazy load given path as pages or single document (see `mode`).
549549
Insert image, if possible, between two paragraphs.
550550
In this way, a paragraph can be continued on the next page.
551551
"""
552+
if kwargs:
553+
logger.warning(
554+
f"Received runtime arguments {kwargs}. Passing runtime args to `load`"
555+
f" is deprecated. Please pass arguments during initialization instead."
556+
)
552557
parser = self.parser
553558
if self.web_path:
554559
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined]
555560
else:
556561
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
557-
yield from parser.lazy_parse(blob)
562+
yield from parser._lazy_parse(blob, text_kwargs=kwargs)
563+
564+
def load(self, **kwargs: Any) -> list[Document]:
565+
return list(self._lazy_load(**kwargs))
566+
567+
def lazy_load(self) -> Iterator[Document]:
568+
yield from self._lazy_load()
558569

559570

560571
# MathpixPDFLoader implementation taken largely from Daniel Gross's:

libs/community/tests/integration_tests/document_loaders/test_pdf.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,3 +237,11 @@ def test_standard_parameters(
237237
assert loader.web_path == web_path
238238
assert loader.file_path != web_path
239239
assert len(docs) == 1
240+
241+
242+
def test_pymupdf_deprecated_kwards() -> None:
243+
from langchain_community.document_loaders import PyMuPDFLoader
244+
245+
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
246+
loader = PyMuPDFLoader(file_path=file_path)
247+
loader.load(sort=True)

0 commit comments

Comments
 (0)