Skip to content

Commit 4845781

Browse files
committed
Update PyMuPDF
1 parent 6340ded commit 4845781

File tree

8 files changed

+2059
-174
lines changed

8 files changed

+2059
-174
lines changed

docs/docs/integrations/document_loaders/pymupdf.ipynb

Lines changed: 1099 additions & 41 deletions
Large diffs are not rendered by default.

libs/community/langchain_community/document_loaders/parsers/pdf.py

Lines changed: 609 additions & 53 deletions
Large diffs are not rendered by default.

libs/community/langchain_community/document_loaders/pdf.py

Lines changed: 111 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
Any,
1313
BinaryIO,
1414
Iterator,
15+
Literal,
1516
Mapping,
1617
Optional,
1718
Sequence,
@@ -28,13 +29,15 @@
2829
from langchain_community.document_loaders.blob_loaders import Blob
2930
from langchain_community.document_loaders.dedoc import DedocBaseLoader
3031
from langchain_community.document_loaders.parsers.pdf import (
32+
CONVERT_IMAGE_TO_TEXT,
3133
AmazonTextractPDFParser,
3234
DocumentIntelligenceParser,
3335
PDFMinerParser,
3436
PDFPlumberParser,
3537
PyMuPDFParser,
3638
PyPDFium2Parser,
3739
PyPDFParser,
40+
_default_page_delimitor,
3841
)
3942
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
4043

@@ -96,7 +99,8 @@ def __init__(
9699
if "~" in self.file_path:
97100
self.file_path = os.path.expanduser(self.file_path)
98101

99-
# If the file is a web path or S3, download it to a temporary file, and use that
102+
# If the file is a web path or S3, download it to a temporary file,
103+
# and use that. It's better to use a BlobLoader.
100104
if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):
101105
self.temp_dir = tempfile.TemporaryDirectory()
102106
_, suffix = os.path.splitext(self.file_path)
@@ -412,51 +416,129 @@ def lazy_load(self) -> Iterator[Document]:
412416

413417

414418
class PyMuPDFLoader(BasePDFLoader):
415-
"""Load `PDF` files using `PyMuPDF`."""
419+
"""Load and parse a PDF file using 'PyMuPDF' library.
420+
421+
This class provides methods to load and parse PDF documents, supporting various
422+
configurations such as handling password-protected files, extracting tables,
423+
extracting images, and defining extraction mode. It integrates the `PyMuPDF`
424+
library for PDF processing and offers both synchronous and asynchronous document
425+
loading.
426+
427+
Examples:
428+
Setup:
429+
430+
.. code-block:: bash
431+
432+
pip install -U langchain-community pymupdf
433+
434+
Instantiate the loader:
435+
436+
.. code-block:: python
437+
438+
from langchain_community.document_loaders import PyMuPDFLoader
439+
440+
loader = PyMuPDFLoader(
441+
file_path = "./example_data/layout-parser-paper.pdf",
442+
# headers = None
443+
# password = None,
444+
mode = "single",
445+
pages_delimitor = "\n\f",
446+
# extract_images = True,
447+
# images_to_text = convert_images_to_text_with_tesseract(),
448+
# extract_tables = "markdown",
449+
# extract_tables_settings = None,
450+
)
451+
452+
Lazy load documents:
453+
454+
.. code-block:: python
455+
456+
docs = []
457+
docs_lazy = loader.lazy_load()
458+
459+
for doc in docs_lazy:
460+
docs.append(doc)
461+
print(docs[0].page_content[:100])
462+
print(docs[0].metadata)
463+
464+
Load documents asynchronously:
465+
466+
.. code-block:: python
467+
468+
docs = await loader.aload()
469+
print(docs[0].page_content[:100])
470+
print(docs[0].metadata)
471+
"""
416472

417473
def __init__(
418474
self,
419475
file_path: Union[str, PurePath],
420476
*,
421-
headers: Optional[dict] = None,
477+
password: Optional[str] = None,
478+
mode: Literal["single", "page"] = "page",
479+
pages_delimitor: str = _default_page_delimitor,
422480
extract_images: bool = False,
481+
images_to_text: CONVERT_IMAGE_TO_TEXT = None,
482+
extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
483+
headers: Optional[dict] = None,
484+
extract_tables_settings: Optional[dict[str, Any]] = None,
423485
**kwargs: Any,
424486
) -> None:
425-
"""Initialize with a file path."""
426-
try:
427-
import fitz # noqa:F401
428-
except ImportError:
429-
raise ImportError(
430-
"`PyMuPDF` package not found, please install it with "
431-
"`pip install pymupdf`"
432-
)
433-
super().__init__(file_path, headers=headers)
434-
self.extract_images = extract_images
435-
self.text_kwargs = kwargs
487+
"""Initialize with a file path.
436488
437-
def _lazy_load(self, **kwargs: Any) -> Iterator[Document]:
438-
if kwargs:
439-
logger.warning(
440-
f"Received runtime arguments {kwargs}. Passing runtime args to `load`"
441-
f" is deprecated. Please pass arguments during initialization instead."
442-
)
489+
Args:
490+
file_path: The path to the PDF file to be loaded.
491+
headers: Optional headers to use for GET request to download a file from a
492+
web path.
493+
password: Optional password for opening encrypted PDFs.
494+
mode: The extraction mode, either "single" for the entire document or "page"
495+
for page-wise extraction.
496+
pages_delimitor: A string delimiter to separate pages in single-mode
497+
extraction.
498+
extract_images: Whether to extract images from the PDF.
499+
images_to_text: Optional function or callable to convert images to text
500+
during extraction.
501+
extract_tables: Whether to extract tables in a specific format, such as
502+
"csv", "markdown", or "html".
503+
extract_tables_settings: Optional dictionary of settings for customizing
504+
table extraction.
505+
**kwargs: Additional keyword arguments for customizing text extraction
506+
behavior.
507+
508+
Returns:
509+
This method does not directly return data. Use the `load`, `lazy_load`, or
510+
`aload` methods to retrieve parsed documents with content and metadata.
443511
444-
text_kwargs = {**self.text_kwargs, **kwargs}
445-
parser = PyMuPDFParser(
446-
text_kwargs=text_kwargs, extract_images=self.extract_images
512+
Raises:
513+
ValueError: If the `mode` argument is not one of "single" or "page".
514+
"""
515+
if mode not in ["single", "page"]:
516+
raise ValueError("mode must be single or page")
517+
super().__init__(file_path, headers=headers)
518+
self.parser = PyMuPDFParser(
519+
password=password,
520+
mode=mode,
521+
pages_delimitor=pages_delimitor,
522+
text_kwargs=kwargs,
523+
extract_images=extract_images,
524+
images_to_text=images_to_text,
525+
extract_tables=extract_tables,
526+
extract_tables_settings=extract_tables_settings,
447527
)
528+
529+
def lazy_load(self) -> Iterator[Document]:
530+
"""
531+
Lazy load given path as pages.
532+
Insert image, if possible, between two paragraphs.
533+
In this way, a paragraph can be continued on the next page.
534+
"""
535+
parser = self.parser
448536
if self.web_path:
449537
blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined]
450538
else:
451539
blob = Blob.from_path(self.file_path) # type: ignore[attr-defined]
452540
yield from parser.lazy_parse(blob)
453541

454-
def load(self, **kwargs: Any) -> list[Document]:
455-
return list(self._lazy_load(**kwargs))
456-
457-
def lazy_load(self) -> Iterator[Document]:
458-
yield from self._lazy_load()
459-
460542

461543
# MathpixPDFLoader implementation taken largely from Daniel Gross's:
462544
# https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21

0 commit comments

Comments
 (0)