|
12 | 12 | Any, |
13 | 13 | BinaryIO, |
14 | 14 | Iterator, |
| 15 | + Literal, |
15 | 16 | Mapping, |
16 | 17 | Optional, |
17 | 18 | Sequence, |
|
28 | 29 | from langchain_community.document_loaders.blob_loaders import Blob |
29 | 30 | from langchain_community.document_loaders.dedoc import DedocBaseLoader |
30 | 31 | from langchain_community.document_loaders.parsers.pdf import ( |
| 32 | + CONVERT_IMAGE_TO_TEXT, |
31 | 33 | AmazonTextractPDFParser, |
32 | 34 | DocumentIntelligenceParser, |
33 | 35 | PDFMinerParser, |
34 | 36 | PDFPlumberParser, |
35 | 37 | PyMuPDFParser, |
36 | 38 | PyPDFium2Parser, |
37 | 39 | PyPDFParser, |
| 40 | + _default_page_delimitor, |
38 | 41 | ) |
39 | 42 | from langchain_community.document_loaders.unstructured import UnstructuredFileLoader |
40 | 43 |
|
@@ -96,7 +99,8 @@ def __init__( |
96 | 99 | if "~" in self.file_path: |
97 | 100 | self.file_path = os.path.expanduser(self.file_path) |
98 | 101 |
|
99 | | - # If the file is a web path or S3, download it to a temporary file, and use that |
| 102 | + # If the file is a web path or S3, download it to a temporary file, |
| 103 | + # and use that. It's better to use a BlobLoader. |
100 | 104 | if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path): |
101 | 105 | self.temp_dir = tempfile.TemporaryDirectory() |
102 | 106 | _, suffix = os.path.splitext(self.file_path) |
@@ -412,51 +416,129 @@ def lazy_load(self) -> Iterator[Document]: |
412 | 416 |
|
413 | 417 |
|
414 | 418 | class PyMuPDFLoader(BasePDFLoader): |
415 | | - """Load `PDF` files using `PyMuPDF`.""" |
| 419 | + """Load and parse a PDF file using 'PyMuPDF' library. |
| 420 | +
|
| 421 | + This class provides methods to load and parse PDF documents, supporting various |
| 422 | + configurations such as handling password-protected files, extracting tables, |
| 423 | + extracting images, and defining extraction mode. It integrates the `PyMuPDF` |
| 424 | + library for PDF processing and offers both synchronous and asynchronous document |
| 425 | + loading. |
| 426 | +
|
| 427 | + Examples: |
| 428 | + Setup: |
| 429 | +
|
| 430 | + .. code-block:: bash |
| 431 | +
|
| 432 | + pip install -U langchain-community pymupdf |
| 433 | +
|
| 434 | + Instantiate the loader: |
| 435 | +
|
| 436 | + .. code-block:: python |
| 437 | +
|
| 438 | + from langchain_community.document_loaders import PyMuPDFLoader |
| 439 | +
|
| 440 | + loader = PyMuPDFLoader( |
| 441 | + file_path = "./example_data/layout-parser-paper.pdf", |
| 442 | + # headers = None |
| 443 | + # password = None, |
| 444 | + mode = "single", |
| 445 | + pages_delimitor = "\n\f", |
| 446 | + # extract_images = True, |
| 447 | + # images_to_text = convert_images_to_text_with_tesseract(), |
| 448 | + # extract_tables = "markdown", |
| 449 | + # extract_tables_settings = None, |
| 450 | + ) |
| 451 | +
|
| 452 | + Lazy load documents: |
| 453 | +
|
| 454 | + .. code-block:: python |
| 455 | +
|
| 456 | + docs = [] |
| 457 | + docs_lazy = loader.lazy_load() |
| 458 | +
|
| 459 | + for doc in docs_lazy: |
| 460 | + docs.append(doc) |
| 461 | + print(docs[0].page_content[:100]) |
| 462 | + print(docs[0].metadata) |
| 463 | +
|
| 464 | + Load documents asynchronously: |
| 465 | +
|
| 466 | + .. code-block:: python |
| 467 | +
|
| 468 | + docs = await loader.aload() |
| 469 | + print(docs[0].page_content[:100]) |
| 470 | + print(docs[0].metadata) |
| 471 | + """ |
416 | 472 |
|
417 | 473 | def __init__( |
418 | 474 | self, |
419 | 475 | file_path: Union[str, PurePath], |
420 | 476 | *, |
421 | | - headers: Optional[dict] = None, |
| 477 | + password: Optional[str] = None, |
| 478 | + mode: Literal["single", "page"] = "page", |
| 479 | + pages_delimitor: str = _default_page_delimitor, |
422 | 480 | extract_images: bool = False, |
| 481 | + images_to_text: CONVERT_IMAGE_TO_TEXT = None, |
| 482 | + extract_tables: Union[Literal["csv", "markdown", "html"], None] = None, |
| 483 | + headers: Optional[dict] = None, |
| 484 | + extract_tables_settings: Optional[dict[str, Any]] = None, |
423 | 485 | **kwargs: Any, |
424 | 486 | ) -> None: |
425 | | - """Initialize with a file path.""" |
426 | | - try: |
427 | | - import fitz # noqa:F401 |
428 | | - except ImportError: |
429 | | - raise ImportError( |
430 | | - "`PyMuPDF` package not found, please install it with " |
431 | | - "`pip install pymupdf`" |
432 | | - ) |
433 | | - super().__init__(file_path, headers=headers) |
434 | | - self.extract_images = extract_images |
435 | | - self.text_kwargs = kwargs |
| 487 | + """Initialize with a file path. |
436 | 488 |
|
437 | | - def _lazy_load(self, **kwargs: Any) -> Iterator[Document]: |
438 | | - if kwargs: |
439 | | - logger.warning( |
440 | | - f"Received runtime arguments {kwargs}. Passing runtime args to `load`" |
441 | | - f" is deprecated. Please pass arguments during initialization instead." |
442 | | - ) |
| 489 | + Args: |
| 490 | + file_path: The path to the PDF file to be loaded. |
| 491 | + headers: Optional headers to use for GET request to download a file from a |
| 492 | + web path. |
| 493 | + password: Optional password for opening encrypted PDFs. |
| 494 | + mode: The extraction mode, either "single" for the entire document or "page" |
| 495 | + for page-wise extraction. |
| 496 | + pages_delimitor: A string delimiter to separate pages in single-mode |
| 497 | + extraction. |
| 498 | + extract_images: Whether to extract images from the PDF. |
| 499 | + images_to_text: Optional function or callable to convert images to text |
| 500 | + during extraction. |
| 501 | + extract_tables: Whether to extract tables in a specific format, such as |
| 502 | + "csv", "markdown", or "html". |
| 503 | + extract_tables_settings: Optional dictionary of settings for customizing |
| 504 | + table extraction. |
| 505 | + **kwargs: Additional keyword arguments for customizing text extraction |
| 506 | + behavior. |
| 507 | +
|
| 508 | + Returns: |
| 509 | + This method does not directly return data. Use the `load`, `lazy_load`, or |
| 510 | + `aload` methods to retrieve parsed documents with content and metadata. |
443 | 511 |
|
444 | | - text_kwargs = {**self.text_kwargs, **kwargs} |
445 | | - parser = PyMuPDFParser( |
446 | | - text_kwargs=text_kwargs, extract_images=self.extract_images |
| 512 | + Raises: |
| 513 | + ValueError: If the `mode` argument is not one of "single" or "page". |
| 514 | + """ |
| 515 | + if mode not in ["single", "page"]: |
| 516 | + raise ValueError("mode must be single or page") |
| 517 | + super().__init__(file_path, headers=headers) |
| 518 | + self.parser = PyMuPDFParser( |
| 519 | + password=password, |
| 520 | + mode=mode, |
| 521 | + pages_delimitor=pages_delimitor, |
| 522 | + text_kwargs=kwargs, |
| 523 | + extract_images=extract_images, |
| 524 | + images_to_text=images_to_text, |
| 525 | + extract_tables=extract_tables, |
| 526 | + extract_tables_settings=extract_tables_settings, |
447 | 527 | ) |
| 528 | + |
| 529 | + def lazy_load(self) -> Iterator[Document]: |
| 530 | + """ |
| 531 | + Lazy load given path as pages. |
| 532 | + Insert image, if possible, between two paragraphs. |
| 533 | + In this way, a paragraph can be continued on the next page. |
| 534 | + """ |
| 535 | + parser = self.parser |
448 | 536 | if self.web_path: |
449 | 537 | blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined] |
450 | 538 | else: |
451 | 539 | blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] |
452 | 540 | yield from parser.lazy_parse(blob) |
453 | 541 |
|
454 | | - def load(self, **kwargs: Any) -> list[Document]: |
455 | | - return list(self._lazy_load(**kwargs)) |
456 | | - |
457 | | - def lazy_load(self) -> Iterator[Document]: |
458 | | - yield from self._lazy_load() |
459 | | - |
460 | 542 |
|
461 | 543 | # MathpixPDFLoader implementation taken largely from Daniel Gross's: |
462 | 544 | # https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21 |
|
0 commit comments