-
Notifications
You must be signed in to change notification settings - Fork 19.6k
community[minor]: Refactoring PyMuPDF parser, loader and add image blob parsers #29063
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
21759e2
4607354
668dc9c
7a5b5c5
6340ded
4845781
3beda82
743a83e
b623750
20f5a41
91234f0
80ee3f7
66f97cf
0e6c904
9b45bd8
acf4358
d7d3021
4762fab
6121005
5910f99
7fc01f3
0f654a1
e4f36ed
4a62529
1c78325
1227dbb
90085e4
14264e9
feacf69
c074729
ee4784d
5d4a256
3d15d39
0be6c88
d104ee7
023ba11
23a73a9
a4587f0
d332958
4b37b34
2281d05
0da73f1
d012d60
882c90d
74d3617
318f304
5ee7b9c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,7 @@ | |
| Any, | ||
| BinaryIO, | ||
| Iterator, | ||
| Literal, | ||
| Mapping, | ||
| Optional, | ||
| Sequence, | ||
|
|
@@ -28,13 +29,15 @@ | |
| from langchain_community.document_loaders.blob_loaders import Blob | ||
| from langchain_community.document_loaders.dedoc import DedocBaseLoader | ||
| from langchain_community.document_loaders.parsers.pdf import ( | ||
| CONVERT_IMAGE_TO_TEXT, | ||
| AmazonTextractPDFParser, | ||
| DocumentIntelligenceParser, | ||
| PDFMinerParser, | ||
| PDFPlumberParser, | ||
| PyMuPDFParser, | ||
| PyPDFium2Parser, | ||
| PyPDFParser, | ||
| _default_page_delimitor, | ||
| ) | ||
| from langchain_community.document_loaders.unstructured import UnstructuredFileLoader | ||
|
|
||
|
|
@@ -96,7 +99,8 @@ def __init__( | |
| if "~" in self.file_path: | ||
| self.file_path = os.path.expanduser(self.file_path) | ||
|
|
||
| # If the file is a web path or S3, download it to a temporary file, and use that | ||
| # If the file is a web path or S3, download it to a temporary file, | ||
| # and use that. It's better to use a BlobLoader. | ||
| if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path): | ||
| self.temp_dir = tempfile.TemporaryDirectory() | ||
| _, suffix = os.path.splitext(self.file_path) | ||
|
|
@@ -412,51 +416,129 @@ def lazy_load(self) -> Iterator[Document]: | |
|
|
||
|
|
||
| class PyMuPDFLoader(BasePDFLoader): | ||
| """Load `PDF` files using `PyMuPDF`.""" | ||
| """Load and parse a PDF file using 'PyMuPDF' library. | ||
|
|
||
| This class provides methods to load and parse PDF documents, supporting various | ||
| configurations such as handling password-protected files, extracting tables, | ||
| extracting images, and defining extraction mode. It integrates the `PyMuPDF` | ||
| library for PDF processing and offers both synchronous and asynchronous document | ||
| loading. | ||
|
|
||
| Examples: | ||
| Setup: | ||
|
|
||
| .. code-block:: bash | ||
|
|
||
| pip install -U langchain-community pymupdf | ||
|
|
||
| Instantiate the loader: | ||
|
|
||
| .. code-block:: python | ||
|
|
||
| from langchain_community.document_loaders import PyMuPDFLoader | ||
|
|
||
| loader = PyMuPDFLoader( | ||
| file_path = "./example_data/layout-parser-paper.pdf", | ||
| # headers = None | ||
| # password = None, | ||
| mode = "single", | ||
| pages_delimitor = "\n\f", | ||
| # extract_images = True, | ||
| # images_to_text = convert_images_to_text_with_tesseract(), | ||
| # extract_tables = "markdown", | ||
| # extract_tables_settings = None, | ||
| ) | ||
|
|
||
| Lazy load documents: | ||
|
|
||
| .. code-block:: python | ||
|
|
||
| docs = [] | ||
| docs_lazy = loader.lazy_load() | ||
|
|
||
| for doc in docs_lazy: | ||
| docs.append(doc) | ||
| print(docs[0].page_content[:100]) | ||
| print(docs[0].metadata) | ||
|
|
||
| Load documents asynchronously: | ||
|
|
||
| .. code-block:: python | ||
|
|
||
| docs = await loader.aload() | ||
| print(docs[0].page_content[:100]) | ||
| print(docs[0].metadata) | ||
| """ | ||
|
|
||
| def __init__( | ||
| self, | ||
| file_path: Union[str, PurePath], | ||
| *, | ||
| headers: Optional[dict] = None, | ||
| password: Optional[str] = None, | ||
| mode: Literal["single", "page"] = "page", | ||
| pages_delimitor: str = _default_page_delimitor, | ||
| extract_images: bool = False, | ||
| images_to_text: CONVERT_IMAGE_TO_TEXT = None, | ||
| extract_tables: Union[Literal["csv", "markdown", "html"], None] = None, | ||
| headers: Optional[dict] = None, | ||
| extract_tables_settings: Optional[dict[str, Any]] = None, | ||
| **kwargs: Any, | ||
| ) -> None: | ||
| """Initialize with a file path.""" | ||
| try: | ||
| import fitz # noqa:F401 | ||
| except ImportError: | ||
| raise ImportError( | ||
| "`PyMuPDF` package not found, please install it with " | ||
| "`pip install pymupdf`" | ||
| ) | ||
| super().__init__(file_path, headers=headers) | ||
| self.extract_images = extract_images | ||
| self.text_kwargs = kwargs | ||
| """Initialize with a file path. | ||
|
|
||
| def _lazy_load(self, **kwargs: Any) -> Iterator[Document]: | ||
| if kwargs: | ||
| logger.warning( | ||
| f"Received runtime arguments {kwargs}. Passing runtime args to `load`" | ||
| f" is deprecated. Please pass arguments during initialization instead." | ||
| ) | ||
| Args: | ||
| file_path: The path to the PDF file to be loaded. | ||
| headers: Optional headers to use for GET request to download a file from a | ||
| web path. | ||
| password: Optional password for opening encrypted PDFs. | ||
| mode: The extraction mode, either "single" for the entire document or "page" | ||
| for page-wise extraction. | ||
| pages_delimitor: A string delimiter to separate pages in single-mode | ||
| extraction. | ||
| extract_images: Whether to extract images from the PDF. | ||
| images_to_text: Optional function or callable to convert images to text | ||
| during extraction. | ||
| extract_tables: Whether to extract tables in a specific format, such as | ||
| "csv", "markdown", or "html". | ||
| extract_tables_settings: Optional dictionary of settings for customizing | ||
| table extraction. | ||
| **kwargs: Additional keyword arguments for customizing text extraction | ||
| behavior. | ||
|
|
||
| Returns: | ||
| This method does not directly return data. Use the `load`, `lazy_load`, or | ||
| `aload` methods to retrieve parsed documents with content and metadata. | ||
|
|
||
| text_kwargs = {**self.text_kwargs, **kwargs} | ||
| parser = PyMuPDFParser( | ||
| text_kwargs=text_kwargs, extract_images=self.extract_images | ||
| Raises: | ||
| ValueError: If the `mode` argument is not one of "single" or "page". | ||
| """ | ||
| if mode not in ["single", "page"]: | ||
| raise ValueError("mode must be single or page") | ||
| super().__init__(file_path, headers=headers) | ||
| self.parser = PyMuPDFParser( | ||
| password=password, | ||
| mode=mode, | ||
| pages_delimitor=pages_delimitor, | ||
| text_kwargs=kwargs, | ||
| extract_images=extract_images, | ||
| images_to_text=images_to_text, | ||
| extract_tables=extract_tables, | ||
| extract_tables_settings=extract_tables_settings, | ||
| ) | ||
|
|
||
| def lazy_load(self) -> Iterator[Document]: | ||
| """ | ||
| Lazy load given path as pages. | ||
|
||
| Insert image, if possible, between two paragraphs. | ||
| In this way, a paragraph can be continued on the next page. | ||
| """ | ||
| parser = self.parser | ||
pprados marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if self.web_path: | ||
| blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined] | ||
| else: | ||
| blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] | ||
| yield from parser.lazy_parse(blob) | ||
|
|
||
| def load(self, **kwargs: Any) -> list[Document]: | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would love to make this change, but it's a breaking change due to
|
||
| return list(self._lazy_load(**kwargs)) | ||
|
|
||
| def lazy_load(self) -> Iterator[Document]: | ||
| yield from self._lazy_load() | ||
|
|
||
|
|
||
| # MathpixPDFLoader implementation taken largely from Daniel Gross's: | ||
| # https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21 | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.