Skip to content

Commit 2921597

Browse files
authored
community[patch]: Refactoring PDF loaders: 01 prepare (#29062)
- **Refactoring PDF loaders step 1**: "community: Refactoring PDF loaders to standardize approaches" - **Description:** Declare CloudBlobLoader in __init__.py. file_path is Union[str, PurePath] anywhere - **Twitter handle:** pprados This is one part of a larger Pull Request (PR) that is too large to be submitted all at once. This specific part focuses to prepare the update of all parsers. For more details, see [PR 28970](#28970). @eyurtsev it's the start of a PR series.
1 parent a49448a commit 2921597

File tree

6 files changed

+90
-85
lines changed

6 files changed

+90
-85
lines changed

libs/community/langchain_community/document_loaders/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@
8787
from langchain_community.document_loaders.blob_loaders import (
8888
Blob,
8989
BlobLoader,
90+
CloudBlobLoader,
9091
FileSystemBlobLoader,
9192
YoutubeAudioLoader,
9293
)
@@ -574,6 +575,7 @@
574575
"CSVLoader": "langchain_community.document_loaders.csv_loader",
575576
"CassandraLoader": "langchain_community.document_loaders.cassandra",
576577
"ChatGPTLoader": "langchain_community.document_loaders.chatgpt",
578+
"CloudBlobLoader": "langchain_community.document_loaders.blob_loaders",
577579
"CoNLLULoader": "langchain_community.document_loaders.conllu",
578580
"CollegeConfidentialLoader": "langchain_community.document_loaders.college_confidential", # noqa: E501
579581
"ConcurrentLoader": "langchain_community.document_loaders.concurrent",
@@ -781,6 +783,7 @@ def __getattr__(name: str) -> Any:
781783
"CSVLoader",
782784
"CassandraLoader",
783785
"ChatGPTLoader",
786+
"CloudBlobLoader",
784787
"CoNLLULoader",
785788
"CollegeConfidentialLoader",
786789
"ConcurrentLoader",

libs/community/langchain_community/document_loaders/parsers/pdf.py

Lines changed: 14 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from typing import (
77
TYPE_CHECKING,
88
Any,
9-
Dict,
109
Iterable,
1110
Iterator,
1211
Mapping,
@@ -23,15 +22,13 @@
2322
from langchain_community.document_loaders.blob_loaders import Blob
2423

2524
if TYPE_CHECKING:
26-
import fitz.fitz
27-
import pdfminer.layout
28-
import pdfplumber.page
29-
import pypdf._page
30-
import pypdfium2._helpers.page
31-
from pypdf import PageObject
25+
import fitz
26+
import pdfminer
27+
import pdfplumber
28+
import pypdf
29+
import pypdfium2
3230
from textractor.data.text_linearization_config import TextLinearizationConfig
3331

34-
3532
_PDF_FILTER_WITH_LOSS = ["DCTDecode", "DCT", "JPXDecode"]
3633
_PDF_FILTER_WITHOUT_LOSS = [
3734
"LZWDecode",
@@ -90,7 +87,7 @@ def __init__(
9087
extract_images: bool = False,
9188
*,
9289
extraction_mode: str = "plain",
93-
extraction_kwargs: Optional[Dict[str, Any]] = None,
90+
extraction_kwargs: Optional[dict[str, Any]] = None,
9491
):
9592
self.password = password
9693
self.extract_images = extract_images
@@ -107,7 +104,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
107104
"`pip install pypdf`"
108105
)
109106

110-
def _extract_text_from_page(page: "PageObject") -> str:
107+
def _extract_text_from_page(page: pypdf.PageObject) -> str:
111108
"""
112109
Extract text from image given the version of pypdf.
113110
"""
@@ -126,12 +123,13 @@ def _extract_text_from_page(page: "PageObject") -> str:
126123
Document(
127124
page_content=_extract_text_from_page(page=page)
128125
+ self._extract_images_from_page(page),
129-
metadata={"source": blob.source, "page": page_number}, # type: ignore[attr-defined]
126+
metadata={"source": blob.source, "page": page_number},
127+
# type: ignore[attr-defined]
130128
)
131129
for page_number, page in enumerate(pdf_reader.pages)
132130
]
133131

134-
def _extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
132+
def _extract_images_from_page(self, page: pypdf.PageObject) -> str:
135133
"""Extract images from page and get the text with RapidOCR."""
136134
if not self.extract_images or "/XObject" not in page["/Resources"].keys(): # type: ignore[attr-defined]
137135
return ""
@@ -307,9 +305,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
307305
for page in doc
308306
]
309307

310-
def _get_page_content(
311-
self, doc: fitz.fitz.Document, page: fitz.fitz.Page, blob: Blob
312-
) -> str:
308+
def _get_page_content(self, doc: fitz.Document, page: fitz.Page, blob: Blob) -> str:
313309
"""
314310
Get the text of the page using PyMuPDF and RapidOCR and issue a warning
315311
if it is empty.
@@ -327,7 +323,7 @@ def _get_page_content(
327323
return content
328324

329325
def _extract_metadata(
330-
self, doc: fitz.fitz.Document, page: fitz.fitz.Page, blob: Blob
326+
self, doc: fitz.Document, page: fitz.Page, blob: Blob
331327
) -> dict:
332328
"""Extract metadata from the document and page."""
333329
return dict(
@@ -344,9 +340,7 @@ def _extract_metadata(
344340
},
345341
)
346342

347-
def _extract_images_from_page(
348-
self, doc: fitz.fitz.Document, page: fitz.fitz.Page
349-
) -> str:
343+
def _extract_images_from_page(self, doc: fitz.Document, page: fitz.Page) -> str:
350344
"""Extract images from page and get the text with RapidOCR."""
351345
if not self.extract_images:
352346
return ""
@@ -558,7 +552,7 @@ def __init__(
558552
textract_features: Optional[Sequence[int]] = None,
559553
client: Optional[Any] = None,
560554
*,
561-
linearization_config: Optional["TextLinearizationConfig"] = None,
555+
linearization_config: Optional[TextLinearizationConfig] = None,
562556
) -> None:
563557
"""Initializes the parser.
564558

0 commit comments

Comments
 (0)