Skip to content

Commit 1c78325

Browse files
committed
Bugfix
1 parent 4a62529 commit 1c78325

File tree

3 files changed

+19
-7
lines changed

3 files changed

+19
-7
lines changed

libs/community/extended_testing_deps.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ oracle-ads>=2.9.1,<3
6060
oracledb>=2.2.0,<3
6161
pandas>=2.0.1,<3
6262
pdfminer-six>=20221105,<20240706
63+
pdfplumber>=0.11
6364
pgvector>=0.1.6,<0.2
6465
playwright>=1.48.0,<2
6566
praw>=7.7.1,<8

libs/community/langchain_community/document_loaders/parsers/images.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -50,13 +50,15 @@ def __init__(
5050
self.format = format
5151

5252
@abstractmethod
53-
def _analyze_image(self, img: "Image") -> str:
53+
def _analyze_image(self, img: "Image", format: str) -> str:
5454
"""
5555
Abstract method to analyze an image and extract textual content.
5656
5757
Args:
5858
img (Image):
5959
The image to be analyzed.
60+
format (str):
61+
The format to use if it's possible
6062
6163
Returns:
6264
str:
@@ -84,7 +86,12 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:
8486
img = Img.fromarray(numpy.load(buf))
8587
else:
8688
img = Img.open(buf)
87-
content = self._analyze_image(img)
89+
format = (
90+
"text"
91+
if self.format in ("markdown-link", "html-img")
92+
else self.format
93+
)
94+
content = self._analyze_image(img, format)
8895
if content:
8996
source = blob.source or "#"
9097
if self.format == "markdown-link":
@@ -143,13 +150,15 @@ def __init__(
143150
super().__init__(format=format)
144151
self.ocr = None
145152

146-
def _analyze_image(self, img: "Image") -> str:
153+
def _analyze_image(self, img: "Image", format: str) -> str:
147154
"""
148155
Analyzes an image and extracts text using RapidOCR.
149156
150157
Args:
151158
img (Image):
152159
The image to be analyzed.
160+
format (str):
161+
The format to use if it's possible
153162
154163
Returns:
155164
str:
@@ -211,13 +220,15 @@ def __init__(
211220
super().__init__(format=format)
212221
self.langs = list(langs)
213222

214-
def _analyze_image(self, img: "Image") -> str:
223+
def _analyze_image(self, img: "Image", format: str) -> str:
215224
"""
216225
Analyzes an image and extracts text using Tesseract OCR.
217226
218227
Args:
219228
img (Image):
220229
The image to be analyzed.
230+
format (str):
231+
The format to use if it's possible
221232
222233
Returns:
223234
str: The extracted text content.
@@ -287,7 +298,7 @@ def __init__(
287298
self.model = model
288299
self.prompt = prompt
289300

290-
def _analyze_image(self, img: "Image") -> str:
301+
def _analyze_image(self, img: "Image", format: str) -> str:
291302
"""
292303
Analyzes an image using the provided language model.
293304
@@ -308,7 +319,7 @@ def _analyze_image(self, img: "Image") -> str:
308319
content=[
309320
{
310321
"type": "text",
311-
"text": self.prompt.format(format=self.format),
322+
"text": self.prompt.format(format=format),
312323
},
313324
{
314325
"type": "image_url",

libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ def _std_assert_with_parser(parser: BaseBlobParser) -> None:
199199
parser.password = old_password
200200

201201
class EmptyImageBlobParser(BaseImageBlobParser):
202-
def _analyze_image(self, img: Image) -> str:
202+
def _analyze_image(self, img: Image, format: str) -> str:
203203
return "![image](#)"
204204

205205
parser_class = getattr(pdf_parsers, parser_factory)

0 commit comments

Comments
 (0)