Skip to content

Commit 0be6c88

Browse files
committed
Change the strategy for images_inner_format.
1 parent 3d15d39 commit 0be6c88

File tree

4 files changed

+68
-146
lines changed

4 files changed

+68
-146
lines changed

libs/community/langchain_community/document_loaders/parsers/images.py

Lines changed: 13 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import base64
2-
import html
32
import io
43
import logging
54
from abc import abstractmethod
@@ -10,7 +9,6 @@
109
from langchain_core.documents import Document
1110
from langchain_core.language_models import BaseChatModel
1211
from langchain_core.messages import HumanMessage
13-
from langchain_core.prompts import BasePromptTemplate, PromptTemplate
1412

1513
from langchain_community.document_loaders.base import BaseBlobParser
1614
from langchain_community.document_loaders.blob_loaders import Blob
@@ -44,12 +42,11 @@ def __init__(
4442
self.format = format
4543

4644
@abstractmethod
47-
def _analyze_image(self, img: "Image", format: str) -> str:
45+
def _analyze_image(self, img: "Image") -> str:
4846
"""Abstract method to analyze an image and extract textual content.
4947
5048
Args:
5149
img: The image to be analyzed.
52-
format: The format to use if it's possible
5350
5451
Returns:
5552
The extracted text content.
@@ -73,22 +70,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:
7370
img = Img.fromarray(numpy.load(buf))
7471
else:
7572
img = Img.open(buf)
76-
format = (
77-
"text"
78-
if self.format in ("markdown-img", "html-img")
79-
else self.format
80-
)
81-
content = self._analyze_image(img, format)
82-
if content:
83-
source = blob.source or "#"
84-
if self.format == "markdown-img":
85-
content = content.replace("]", r"\\]")
86-
content = f"![{content}]({source})"
87-
elif self.format == "html-img":
88-
content = (
89-
f'<img alt="{html.escape(content, quote=True)} '
90-
f'src="{source}" />'
91-
)
73+
content = self._analyze_image(img)
9274
logger.debug("Image text: %s", content.replace("\n", "\\n"))
9375
yield Document(
9476
page_content=content,
@@ -107,44 +89,24 @@ class RapidOCRBlobParser(BaseImageBlobParser):
10789
Attributes:
10890
ocr:
10991
The RapidOCR instance for performing OCR.
110-
format (Literal["text", "markdown-img", "html-img"]):
111-
The format for the parsed output.
112-
- "text" = return the content as is
113-
- "markdown-img" = wrap the content into an image markdown link, w/ link
114-
pointing to (`![body)(#)`]
115-
- "html-img" = wrap the content as the `alt` text of an tag and link to
116-
(`<img alt="{body}" src="#"/>`)
11792
"""
11893

11994
def __init__(
12095
self,
121-
*,
122-
format: Literal["text", "markdown-img", "html-img"] = "text",
123-
):
96+
) -> None:
12497
"""
12598
Initializes the RapidOCRBlobParser.
126-
127-
Args:
128-
format (Literal["text", "markdown-img", "html-img"]):
129-
The format for the parsed output.
130-
- "text" = return the content as is
131-
- "markdown-img" = wrap the content into an image markdown link, w/ link
132-
pointing to (`![body)(#)`]
133-
- "html-img" = wrap the content as the `alt` text of an tag and link to
134-
(`<img alt="{body}" src="#"/>`)
13599
"""
136-
super().__init__(format=format)
100+
super().__init__()
137101
self.ocr = None
138102

139-
def _analyze_image(self, img: "Image", format: str) -> str:
103+
def _analyze_image(self, img: "Image") -> str:
140104
"""
141105
Analyzes an image and extracts text using RapidOCR.
142106
143107
Args:
144108
img (Image):
145109
The image to be analyzed.
146-
format (str):
147-
The format to use if it's possible
148110
149111
Returns:
150112
str:
@@ -168,48 +130,27 @@ def _analyze_image(self, img: "Image", format: str) -> str:
168130

169131

170132
class TesseractBlobParser(BaseImageBlobParser):
171-
"""Parse for extracting text from images using the Tesseract OCR library.
172-
173-
Attributes:
174-
format (Literal["text", "markdown-img", "html-img"]):
175-
The format for the parsed output.
176-
- "text" = return the content as is
177-
- "markdown-img" = wrap the content into an image markdown link, w/ link
178-
pointing to (`![body)(#)`]
179-
- "html-img" = wrap the content as the `alt` text of an tag and link to
180-
(`<img alt="{body}" src="#"/>`)
181-
langs (list[str]):
182-
The languages to use for OCR.
183-
"""
133+
"""Parse for extracting text from images using the Tesseract OCR library."""
184134

185135
def __init__(
186136
self,
187137
*,
188-
format: Literal["text", "markdown-img", "html-img"] = "text",
189138
langs: Iterable[str] = ("eng",),
190139
):
191140
"""Initialize the TesseractBlobParser.
192141
193142
Args:
194-
format (Literal["text", "markdown-img", "html-img"]):
195-
The format for the parsed output.
196-
- "text" = return the content as is
197-
- "markdown-img" = wrap the content into an image markdown link, w/ link
198-
pointing to (`![body)(#)`]
199-
- "html-img" = wrap the content as the `alt` text of an tag and link to
200-
(`<img alt="{body}" src="#"/>`)
201143
langs (list[str]):
202144
The languages to use for OCR.
203145
"""
204-
super().__init__(format=format)
146+
super().__init__()
205147
self.langs = list(langs)
206148

207-
def _analyze_image(self, img: "Image", format: str) -> str:
149+
def _analyze_image(self, img: "Image") -> str:
208150
"""Analyze an image and extracts text using Tesseract OCR.
209151
210152
Args:
211153
img: The image to be analyzed.
212-
format: The format to use if it's possible
213154
214155
Returns:
215156
str: The extracted text content.
@@ -224,31 +165,21 @@ def _analyze_image(self, img: "Image", format: str) -> str:
224165
return pytesseract.image_to_string(img, lang="+".join(self.langs)).strip()
225166

226167

227-
_PROMPT_IMAGES_TO_DESCRIPTION: BasePromptTemplate = PromptTemplate.from_template(
168+
_PROMPT_IMAGES_TO_DESCRIPTION: str = (
228169
"You are an assistant tasked with summarizing images for retrieval. "
229170
"1. These summaries will be embedded and used to retrieve the raw image. "
230171
"Give a concise summary of the image that is well optimized for retrieval\n"
231172
"2. extract all the text from the image. "
232173
"Do not exclude any content from the page.\n"
233-
"Format answer in {format} without explanatory text "
174+
"Format answer in markdown without explanatory text "
234175
"and without markdown delimiter ``` at the beginning. "
235-
"Respects the start of the format."
236176
)
237177

238178

239179
class LLMImageBlobParser(BaseImageBlobParser):
240180
"""Parser for analyzing images using a language model (LLM).
241181
242182
Attributes:
243-
format (Literal["text", "markdown-img", "html-img"]):
244-
The format for the parsed output.
245-
- "text" = return the content as is
246-
- "markdown-img" = wrap the content into an image markdown link, w/ link
247-
pointing to (`![body)(#)`]
248-
- "html-img" = wrap the content as the `alt` text of an tag and link to
249-
(`<img alt="{body}" src="#"/>`)
250-
- "markdown" = return markdown content
251-
- "html" = return html content
252183
model (BaseChatModel):
253184
The language model to use for analysis.
254185
prompt (str):
@@ -258,27 +189,22 @@ class LLMImageBlobParser(BaseImageBlobParser):
258189
def __init__(
259190
self,
260191
*,
261-
format: Literal[
262-
"text", "markdown-img", "html-img", "markdown", "html"
263-
] = "text",
264192
model: BaseChatModel,
265-
prompt: BasePromptTemplate = _PROMPT_IMAGES_TO_DESCRIPTION,
193+
prompt: str = _PROMPT_IMAGES_TO_DESCRIPTION,
266194
):
267195
"""Initializes the LLMImageBlobParser.
268196
269197
Args:
270-
format (Literal["text", "markdown", "html"]):
271-
The format for the parsed output.
272198
model (BaseChatModel):
273199
The language model to use for analysis.
274200
prompt (str):
275201
The prompt to provide to the language model.
276202
"""
277-
super().__init__(format=format)
203+
super().__init__()
278204
self.model = model
279205
self.prompt = prompt
280206

281-
def _analyze_image(self, img: "Image", format: str) -> str:
207+
def _analyze_image(self, img: "Image") -> str:
282208
"""Analyze an image using the provided language model.
283209
284210
Args:

libs/community/langchain_community/document_loaders/parsers/pdf.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from __future__ import annotations
44

5+
import html
56
import io
67
import logging
78
import threading
@@ -98,6 +99,30 @@ def extract_from_images_with_rapidocr(
9899
_STD_METADATA_KEYS = {"source", "total_pages", "creationdate", "creator", "producer"}
99100

100101

102+
def _format_inner_image(blob: Blob, content: str, format: str) -> str:
103+
"""Format the content of the image with the source of the blob.
104+
105+
blob: The blob containing the image.
106+
format::
107+
The format for the parsed output.
108+
- "text" = return the content as is
109+
- "markdown-img" = wrap the content into an image markdown link, w/ link
110+
pointing to (`![body)(#)`]
111+
- "html-img" = wrap the content as the `alt` text of an tag and link to
112+
(`<img alt="{body}" src="#"/>`)
113+
"""
114+
if content:
115+
source = blob.source or "#"
116+
if format == "markdown-img":
117+
content = content.replace("]", r"\\]")
118+
content = f"![{content}]({source})"
119+
elif format == "html-img":
120+
content = (
121+
f'<img alt="{html.escape(content, quote=True)} ' f'src="{source}" />'
122+
)
123+
return content
124+
125+
101126
def _validate_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
102127
"""Validate that the metadata has all the standard keys and the page is an integer.
103128
@@ -475,6 +500,7 @@ def __init__(
475500
mode: Literal["single", "page"] = "page",
476501
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
477502
images_parser: Optional[BaseImageBlobParser] = None,
503+
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
478504
extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
479505
extract_tables_settings: Optional[dict[str, Any]] = None,
480506
) -> None:
@@ -488,6 +514,12 @@ def __init__(
488514
extraction.
489515
extract_images: Whether to extract images from the PDF.
490516
images_parser: Optional image blob parser.
517+
images_inner_format: The format for the parsed output.
518+
- "text" = return the content as is
519+
- "markdown-img" = wrap the content into an image markdown link, w/ link
520+
pointing to (`![body)(#)`]
521+
- "html-img" = wrap the content as the `alt` text of an tag and link to
522+
(`<img alt="{body}" src="#"/>`)
491523
extract_tables: Whether to extract tables in a specific format, such as
492524
"csv", "markdown", or "html".
493525
extract_tables_settings: Optional dictionary of settings for customizing
@@ -515,6 +547,7 @@ def __init__(
515547
if extract_images and not images_parser:
516548
images_parser = RapidOCRBlobParser()
517549
self.extract_images = extract_images
550+
self.images_inner_format = images_inner_format
518551
self.images_parser = images_parser
519552
self.extract_tables = extract_tables
520553
self.extract_tables_settings = extract_tables_settings
@@ -704,7 +737,11 @@ def _extract_images_from_page(
704737
blob = Blob.from_data(
705738
image_bytes.getvalue(), mime_type="application/x-npy"
706739
)
707-
images.append(next(self.images_parser.lazy_parse(blob)).page_content)
740+
image_text = next(self.images_parser.lazy_parse(blob)).page_content
741+
742+
images.append(
743+
_format_inner_image(blob, image_text, self.images_inner_format)
744+
)
708745
return _FORMAT_IMAGE_STR.format(
709746
image_text=_JOIN_IMAGES.join(filter(None, images))
710747
)

libs/community/tests/integration_tests/document_loaders/parsers/test_images.py

Lines changed: 4 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from langchain_core.documents.base import Blob
77
from langchain_core.language_models import FakeMessagesListChatModel
88
from langchain_core.messages import ChatMessage
9-
from langchain_openai import ChatOpenAI
109

1110
from langchain_community.document_loaders.parsers.images import (
1211
LLMImageBlobParser,
@@ -24,15 +23,7 @@
2423
"blob,body",
2524
[
2625
(building_image, ""),
27-
(text_image, r".*\bMAKE *TEXT\b.*\bSTAND\b.*\bOUT *FROM\b.*\bBACKGROUNDS\b.*"),
28-
],
29-
)
30-
@pytest.mark.parametrize(
31-
"format,pattern",
32-
[
33-
("text", r"(?ism)^{body}$"),
34-
("markdown-img", r"(?ism)^!\[{body}]\(.*\)|$"),
35-
("html-img", r'(?ism)^(<img alt="{body}" src=".*" />|)'),
26+
(text_image, r"(?ms).*MAKE.*TEXT.*STAND.*OUT.*FROM.*BACKGROUNDS.*"),
3627
],
3728
)
3829
@pytest.mark.parametrize(
@@ -56,55 +47,14 @@
5647
),
5748
],
5849
)
59-
def test_image_parser_with_differents_format_and_files(
50+
def test_image_parser_with_differents_files(
6051
blob_loader: Type,
6152
kw: dict[str, Any],
62-
format: str,
63-
pattern: str,
6453
blob: Blob,
6554
body: str,
6655
) -> None:
6756
if blob_loader == LLMImageBlobParser and "building" in str(blob.path):
6857
body = ".*building.*"
69-
documents = list(blob_loader(format=format, **kw).lazy_parse(blob))
70-
assert len(documents) == 1
71-
assert re.compile(pattern.format(body=body)).match(documents[0].page_content)
72-
73-
74-
@pytest.mark.parametrize(
75-
"blob,body",
76-
[
77-
(page_image, r".*Layout Detection Models.*"),
78-
],
79-
)
80-
@pytest.mark.parametrize(
81-
"format,pattern",
82-
[
83-
("html", r"^<.*>"),
84-
("markdown", r"^\*\*.*\*\*"),
85-
],
86-
)
87-
@pytest.mark.parametrize(
88-
"blob_loader,kw",
89-
[
90-
(
91-
LLMImageBlobParser,
92-
{
93-
"model": ChatOpenAI(
94-
model="gpt-4o",
95-
)
96-
},
97-
),
98-
],
99-
)
100-
def test_image_parser_with_extra_format(
101-
blob_loader: Type,
102-
kw: dict[str, Any],
103-
format: str,
104-
pattern: str,
105-
blob: Blob,
106-
body: str,
107-
) -> None:
108-
documents = list(blob_loader(format=format, **kw).lazy_parse(blob))
58+
documents = list(blob_loader(**kw).lazy_parse(blob))
10959
assert len(documents) == 1
110-
assert re.compile(pattern.format(body=body)).match(documents[0].page_content)
60+
assert re.compile(body).match(documents[0].page_content)

0 commit comments

Comments
 (0)