Skip to content

Commit 0f654a1

Browse files
committed
Add format "html" and "markdown" for LLMImageBlobParser
1 parent 4762fab commit 0f654a1

File tree

3 files changed

+60
-9
lines changed

3 files changed

+60
-9
lines changed

libs/community/langchain_community/document_loaders/parsers/images.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from langchain_core.documents import Document
1111
from langchain_core.language_models import BaseChatModel
1212
from langchain_core.messages import HumanMessage
13+
from langchain_core.prompts import BasePromptTemplate, PromptTemplate
1314

1415
from langchain_community.document_loaders.base import BaseBlobParser
1516
from langchain_community.document_loaders.blob_loaders import Blob
@@ -231,12 +232,14 @@ def _analyze_image(self, img: "Image") -> str:
231232
return pytesseract.image_to_string(img, lang="+".join(self.langs)).strip()
232233

233234

234-
_PROMPT_IMAGES_TO_DESCRIPTION = (
235-
"You are an assistant tasked with summarizing "
236-
"images for retrieval. "
237-
"These summaries will be embedded and used to retrieve the raw image. "
238-
"Give a concise summary of the image that is well optimized for retrieval "
239-
"and extract all the text from the image."
235+
_PROMPT_IMAGES_TO_DESCRIPTION: BasePromptTemplate = PromptTemplate.from_template(
236+
"You are an assistant tasked with summarizing images for retrieval. "
237+
"1. These summaries will be embedded and used to retrieve the raw image. "
238+
"Give a concise summary of the image that is well optimized for retrieval\n"
239+
"2. extract all the text from the image. "
240+
"Do not exclude any content from the page.\n"
241+
"Format response in {format} format without explanatory text "
242+
"and without markdown delimiter ``` at the beginning.\n"
240243
)
241244

242245

@@ -252,6 +255,8 @@ class LLMImageBlobParser(BaseImageBlobParser):
252255
pointing to (`![body)(#)`]
253256
- "html-img" = wrap the content as the `alt` text of an tag and link to
254257
(`<img alt="{body}" src="#"/>`)
258+
- "markdown" = return markdown content
259+
- "html" = return html content
255260
model (BaseChatModel):
256261
The language model to use for analysis.
257262
prompt (str):
@@ -261,9 +266,11 @@ class LLMImageBlobParser(BaseImageBlobParser):
261266
def __init__(
262267
self,
263268
*,
264-
format: Literal["text", "markdown-link", "html-img"] = "text",
269+
format: Literal[
270+
"text", "markdown-link", "html-img", "markdown", "html"
271+
] = "text",
265272
model: BaseChatModel,
266-
prompt: str = _PROMPT_IMAGES_TO_DESCRIPTION,
273+
prompt: BasePromptTemplate = _PROMPT_IMAGES_TO_DESCRIPTION,
267274
):
268275
"""
269276
Initializes the LLMImageBlobParser.
@@ -299,7 +306,10 @@ def _analyze_image(self, img: "Image") -> str:
299306
[
300307
HumanMessage(
301308
content=[
302-
{"type": "text", "text": self.prompt},
309+
{
310+
"type": "text",
311+
"text": self.prompt.format(format=self.format),
312+
},
303313
{
304314
"type": "image_url",
305315
"image_url": {

libs/community/tests/integration_tests/document_loaders/parsers/test_images.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from langchain_core.documents.base import Blob
77
from langchain_core.language_models import FakeMessagesListChatModel
88
from langchain_core.messages import ChatMessage
9+
from langchain_openai import ChatOpenAI
910

1011
from langchain_community.document_loaders.parsers.images import (
1112
LLMImageBlobParser,
@@ -16,6 +17,7 @@
1617
path_base = Path(__file__).parent.parent.parent
1718
building_image = Blob.from_path(path_base / "examples/building.jpg")
1819
text_image = Blob.from_path(path_base / "examples/text.png")
20+
page_image = Blob.from_path(path_base / "examples/page.png")
1921

2022

2123
@pytest.mark.parametrize(
@@ -67,3 +69,42 @@ def test_image_parser_with_differents_format_and_files(
6769
documents = list(blob_loader(format=format, **kw).lazy_parse(blob))
6870
assert len(documents) == 1
6971
assert re.compile(pattern.format(body=body)).match(documents[0].page_content)
72+
73+
74+
@pytest.mark.parametrize(
75+
"blob,body",
76+
[
77+
(page_image, r".*Layout Detection Models.*"),
78+
],
79+
)
80+
@pytest.mark.parametrize(
81+
"format,pattern",
82+
[
83+
("html", r"^<html"),
84+
("markdown", r"^\*\*.*\*\*"),
85+
],
86+
)
87+
@pytest.mark.parametrize(
88+
"blob_loader,kw",
89+
[
90+
(
91+
LLMImageBlobParser,
92+
{
93+
"model": ChatOpenAI(
94+
model="gpt-4o",
95+
)
96+
},
97+
),
98+
],
99+
)
100+
def test_image_parser_with_extra_format(
101+
blob_loader: Type,
102+
kw: dict[str, Any],
103+
format: str,
104+
pattern: str,
105+
blob: Blob,
106+
body: str,
107+
) -> None:
108+
documents = list(blob_loader(format=format, **kw).lazy_parse(blob))
109+
assert len(documents) == 1
110+
assert re.compile(pattern.format(body=body)).match(documents[0].page_content)
280 KB
Loading

0 commit comments

Comments
 (0)