Add format "html" and "markdown" for LLMImageBlobParser

pprados · pprados · commit 0f654a1f8f36 · 2025-01-15T09:28:04.000+01:00
diff --git a/libs/community/langchain_community/document_loaders/parsers/images.py b/libs/community/langchain_community/document_loaders/parsers/images.py
@@ -10,6 +10,7 @@
 from langchain_core.documents import Document
 from langchain_core.language_models import BaseChatModel
 from langchain_core.messages import HumanMessage
+from langchain_core.prompts import BasePromptTemplate, PromptTemplate
 
 from langchain_community.document_loaders.base import BaseBlobParser
 from langchain_community.document_loaders.blob_loaders import Blob
@@ -231,12 +232,14 @@ def _analyze_image(self, img: "Image") -> str:
         return pytesseract.image_to_string(img, lang="+".join(self.langs)).strip()
 
 
-_PROMPT_IMAGES_TO_DESCRIPTION = (
-    "You are an assistant tasked with summarizing "
-    "images for retrieval. "
-    "These summaries will be embedded and used to retrieve the raw image. "
-    "Give a concise summary of the image that is well optimized for retrieval "
-    "and extract all the text from the image."
+_PROMPT_IMAGES_TO_DESCRIPTION: BasePromptTemplate = PromptTemplate.from_template(
+    "You are an assistant tasked with summarizing images for retrieval. "
+    "1. These summaries will be embedded and used to retrieve the raw image. "
+    "Give a concise summary of the image that is well optimized for retrieval\n"
+    "2. extract all the text from the image. "
+    "Do not exclude any content from the page.\n"
+    "Format response in {format} format without explanatory text "
+    "and without markdown delimiter ``` at the beginning.\n"
 )
 
 
@@ -252,6 +255,8 @@ class LLMImageBlobParser(BaseImageBlobParser):
           pointing to (`![body)(#)`]
           - "html-img" = wrap the content as the `alt` text of an tag and link to
           (`<img alt="{body}" src="#"/>`)
+          - "markdown" = return markdown content
+          - "html" = return html content
         model (BaseChatModel):
           The language model to use for analysis.
         prompt (str):
@@ -261,9 +266,11 @@ class LLMImageBlobParser(BaseImageBlobParser):
     def __init__(
         self,
         *,
-        format: Literal["text", "markdown-link", "html-img"] = "text",
+        format: Literal[
+            "text", "markdown-link", "html-img", "markdown", "html"
+        ] = "text",
         model: BaseChatModel,
-        prompt: str = _PROMPT_IMAGES_TO_DESCRIPTION,
+        prompt: BasePromptTemplate = _PROMPT_IMAGES_TO_DESCRIPTION,
     ):
         """
         Initializes the LLMImageBlobParser.
@@ -299,7 +306,10 @@ def _analyze_image(self, img: "Image") -> str:
             [
                 HumanMessage(
                     content=[
-                        {"type": "text", "text": self.prompt},
+                        {
+                            "type": "text",
+                            "text": self.prompt.format(format=self.format),
+                        },
                         {
                             "type": "image_url",
                             "image_url": {
diff --git a/libs/community/tests/integration_tests/document_loaders/parsers/test_images.py b/libs/community/tests/integration_tests/document_loaders/parsers/test_images.py
@@ -6,6 +6,7 @@
 from langchain_core.documents.base import Blob
 from langchain_core.language_models import FakeMessagesListChatModel
 from langchain_core.messages import ChatMessage
+from langchain_openai import ChatOpenAI
 
 from langchain_community.document_loaders.parsers.images import (
     LLMImageBlobParser,
@@ -16,6 +17,7 @@
 path_base = Path(__file__).parent.parent.parent
 building_image = Blob.from_path(path_base / "examples/building.jpg")
 text_image = Blob.from_path(path_base / "examples/text.png")
+page_image = Blob.from_path(path_base / "examples/page.png")
 
 
 @pytest.mark.parametrize(
@@ -67,3 +69,42 @@ def test_image_parser_with_differents_format_and_files(
     documents = list(blob_loader(format=format, **kw).lazy_parse(blob))
     assert len(documents) == 1
     assert re.compile(pattern.format(body=body)).match(documents[0].page_content)
+
+
+@pytest.mark.parametrize(
+    "blob,body",
+    [
+        (page_image, r".*Layout Detection Models.*"),
+    ],
+)
+@pytest.mark.parametrize(
+    "format,pattern",
+    [
+        ("html", r"^<html"),
+        ("markdown", r"^\*\*.*\*\*"),
+    ],
+)
+@pytest.mark.parametrize(
+    "blob_loader,kw",
+    [
+        (
+            LLMImageBlobParser,
+            {
+                "model": ChatOpenAI(
+                    model="gpt-4o",
+                )
+            },
+        ),
+    ],
+)
+def test_image_parser_with_extra_format(
+    blob_loader: Type,
+    kw: dict[str, Any],
+    format: str,
+    pattern: str,
+    blob: Blob,
+    body: str,
+) -> None:
+    documents = list(blob_loader(format=format, **kw).lazy_parse(blob))
+    assert len(documents) == 1
+    assert re.compile(pattern.format(body=body)).match(documents[0].page_content)
diff --git a/libs/community/tests/integration_tests/examples/page.png b/libs/community/tests/integration_tests/examples/page.png