Change the strategy for images_inner_format.

pprados · pprados · commit 0be6c885b56f · 2025-01-17T09:30:32.000+01:00
diff --git a/libs/community/langchain_community/document_loaders/parsers/images.py b/libs/community/langchain_community/document_loaders/parsers/images.py
@@ -1,5 +1,4 @@
 import base64
-import html
 import io
 import logging
 from abc import abstractmethod
@@ -10,7 +9,6 @@
 from langchain_core.documents import Document
 from langchain_core.language_models import BaseChatModel
 from langchain_core.messages import HumanMessage
-from langchain_core.prompts import BasePromptTemplate, PromptTemplate
 
 from langchain_community.document_loaders.base import BaseBlobParser
 from langchain_community.document_loaders.blob_loaders import Blob
@@ -44,12 +42,11 @@ def __init__(
         self.format = format
 
     @abstractmethod
-    def _analyze_image(self, img: "Image", format: str) -> str:
+    def _analyze_image(self, img: "Image") -> str:
         """Abstract method to analyze an image and extract textual content.
 
         Args:
             img: The image to be analyzed.
-            format: The format to use if it's possible
 
         Returns:
           The extracted text content.
@@ -73,22 +70,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:
                     img = Img.fromarray(numpy.load(buf))
                 else:
                     img = Img.open(buf)
-                format = (
-                    "text"
-                    if self.format in ("markdown-img", "html-img")
-                    else self.format
-                )
-                content = self._analyze_image(img, format)
-                if content:
-                    source = blob.source or "#"
-                    if self.format == "markdown-img":
-                        content = content.replace("]", r"\\]")
-                        content = f"![{content}]({source})"
-                    elif self.format == "html-img":
-                        content = (
-                            f'<img alt="{html.escape(content, quote=True)} '
-                            f'src="{source}" />'
-                        )
+                content = self._analyze_image(img)
                 logger.debug("Image text: %s", content.replace("\n", "\\n"))
                 yield Document(
                     page_content=content,
@@ -107,44 +89,24 @@ class RapidOCRBlobParser(BaseImageBlobParser):
     Attributes:
         ocr:
           The RapidOCR instance for performing OCR.
-        format (Literal["text", "markdown-img", "html-img"]):
-          The format for the parsed output.
-          - "text" = return the content as is
-          - "markdown-img" = wrap the content into an image markdown link, w/ link
-          pointing to (`![body)(#)`]
-          - "html-img" = wrap the content as the `alt` text of an tag and link to
-          (`<img alt="{body}" src="#"/>`)
     """
 
     def __init__(
         self,
-        *,
-        format: Literal["text", "markdown-img", "html-img"] = "text",
-    ):
+    ) -> None:
         """
         Initializes the RapidOCRBlobParser.
-
-        Args:
-            format (Literal["text", "markdown-img", "html-img"]):
-              The format for the parsed output.
-              - "text" = return the content as is
-              - "markdown-img" = wrap the content into an image markdown link, w/ link
-              pointing to (`![body)(#)`]
-              - "html-img" = wrap the content as the `alt` text of an tag and link to
-              (`<img alt="{body}" src="#"/>`)
         """
-        super().__init__(format=format)
+        super().__init__()
         self.ocr = None
 
-    def _analyze_image(self, img: "Image", format: str) -> str:
+    def _analyze_image(self, img: "Image") -> str:
         """
         Analyzes an image and extracts text using RapidOCR.
 
         Args:
             img (Image):
               The image to be analyzed.
-            format (str):
-              The format to use if it's possible
 
         Returns:
             str:
@@ -168,48 +130,27 @@ def _analyze_image(self, img: "Image", format: str) -> str:
 
 
 class TesseractBlobParser(BaseImageBlobParser):
-    """Parse for extracting text from images using the Tesseract OCR library.
-
-    Attributes:
-        format (Literal["text", "markdown-img", "html-img"]):
-          The format for the parsed output.
-          - "text" = return the content as is
-          - "markdown-img" = wrap the content into an image markdown link, w/ link
-          pointing to (`![body)(#)`]
-          - "html-img" = wrap the content as the `alt` text of an tag and link to
-          (`<img alt="{body}" src="#"/>`)
-        langs (list[str]):
-          The languages to use for OCR.
-    """
+    """Parse for extracting text from images using the Tesseract OCR library."""
 
     def __init__(
         self,
         *,
-        format: Literal["text", "markdown-img", "html-img"] = "text",
         langs: Iterable[str] = ("eng",),
     ):
         """Initialize the TesseractBlobParser.
 
         Args:
-            format (Literal["text", "markdown-img", "html-img"]):
-              The format for the parsed output.
-              - "text" = return the content as is
-              - "markdown-img" = wrap the content into an image markdown link, w/ link
-              pointing to (`![body)(#)`]
-              - "html-img" = wrap the content as the `alt` text of an tag and link to
-              (`<img alt="{body}" src="#"/>`)
             langs (list[str]):
               The languages to use for OCR.
         """
-        super().__init__(format=format)
+        super().__init__()
         self.langs = list(langs)
 
-    def _analyze_image(self, img: "Image", format: str) -> str:
+    def _analyze_image(self, img: "Image") -> str:
         """Analyze an image and extracts text using Tesseract OCR.
 
         Args:
             img: The image to be analyzed.
-            format: The format to use if it's possible
 
         Returns:
             str: The extracted text content.
@@ -224,31 +165,21 @@ def _analyze_image(self, img: "Image", format: str) -> str:
         return pytesseract.image_to_string(img, lang="+".join(self.langs)).strip()
 
 
-_PROMPT_IMAGES_TO_DESCRIPTION: BasePromptTemplate = PromptTemplate.from_template(
+_PROMPT_IMAGES_TO_DESCRIPTION: str = (
     "You are an assistant tasked with summarizing images for retrieval. "
     "1. These summaries will be embedded and used to retrieve the raw image. "
     "Give a concise summary of the image that is well optimized for retrieval\n"
     "2. extract all the text from the image. "
     "Do not exclude any content from the page.\n"
-    "Format answer in {format} without explanatory text "
+    "Format answer in markdown without explanatory text "
     "and without markdown delimiter ``` at the beginning. "
-    "Respects the start of the format."
 )
 
 
 class LLMImageBlobParser(BaseImageBlobParser):
     """Parser for analyzing images using a language model (LLM).
 
     Attributes:
-        format (Literal["text", "markdown-img", "html-img"]):
-          The format for the parsed output.
-          - "text" = return the content as is
-          - "markdown-img" = wrap the content into an image markdown link, w/ link
-          pointing to (`![body)(#)`]
-          - "html-img" = wrap the content as the `alt` text of an tag and link to
-          (`<img alt="{body}" src="#"/>`)
-          - "markdown" = return markdown content
-          - "html" = return html content
         model (BaseChatModel):
           The language model to use for analysis.
         prompt (str):
@@ -258,27 +189,22 @@ class LLMImageBlobParser(BaseImageBlobParser):
     def __init__(
         self,
         *,
-        format: Literal[
-            "text", "markdown-img", "html-img", "markdown", "html"
-        ] = "text",
         model: BaseChatModel,
-        prompt: BasePromptTemplate = _PROMPT_IMAGES_TO_DESCRIPTION,
+        prompt: str = _PROMPT_IMAGES_TO_DESCRIPTION,
     ):
         """Initializes the LLMImageBlobParser.
 
         Args:
-            format (Literal["text", "markdown", "html"]):
-              The format for the parsed output.
             model (BaseChatModel):
               The language model to use for analysis.
             prompt (str):
               The prompt to provide to the language model.
         """
-        super().__init__(format=format)
+        super().__init__()
         self.model = model
         self.prompt = prompt
 
-    def _analyze_image(self, img: "Image", format: str) -> str:
+    def _analyze_image(self, img: "Image") -> str:
         """Analyze an image using the provided language model.
 
         Args:
diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import html
 import io
 import logging
 import threading
@@ -98,6 +99,30 @@ def extract_from_images_with_rapidocr(
 _STD_METADATA_KEYS = {"source", "total_pages", "creationdate", "creator", "producer"}
 
 
+def _format_inner_image(blob: Blob, content: str, format: str) -> str:
+    """Format the content of the image with the source of the blob.
+
+    blob: The blob containing the image.
+    format::
+      The format for the parsed output.
+      - "text" = return the content as is
+      - "markdown-img" = wrap the content into an image markdown link, w/ link
+      pointing to (`![body)(#)`]
+      - "html-img" = wrap the content as the `alt` text of an tag and link to
+      (`<img alt="{body}" src="#"/>`)
+    """
+    if content:
+        source = blob.source or "#"
+        if format == "markdown-img":
+            content = content.replace("]", r"\\]")
+            content = f"![{content}]({source})"
+        elif format == "html-img":
+            content = (
+                f'<img alt="{html.escape(content, quote=True)} ' f'src="{source}" />'
+            )
+    return content
+
+
 def _validate_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
     """Validate that the metadata has all the standard keys and the page is an integer.
 
@@ -475,6 +500,7 @@ def __init__(
         mode: Literal["single", "page"] = "page",
         pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
         images_parser: Optional[BaseImageBlobParser] = None,
+        images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
         extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
         extract_tables_settings: Optional[dict[str, Any]] = None,
     ) -> None:
@@ -488,6 +514,12 @@ def __init__(
                 extraction.
             extract_images: Whether to extract images from the PDF.
             images_parser: Optional image blob parser.
+            images_inner_format: The format for the parsed output.
+                - "text" = return the content as is
+                - "markdown-img" = wrap the content into an image markdown link, w/ link
+                pointing to (`![body)(#)`]
+                - "html-img" = wrap the content as the `alt` text of an tag and link to
+                (`<img alt="{body}" src="#"/>`)
             extract_tables: Whether to extract tables in a specific format, such as
                 "csv", "markdown", or "html".
             extract_tables_settings: Optional dictionary of settings for customizing
@@ -515,6 +547,7 @@ def __init__(
         if extract_images and not images_parser:
             images_parser = RapidOCRBlobParser()
         self.extract_images = extract_images
+        self.images_inner_format = images_inner_format
         self.images_parser = images_parser
         self.extract_tables = extract_tables
         self.extract_tables_settings = extract_tables_settings
@@ -704,7 +737,11 @@ def _extract_images_from_page(
                 blob = Blob.from_data(
                     image_bytes.getvalue(), mime_type="application/x-npy"
                 )
-                images.append(next(self.images_parser.lazy_parse(blob)).page_content)
+                image_text = next(self.images_parser.lazy_parse(blob)).page_content
+
+                images.append(
+                    _format_inner_image(blob, image_text, self.images_inner_format)
+                )
         return _FORMAT_IMAGE_STR.format(
             image_text=_JOIN_IMAGES.join(filter(None, images))
         )
diff --git a/libs/community/tests/integration_tests/document_loaders/parsers/test_images.py b/libs/community/tests/integration_tests/document_loaders/parsers/test_images.py
@@ -6,7 +6,6 @@
 from langchain_core.documents.base import Blob
 from langchain_core.language_models import FakeMessagesListChatModel
 from langchain_core.messages import ChatMessage
-from langchain_openai import ChatOpenAI
 
 from langchain_community.document_loaders.parsers.images import (
     LLMImageBlobParser,
@@ -24,15 +23,7 @@
     "blob,body",
     [
         (building_image, ""),
-        (text_image, r".*\bMAKE *TEXT\b.*\bSTAND\b.*\bOUT *FROM\b.*\bBACKGROUNDS\b.*"),
-    ],
-)
-@pytest.mark.parametrize(
-    "format,pattern",
-    [
-        ("text", r"(?ism)^{body}$"),
-        ("markdown-img", r"(?ism)^!\[{body}]\(.*\)|$"),
-        ("html-img", r'(?ism)^(<img alt="{body}" src=".*" />|)'),
+        (text_image, r"(?ms).*MAKE.*TEXT.*STAND.*OUT.*FROM.*BACKGROUNDS.*"),
     ],
 )
 @pytest.mark.parametrize(
@@ -56,55 +47,14 @@
         ),
     ],
 )
-def test_image_parser_with_differents_format_and_files(
+def test_image_parser_with_differents_files(
     blob_loader: Type,
     kw: dict[str, Any],
-    format: str,
-    pattern: str,
     blob: Blob,
     body: str,
 ) -> None:
     if blob_loader == LLMImageBlobParser and "building" in str(blob.path):
         body = ".*building.*"
-    documents = list(blob_loader(format=format, **kw).lazy_parse(blob))
-    assert len(documents) == 1
-    assert re.compile(pattern.format(body=body)).match(documents[0].page_content)
-
-
-@pytest.mark.parametrize(
-    "blob,body",
-    [
-        (page_image, r".*Layout Detection Models.*"),
-    ],
-)
-@pytest.mark.parametrize(
-    "format,pattern",
-    [
-        ("html", r"^<.*>"),
-        ("markdown", r"^\*\*.*\*\*"),
-    ],
-)
-@pytest.mark.parametrize(
-    "blob_loader,kw",
-    [
-        (
-            LLMImageBlobParser,
-            {
-                "model": ChatOpenAI(
-                    model="gpt-4o",
-                )
-            },
-        ),
-    ],
-)
-def test_image_parser_with_extra_format(
-    blob_loader: Type,
-    kw: dict[str, Any],
-    format: str,
-    pattern: str,
-    blob: Blob,
-    body: str,
-) -> None:
-    documents = list(blob_loader(format=format, **kw).lazy_parse(blob))
+    documents = list(blob_loader(**kw).lazy_parse(blob))
     assert len(documents) == 1
-    assert re.compile(pattern.format(body=body)).match(documents[0].page_content)
+    assert re.compile(body).match(documents[0].page_content)
diff --git a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py