Bugfix

pprados · pprados · commit 1c78325d073e · 2025-01-15T13:23:53.000+01:00
diff --git a/libs/community/extended_testing_deps.txt b/libs/community/extended_testing_deps.txt
@@ -60,6 +60,7 @@ oracle-ads>=2.9.1,<3
 oracledb>=2.2.0,<3
 pandas>=2.0.1,<3
 pdfminer-six>=20221105,<20240706
+pdfplumber>=0.11
 pgvector>=0.1.6,<0.2
 playwright>=1.48.0,<2
 praw>=7.7.1,<8
diff --git a/libs/community/langchain_community/document_loaders/parsers/images.py b/libs/community/langchain_community/document_loaders/parsers/images.py
@@ -50,13 +50,15 @@ def __init__(
         self.format = format
 
     @abstractmethod
-    def _analyze_image(self, img: "Image") -> str:
+    def _analyze_image(self, img: "Image", format: str) -> str:
         """
         Abstract method to analyze an image and extract textual content.
 
         Args:
             img (Image):
               The image to be analyzed.
+            format (str):
+              The format to use if it's possible
 
         Returns:
             str:
@@ -84,7 +86,12 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:
                     img = Img.fromarray(numpy.load(buf))
                 else:
                     img = Img.open(buf)
-                content = self._analyze_image(img)
+                format = (
+                    "text"
+                    if self.format in ("markdown-link", "html-img")
+                    else self.format
+                )
+                content = self._analyze_image(img, format)
                 if content:
                     source = blob.source or "#"
                     if self.format == "markdown-link":
@@ -143,13 +150,15 @@ def __init__(
         super().__init__(format=format)
         self.ocr = None
 
-    def _analyze_image(self, img: "Image") -> str:
+    def _analyze_image(self, img: "Image", format: str) -> str:
         """
         Analyzes an image and extracts text using RapidOCR.
 
         Args:
             img (Image):
               The image to be analyzed.
+            format (str):
+              The format to use if it's possible
 
         Returns:
             str:
@@ -211,13 +220,15 @@ def __init__(
         super().__init__(format=format)
         self.langs = list(langs)
 
-    def _analyze_image(self, img: "Image") -> str:
+    def _analyze_image(self, img: "Image", format: str) -> str:
         """
         Analyzes an image and extracts text using Tesseract OCR.
 
         Args:
             img (Image):
               The image to be analyzed.
+            format (str):
+              The format to use if it's possible
 
         Returns:
             str: The extracted text content.
@@ -287,7 +298,7 @@ def __init__(
         self.model = model
         self.prompt = prompt
 
-    def _analyze_image(self, img: "Image") -> str:
+    def _analyze_image(self, img: "Image", format: str) -> str:
         """
         Analyzes an image using the provided language model.
 
@@ -308,7 +319,7 @@ def _analyze_image(self, img: "Image") -> str:
                     content=[
                         {
                             "type": "text",
-                            "text": self.prompt.format(format=self.format),
+                            "text": self.prompt.format(format=format),
                         },
                         {
                             "type": "image_url",
diff --git a/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py b/libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py
@@ -199,7 +199,7 @@ def _std_assert_with_parser(parser: BaseBlobParser) -> None:
             parser.password = old_password
 
     class EmptyImageBlobParser(BaseImageBlobParser):
-        def _analyze_image(self, img: Image) -> str:
+        def _analyze_image(self, img: Image, format: str) -> str:
             return "![image](#)"
 
     parser_class = getattr(pdf_parsers, parser_factory)