langchain-ai
diff --git a/‎docs/docs/integrations/document_loaders/pymupdf.ipynb‎
Lines changed: 74 additions & 78 deletions b/‎docs/docs/integrations/document_loaders/pymupdf.ipynb‎
Lines changed: 74 additions & 78 deletions
diff --git a/‎libs/community/langchain_community/document_loaders/parsers/__init__.py‎
Lines changed: 6 additions & 3 deletions b/‎libs/community/langchain_community/document_loaders/parsers/__init__.py‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎libs/community/langchain_community/document_loaders/parsers/images.py‎
Lines changed: 142 additions & 12 deletions b/‎libs/community/langchain_community/document_loaders/parsers/images.py‎
Lines changed: 142 additions & 12 deletions
diff --git a/‎libs/community/langchain_community/document_loaders/parsers/pdf.py‎
Lines changed: 8 additions & 14 deletions b/‎libs/community/langchain_community/document_loaders/parsers/pdf.py‎
Lines changed: 8 additions & 14 deletions
diff --git a/‎libs/community/langchain_community/document_loaders/pdf.py‎
Lines changed: 2 additions & 2 deletions b/‎libs/community/langchain_community/document_loaders/pdf.py‎
Lines changed: 2 additions & 2 deletions
@@ -18,7 +18,8 @@
         BS4HTMLParser,
     )
     from langchain_community.document_loaders.parsers.images import (
-        MultimodalBlobParser,
+        BaseImageBlobParser,
+        LLMImageBlobParser,
         RapidOCRBlobParser,
         TesseractBlobParser,
     )
@@ -40,10 +41,11 @@
 _module_lookup = {
     "AzureAIDocumentIntelligenceParser": "langchain_community.document_loaders.parsers.doc_intelligence",  # noqa: E501
     "BS4HTMLParser": "langchain_community.document_loaders.parsers.html",
+    "BaseImageBlobParser": "langchain_community.document_loaders.parsers.images",
     "DocAIParser": "langchain_community.document_loaders.parsers.docai",
     "GrobidParser": "langchain_community.document_loaders.parsers.grobid",
     "LanguageParser": "langchain_community.document_loaders.parsers.language",
-    "MultimodalBlobParser": "langchain_community.document_loaders.parsers.images",
+    "LLMImageBlobParser": "langchain_community.document_loaders.parsers.images",
     "OpenAIWhisperParser": "langchain_community.document_loaders.parsers.audio",
     "PDFMinerParser": "langchain_community.document_loaders.parsers.pdf",
     "PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf",
@@ -65,11 +67,12 @@ def __getattr__(name: str) -> Any:
 
 __all__ = [
     "AzureAIDocumentIntelligenceParser",
+    "BaseImageBlobParser",
     "BS4HTMLParser",
     "DocAIParser",
     "GrobidParser",
     "LanguageParser",
-    "MultimodalBlobParser",
+    "LLMImageBlobParser",
     "OpenAIWhisperParser",
     "PDFMinerParser",
     "PDFPlumberParser",
 
@@ -5,6 +5,7 @@
 from abc import abstractmethod
 from typing import TYPE_CHECKING, Iterator, Literal
 
+import numpy
 import numpy as np
 from langchain_core.documents import Document
 from langchain_core.language_models import BaseChatModel
@@ -19,53 +20,123 @@
 logger = logging.getLogger(__name__)
 
 
-class ImageBlobParser(BaseBlobParser):
+class BaseImageBlobParser(BaseBlobParser):
+    """
+    Abstract base class for parsing image blobs into text.
+
+    Attributes:
+        format (Literal["text", "markdown", "html"]):
+          Output format of the parsed text.
+    """
+
     def __init__(
         self,
         *,
         format: Literal["text", "markdown", "html"] = "text",
     ):
+        """
+        Initializes the BaseImageBlobParser.
+
+        Args:
+            format (Literal["text", "markdown", "html"]):
+              The format for the parsed output.
+        """
         self.format = format
 
     @abstractmethod
     def _analyze_image(self, img: "Image") -> str:
+        """
+        Abstract method to analyze an image and extract textual content.
+
+        Args:
+            img (Image):
+              The image to be analyzed.
+
+        Returns:
+            str:
+              The extracted text content.
+        """
         pass
 
     def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        """
+        Lazily parses a blob and yields Document objects containing the parsed content.
+
+        Args:
+            blob (Blob):
+              The blob to be parsed.
+
+        Yields:
+            Document:
+              A document containing the parsed content and metadata.
+        """
         try:
             from PIL import Image as Img
 
             with blob.as_bytes_io() as buf:
-                img = Img.open(buf)
+                if blob.mimetype == "application/x-npy":
+                    img = Img.fromarray(numpy.load(buf))
+                else:
+                    img = Img.open(buf)
                 content = self._analyze_image(img)
                 if content:
+                    source = blob.source or "#"
                     if self.format == "markdown":
                         content = content.replace("]", r"\\]")
-                        content = f"![{content}](.)"
+                        content = f"![{content}]({source})"
                     elif self.format == "html":
-                        content = f'<img alt="{html.escape(content, quote=True)}" />'
+                        content = (
+                            f'<img alt="{html.escape(content, quote=True)} '
+                            f'src="{source}" />'
+                        )
                 logger.debug("Image text: %s", content.replace("\n", "\\n"))
                 yield Document(
                     page_content=content,
-                    metadata={"source": blob.source},
+                    metadata={**blob.metadata, **{"source": blob.source}},
                 )
         except ImportError:
             raise ImportError(
-                "`rapidocr-onnxruntime` package not found, please install it with "
+                "`Pillow` package not found, please install it with "
                 "`pip install Pillow`"
             )
 
 
-class RapidOCRBlobParser(ImageBlobParser):
+class RapidOCRBlobParser(BaseImageBlobParser):
+    """
+    Parser for extracting text from images using the RapidOCR library.
+
+    Attributes:
+        ocr:
+          The RapidOCR instance for performing OCR.
+    """
+
     def __init__(
         self,
         *,
         format: Literal["text", "markdown", "html"] = "text",
     ):
+        """
+        Initializes the RapidOCRBlobParser.
+
+        Args:
+            format (Literal["text", "markdown", "html"]):
+              The format for the parsed output.
+        """
         super().__init__(format=format)
         self.ocr = None
 
     def _analyze_image(self, img: "Image") -> str:
+        """
+        Analyzes an image and extracts text using RapidOCR.
+
+        Args:
+            img (Image):
+              The image to be analyzed.
+
+        Returns:
+            str:
+              The extracted text content.
+        """
         if not self.ocr:
             try:
                 from rapidocr_onnxruntime import RapidOCR
@@ -83,17 +154,44 @@ def _analyze_image(self, img: "Image") -> str:
         return content
 
 
-class TesseractBlobParser(ImageBlobParser):
+class TesseractBlobParser(BaseImageBlobParser):
+    """
+    Parser for extracting text from images using the Tesseract OCR library.
+
+    Attributes:
+        langs (list[str]):
+          The languages to use for OCR.
+    """
+
     def __init__(
         self,
         *,
         format: Literal["text", "markdown", "html"] = "text",
-        langs: list[str] = ["eng"],
+        langs: list[str] = ("eng",),
     ):
+        """
+        Initializes the TesseractBlobParser.
+
+        Args:
+            format (Literal["text", "markdown", "html"]):
+              The format for the parsed output.
+            langs (list[str]):
+              The languages to use for OCR.
+        """
         super().__init__(format=format)
         self.langs = langs
 
     def _analyze_image(self, img: "Image") -> str:
+        """
+        Analyzes an image and extracts text using Tesseract OCR.
+
+        Args:
+            img (Image):
+              The image to be analyzed.
+
+        Returns:
+            str: The extracted text content.
+        """
         try:
             import pytesseract
         except ImportError:
@@ -104,7 +202,7 @@ def _analyze_image(self, img: "Image") -> str:
         return pytesseract.image_to_string(img, lang="+".join(self.langs)).strip()
 
 
-_prompt_images_to_description = (
+_PROMPT_IMAGES_TO_DESCRIPTION = (
     "You are an assistant tasked with summarizing "
     "images for retrieval. "
     "These summaries will be embedded and used to retrieve the raw image. "
@@ -113,19 +211,51 @@ def _analyze_image(self, img: "Image") -> str:
 )
 
 
-class MultimodalBlobParser(ImageBlobParser):
+class LLMImageBlobParser(BaseImageBlobParser):
+    """
+    Parser for analyzing images using a language model (LLM).
+
+    Attributes:
+        model (BaseChatModel):
+          The language model to use for analysis.
+        prompt (str):
+          The prompt to provide to the language model.
+    """
+
     def __init__(
         self,
         *,
         format: Literal["text", "markdown", "html"] = "text",
         model: BaseChatModel,
-        prompt: str = _prompt_images_to_description,
+        prompt: str = _PROMPT_IMAGES_TO_DESCRIPTION,
     ):
+        """
+        Initializes the LLMImageBlobParser.
+
+        Args:
+            format (Literal["text", "markdown", "html"]):
+              The format for the parsed output.
+            model (BaseChatModel):
+              The language model to use for analysis.
+            prompt (str):
+              The prompt to provide to the language model.
+        """
         super().__init__(format=format)
         self.model = model
         self.prompt = prompt
 
     def _analyze_image(self, img: "Image") -> str:
+        """
+        Analyzes an image using the provided language model.
+
+        Args:
+            img (Image):
+              The image to be analyzed.
+
+        Returns:
+            str: *
+              The extracted textual content.
+        """
         image_bytes = io.BytesIO()
         img.save(image_bytes, format="PNG")
         img_base64 = base64.b64encode(image_bytes.getvalue()).decode("utf-8")
 
@@ -20,13 +20,14 @@
 )
 from urllib.parse import urlparse
 
+import numpy
 import numpy as np
 from langchain_core.documents import Document
 
 from langchain_community.document_loaders.base import BaseBlobParser
 from langchain_community.document_loaders.blob_loaders import Blob
 from langchain_community.document_loaders.parsers.images import (
-    ImageBlobParser,
+    BaseImageBlobParser,
     RapidOCRBlobParser,
 )
 
@@ -216,7 +217,7 @@ class ImagesPdfParser(BaseBlobParser):
     def __init__(
         self,
         extract_images: bool,
-        images_parser: Optional[ImageBlobParser],
+        images_parser: Optional[BaseImageBlobParser],
     ):
         """Extract text from images.
 
@@ -485,7 +486,7 @@ def __init__(
         password: Optional[str] = None,
         mode: Literal["single", "page"] = "page",
         pages_delimitor: str = _DEFAULT_PAGE_DELIMITOR,
-        images_parser: Optional[ImageBlobParser] = RapidOCRBlobParser(),
+        images_parser: Optional[BaseImageBlobParser] = RapidOCRBlobParser(),
         extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
         extract_tables_settings: Optional[dict[str, Any]] = None,
     ) -> None:
@@ -637,14 +638,6 @@ def _get_page_content(
             extras.append(tables_from_page)
         all_text = _merge_text_and_extras(extras, text_from_page)
 
-        if not all_text:
-            # logger.warning(
-            #     "Warning: Empty content on page %s of document %s",
-            #     page.number,
-            #     blob.source,
-            # )
-            pass
-
         return all_text
 
     def _extract_metadata(self, doc: pymupdf.Document, blob: Blob) -> dict:
@@ -687,7 +680,6 @@ def _extract_images_from_page(
         if not self.extract_images:
             return ""
         import pymupdf
-        from PIL import Image
 
         img_list = page.get_images()
         images = []
@@ -699,8 +691,10 @@ def _extract_images_from_page(
                     pix.height, pix.width, -1
                 )
                 image_bytes = io.BytesIO()
-                Image.fromarray(image).save(image_bytes, format="PNG")
-                blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
+                numpy.save(image_bytes, image)
+                blob = Blob.from_data(
+                    image_bytes.getvalue(), mime_type="application/x-npy"
+                )
                 images.append(next(self.images_parser.lazy_parse(blob)).page_content)
         return _FORMAT_IMAGE_STR.format(
             image_text=_JOIN_IMAGES.join(filter(None, images))
 
@@ -29,7 +29,7 @@
 from langchain_community.document_loaders.blob_loaders import Blob
 from langchain_community.document_loaders.dedoc import DedocBaseLoader
 from langchain_community.document_loaders.parsers.images import (
-    ImageBlobParser,
+    BaseImageBlobParser,
     RapidOCRBlobParser,
 )
 from langchain_community.document_loaders.parsers.pdf import (
@@ -480,7 +480,7 @@ def __init__(
         mode: Literal["single", "page"] = "page",
         pages_delimitor: str = _DEFAULT_PAGE_DELIMITOR,
         extract_images: bool = False,
-        images_parser: Optional[ImageBlobParser] = RapidOCRBlobParser(),
+        images_parser: Optional[BaseImageBlobParser] = RapidOCRBlobParser(),
         extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
         headers: Optional[dict] = None,
         extract_tables_settings: Optional[dict[str, Any]] = None,