Skip to content

Commit 80ee3f7

Browse files
committed
Fix Images
1 parent 91234f0 commit 80ee3f7

File tree

9 files changed

+248
-124
lines changed

9 files changed

+248
-124
lines changed

docs/docs/integrations/document_loaders/pymupdf.ipynb

Lines changed: 74 additions & 78 deletions
Large diffs are not rendered by default.

libs/community/langchain_community/document_loaders/parsers/__init__.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
BS4HTMLParser,
1919
)
2020
from langchain_community.document_loaders.parsers.images import (
21-
MultimodalBlobParser,
21+
BaseImageBlobParser,
22+
LLMImageBlobParser,
2223
RapidOCRBlobParser,
2324
TesseractBlobParser,
2425
)
@@ -40,10 +41,11 @@
4041
_module_lookup = {
4142
"AzureAIDocumentIntelligenceParser": "langchain_community.document_loaders.parsers.doc_intelligence", # noqa: E501
4243
"BS4HTMLParser": "langchain_community.document_loaders.parsers.html",
44+
"BaseImageBlobParser": "langchain_community.document_loaders.parsers.images",
4345
"DocAIParser": "langchain_community.document_loaders.parsers.docai",
4446
"GrobidParser": "langchain_community.document_loaders.parsers.grobid",
4547
"LanguageParser": "langchain_community.document_loaders.parsers.language",
46-
"MultimodalBlobParser": "langchain_community.document_loaders.parsers.images",
48+
"LLMImageBlobParser": "langchain_community.document_loaders.parsers.images",
4749
"OpenAIWhisperParser": "langchain_community.document_loaders.parsers.audio",
4850
"PDFMinerParser": "langchain_community.document_loaders.parsers.pdf",
4951
"PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf",
@@ -65,11 +67,12 @@ def __getattr__(name: str) -> Any:
6567

6668
__all__ = [
6769
"AzureAIDocumentIntelligenceParser",
70+
"BaseImageBlobParser",
6871
"BS4HTMLParser",
6972
"DocAIParser",
7073
"GrobidParser",
7174
"LanguageParser",
72-
"MultimodalBlobParser",
75+
"LLMImageBlobParser",
7376
"OpenAIWhisperParser",
7477
"PDFMinerParser",
7578
"PDFPlumberParser",

libs/community/langchain_community/document_loaders/parsers/images.py

Lines changed: 142 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from abc import abstractmethod
66
from typing import TYPE_CHECKING, Iterator, Literal
77

8+
import numpy
89
import numpy as np
910
from langchain_core.documents import Document
1011
from langchain_core.language_models import BaseChatModel
@@ -19,53 +20,123 @@
1920
logger = logging.getLogger(__name__)
2021

2122

22-
class ImageBlobParser(BaseBlobParser):
23+
class BaseImageBlobParser(BaseBlobParser):
24+
"""
25+
Abstract base class for parsing image blobs into text.
26+
27+
Attributes:
28+
format (Literal["text", "markdown", "html"]):
29+
Output format of the parsed text.
30+
"""
31+
2332
def __init__(
2433
self,
2534
*,
2635
format: Literal["text", "markdown", "html"] = "text",
2736
):
37+
"""
38+
Initializes the BaseImageBlobParser.
39+
40+
Args:
41+
format (Literal["text", "markdown", "html"]):
42+
The format for the parsed output.
43+
"""
2844
self.format = format
2945

3046
@abstractmethod
3147
def _analyze_image(self, img: "Image") -> str:
48+
"""
49+
Abstract method to analyze an image and extract textual content.
50+
51+
Args:
52+
img (Image):
53+
The image to be analyzed.
54+
55+
Returns:
56+
str:
57+
The extracted text content.
58+
"""
3259
pass
3360

3461
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
62+
"""
63+
Lazily parses a blob and yields Document objects containing the parsed content.
64+
65+
Args:
66+
blob (Blob):
67+
The blob to be parsed.
68+
69+
Yields:
70+
Document:
71+
A document containing the parsed content and metadata.
72+
"""
3573
try:
3674
from PIL import Image as Img
3775

3876
with blob.as_bytes_io() as buf:
39-
img = Img.open(buf)
77+
if blob.mimetype == "application/x-npy":
78+
img = Img.fromarray(numpy.load(buf))
79+
else:
80+
img = Img.open(buf)
4081
content = self._analyze_image(img)
4182
if content:
83+
source = blob.source or "#"
4284
if self.format == "markdown":
4385
content = content.replace("]", r"\\]")
44-
content = f"![{content}](.)"
86+
content = f"![{content}]({source})"
4587
elif self.format == "html":
46-
content = f'<img alt="{html.escape(content, quote=True)}" />'
88+
content = (
89+
f'<img alt="{html.escape(content, quote=True)} '
90+
f'src="{source}" />'
91+
)
4792
logger.debug("Image text: %s", content.replace("\n", "\\n"))
4893
yield Document(
4994
page_content=content,
50-
metadata={"source": blob.source},
95+
metadata={**blob.metadata, **{"source": blob.source}},
5196
)
5297
except ImportError:
5398
raise ImportError(
54-
"`rapidocr-onnxruntime` package not found, please install it with "
99+
"`Pillow` package not found, please install it with "
55100
"`pip install Pillow`"
56101
)
57102

58103

59-
class RapidOCRBlobParser(ImageBlobParser):
104+
class RapidOCRBlobParser(BaseImageBlobParser):
105+
"""
106+
Parser for extracting text from images using the RapidOCR library.
107+
108+
Attributes:
109+
ocr:
110+
The RapidOCR instance for performing OCR.
111+
"""
112+
60113
def __init__(
61114
self,
62115
*,
63116
format: Literal["text", "markdown", "html"] = "text",
64117
):
118+
"""
119+
Initializes the RapidOCRBlobParser.
120+
121+
Args:
122+
format (Literal["text", "markdown", "html"]):
123+
The format for the parsed output.
124+
"""
65125
super().__init__(format=format)
66126
self.ocr = None
67127

68128
def _analyze_image(self, img: "Image") -> str:
129+
"""
130+
Analyzes an image and extracts text using RapidOCR.
131+
132+
Args:
133+
img (Image):
134+
The image to be analyzed.
135+
136+
Returns:
137+
str:
138+
The extracted text content.
139+
"""
69140
if not self.ocr:
70141
try:
71142
from rapidocr_onnxruntime import RapidOCR
@@ -83,17 +154,44 @@ def _analyze_image(self, img: "Image") -> str:
83154
return content
84155

85156

86-
class TesseractBlobParser(ImageBlobParser):
157+
class TesseractBlobParser(BaseImageBlobParser):
158+
"""
159+
Parser for extracting text from images using the Tesseract OCR library.
160+
161+
Attributes:
162+
langs (list[str]):
163+
The languages to use for OCR.
164+
"""
165+
87166
def __init__(
88167
self,
89168
*,
90169
format: Literal["text", "markdown", "html"] = "text",
91-
langs: list[str] = ["eng"],
170+
langs: list[str] = ("eng",),
92171
):
172+
"""
173+
Initializes the TesseractBlobParser.
174+
175+
Args:
176+
format (Literal["text", "markdown", "html"]):
177+
The format for the parsed output.
178+
langs (list[str]):
179+
The languages to use for OCR.
180+
"""
93181
super().__init__(format=format)
94182
self.langs = langs
95183

96184
def _analyze_image(self, img: "Image") -> str:
185+
"""
186+
Analyzes an image and extracts text using Tesseract OCR.
187+
188+
Args:
189+
img (Image):
190+
The image to be analyzed.
191+
192+
Returns:
193+
str: The extracted text content.
194+
"""
97195
try:
98196
import pytesseract
99197
except ImportError:
@@ -104,7 +202,7 @@ def _analyze_image(self, img: "Image") -> str:
104202
return pytesseract.image_to_string(img, lang="+".join(self.langs)).strip()
105203

106204

107-
_prompt_images_to_description = (
205+
_PROMPT_IMAGES_TO_DESCRIPTION = (
108206
"You are an assistant tasked with summarizing "
109207
"images for retrieval. "
110208
"These summaries will be embedded and used to retrieve the raw image. "
@@ -113,19 +211,51 @@ def _analyze_image(self, img: "Image") -> str:
113211
)
114212

115213

116-
class MultimodalBlobParser(ImageBlobParser):
214+
class LLMImageBlobParser(BaseImageBlobParser):
215+
"""
216+
Parser for analyzing images using a language model (LLM).
217+
218+
Attributes:
219+
model (BaseChatModel):
220+
The language model to use for analysis.
221+
prompt (str):
222+
The prompt to provide to the language model.
223+
"""
224+
117225
def __init__(
118226
self,
119227
*,
120228
format: Literal["text", "markdown", "html"] = "text",
121229
model: BaseChatModel,
122-
prompt: str = _prompt_images_to_description,
230+
prompt: str = _PROMPT_IMAGES_TO_DESCRIPTION,
123231
):
232+
"""
233+
Initializes the LLMImageBlobParser.
234+
235+
Args:
236+
format (Literal["text", "markdown", "html"]):
237+
The format for the parsed output.
238+
model (BaseChatModel):
239+
The language model to use for analysis.
240+
prompt (str):
241+
The prompt to provide to the language model.
242+
"""
124243
super().__init__(format=format)
125244
self.model = model
126245
self.prompt = prompt
127246

128247
def _analyze_image(self, img: "Image") -> str:
248+
"""
249+
Analyzes an image using the provided language model.
250+
251+
Args:
252+
img (Image):
253+
The image to be analyzed.
254+
255+
Returns:
256+
str: *
257+
The extracted textual content.
258+
"""
129259
image_bytes = io.BytesIO()
130260
img.save(image_bytes, format="PNG")
131261
img_base64 = base64.b64encode(image_bytes.getvalue()).decode("utf-8")

libs/community/langchain_community/document_loaders/parsers/pdf.py

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,14 @@
2020
)
2121
from urllib.parse import urlparse
2222

23+
import numpy
2324
import numpy as np
2425
from langchain_core.documents import Document
2526

2627
from langchain_community.document_loaders.base import BaseBlobParser
2728
from langchain_community.document_loaders.blob_loaders import Blob
2829
from langchain_community.document_loaders.parsers.images import (
29-
ImageBlobParser,
30+
BaseImageBlobParser,
3031
RapidOCRBlobParser,
3132
)
3233

@@ -216,7 +217,7 @@ class ImagesPdfParser(BaseBlobParser):
216217
def __init__(
217218
self,
218219
extract_images: bool,
219-
images_parser: Optional[ImageBlobParser],
220+
images_parser: Optional[BaseImageBlobParser],
220221
):
221222
"""Extract text from images.
222223
@@ -485,7 +486,7 @@ def __init__(
485486
password: Optional[str] = None,
486487
mode: Literal["single", "page"] = "page",
487488
pages_delimitor: str = _DEFAULT_PAGE_DELIMITOR,
488-
images_parser: Optional[ImageBlobParser] = RapidOCRBlobParser(),
489+
images_parser: Optional[BaseImageBlobParser] = RapidOCRBlobParser(),
489490
extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
490491
extract_tables_settings: Optional[dict[str, Any]] = None,
491492
) -> None:
@@ -637,14 +638,6 @@ def _get_page_content(
637638
extras.append(tables_from_page)
638639
all_text = _merge_text_and_extras(extras, text_from_page)
639640

640-
if not all_text:
641-
# logger.warning(
642-
# "Warning: Empty content on page %s of document %s",
643-
# page.number,
644-
# blob.source,
645-
# )
646-
pass
647-
648641
return all_text
649642

650643
def _extract_metadata(self, doc: pymupdf.Document, blob: Blob) -> dict:
@@ -687,7 +680,6 @@ def _extract_images_from_page(
687680
if not self.extract_images:
688681
return ""
689682
import pymupdf
690-
from PIL import Image
691683

692684
img_list = page.get_images()
693685
images = []
@@ -699,8 +691,10 @@ def _extract_images_from_page(
699691
pix.height, pix.width, -1
700692
)
701693
image_bytes = io.BytesIO()
702-
Image.fromarray(image).save(image_bytes, format="PNG")
703-
blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
694+
numpy.save(image_bytes, image)
695+
blob = Blob.from_data(
696+
image_bytes.getvalue(), mime_type="application/x-npy"
697+
)
704698
images.append(next(self.images_parser.lazy_parse(blob)).page_content)
705699
return _FORMAT_IMAGE_STR.format(
706700
image_text=_JOIN_IMAGES.join(filter(None, images))

libs/community/langchain_community/document_loaders/pdf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
from langchain_community.document_loaders.blob_loaders import Blob
3030
from langchain_community.document_loaders.dedoc import DedocBaseLoader
3131
from langchain_community.document_loaders.parsers.images import (
32-
ImageBlobParser,
32+
BaseImageBlobParser,
3333
RapidOCRBlobParser,
3434
)
3535
from langchain_community.document_loaders.parsers.pdf import (
@@ -480,7 +480,7 @@ def __init__(
480480
mode: Literal["single", "page"] = "page",
481481
pages_delimitor: str = _DEFAULT_PAGE_DELIMITOR,
482482
extract_images: bool = False,
483-
images_parser: Optional[ImageBlobParser] = RapidOCRBlobParser(),
483+
images_parser: Optional[BaseImageBlobParser] = RapidOCRBlobParser(),
484484
extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
485485
headers: Optional[dict] = None,
486486
extract_tables_settings: Optional[dict[str, Any]] = None,

0 commit comments

Comments
 (0)