Skip to content

Commit b623750

Browse files
committed
Fix all remarques
1 parent 743a83e commit b623750

File tree

10 files changed

+359
-330
lines changed

10 files changed

+359
-330
lines changed

docs/docs/integrations/document_loaders/pymupdf.ipynb

Lines changed: 101 additions & 82 deletions
Large diffs are not rendered by default.

libs/community/langchain_community/document_loaders/parsers/__init__.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,11 @@
3030
from langchain_community.document_loaders.parsers.vsdx import (
3131
VsdxParser,
3232
)
33+
from langchain_community.document_loaders.parsers.images import (
34+
MultimodalBlobParser,
35+
RapidOCRBlobParser,
36+
TesseractBlobParser,
37+
)
3338

3439

3540
_module_lookup = {
@@ -38,12 +43,15 @@
3843
"DocAIParser": "langchain_community.document_loaders.parsers.docai",
3944
"GrobidParser": "langchain_community.document_loaders.parsers.grobid",
4045
"LanguageParser": "langchain_community.document_loaders.parsers.language",
46+
"MultimodalBlobParser": "langchain_community.document_loaders.parsers.images",
4147
"OpenAIWhisperParser": "langchain_community.document_loaders.parsers.audio",
4248
"PDFMinerParser": "langchain_community.document_loaders.parsers.pdf",
4349
"PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf",
4450
"PyMuPDFParser": "langchain_community.document_loaders.parsers.pdf",
4551
"PyPDFParser": "langchain_community.document_loaders.parsers.pdf",
4652
"PyPDFium2Parser": "langchain_community.document_loaders.parsers.pdf",
53+
"RapidOCRBlobParser": "langchain_community.document_loaders.parsers.images",
54+
"TesseractBlobParser": "langchain_community.document_loaders.parsers.images",
4755
"VsdxParser": "langchain_community.document_loaders.parsers.vsdx",
4856
}
4957

@@ -61,11 +69,14 @@ def __getattr__(name: str) -> Any:
6169
"DocAIParser",
6270
"GrobidParser",
6371
"LanguageParser",
72+
"MultimodalBlobParser",
6473
"OpenAIWhisperParser",
6574
"PDFMinerParser",
6675
"PDFPlumberParser",
6776
"PyMuPDFParser",
6877
"PyPDFParser",
6978
"PyPDFium2Parser",
79+
"RapidOCRBlobParser",
80+
"TesseractBlobParser",
7081
"VsdxParser",
7182
]
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
import base64
2+
import html
3+
import io
4+
import logging
5+
from abc import abstractmethod
6+
7+
from PIL import Image
8+
from typing import Iterator, Literal
9+
10+
from langchain_community.document_loaders.base import BaseBlobParser
11+
from langchain_community.document_loaders.blob_loaders import Blob
12+
from langchain_core.documents import Document
13+
from langchain_core.language_models import BaseChatModel
14+
from langchain_core.messages import HumanMessage
15+
16+
logger = logging.getLogger(__name__)
17+
18+
19+
class ImageBlobParser(BaseBlobParser):
20+
def __init__(
21+
self,
22+
*,
23+
format: Literal["text", "markdown", "html"] = "text",
24+
):
25+
self.format = format
26+
27+
@abstractmethod
28+
def _analyze_image(self, img: Image) -> str:
29+
pass
30+
31+
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
32+
with blob.as_bytes_io() as buf:
33+
img = Image.open(buf)
34+
content = self._analyze_image(img)
35+
if content:
36+
if self.format == "markdown":
37+
content = content.replace("]", r"\\]")
38+
content = f"![{content}](.)"
39+
elif self.format == "html":
40+
content = f'<img alt="{html.escape(content, quote=True)}" />'
41+
logger.debug("Image text: %s", content.replace("\n", "\\n"))
42+
yield Document(
43+
page_content=content,
44+
metadata={"source": blob.source},
45+
)
46+
47+
48+
class RapidOCRBlobParser(ImageBlobParser):
49+
def __init__(
50+
self,
51+
*,
52+
format: Literal["text", "markdown", "html"] = "text",
53+
):
54+
super().__init__(format=format)
55+
self.ocr = None
56+
57+
def _analyze_image(self, img: Image) -> str:
58+
if not self.ocr:
59+
try:
60+
from rapidocr_onnxruntime import RapidOCR
61+
except ImportError:
62+
raise ImportError(
63+
"`rapidocr-onnxruntime` package not found, please install it with "
64+
"`pip install rapidocr-onnxruntime`"
65+
)
66+
self.ocr = RapidOCR()
67+
ocr_result, _ = self.ocr(img)
68+
content = ""
69+
if ocr_result:
70+
content = ("\n".join([text[1] for text in ocr_result])).strip()
71+
return content
72+
73+
74+
class TesseractBlobParser(ImageBlobParser):
75+
76+
def __init__(
77+
self,
78+
*,
79+
format: Literal["text", "markdown", "html"] = "text",
80+
langs: list[str] = ["eng"],
81+
82+
):
83+
super().__init__(format=format)
84+
self.langs = langs
85+
86+
def _analyze_image(self, img: Image) -> str:
87+
try:
88+
import pytesseract
89+
except ImportError:
90+
raise ImportError(
91+
"`pytesseract` package not found, please install it with "
92+
"`pip install pytesseract`"
93+
)
94+
return pytesseract.image_to_string(img, lang="+".join(self.langs)).strip()
95+
96+
97+
_prompt_images_to_description = (
98+
"You are an assistant tasked with summarizing "
99+
"images for retrieval. "
100+
"These summaries will be embedded and used to retrieve the raw image. "
101+
"Give a concise summary of the image that is well optimized for retrieval "
102+
"and extract all the text from the image.")
103+
104+
105+
class MultimodalBlobParser(ImageBlobParser):
106+
107+
def __init__(
108+
self,
109+
*,
110+
format: Literal["text", "markdown", "html"] = "text",
111+
model: BaseChatModel,
112+
prompt: str = _prompt_images_to_description,
113+
114+
):
115+
super().__init__(format=format)
116+
self.model = model
117+
self.prompt = prompt
118+
119+
def _analyze_image(self, img: Image) -> str:
120+
image_bytes = io.BytesIO()
121+
img.save(image_bytes, format="PNG")
122+
img_base64 = base64.b64encode(image_bytes.getvalue()).decode("utf-8")
123+
msg = self.model.invoke(
124+
[
125+
HumanMessage(
126+
content=[
127+
{"type": "text", "text": self.prompt},
128+
{
129+
"type": "image_url",
130+
"image_url": {
131+
"url": f"data:image/jpeg;base64,{img_base64}"
132+
},
133+
},
134+
]
135+
)
136+
]
137+
)
138+
result = msg.content
139+
assert isinstance(result, str)
140+
return result

0 commit comments

Comments
 (0)