Skip to content

Commit 4efc509

Browse files
ppradoseyurtsev
andauthored
community[minor]: Refactoring PyMuPDF parser, loader and add image blob parsers (#29063)
* Adds BlobParsers for images. These implementations can take an image and produce one or more documents per image. This interface can be used for exposing OCR capabilities. * Update PyMuPDFParser and Loader to standardize metadata, handle images, improve table extraction etc. - **Twitter handle:** pprados This is one part of a larger Pull Request (PR) that is too large to be submitted all at once. This specific part focuses to prepare the update of all parsers. For more details, see [PR 28970](#28970). --------- Co-authored-by: Eugene Yurtsev <[email protected]>
1 parent f175319 commit 4efc509

File tree

16 files changed

+2385
-186
lines changed

16 files changed

+2385
-186
lines changed

docs/docs/integrations/document_loaders/pymupdf.ipynb

Lines changed: 1154 additions & 34 deletions
Large diffs are not rendered by default.

libs/community/extended_testing_deps.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,14 @@ oracle-ads>=2.9.1,<3
6060
oracledb>=2.2.0,<3
6161
pandas>=2.0.1,<3
6262
pdfminer-six>=20221105,<20240706
63+
pdfplumber>=0.11
6364
pgvector>=0.1.6,<0.2
6465
playwright>=1.48.0,<2
6566
praw>=7.7.1,<8
6667
premai>=0.3.25,<0.4
6768
psychicapi>=0.8.0,<0.9
6869
pydantic>=2.7.4,<3
70+
pytesseract>=0.3.13
6971
py-trello>=0.19.0,<0.20
7072
pyjwt>=2.8.0,<3
7173
pymupdf>=1.22.3,<2

libs/community/langchain_community/document_loaders/parsers/__init__.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@
1717
from langchain_community.document_loaders.parsers.html import (
1818
BS4HTMLParser,
1919
)
20+
from langchain_community.document_loaders.parsers.images import (
21+
BaseImageBlobParser,
22+
LLMImageBlobParser,
23+
RapidOCRBlobParser,
24+
TesseractBlobParser,
25+
)
2026
from langchain_community.document_loaders.parsers.language import (
2127
LanguageParser,
2228
)
@@ -35,15 +41,19 @@
3541
_module_lookup = {
3642
"AzureAIDocumentIntelligenceParser": "langchain_community.document_loaders.parsers.doc_intelligence", # noqa: E501
3743
"BS4HTMLParser": "langchain_community.document_loaders.parsers.html",
44+
"BaseImageBlobParser": "langchain_community.document_loaders.parsers.images",
3845
"DocAIParser": "langchain_community.document_loaders.parsers.docai",
3946
"GrobidParser": "langchain_community.document_loaders.parsers.grobid",
4047
"LanguageParser": "langchain_community.document_loaders.parsers.language",
48+
"LLMImageBlobParser": "langchain_community.document_loaders.parsers.images",
4149
"OpenAIWhisperParser": "langchain_community.document_loaders.parsers.audio",
4250
"PDFMinerParser": "langchain_community.document_loaders.parsers.pdf",
4351
"PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf",
4452
"PyMuPDFParser": "langchain_community.document_loaders.parsers.pdf",
4553
"PyPDFParser": "langchain_community.document_loaders.parsers.pdf",
4654
"PyPDFium2Parser": "langchain_community.document_loaders.parsers.pdf",
55+
"RapidOCRBlobParser": "langchain_community.document_loaders.parsers.images",
56+
"TesseractBlobParser": "langchain_community.document_loaders.parsers.images",
4757
"VsdxParser": "langchain_community.document_loaders.parsers.vsdx",
4858
}
4959

@@ -57,15 +67,19 @@ def __getattr__(name: str) -> Any:
5767

5868
__all__ = [
5969
"AzureAIDocumentIntelligenceParser",
70+
"BaseImageBlobParser",
6071
"BS4HTMLParser",
6172
"DocAIParser",
6273
"GrobidParser",
6374
"LanguageParser",
75+
"LLMImageBlobParser",
6476
"OpenAIWhisperParser",
6577
"PDFMinerParser",
6678
"PDFPlumberParser",
6779
"PyMuPDFParser",
6880
"PyPDFParser",
6981
"PyPDFium2Parser",
82+
"RapidOCRBlobParser",
83+
"TesseractBlobParser",
7084
"VsdxParser",
7185
]
Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
import base64
2+
import io
3+
import logging
4+
from abc import abstractmethod
5+
from typing import TYPE_CHECKING, Iterable, Iterator
6+
7+
import numpy
8+
import numpy as np
9+
from langchain_core.documents import Document
10+
from langchain_core.language_models import BaseChatModel
11+
from langchain_core.messages import HumanMessage
12+
13+
from langchain_community.document_loaders.base import BaseBlobParser
14+
from langchain_community.document_loaders.blob_loaders import Blob
15+
16+
if TYPE_CHECKING:
17+
from PIL.Image import Image
18+
19+
logger = logging.getLogger(__name__)
20+
21+
22+
class BaseImageBlobParser(BaseBlobParser):
23+
"""Abstract base class for parsing image blobs into text."""
24+
25+
@abstractmethod
26+
def _analyze_image(self, img: "Image") -> str:
27+
"""Abstract method to analyze an image and extract textual content.
28+
29+
Args:
30+
img: The image to be analyzed.
31+
32+
Returns:
33+
The extracted text content.
34+
"""
35+
36+
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
37+
"""Lazily parse a blob and yields Documents containing the parsed content.
38+
39+
Args:
40+
blob (Blob): The blob to be parsed.
41+
42+
Yields:
43+
Document:
44+
A document containing the parsed content and metadata.
45+
"""
46+
try:
47+
from PIL import Image as Img
48+
49+
with blob.as_bytes_io() as buf:
50+
if blob.mimetype == "application/x-npy":
51+
img = Img.fromarray(numpy.load(buf))
52+
else:
53+
img = Img.open(buf)
54+
content = self._analyze_image(img)
55+
logger.debug("Image text: %s", content.replace("\n", "\\n"))
56+
yield Document(
57+
page_content=content,
58+
metadata={**blob.metadata, **{"source": blob.source}},
59+
)
60+
except ImportError:
61+
raise ImportError(
62+
"`Pillow` package not found, please install it with "
63+
"`pip install Pillow`"
64+
)
65+
66+
67+
class RapidOCRBlobParser(BaseImageBlobParser):
68+
"""Parser for extracting text from images using the RapidOCR library.
69+
70+
Attributes:
71+
ocr:
72+
The RapidOCR instance for performing OCR.
73+
"""
74+
75+
def __init__(
76+
self,
77+
) -> None:
78+
"""
79+
Initializes the RapidOCRBlobParser.
80+
"""
81+
super().__init__()
82+
self.ocr = None
83+
84+
def _analyze_image(self, img: "Image") -> str:
85+
"""
86+
Analyzes an image and extracts text using RapidOCR.
87+
88+
Args:
89+
img (Image):
90+
The image to be analyzed.
91+
92+
Returns:
93+
str:
94+
The extracted text content.
95+
"""
96+
if not self.ocr:
97+
try:
98+
from rapidocr_onnxruntime import RapidOCR
99+
100+
self.ocr = RapidOCR()
101+
except ImportError:
102+
raise ImportError(
103+
"`rapidocr-onnxruntime` package not found, please install it with "
104+
"`pip install rapidocr-onnxruntime`"
105+
)
106+
ocr_result, _ = self.ocr(np.array(img)) # type: ignore
107+
content = ""
108+
if ocr_result:
109+
content = ("\n".join([text[1] for text in ocr_result])).strip()
110+
return content
111+
112+
113+
class TesseractBlobParser(BaseImageBlobParser):
114+
"""Parse for extracting text from images using the Tesseract OCR library."""
115+
116+
def __init__(
117+
self,
118+
*,
119+
langs: Iterable[str] = ("eng",),
120+
):
121+
"""Initialize the TesseractBlobParser.
122+
123+
Args:
124+
langs (list[str]):
125+
The languages to use for OCR.
126+
"""
127+
super().__init__()
128+
self.langs = list(langs)
129+
130+
def _analyze_image(self, img: "Image") -> str:
131+
"""Analyze an image and extracts text using Tesseract OCR.
132+
133+
Args:
134+
img: The image to be analyzed.
135+
136+
Returns:
137+
str: The extracted text content.
138+
"""
139+
try:
140+
import pytesseract
141+
except ImportError:
142+
raise ImportError(
143+
"`pytesseract` package not found, please install it with "
144+
"`pip install pytesseract`"
145+
)
146+
return pytesseract.image_to_string(img, lang="+".join(self.langs)).strip()
147+
148+
149+
_PROMPT_IMAGES_TO_DESCRIPTION: str = (
150+
"You are an assistant tasked with summarizing images for retrieval. "
151+
"1. These summaries will be embedded and used to retrieve the raw image. "
152+
"Give a concise summary of the image that is well optimized for retrieval\n"
153+
"2. extract all the text from the image. "
154+
"Do not exclude any content from the page.\n"
155+
"Format answer in markdown without explanatory text "
156+
"and without markdown delimiter ``` at the beginning. "
157+
)
158+
159+
160+
class LLMImageBlobParser(BaseImageBlobParser):
161+
"""Parser for analyzing images using a language model (LLM).
162+
163+
Attributes:
164+
model (BaseChatModel):
165+
The language model to use for analysis.
166+
prompt (str):
167+
The prompt to provide to the language model.
168+
"""
169+
170+
def __init__(
171+
self,
172+
*,
173+
model: BaseChatModel,
174+
prompt: str = _PROMPT_IMAGES_TO_DESCRIPTION,
175+
):
176+
"""Initializes the LLMImageBlobParser.
177+
178+
Args:
179+
model (BaseChatModel):
180+
The language model to use for analysis.
181+
prompt (str):
182+
The prompt to provide to the language model.
183+
"""
184+
super().__init__()
185+
self.model = model
186+
self.prompt = prompt
187+
188+
def _analyze_image(self, img: "Image") -> str:
189+
"""Analyze an image using the provided language model.
190+
191+
Args:
192+
img: The image to be analyzed.
193+
194+
Returns:
195+
The extracted textual content.
196+
"""
197+
image_bytes = io.BytesIO()
198+
img.save(image_bytes, format="PNG")
199+
img_base64 = base64.b64encode(image_bytes.getvalue()).decode("utf-8")
200+
msg = self.model.invoke(
201+
[
202+
HumanMessage(
203+
content=[
204+
{
205+
"type": "text",
206+
"text": self.prompt.format(format=format),
207+
},
208+
{
209+
"type": "image_url",
210+
"image_url": {
211+
"url": f"data:image/jpeg;base64,{img_base64}"
212+
},
213+
},
214+
]
215+
)
216+
]
217+
)
218+
result = msg.content
219+
assert isinstance(result, str)
220+
return result

0 commit comments

Comments
 (0)