Skip to content

Commit 91234f0

Browse files
committed
Fix remarques
1 parent 20f5a41 commit 91234f0

File tree

10 files changed

+227
-152
lines changed

10 files changed

+227
-152
lines changed

libs/community/extended_testing_deps.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ praw>=7.7.1,<8
6666
premai>=0.3.25,<0.4
6767
psychicapi>=0.8.0,<0.9
6868
pydantic>=2.7.4,<3
69+
pytesseract>=0.3.13
6970
py-trello>=0.19.0,<0.20
7071
pyjwt>=2.8.0,<3
7172
pymupdf>=1.22.3,<2

libs/community/langchain_community/document_loaders/parsers/__init__.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@
1717
from langchain_community.document_loaders.parsers.html import (
1818
BS4HTMLParser,
1919
)
20+
from langchain_community.document_loaders.parsers.images import (
21+
MultimodalBlobParser,
22+
RapidOCRBlobParser,
23+
TesseractBlobParser,
24+
)
2025
from langchain_community.document_loaders.parsers.language import (
2126
LanguageParser,
2227
)
@@ -30,11 +35,6 @@
3035
from langchain_community.document_loaders.parsers.vsdx import (
3136
VsdxParser,
3237
)
33-
from langchain_community.document_loaders.parsers.images import (
34-
MultimodalBlobParser,
35-
RapidOCRBlobParser,
36-
TesseractBlobParser,
37-
)
3838

3939

4040
_module_lookup = {

libs/community/langchain_community/document_loaders/parsers/images.py

Lines changed: 53 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -3,87 +3,97 @@
33
import io
44
import logging
55
from abc import abstractmethod
6+
from typing import TYPE_CHECKING, Iterator, Literal
67

7-
from PIL import Image
8-
from typing import Iterator, Literal
9-
10-
from langchain_community.document_loaders.base import BaseBlobParser
11-
from langchain_community.document_loaders.blob_loaders import Blob
8+
import numpy as np
129
from langchain_core.documents import Document
1310
from langchain_core.language_models import BaseChatModel
1411
from langchain_core.messages import HumanMessage
1512

13+
if TYPE_CHECKING:
14+
from PIL.Image import Image
15+
16+
from langchain_community.document_loaders.base import BaseBlobParser
17+
from langchain_community.document_loaders.blob_loaders import Blob
18+
1619
logger = logging.getLogger(__name__)
1720

1821

1922
class ImageBlobParser(BaseBlobParser):
2023
def __init__(
21-
self,
22-
*,
23-
format: Literal["text", "markdown", "html"] = "text",
24+
self,
25+
*,
26+
format: Literal["text", "markdown", "html"] = "text",
2427
):
2528
self.format = format
2629

2730
@abstractmethod
28-
def _analyze_image(self, img: Image) -> str:
31+
def _analyze_image(self, img: "Image") -> str:
2932
pass
3033

3134
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
32-
with blob.as_bytes_io() as buf:
33-
img = Image.open(buf)
34-
content = self._analyze_image(img)
35-
if content:
36-
if self.format == "markdown":
37-
content = content.replace("]", r"\\]")
38-
content = f"![{content}](.)"
39-
elif self.format == "html":
40-
content = f'<img alt="{html.escape(content, quote=True)}" />'
41-
logger.debug("Image text: %s", content.replace("\n", "\\n"))
42-
yield Document(
43-
page_content=content,
44-
metadata={"source": blob.source},
35+
try:
36+
from PIL import Image as Img
37+
38+
with blob.as_bytes_io() as buf:
39+
img = Img.open(buf)
40+
content = self._analyze_image(img)
41+
if content:
42+
if self.format == "markdown":
43+
content = content.replace("]", r"\\]")
44+
content = f"![{content}](.)"
45+
elif self.format == "html":
46+
content = f'<img alt="{html.escape(content, quote=True)}" />'
47+
logger.debug("Image text: %s", content.replace("\n", "\\n"))
48+
yield Document(
49+
page_content=content,
50+
metadata={"source": blob.source},
51+
)
52+
except ImportError:
53+
raise ImportError(
54+
"`rapidocr-onnxruntime` package not found, please install it with "
55+
"`pip install Pillow`"
4556
)
4657

4758

4859
class RapidOCRBlobParser(ImageBlobParser):
4960
def __init__(
50-
self,
51-
*,
52-
format: Literal["text", "markdown", "html"] = "text",
61+
self,
62+
*,
63+
format: Literal["text", "markdown", "html"] = "text",
5364
):
5465
super().__init__(format=format)
5566
self.ocr = None
5667

57-
def _analyze_image(self, img: Image) -> str:
68+
def _analyze_image(self, img: "Image") -> str:
5869
if not self.ocr:
5970
try:
6071
from rapidocr_onnxruntime import RapidOCR
72+
73+
self.ocr = RapidOCR()
6174
except ImportError:
6275
raise ImportError(
6376
"`rapidocr-onnxruntime` package not found, please install it with "
6477
"`pip install rapidocr-onnxruntime`"
6578
)
66-
self.ocr = RapidOCR()
67-
ocr_result, _ = self.ocr(img)
79+
ocr_result, _ = self.ocr(np.array(img)) # type: ignore
6880
content = ""
6981
if ocr_result:
7082
content = ("\n".join([text[1] for text in ocr_result])).strip()
7183
return content
7284

7385

7486
class TesseractBlobParser(ImageBlobParser):
75-
7687
def __init__(
77-
self,
78-
*,
79-
format: Literal["text", "markdown", "html"] = "text",
80-
langs: list[str] = ["eng"],
81-
88+
self,
89+
*,
90+
format: Literal["text", "markdown", "html"] = "text",
91+
langs: list[str] = ["eng"],
8292
):
8393
super().__init__(format=format)
8494
self.langs = langs
8595

86-
def _analyze_image(self, img: Image) -> str:
96+
def _analyze_image(self, img: "Image") -> str:
8797
try:
8898
import pytesseract
8999
except ImportError:
@@ -99,24 +109,23 @@ def _analyze_image(self, img: Image) -> str:
99109
"images for retrieval. "
100110
"These summaries will be embedded and used to retrieve the raw image. "
101111
"Give a concise summary of the image that is well optimized for retrieval "
102-
"and extract all the text from the image.")
112+
"and extract all the text from the image."
113+
)
103114

104115

105116
class MultimodalBlobParser(ImageBlobParser):
106-
107117
def __init__(
108-
self,
109-
*,
110-
format: Literal["text", "markdown", "html"] = "text",
111-
model: BaseChatModel,
112-
prompt: str = _prompt_images_to_description,
113-
118+
self,
119+
*,
120+
format: Literal["text", "markdown", "html"] = "text",
121+
model: BaseChatModel,
122+
prompt: str = _prompt_images_to_description,
114123
):
115124
super().__init__(format=format)
116125
self.model = model
117126
self.prompt = prompt
118127

119-
def _analyze_image(self, img: Image) -> str:
128+
def _analyze_image(self, img: "Image") -> str:
120129
image_bytes = io.BytesIO()
121130
img.save(image_bytes, format="PNG")
122131
img_base64 = base64.b64encode(image_bytes.getvalue()).decode("utf-8")

libs/community/langchain_community/document_loaders/parsers/pdf.py

Lines changed: 63 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,31 +2,33 @@
22

33
from __future__ import annotations
44

5-
import html
65
import io
76
import logging
87
import threading
98
import warnings
109
from datetime import datetime
11-
from urllib.parse import urlparse
12-
13-
import numpy as np
1410
from typing import (
1511
TYPE_CHECKING,
1612
Any,
13+
Iterable,
1714
Iterator,
1815
Literal,
1916
Mapping,
2017
Optional,
2118
Sequence,
2219
Union,
2320
)
21+
from urllib.parse import urlparse
22+
23+
import numpy as np
24+
from langchain_core.documents import Document
2425

2526
from langchain_community.document_loaders.base import BaseBlobParser
2627
from langchain_community.document_loaders.blob_loaders import Blob
27-
from langchain_community.document_loaders.parsers.images import ImageBlobParser, \
28-
RapidOCRBlobParser
29-
from langchain_core.documents import Document
28+
from langchain_community.document_loaders.parsers.images import (
29+
ImageBlobParser,
30+
RapidOCRBlobParser,
31+
)
3032

3133
if TYPE_CHECKING:
3234
import pdfminer
@@ -53,16 +55,49 @@
5355
"JBIG2Decode",
5456
]
5557

58+
59+
def extract_from_images_with_rapidocr(
60+
images: Sequence[Union[Iterable[np.ndarray], bytes]],
61+
) -> str:
62+
"""Extract text from images with RapidOCR.
63+
64+
Args:
65+
images: Images to extract text from.
66+
67+
Returns:
68+
Text extracted from images.
69+
70+
Raises:
71+
ImportError: If `rapidocr-onnxruntime` package is not installed.
72+
"""
73+
try:
74+
from rapidocr_onnxruntime import RapidOCR
75+
except ImportError:
76+
raise ImportError(
77+
"`rapidocr-onnxruntime` package not found, please install it with "
78+
"`pip install rapidocr-onnxruntime`"
79+
)
80+
ocr = RapidOCR()
81+
text = ""
82+
for img in images:
83+
result, _ = ocr(img)
84+
if result:
85+
result = [text[1] for text in result]
86+
text += "\n".join(result)
87+
return text
88+
89+
5690
logger = logging.getLogger(__name__)
5791

5892
_FORMAT_IMAGE_STR = "\n\n{image_text}\n\n"
5993
_JOIN_IMAGES = "\n"
6094
_JOIN_TABLES = "\n"
6195
_DEFAULT_PAGE_DELIMITOR = "\n\f"
6296

63-
_STD_METADATA_KEYS={"source", "total_pages", "creationdate", "creator", "producer"}
97+
_STD_METADATA_KEYS = {"source", "total_pages", "creationdate", "creator", "producer"}
98+
6499

65-
def _validate_metadata(metadata: dict[str, Any]) -> dict[str,Any]:
100+
def _validate_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
66101
"""Validates the presence of at least the following keys:
67102
- source
68103
- page (if mode='page')
@@ -73,7 +108,7 @@ def _validate_metadata(metadata: dict[str, Any]) -> dict[str,Any]:
73108
"""
74109
if not _STD_METADATA_KEYS.issubset(metadata.keys()):
75110
raise ValueError("The PDF parser must valorize the standard metadata.")
76-
if not isinstance(metadata.get("page",0), int):
111+
if not isinstance(metadata.get("page", 0), int):
77112
raise ValueError("The PDF metadata page must be a integer.")
78113
return metadata
79114

@@ -116,7 +151,10 @@ def _purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
116151
return new_metadata
117152

118153

119-
_PARAGRAPH_DELIMITOR = ["\n\n\n", "\n\n"] # To insert images or table in the middle of the page.
154+
_PARAGRAPH_DELIMITOR = [
155+
"\n\n\n",
156+
"\n\n",
157+
] # To insert images or table in the middle of the page.
120158

121159

122160
def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
@@ -132,7 +170,7 @@ def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
132170
"""
133171

134172
def _recurs_merge_text_and_extras(
135-
extras: list[str], text_from_page: str, recurs: bool
173+
extras: list[str], text_from_page: str, recurs: bool
136174
) -> Optional[str]:
137175
if extras:
138176
for delim in _PARAGRAPH_DELIMITOR:
@@ -151,8 +189,9 @@ def _recurs_merge_text_and_extras(
151189
str_extras = "\n\n".join(filter(lambda x: x, extras))
152190
if str_extras:
153191
all_extras = delim + str_extras
154-
all_text = text_from_page[:pos] + all_extras + text_from_page[
155-
pos:]
192+
all_text = (
193+
text_from_page[:pos] + all_extras + text_from_page[pos:]
194+
)
156195
break
157196
else:
158197
all_text = None
@@ -171,7 +210,6 @@ def _recurs_merge_text_and_extras(
171210
return all_text
172211

173212

174-
175213
class ImagesPdfParser(BaseBlobParser):
176214
"""Abstract interface for blob parsers with images_to_text."""
177215

@@ -218,8 +256,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
218256
)
219257

220258
def _extract_text_from_page(page: pypdf.PageObject) -> str:
221-
"""Extract text from image given the version of pypdf.
222-
"""
259+
"""Extract text from image given the version of pypdf."""
223260
if pypdf.__version__.startswith("3"):
224261
return page.extract_text()
225262
else:
@@ -561,11 +598,11 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
561598
for page in doc:
562599
all_text = self._get_page_content(doc, page, blob).strip()
563600
if self.mode == "page":
564-
565601
yield Document(
566602
page_content=all_text,
567-
metadata=_validate_metadata(doc_metadata |
568-
{"page": page.number}),
603+
metadata=_validate_metadata(
604+
doc_metadata | {"page": page.number}
605+
),
569606
)
570607
else:
571608
full_content.append(all_text)
@@ -658,17 +695,16 @@ def _extract_images_from_page(
658695
if self.images_parser:
659696
xref = img[0]
660697
pix = pymupdf.Pixmap(doc, xref)
661-
image=np.frombuffer(pix.samples, dtype=np.uint8).reshape(
662-
pix.height, pix.width, -1
663-
)
698+
image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
699+
pix.height, pix.width, -1
700+
)
664701
image_bytes = io.BytesIO()
665702
Image.fromarray(image).save(image_bytes, format="PNG")
666-
blob=Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
703+
blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
667704
images.append(next(self.images_parser.lazy_parse(blob)).page_content)
668705
return _FORMAT_IMAGE_STR.format(
669-
image_text=_JOIN_IMAGES.join(filter(None,images))
670-
)
671-
706+
image_text=_JOIN_IMAGES.join(filter(None, images))
707+
)
672708

673709
def _extract_tables_from_page(self, page: pymupdf.Page) -> str:
674710
"""Extract tables from a PDF page.

0 commit comments

Comments
 (0)