Skip to content

Commit 0d99673

Browse files
committed
Fix remarques
1 parent 20f5a41 commit 0d99673

File tree

7 files changed

+151
-106
lines changed

7 files changed

+151
-106
lines changed

libs/community/langchain_community/document_loaders/parsers/__init__.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@
1717
from langchain_community.document_loaders.parsers.html import (
1818
BS4HTMLParser,
1919
)
20+
from langchain_community.document_loaders.parsers.images import (
21+
MultimodalBlobParser,
22+
RapidOCRBlobParser,
23+
TesseractBlobParser,
24+
)
2025
from langchain_community.document_loaders.parsers.language import (
2126
LanguageParser,
2227
)
@@ -30,11 +35,6 @@
3035
from langchain_community.document_loaders.parsers.vsdx import (
3136
VsdxParser,
3237
)
33-
from langchain_community.document_loaders.parsers.images import (
34-
MultimodalBlobParser,
35-
RapidOCRBlobParser,
36-
TesseractBlobParser,
37-
)
3838

3939

4040
_module_lookup = {

libs/community/langchain_community/document_loaders/parsers/images.py

Lines changed: 21 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,24 +3,24 @@
33
import io
44
import logging
55
from abc import abstractmethod
6-
7-
from PIL import Image
86
from typing import Iterator, Literal
97

10-
from langchain_community.document_loaders.base import BaseBlobParser
11-
from langchain_community.document_loaders.blob_loaders import Blob
128
from langchain_core.documents import Document
139
from langchain_core.language_models import BaseChatModel
1410
from langchain_core.messages import HumanMessage
11+
from PIL import Image
12+
13+
from langchain_community.document_loaders.base import BaseBlobParser
14+
from langchain_community.document_loaders.blob_loaders import Blob
1515

1616
logger = logging.getLogger(__name__)
1717

1818

1919
class ImageBlobParser(BaseBlobParser):
2020
def __init__(
21-
self,
22-
*,
23-
format: Literal["text", "markdown", "html"] = "text",
21+
self,
22+
*,
23+
format: Literal["text", "markdown", "html"] = "text",
2424
):
2525
self.format = format
2626

@@ -47,9 +47,9 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:
4747

4848
class RapidOCRBlobParser(ImageBlobParser):
4949
def __init__(
50-
self,
51-
*,
52-
format: Literal["text", "markdown", "html"] = "text",
50+
self,
51+
*,
52+
format: Literal["text", "markdown", "html"] = "text",
5353
):
5454
super().__init__(format=format)
5555
self.ocr = None
@@ -72,13 +72,11 @@ def _analyze_image(self, img: Image) -> str:
7272

7373

7474
class TesseractBlobParser(ImageBlobParser):
75-
7675
def __init__(
77-
self,
78-
*,
79-
format: Literal["text", "markdown", "html"] = "text",
80-
langs: list[str] = ["eng"],
81-
76+
self,
77+
*,
78+
format: Literal["text", "markdown", "html"] = "text",
79+
langs: list[str] = ["eng"],
8280
):
8381
super().__init__(format=format)
8482
self.langs = langs
@@ -99,18 +97,17 @@ def _analyze_image(self, img: Image) -> str:
9997
"images for retrieval. "
10098
"These summaries will be embedded and used to retrieve the raw image. "
10199
"Give a concise summary of the image that is well optimized for retrieval "
102-
"and extract all the text from the image.")
100+
"and extract all the text from the image."
101+
)
103102

104103

105104
class MultimodalBlobParser(ImageBlobParser):
106-
107105
def __init__(
108-
self,
109-
*,
110-
format: Literal["text", "markdown", "html"] = "text",
111-
model: BaseChatModel,
112-
prompt: str = _prompt_images_to_description,
113-
106+
self,
107+
*,
108+
format: Literal["text", "markdown", "html"] = "text",
109+
model: BaseChatModel,
110+
prompt: str = _prompt_images_to_description,
114111
):
115112
super().__init__(format=format)
116113
self.model = model

libs/community/langchain_community/document_loaders/parsers/pdf.py

Lines changed: 63 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2,31 +2,33 @@
22

33
from __future__ import annotations
44

5-
import html
65
import io
76
import logging
87
import threading
98
import warnings
109
from datetime import datetime
11-
from urllib.parse import urlparse
12-
13-
import numpy as np
1410
from typing import (
1511
TYPE_CHECKING,
1612
Any,
13+
Iterable,
1714
Iterator,
1815
Literal,
1916
Mapping,
2017
Optional,
2118
Sequence,
2219
Union,
2320
)
21+
from urllib.parse import urlparse
22+
23+
import numpy as np
24+
from langchain_core.documents import Document
2425

2526
from langchain_community.document_loaders.base import BaseBlobParser
2627
from langchain_community.document_loaders.blob_loaders import Blob
27-
from langchain_community.document_loaders.parsers.images import ImageBlobParser, \
28-
RapidOCRBlobParser
29-
from langchain_core.documents import Document
28+
from langchain_community.document_loaders.parsers.images import (
29+
ImageBlobParser,
30+
RapidOCRBlobParser,
31+
)
3032

3133
if TYPE_CHECKING:
3234
import pdfminer
@@ -53,16 +55,49 @@
5355
"JBIG2Decode",
5456
]
5557

58+
59+
def extract_from_images_with_rapidocr(
60+
images: Sequence[Union[Iterable[np.ndarray], bytes]],
61+
) -> str:
62+
"""Extract text from images with RapidOCR.
63+
64+
Args:
65+
images: Images to extract text from.
66+
67+
Returns:
68+
Text extracted from images.
69+
70+
Raises:
71+
ImportError: If `rapidocr-onnxruntime` package is not installed.
72+
"""
73+
try:
74+
from rapidocr_onnxruntime import RapidOCR
75+
except ImportError:
76+
raise ImportError(
77+
"`rapidocr-onnxruntime` package not found, please install it with "
78+
"`pip install rapidocr-onnxruntime`"
79+
)
80+
ocr = RapidOCR()
81+
text = ""
82+
for img in images:
83+
result, _ = ocr(img)
84+
if result:
85+
result = [text[1] for text in result]
86+
text += "\n".join(result)
87+
return text
88+
89+
5690
logger = logging.getLogger(__name__)
5791

5892
_FORMAT_IMAGE_STR = "\n\n{image_text}\n\n"
5993
_JOIN_IMAGES = "\n"
6094
_JOIN_TABLES = "\n"
6195
_DEFAULT_PAGE_DELIMITOR = "\n\f"
6296

63-
_STD_METADATA_KEYS={"source", "total_pages", "creationdate", "creator", "producer"}
97+
_STD_METADATA_KEYS = {"source", "total_pages", "creationdate", "creator", "producer"}
98+
6499

65-
def _validate_metadata(metadata: dict[str, Any]) -> dict[str,Any]:
100+
def _validate_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
66101
"""Validates the presence of at least the following keys:
67102
- source
68103
- page (if mode='page')
@@ -73,7 +108,7 @@ def _validate_metadata(metadata: dict[str, Any]) -> dict[str,Any]:
73108
"""
74109
if not _STD_METADATA_KEYS.issubset(metadata.keys()):
75110
raise ValueError("The PDF parser must valorize the standard metadata.")
76-
if not isinstance(metadata.get("page",0), int):
111+
if not isinstance(metadata.get("page", 0), int):
77112
raise ValueError("The PDF metadata page must be a integer.")
78113
return metadata
79114

@@ -116,7 +151,10 @@ def _purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]:
116151
return new_metadata
117152

118153

119-
_PARAGRAPH_DELIMITOR = ["\n\n\n", "\n\n"] # To insert images or table in the middle of the page.
154+
_PARAGRAPH_DELIMITOR = [
155+
"\n\n\n",
156+
"\n\n",
157+
] # To insert images or table in the middle of the page.
120158

121159

122160
def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
@@ -132,7 +170,7 @@ def _merge_text_and_extras(extras: list[str], text_from_page: str) -> str:
132170
"""
133171

134172
def _recurs_merge_text_and_extras(
135-
extras: list[str], text_from_page: str, recurs: bool
173+
extras: list[str], text_from_page: str, recurs: bool
136174
) -> Optional[str]:
137175
if extras:
138176
for delim in _PARAGRAPH_DELIMITOR:
@@ -151,8 +189,9 @@ def _recurs_merge_text_and_extras(
151189
str_extras = "\n\n".join(filter(lambda x: x, extras))
152190
if str_extras:
153191
all_extras = delim + str_extras
154-
all_text = text_from_page[:pos] + all_extras + text_from_page[
155-
pos:]
192+
all_text = (
193+
text_from_page[:pos] + all_extras + text_from_page[pos:]
194+
)
156195
break
157196
else:
158197
all_text = None
@@ -171,7 +210,6 @@ def _recurs_merge_text_and_extras(
171210
return all_text
172211

173212

174-
175213
class ImagesPdfParser(BaseBlobParser):
176214
"""Abstract interface for blob parsers with images_to_text."""
177215

@@ -218,8 +256,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
218256
)
219257

220258
def _extract_text_from_page(page: pypdf.PageObject) -> str:
221-
"""Extract text from image given the version of pypdf.
222-
"""
259+
"""Extract text from image given the version of pypdf."""
223260
if pypdf.__version__.startswith("3"):
224261
return page.extract_text()
225262
else:
@@ -561,11 +598,11 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
561598
for page in doc:
562599
all_text = self._get_page_content(doc, page, blob).strip()
563600
if self.mode == "page":
564-
565601
yield Document(
566602
page_content=all_text,
567-
metadata=_validate_metadata(doc_metadata |
568-
{"page": page.number}),
603+
metadata=_validate_metadata(
604+
doc_metadata | {"page": page.number}
605+
),
569606
)
570607
else:
571608
full_content.append(all_text)
@@ -658,17 +695,16 @@ def _extract_images_from_page(
658695
if self.images_parser:
659696
xref = img[0]
660697
pix = pymupdf.Pixmap(doc, xref)
661-
image=np.frombuffer(pix.samples, dtype=np.uint8).reshape(
662-
pix.height, pix.width, -1
663-
)
698+
image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(
699+
pix.height, pix.width, -1
700+
)
664701
image_bytes = io.BytesIO()
665702
Image.fromarray(image).save(image_bytes, format="PNG")
666-
blob=Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
703+
blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
667704
images.append(next(self.images_parser.lazy_parse(blob)).page_content)
668705
return _FORMAT_IMAGE_STR.format(
669-
image_text=_JOIN_IMAGES.join(filter(None,images))
670-
)
671-
706+
image_text=_JOIN_IMAGES.join(filter(None, images))
707+
)
672708

673709
def _extract_tables_from_page(self, page: pymupdf.Page) -> str:
674710
"""Extract tables from a PDF page.

libs/community/langchain_community/document_loaders/pdf.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,6 @@
77
from abc import ABC
88
from io import StringIO
99
from pathlib import Path, PurePath
10-
from urllib.parse import urlparse
11-
12-
import requests
1310
from typing import (
1411
TYPE_CHECKING,
1512
Any,
@@ -22,25 +19,30 @@
2219
Union,
2320
cast,
2421
)
22+
from urllib.parse import urlparse
23+
24+
import requests
25+
from langchain_core.documents import Document
26+
from langchain_core.utils import get_from_dict_or_env
2527

2628
from langchain_community.document_loaders.base import BaseLoader
2729
from langchain_community.document_loaders.blob_loaders import Blob
2830
from langchain_community.document_loaders.dedoc import DedocBaseLoader
29-
from langchain_community.document_loaders.parsers.images import ImageBlobParser, \
30-
RapidOCRBlobParser
31+
from langchain_community.document_loaders.parsers.images import (
32+
ImageBlobParser,
33+
RapidOCRBlobParser,
34+
)
3135
from langchain_community.document_loaders.parsers.pdf import (
36+
_DEFAULT_PAGE_DELIMITOR,
3237
AmazonTextractPDFParser,
3338
DocumentIntelligenceParser,
3439
PDFMinerParser,
3540
PDFPlumberParser,
3641
PyMuPDFParser,
3742
PyPDFium2Parser,
3843
PyPDFParser,
39-
_DEFAULT_PAGE_DELIMITOR,
4044
)
4145
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
42-
from langchain_core.documents import Document
43-
from langchain_core.utils import get_from_dict_or_env
4446

4547
if TYPE_CHECKING:
4648
from textractor.data.text_linearization_config import TextLinearizationConfig

0 commit comments

Comments
 (0)