Skip to content

Commit 88a9f5b

Browse files
committed
Fix revue
1 parent 97996ea commit 88a9f5b

File tree

7 files changed

+267
-344
lines changed

7 files changed

+267
-344
lines changed

docs/docs/integrations/document_loaders/zeroxpdfloader.ipynb

Lines changed: 176 additions & 248 deletions
Large diffs are not rendered by default.

libs/community/langchain_community/document_loaders/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,6 @@
360360
PyPDFium2Loader,
361361
PyPDFLoader,
362362
UnstructuredPDFLoader,
363-
ZeroxPDFLoader,
364363
)
365364
from langchain_community.document_loaders.pebblo import (
366365
PebbloSafeLoader,
@@ -733,7 +732,6 @@
733732
"YoutubeAudioLoader": "langchain_community.document_loaders.blob_loaders",
734733
"YoutubeLoader": "langchain_community.document_loaders.youtube",
735734
"YuqueLoader": "langchain_community.document_loaders.yuque",
736-
"ZeroxPDFLoader": "langchain_community.document_loaders.pdf",
737735
}
738736

739737

libs/community/langchain_community/document_loaders/parsers/pdf.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import warnings
1111
from asyncio import AbstractEventLoop
1212
from datetime import datetime
13+
from multiprocessing.pool import ThreadPool
1314
from pathlib import Path
1415
from tempfile import NamedTemporaryFile, TemporaryDirectory
1516
from typing import (
@@ -1471,6 +1472,7 @@ def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str:
14711472

14721473
return extract_from_images_with_rapidocr(images)
14731474

1475+
14741476
_map_extract_tables: Dict[Literal["markdown", "html", None], str] = {
14751477
"markdown": "",
14761478
"html": "But, use html syntax for convert all tables. ",
@@ -1483,6 +1485,7 @@ def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str:
14831485
"describe it. ",
14841486
}
14851487

1488+
14861489
class ZeroxPDFParser(BaseBlobParser):
14871490
"""Parse a blob from a PDF using `py-zerox` library.
14881491
@@ -1532,6 +1535,9 @@ class ZeroxPDFParser(BaseBlobParser):
15321535
print(docs[0].page_content[:100])
15331536
print(docs[0].metadata)
15341537
"""
1538+
1539+
_pool = ThreadPool()
1540+
15351541
_warn_images_to_text = False
15361542
_warn_creator = False
15371543
_prompt = (
@@ -1545,8 +1551,9 @@ class ZeroxPDFParser(BaseBlobParser):
15451551

15461552
@staticmethod
15471553
def _run_async_from_thread(coro, loop):
1548-
future = asyncio.run_coroutine_threadsafe(coro,
1549-
loop) # Lancer la coroutine dans la boucle existante
1554+
future = asyncio.run_coroutine_threadsafe(
1555+
coro, loop
1556+
) # Lancer la coroutine dans la boucle existante
15501557
return future.result() # Bloque en attendant le résultat
15511558

15521559
def __init__(
@@ -1686,11 +1693,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
16861693
zerox_prompt = PromptTemplate.from_template(
16871694
self.custom_system_prompt
16881695
).format(prompt_tables=prompt_tables, prompt_images=prompt_images)
1689-
# async def toto():
1690-
# await asyncio.sleep(0)
1691-
# return "hello"
1692-
# coro=toto()
1693-
coro=zerox(
1696+
coro = zerox(
16941697
file_path=str(file_path),
16951698
model=self.model,
16961699
cleanup=self.cleanup,
@@ -1703,10 +1706,9 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
17031706
try:
17041707
loop = asyncio.get_running_loop()
17051708

1706-
from multiprocessing.pool import ThreadPool
1707-
pool = ThreadPool(processes=1)
1708-
zerox_output = pool.apply_async(
1709-
lambda : loop.run_until_complete(coro)).get() # tuple of args for foo
1709+
zerox_output = ZeroxPDFParser._pool.apply_async(
1710+
lambda: loop.run_until_complete(coro)
1711+
).get() # tuple of args for foo
17101712

17111713
except RuntimeError:
17121714
zerox_output = asyncio.run(coro)

libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py

Lines changed: 51 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -2,48 +2,43 @@
22

33
import re
44
from pathlib import Path
5+
from typing import TYPE_CHECKING, Iterator, Type
56

67
import pytest
7-
from typing import TYPE_CHECKING, Iterator
88

9-
from langchain_community.document_loaders import PDFMinerLoader, PDFPlumberLoader, \
10-
PyMuPDFLoader, PyPDFium2Loader, PyPDFLoader
119
from langchain_community.document_loaders.base import BaseBlobParser
1210
from langchain_community.document_loaders.blob_loaders import Blob
1311
from langchain_community.document_loaders.parsers import (
1412
BaseImageBlobParser,
1513
PDFPlumberParser,
1614
)
17-
from langchain_community.document_loaders.parsers.pdf import ZeroxPDFParser, \
18-
PyMuPDFParser, PDFMinerParser, PyPDFium2Parser, PyPDFParser
15+
from langchain_community.document_loaders.parsers.pdf import (
16+
PDFMinerParser,
17+
PyMuPDFParser,
18+
PyPDFium2Parser,
19+
PyPDFParser,
20+
ZeroxPDFParser,
21+
)
1922

2023
if TYPE_CHECKING:
2124
from PIL.Image import Image
2225

23-
_map_parser = {
24-
'PDFMinerParser': PDFMinerParser,
25-
'PDFPlumberParser': PDFPlumberParser,
26-
'PyMuPDFParser': PyMuPDFParser,
27-
'PyPDFium2Parser': PyPDFium2Parser,
28-
'PyPDFParser': PyPDFParser,
29-
'ZeroxPDFParser': ZeroxPDFParser,
30-
}
3126

3227
# PDFs to test parsers on.
3328
HELLO_PDF = Path(__file__).parent.parent.parent / "examples" / "hello.pdf"
3429

3530
LAYOUT_PARSER_PAPER_PDF = (
36-
Path(__file__).parent.parent.parent / "examples" / "layout-parser-paper.pdf"
31+
Path(__file__).parent.parent.parent / "examples" / "layout-parser-paper.pdf"
3732
)
3833

3934
LAYOUT_PARSER_PAPER_PASSWORD_PDF = (
40-
Path(__file__).parent.parent.parent
41-
/ "examples"
42-
/ "layout-parser-paper-password.pdf"
35+
Path(__file__).parent.parent.parent
36+
/ "examples"
37+
/ "layout-parser-paper-password.pdf"
4338
)
4439

4540
DUPLICATE_CHARS = (
46-
Path(__file__).parent.parent.parent / "examples" / "duplicate-chars.pdf"
41+
Path(__file__).parent.parent.parent / "examples" / "duplicate-chars.pdf"
4742
)
4843

4944

@@ -123,30 +118,30 @@ def _analyze_image(self, img: "Image") -> str:
123118
[("single", EmptyImageBlobParser()), ("page", None)],
124119
)
125120
@pytest.mark.parametrize(
126-
"parser_factory,params",
121+
"parser_class,params",
127122
[
128-
("PDFMinerParser", {}),
129-
("PyMuPDFParser", {}),
130-
("PyPDFium2Parser", {}),
131-
("PyPDFParser", {"extraction_mode": "plain"}),
132-
("PyPDFParser", {"extraction_mode": "layout"}),
133-
("ZeroxPDFParser", {}),
123+
(PDFMinerParser, {}),
124+
(PyMuPDFParser, {}),
125+
(PyPDFium2Parser, {}),
126+
(PyPDFParser, {"extraction_mode": "plain"}),
127+
(PyPDFParser, {"extraction_mode": "layout"}),
128+
(ZeroxPDFParser, {}),
134129
],
135130
)
136131
@pytest.mark.requires("pillow")
137132
def test_mode_and_extract_images_variations(
138-
parser_factory: str,
139-
params: dict,
140-
mode: str,
141-
image_parser: BaseImageBlobParser,
133+
parser_class: Type,
134+
params: dict,
135+
mode: str,
136+
image_parser: BaseImageBlobParser,
142137
) -> None:
143-
if parser_factory == "ZeroxPDFParser":
138+
if parser_class == ZeroxPDFParser:
144139
try:
145140
import pyzerox # noqa: F401
146141
except ImportError:
147142
pytest.skip("py-zerox is valid only with Python +3.11")
148143
_test_matrix(
149-
parser_factory,
144+
parser_class,
150145
params,
151146
mode,
152147
image_parser,
@@ -159,23 +154,23 @@ def test_mode_and_extract_images_variations(
159154
["text", "markdown-img", "html-img"],
160155
)
161156
@pytest.mark.parametrize(
162-
"parser_factory,params",
157+
"parser_class,params",
163158
[
164-
("PDFMinerParser", {}),
165-
("PyMuPDFParser", {}),
166-
("PyPDFium2Parser", {}),
167-
("PyPDFParser", {"extraction_mode": "plain"}),
168-
("PyPDFParser", {"extraction_mode": "layout"}),
169-
("ZeroxPDFParser", {}),
159+
(PDFMinerParser, {}),
160+
(PyMuPDFParser, {}),
161+
(PyPDFium2Parser, {}),
162+
(PyPDFParser, {"extraction_mode": "plain"}),
163+
(PyPDFParser, {"extraction_mode": "layout"}),
164+
(ZeroxPDFParser, {}),
170165
],
171166
)
172167
@pytest.mark.requires("pillow")
173168
def test_mode_and_image_formats_variations(
174-
parser_factory: str,
175-
params: dict,
176-
images_inner_format: str,
169+
parser_class: str,
170+
params: dict,
171+
images_inner_format: str,
177172
) -> None:
178-
if parser_factory == "ZeroxPDFParser":
173+
if parser_class == ZeroxPDFParser:
179174
try:
180175
import pyzerox # noqa: F401
181176
except ImportError:
@@ -184,7 +179,7 @@ def test_mode_and_image_formats_variations(
184179
image_parser = EmptyImageBlobParser()
185180

186181
_test_matrix(
187-
parser_factory,
182+
parser_class,
188183
params,
189184
mode,
190185
image_parser,
@@ -193,11 +188,11 @@ def test_mode_and_image_formats_variations(
193188

194189

195190
def _test_matrix(
196-
parser_factory: str,
197-
params: dict,
198-
mode: str,
199-
image_parser: BaseImageBlobParser,
200-
images_inner_format: str,
191+
parser_class: Type,
192+
params: dict,
193+
mode: str,
194+
image_parser: BaseImageBlobParser,
195+
images_inner_format: str,
201196
) -> None:
202197
"""Apply the same test for all *standard* PDF parsers.
203198
@@ -245,8 +240,6 @@ def _std_assert_with_parser(parser: BaseBlobParser) -> None:
245240
assert len(docs)
246241
parser.password = old_password
247242

248-
parser_class = _map_parser[parser_factory]
249-
250243
parser = parser_class(
251244
mode=mode,
252245
images_parser=image_parser,
@@ -266,19 +259,19 @@ def _std_assert_with_parser(parser: BaseBlobParser) -> None:
266259
["markdown", "html", "csv", None],
267260
)
268261
@pytest.mark.parametrize(
269-
"parser_factory,params",
262+
"parser_class,params",
270263
[
271-
("PyMuPDFParser", {}),
272-
("ZeroxPDFParser", {"model": "gpt-4o-mini"}),
264+
(PyMuPDFParser, {}),
265+
(ZeroxPDFParser, {"model": "gpt-4o-mini"}),
273266
],
274267
)
275268
def test_parser_with_table(
276-
parser_factory: str,
277-
params: dict,
278-
mode: str,
279-
extract_tables: str,
269+
parser_class: Type,
270+
params: dict,
271+
mode: str,
272+
extract_tables: str,
280273
) -> None:
281-
if parser_factory == "ZeroxPDFParser":
274+
if parser_class == ZeroxPDFParser:
282275
try:
283276
import pyzerox # noqa: F401
284277
except ImportError:
@@ -333,8 +326,6 @@ class EmptyImageBlobParser(BaseImageBlobParser):
333326
def _analyze_image(self, img: Image) -> str:
334327
return "![image](.)"
335328

336-
parser_class = _map_parser[parser_factory]
337-
338329
parser = parser_class(
339330
mode=mode,
340331
extract_tables=extract_tables,

libs/community/tests/integration_tests/document_loaders/test_pdf.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,19 @@
11
import os
22
from pathlib import Path
3-
from typing import Sequence, Union
3+
from typing import Sequence, Type, Union
44

55
import pytest
66

7-
import langchain_community.document_loaders as pdf_loaders
8-
from langchain_community.document_loaders import (
7+
from langchain_community.document_loaders.pdf import (
98
AmazonTextractPDFLoader,
109
MathpixPDFLoader,
10+
PDFMinerLoader,
1111
PDFMinerPDFasHTMLLoader,
12+
PyMuPDFLoader,
13+
PyPDFium2Loader,
14+
PyPDFLoader,
1215
UnstructuredPDFLoader,
16+
ZeroxPDFLoader,
1317
)
1418

1519

@@ -164,25 +168,24 @@ def test_amazontextract_loader_failures() -> None:
164168

165169

166170
@pytest.mark.parametrize(
167-
"parser_factory,params",
171+
"loader_class,params",
168172
[
169-
("PDFMinerLoader", {}),
170-
("PyMuPDFLoader", {}),
171-
("PyPDFium2Loader", {}),
172-
("PyPDFLoader", {}),
173-
("ZeroxPDFLoader", {}),
173+
(PDFMinerLoader, {}),
174+
(PyMuPDFLoader, {}),
175+
(PyPDFium2Loader, {}),
176+
(PyPDFLoader, {}),
177+
(ZeroxPDFLoader, {}),
174178
],
175179
)
176180
def test_standard_parameters(
177-
parser_factory: str,
181+
loader_class: Type,
178182
params: dict,
179183
) -> None:
180-
if parser_factory == "ZeroxPDFLoader":
184+
if loader_class == ZeroxPDFLoader:
181185
try:
182186
import pyzerox # noqa: F401
183187
except ImportError:
184188
pytest.skip("pyzerox is valid only with Python +3.11")
185-
loader_class = getattr(pdf_loaders, parser_factory)
186189

187190
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
188191
loader = loader_class(file_path)

0 commit comments

Comments
 (0)