Skip to content

Commit 0d701f0

Browse files
committed
Fix revue
1 parent 97996ea commit 0d701f0

File tree

7 files changed

+265
-352
lines changed

7 files changed

+265
-352
lines changed

docs/docs/integrations/document_loaders/zeroxpdfloader.ipynb

Lines changed: 176 additions & 248 deletions
Large diffs are not rendered by default.

libs/community/langchain_community/document_loaders/__init__.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,6 @@
360360
PyPDFium2Loader,
361361
PyPDFLoader,
362362
UnstructuredPDFLoader,
363-
ZeroxPDFLoader,
364363
)
365364
from langchain_community.document_loaders.pebblo import (
366365
PebbloSafeLoader,
@@ -733,7 +732,6 @@
733732
"YoutubeAudioLoader": "langchain_community.document_loaders.blob_loaders",
734733
"YoutubeLoader": "langchain_community.document_loaders.youtube",
735734
"YuqueLoader": "langchain_community.document_loaders.yuque",
736-
"ZeroxPDFLoader": "langchain_community.document_loaders.pdf",
737735
}
738736

739737

@@ -942,5 +940,4 @@ def __getattr__(name: str) -> Any:
942940
"YoutubeAudioLoader",
943941
"YoutubeLoader",
944942
"YuqueLoader",
945-
"ZeroxPDFLoader",
946943
]

libs/community/langchain_community/document_loaders/parsers/pdf.py

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@
88
import logging
99
import threading
1010
import warnings
11-
from asyncio import AbstractEventLoop
1211
from datetime import datetime
12+
from multiprocessing.pool import ThreadPool
1313
from pathlib import Path
14-
from tempfile import NamedTemporaryFile, TemporaryDirectory
14+
from tempfile import TemporaryDirectory
1515
from typing import (
1616
TYPE_CHECKING,
1717
Any,
@@ -1471,6 +1471,7 @@ def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str:
14711471

14721472
return extract_from_images_with_rapidocr(images)
14731473

1474+
14741475
_map_extract_tables: Dict[Literal["markdown", "html", None], str] = {
14751476
"markdown": "",
14761477
"html": "But, use html syntax for convert all tables. ",
@@ -1483,6 +1484,7 @@ def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str:
14831484
"describe it. ",
14841485
}
14851486

1487+
14861488
class ZeroxPDFParser(BaseBlobParser):
14871489
"""Parse a blob from a PDF using `py-zerox` library.
14881490
@@ -1532,6 +1534,9 @@ class ZeroxPDFParser(BaseBlobParser):
15321534
print(docs[0].page_content[:100])
15331535
print(docs[0].metadata)
15341536
"""
1537+
1538+
_pool = ThreadPool()
1539+
15351540
_warn_images_to_text = False
15361541
_warn_creator = False
15371542
_prompt = (
@@ -1543,12 +1548,6 @@ class ZeroxPDFParser(BaseBlobParser):
15431548
"Do not exclude any content from the page. "
15441549
)
15451550

1546-
@staticmethod
1547-
def _run_async_from_thread(coro, loop):
1548-
future = asyncio.run_coroutine_threadsafe(coro,
1549-
loop) # Lancer la coroutine dans la boucle existante
1550-
return future.result() # Bloque en attendant le résultat
1551-
15521551
def __init__(
15531552
self,
15541553
mode: Literal["single", "page"] = "page",
@@ -1686,11 +1685,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
16861685
zerox_prompt = PromptTemplate.from_template(
16871686
self.custom_system_prompt
16881687
).format(prompt_tables=prompt_tables, prompt_images=prompt_images)
1689-
# async def toto():
1690-
# await asyncio.sleep(0)
1691-
# return "hello"
1692-
# coro=toto()
1693-
coro=zerox(
1688+
coro = zerox(
16941689
file_path=str(file_path),
16951690
model=self.model,
16961691
cleanup=self.cleanup,
@@ -1703,10 +1698,9 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-ty
17031698
try:
17041699
loop = asyncio.get_running_loop()
17051700

1706-
from multiprocessing.pool import ThreadPool
1707-
pool = ThreadPool(processes=1)
1708-
zerox_output = pool.apply_async(
1709-
lambda : loop.run_until_complete(coro)).get() # tuple of args for foo
1701+
zerox_output = ZeroxPDFParser._pool.apply_async(
1702+
lambda: loop.run_until_complete(coro)
1703+
).get() # tuple of args for foo
17101704

17111705
except RuntimeError:
17121706
zerox_output = asyncio.run(coro)

libs/community/tests/integration_tests/document_loaders/parsers/test_pdf_parsers.py

Lines changed: 51 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -2,48 +2,43 @@
22

33
import re
44
from pathlib import Path
5+
from typing import TYPE_CHECKING, Iterator, Type
56

67
import pytest
7-
from typing import TYPE_CHECKING, Iterator
88

9-
from langchain_community.document_loaders import PDFMinerLoader, PDFPlumberLoader, \
10-
PyMuPDFLoader, PyPDFium2Loader, PyPDFLoader
119
from langchain_community.document_loaders.base import BaseBlobParser
1210
from langchain_community.document_loaders.blob_loaders import Blob
1311
from langchain_community.document_loaders.parsers import (
1412
BaseImageBlobParser,
1513
PDFPlumberParser,
1614
)
17-
from langchain_community.document_loaders.parsers.pdf import ZeroxPDFParser, \
18-
PyMuPDFParser, PDFMinerParser, PyPDFium2Parser, PyPDFParser
15+
from langchain_community.document_loaders.parsers.pdf import (
16+
PDFMinerParser,
17+
PyMuPDFParser,
18+
PyPDFium2Parser,
19+
PyPDFParser,
20+
ZeroxPDFParser,
21+
)
1922

2023
if TYPE_CHECKING:
2124
from PIL.Image import Image
2225

23-
_map_parser = {
24-
'PDFMinerParser': PDFMinerParser,
25-
'PDFPlumberParser': PDFPlumberParser,
26-
'PyMuPDFParser': PyMuPDFParser,
27-
'PyPDFium2Parser': PyPDFium2Parser,
28-
'PyPDFParser': PyPDFParser,
29-
'ZeroxPDFParser': ZeroxPDFParser,
30-
}
3126

3227
# PDFs to test parsers on.
3328
HELLO_PDF = Path(__file__).parent.parent.parent / "examples" / "hello.pdf"
3429

3530
LAYOUT_PARSER_PAPER_PDF = (
36-
Path(__file__).parent.parent.parent / "examples" / "layout-parser-paper.pdf"
31+
Path(__file__).parent.parent.parent / "examples" / "layout-parser-paper.pdf"
3732
)
3833

3934
LAYOUT_PARSER_PAPER_PASSWORD_PDF = (
40-
Path(__file__).parent.parent.parent
41-
/ "examples"
42-
/ "layout-parser-paper-password.pdf"
35+
Path(__file__).parent.parent.parent
36+
/ "examples"
37+
/ "layout-parser-paper-password.pdf"
4338
)
4439

4540
DUPLICATE_CHARS = (
46-
Path(__file__).parent.parent.parent / "examples" / "duplicate-chars.pdf"
41+
Path(__file__).parent.parent.parent / "examples" / "duplicate-chars.pdf"
4742
)
4843

4944

@@ -123,30 +118,30 @@ def _analyze_image(self, img: "Image") -> str:
123118
[("single", EmptyImageBlobParser()), ("page", None)],
124119
)
125120
@pytest.mark.parametrize(
126-
"parser_factory,params",
121+
"parser_class,params",
127122
[
128-
("PDFMinerParser", {}),
129-
("PyMuPDFParser", {}),
130-
("PyPDFium2Parser", {}),
131-
("PyPDFParser", {"extraction_mode": "plain"}),
132-
("PyPDFParser", {"extraction_mode": "layout"}),
133-
("ZeroxPDFParser", {}),
123+
(PDFMinerParser, {}),
124+
(PyMuPDFParser, {}),
125+
(PyPDFium2Parser, {}),
126+
(PyPDFParser, {"extraction_mode": "plain"}),
127+
(PyPDFParser, {"extraction_mode": "layout"}),
128+
(ZeroxPDFParser, {}),
134129
],
135130
)
136131
@pytest.mark.requires("pillow")
137132
def test_mode_and_extract_images_variations(
138-
parser_factory: str,
139-
params: dict,
140-
mode: str,
141-
image_parser: BaseImageBlobParser,
133+
parser_class: Type,
134+
params: dict,
135+
mode: str,
136+
image_parser: BaseImageBlobParser,
142137
) -> None:
143-
if parser_factory == "ZeroxPDFParser":
138+
if parser_class == ZeroxPDFParser:
144139
try:
145140
import pyzerox # noqa: F401
146141
except ImportError:
147142
pytest.skip("py-zerox is valid only with Python +3.11")
148143
_test_matrix(
149-
parser_factory,
144+
parser_class,
150145
params,
151146
mode,
152147
image_parser,
@@ -159,23 +154,23 @@ def test_mode_and_extract_images_variations(
159154
["text", "markdown-img", "html-img"],
160155
)
161156
@pytest.mark.parametrize(
162-
"parser_factory,params",
157+
"parser_class,params",
163158
[
164-
("PDFMinerParser", {}),
165-
("PyMuPDFParser", {}),
166-
("PyPDFium2Parser", {}),
167-
("PyPDFParser", {"extraction_mode": "plain"}),
168-
("PyPDFParser", {"extraction_mode": "layout"}),
169-
("ZeroxPDFParser", {}),
159+
(PDFMinerParser, {}),
160+
(PyMuPDFParser, {}),
161+
(PyPDFium2Parser, {}),
162+
(PyPDFParser, {"extraction_mode": "plain"}),
163+
(PyPDFParser, {"extraction_mode": "layout"}),
164+
(ZeroxPDFParser, {}),
170165
],
171166
)
172167
@pytest.mark.requires("pillow")
173168
def test_mode_and_image_formats_variations(
174-
parser_factory: str,
175-
params: dict,
176-
images_inner_format: str,
169+
parser_class: Type,
170+
params: dict,
171+
images_inner_format: str,
177172
) -> None:
178-
if parser_factory == "ZeroxPDFParser":
173+
if parser_class == ZeroxPDFParser:
179174
try:
180175
import pyzerox # noqa: F401
181176
except ImportError:
@@ -184,7 +179,7 @@ def test_mode_and_image_formats_variations(
184179
image_parser = EmptyImageBlobParser()
185180

186181
_test_matrix(
187-
parser_factory,
182+
parser_class,
188183
params,
189184
mode,
190185
image_parser,
@@ -193,11 +188,11 @@ def test_mode_and_image_formats_variations(
193188

194189

195190
def _test_matrix(
196-
parser_factory: str,
197-
params: dict,
198-
mode: str,
199-
image_parser: BaseImageBlobParser,
200-
images_inner_format: str,
191+
parser_class: Type,
192+
params: dict,
193+
mode: str,
194+
image_parser: BaseImageBlobParser,
195+
images_inner_format: str,
201196
) -> None:
202197
"""Apply the same test for all *standard* PDF parsers.
203198
@@ -245,8 +240,6 @@ def _std_assert_with_parser(parser: BaseBlobParser) -> None:
245240
assert len(docs)
246241
parser.password = old_password
247242

248-
parser_class = _map_parser[parser_factory]
249-
250243
parser = parser_class(
251244
mode=mode,
252245
images_parser=image_parser,
@@ -266,19 +259,19 @@ def _std_assert_with_parser(parser: BaseBlobParser) -> None:
266259
["markdown", "html", "csv", None],
267260
)
268261
@pytest.mark.parametrize(
269-
"parser_factory,params",
262+
"parser_class,params",
270263
[
271-
("PyMuPDFParser", {}),
272-
("ZeroxPDFParser", {"model": "gpt-4o-mini"}),
264+
(PyMuPDFParser, {}),
265+
(ZeroxPDFParser, {"model": "gpt-4o-mini"}),
273266
],
274267
)
275268
def test_parser_with_table(
276-
parser_factory: str,
277-
params: dict,
278-
mode: str,
279-
extract_tables: str,
269+
parser_class: Type,
270+
params: dict,
271+
mode: str,
272+
extract_tables: str,
280273
) -> None:
281-
if parser_factory == "ZeroxPDFParser":
274+
if parser_class == ZeroxPDFParser:
282275
try:
283276
import pyzerox # noqa: F401
284277
except ImportError:
@@ -333,8 +326,6 @@ class EmptyImageBlobParser(BaseImageBlobParser):
333326
def _analyze_image(self, img: Image) -> str:
334327
return "![image](.)"
335328

336-
parser_class = _map_parser[parser_factory]
337-
338329
parser = parser_class(
339330
mode=mode,
340331
extract_tables=extract_tables,

libs/community/tests/integration_tests/document_loaders/test_pdf.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,19 @@
11
import os
22
from pathlib import Path
3-
from typing import Sequence, Union
3+
from typing import Sequence, Type, Union
44

55
import pytest
66

7-
import langchain_community.document_loaders as pdf_loaders
8-
from langchain_community.document_loaders import (
7+
from langchain_community.document_loaders.pdf import (
98
AmazonTextractPDFLoader,
109
MathpixPDFLoader,
10+
PDFMinerLoader,
1111
PDFMinerPDFasHTMLLoader,
12+
PyMuPDFLoader,
13+
PyPDFium2Loader,
14+
PyPDFLoader,
1215
UnstructuredPDFLoader,
16+
ZeroxPDFLoader,
1317
)
1418

1519

@@ -164,25 +168,24 @@ def test_amazontextract_loader_failures() -> None:
164168

165169

166170
@pytest.mark.parametrize(
167-
"parser_factory,params",
171+
"loader_class,params",
168172
[
169-
("PDFMinerLoader", {}),
170-
("PyMuPDFLoader", {}),
171-
("PyPDFium2Loader", {}),
172-
("PyPDFLoader", {}),
173-
("ZeroxPDFLoader", {}),
173+
(PDFMinerLoader, {}),
174+
(PyMuPDFLoader, {}),
175+
(PyPDFium2Loader, {}),
176+
(PyPDFLoader, {}),
177+
(ZeroxPDFLoader, {}),
174178
],
175179
)
176180
def test_standard_parameters(
177-
parser_factory: str,
181+
loader_class: Type,
178182
params: dict,
179183
) -> None:
180-
if parser_factory == "ZeroxPDFLoader":
184+
if loader_class == ZeroxPDFLoader:
181185
try:
182186
import pyzerox # noqa: F401
183187
except ImportError:
184188
pytest.skip("pyzerox is valid only with Python +3.11")
185-
loader_class = getattr(pdf_loaders, parser_factory)
186189

187190
file_path = Path(__file__).parent.parent / "examples/hello.pdf"
188191
loader = loader_class(file_path)

0 commit comments

Comments
 (0)