22
33import re
44from pathlib import Path
5+ from typing import TYPE_CHECKING , Iterator , Type
56
67import pytest
7- from typing import TYPE_CHECKING , Iterator
88
9- from langchain_community .document_loaders import PDFMinerLoader , PDFPlumberLoader , \
10- PyMuPDFLoader , PyPDFium2Loader , PyPDFLoader
119from langchain_community .document_loaders .base import BaseBlobParser
1210from langchain_community .document_loaders .blob_loaders import Blob
1311from langchain_community .document_loaders .parsers import (
1412 BaseImageBlobParser ,
1513 PDFPlumberParser ,
1614)
17- from langchain_community .document_loaders .parsers .pdf import ZeroxPDFParser , \
18- PyMuPDFParser , PDFMinerParser , PyPDFium2Parser , PyPDFParser
15+ from langchain_community .document_loaders .parsers .pdf import (
16+ PDFMinerParser ,
17+ PyMuPDFParser ,
18+ PyPDFium2Parser ,
19+ PyPDFParser ,
20+ ZeroxPDFParser ,
21+ )
1922
2023if TYPE_CHECKING :
2124 from PIL .Image import Image
2225
23- _map_parser = {
24- 'PDFMinerParser' : PDFMinerParser ,
25- 'PDFPlumberParser' : PDFPlumberParser ,
26- 'PyMuPDFParser' : PyMuPDFParser ,
27- 'PyPDFium2Parser' : PyPDFium2Parser ,
28- 'PyPDFParser' : PyPDFParser ,
29- 'ZeroxPDFParser' : ZeroxPDFParser ,
30- }
3126
3227# PDFs to test parsers on.
3328HELLO_PDF = Path (__file__ ).parent .parent .parent / "examples" / "hello.pdf"
3429
3530LAYOUT_PARSER_PAPER_PDF = (
36- Path (__file__ ).parent .parent .parent / "examples" / "layout-parser-paper.pdf"
31+ Path (__file__ ).parent .parent .parent / "examples" / "layout-parser-paper.pdf"
3732)
3833
3934LAYOUT_PARSER_PAPER_PASSWORD_PDF = (
40- Path (__file__ ).parent .parent .parent
41- / "examples"
42- / "layout-parser-paper-password.pdf"
35+ Path (__file__ ).parent .parent .parent
36+ / "examples"
37+ / "layout-parser-paper-password.pdf"
4338)
4439
4540DUPLICATE_CHARS = (
46- Path (__file__ ).parent .parent .parent / "examples" / "duplicate-chars.pdf"
41+ Path (__file__ ).parent .parent .parent / "examples" / "duplicate-chars.pdf"
4742)
4843
4944
@@ -123,30 +118,30 @@ def _analyze_image(self, img: "Image") -> str:
123118 [("single" , EmptyImageBlobParser ()), ("page" , None )],
124119)
125120@pytest .mark .parametrize (
126- "parser_factory ,params" ,
121+ "parser_class ,params" ,
127122 [
128- (" PDFMinerParser" , {}),
129- (" PyMuPDFParser" , {}),
130- (" PyPDFium2Parser" , {}),
131- (" PyPDFParser" , {"extraction_mode" : "plain" }),
132- (" PyPDFParser" , {"extraction_mode" : "layout" }),
133- (" ZeroxPDFParser" , {}),
123+ (PDFMinerParser , {}),
124+ (PyMuPDFParser , {}),
125+ (PyPDFium2Parser , {}),
126+ (PyPDFParser , {"extraction_mode" : "plain" }),
127+ (PyPDFParser , {"extraction_mode" : "layout" }),
128+ (ZeroxPDFParser , {}),
134129 ],
135130)
136131@pytest .mark .requires ("pillow" )
137132def test_mode_and_extract_images_variations (
138- parser_factory : str ,
139- params : dict ,
140- mode : str ,
141- image_parser : BaseImageBlobParser ,
133+ parser_class : Type ,
134+ params : dict ,
135+ mode : str ,
136+ image_parser : BaseImageBlobParser ,
142137) -> None :
143- if parser_factory == " ZeroxPDFParser" :
138+ if parser_class == ZeroxPDFParser :
144139 try :
145140 import pyzerox # noqa: F401
146141 except ImportError :
147142 pytest .skip ("py-zerox is valid only with Python +3.11" )
148143 _test_matrix (
149- parser_factory ,
144+ parser_class ,
150145 params ,
151146 mode ,
152147 image_parser ,
@@ -159,23 +154,23 @@ def test_mode_and_extract_images_variations(
159154 ["text" , "markdown-img" , "html-img" ],
160155)
161156@pytest .mark .parametrize (
162- "parser_factory ,params" ,
157+ "parser_class ,params" ,
163158 [
164- (" PDFMinerParser" , {}),
165- (" PyMuPDFParser" , {}),
166- (" PyPDFium2Parser" , {}),
167- (" PyPDFParser" , {"extraction_mode" : "plain" }),
168- (" PyPDFParser" , {"extraction_mode" : "layout" }),
169- (" ZeroxPDFParser" , {}),
159+ (PDFMinerParser , {}),
160+ (PyMuPDFParser , {}),
161+ (PyPDFium2Parser , {}),
162+ (PyPDFParser , {"extraction_mode" : "plain" }),
163+ (PyPDFParser , {"extraction_mode" : "layout" }),
164+ (ZeroxPDFParser , {}),
170165 ],
171166)
172167@pytest .mark .requires ("pillow" )
173168def test_mode_and_image_formats_variations (
174- parser_factory : str ,
175- params : dict ,
176- images_inner_format : str ,
169+ parser_class : str ,
170+ params : dict ,
171+ images_inner_format : str ,
177172) -> None :
178- if parser_factory == " ZeroxPDFParser" :
173+ if parser_class == ZeroxPDFParser :
179174 try :
180175 import pyzerox # noqa: F401
181176 except ImportError :
@@ -184,7 +179,7 @@ def test_mode_and_image_formats_variations(
184179 image_parser = EmptyImageBlobParser ()
185180
186181 _test_matrix (
187- parser_factory ,
182+ parser_class ,
188183 params ,
189184 mode ,
190185 image_parser ,
@@ -193,11 +188,11 @@ def test_mode_and_image_formats_variations(
193188
194189
195190def _test_matrix (
196- parser_factory : str ,
197- params : dict ,
198- mode : str ,
199- image_parser : BaseImageBlobParser ,
200- images_inner_format : str ,
191+ parser_class : Type ,
192+ params : dict ,
193+ mode : str ,
194+ image_parser : BaseImageBlobParser ,
195+ images_inner_format : str ,
201196) -> None :
202197 """Apply the same test for all *standard* PDF parsers.
203198
@@ -245,8 +240,6 @@ def _std_assert_with_parser(parser: BaseBlobParser) -> None:
245240 assert len (docs )
246241 parser .password = old_password
247242
248- parser_class = _map_parser [parser_factory ]
249-
250243 parser = parser_class (
251244 mode = mode ,
252245 images_parser = image_parser ,
@@ -266,19 +259,19 @@ def _std_assert_with_parser(parser: BaseBlobParser) -> None:
266259 ["markdown" , "html" , "csv" , None ],
267260)
268261@pytest .mark .parametrize (
269- "parser_factory ,params" ,
262+ "parser_class ,params" ,
270263 [
271- (" PyMuPDFParser" , {}),
272- (" ZeroxPDFParser" , {"model" : "gpt-4o-mini" }),
264+ (PyMuPDFParser , {}),
265+ (ZeroxPDFParser , {"model" : "gpt-4o-mini" }),
273266 ],
274267)
275268def test_parser_with_table (
276- parser_factory : str ,
277- params : dict ,
278- mode : str ,
279- extract_tables : str ,
269+ parser_class : Type ,
270+ params : dict ,
271+ mode : str ,
272+ extract_tables : str ,
280273) -> None :
281- if parser_factory == " ZeroxPDFParser" :
274+ if parser_class == ZeroxPDFParser :
282275 try :
283276 import pyzerox # noqa: F401
284277 except ImportError :
@@ -333,8 +326,6 @@ class EmptyImageBlobParser(BaseImageBlobParser):
333326 def _analyze_image (self , img : Image ) -> str :
334327 return ""
335328
336- parser_class = _map_parser [parser_factory ]
337-
338329 parser = parser_class (
339330 mode = mode ,
340331 extract_tables = extract_tables ,
0 commit comments