@@ -36,10 +36,7 @@ class PDFParserFn(Protocol):
36
36
"""Protocol for parsing a PDF."""
37
37
38
38
def __call__ (
39
- self ,
40
- path : str | os .PathLike ,
41
- page_size_limit : int | None = None ,
42
- use_block_parsing : bool = False ,
39
+ self , path : str | os .PathLike , page_size_limit : int | None = None , ** kwargs
43
40
) -> ParsedText : ...
44
41
45
42
@@ -50,6 +47,7 @@ def parse_pdf_to_pages(
50
47
path : str | os .PathLike ,
51
48
page_size_limit : int | None = None ,
52
49
use_block_parsing : bool = False ,
50
+ ** _ ,
53
51
) -> ParsedText :
54
52
55
53
with pymupdf .open (path ) as file :
@@ -151,6 +149,7 @@ def parse_text(
151
149
split_lines : bool = False ,
152
150
use_tiktoken : bool = True ,
153
151
page_size_limit : int | None = None ,
152
+ ** _ ,
154
153
) -> ParsedText :
155
154
"""Simple text splitter, can optionally use tiktoken, parse html, or split into newlines.
156
155
@@ -392,15 +391,12 @@ async def read_doc(
392
391
parsed_text : ParsedText = parse_pdf (path , ** parser_kwargs )
393
392
elif str_path .endswith (".txt" ):
394
393
# TODO: Make parse_text async
395
- parser_kwargs .pop ("use_block_parsing" , None ) # Not a parse_text kwarg
396
394
parsed_text = await asyncio .to_thread (parse_text , path , ** parser_kwargs )
397
395
elif str_path .endswith (".html" ):
398
- parser_kwargs .pop ("use_block_parsing" , None ) # Not a parse_text kwarg
399
396
parsed_text = await asyncio .to_thread (
400
397
parse_text , path , html = True , ** parser_kwargs
401
398
)
402
399
else :
403
- parser_kwargs .pop ("use_block_parsing" , None ) # Not a parse_text kwarg
404
400
parsed_text = await asyncio .to_thread (
405
401
parse_text , path , split_lines = True , use_tiktoken = False , ** parser_kwargs
406
402
)
0 commit comments