Skip to content

Commit 9d44782

Browse files
authored
Supporting more kwargs in ParsingSettings.parse_pdf (#999)
1 parent 2bea091 commit 9d44782

File tree

1 file changed

+3
-7
lines changed

1 file changed

+3
-7
lines changed

paperqa/readers.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,7 @@ class PDFParserFn(Protocol):
3636
"""Protocol for parsing a PDF."""
3737

3838
def __call__(
39-
self,
40-
path: str | os.PathLike,
41-
page_size_limit: int | None = None,
42-
use_block_parsing: bool = False,
39+
self, path: str | os.PathLike, page_size_limit: int | None = None, **kwargs
4340
) -> ParsedText: ...
4441

4542

@@ -50,6 +47,7 @@ def parse_pdf_to_pages(
5047
path: str | os.PathLike,
5148
page_size_limit: int | None = None,
5249
use_block_parsing: bool = False,
50+
**_,
5351
) -> ParsedText:
5452

5553
with pymupdf.open(path) as file:
@@ -151,6 +149,7 @@ def parse_text(
151149
split_lines: bool = False,
152150
use_tiktoken: bool = True,
153151
page_size_limit: int | None = None,
152+
**_,
154153
) -> ParsedText:
155154
"""Simple text splitter, can optionally use tiktoken, parse html, or split into newlines.
156155
@@ -392,15 +391,12 @@ async def read_doc(
392391
parsed_text: ParsedText = parse_pdf(path, **parser_kwargs)
393392
elif str_path.endswith(".txt"):
394393
# TODO: Make parse_text async
395-
parser_kwargs.pop("use_block_parsing", None) # Not a parse_text kwarg
396394
parsed_text = await asyncio.to_thread(parse_text, path, **parser_kwargs)
397395
elif str_path.endswith(".html"):
398-
parser_kwargs.pop("use_block_parsing", None) # Not a parse_text kwarg
399396
parsed_text = await asyncio.to_thread(
400397
parse_text, path, html=True, **parser_kwargs
401398
)
402399
else:
403-
parser_kwargs.pop("use_block_parsing", None) # Not a parse_text kwarg
404400
parsed_text = await asyncio.to_thread(
405401
parse_text, path, split_lines=True, use_tiktoken=False, **parser_kwargs
406402
)

0 commit comments

Comments
 (0)