Supporting more kwargs in ParsingSettings.parse_pdf (#999)

jamesbraza · web-flow · commit 9d44782af90f · 2025-07-08T18:00:03.000-07:00
diff --git a/paperqa/readers.py b/paperqa/readers.py
@@ -36,10 +36,7 @@ class PDFParserFn(Protocol):
     """Protocol for parsing a PDF."""
 
     def __call__(
-        self,
-        path: str | os.PathLike,
-        page_size_limit: int | None = None,
-        use_block_parsing: bool = False,
+        self, path: str | os.PathLike, page_size_limit: int | None = None, **kwargs
     ) -> ParsedText: ...
 
 
@@ -50,6 +47,7 @@ def parse_pdf_to_pages(
     path: str | os.PathLike,
     page_size_limit: int | None = None,
     use_block_parsing: bool = False,
+    **_,
 ) -> ParsedText:
 
     with pymupdf.open(path) as file:
@@ -151,6 +149,7 @@ def parse_text(
     split_lines: bool = False,
     use_tiktoken: bool = True,
     page_size_limit: int | None = None,
+    **_,
 ) -> ParsedText:
     """Simple text splitter, can optionally use tiktoken, parse html, or split into newlines.
 
@@ -392,15 +391,12 @@ async def read_doc(
         parsed_text: ParsedText = parse_pdf(path, **parser_kwargs)
     elif str_path.endswith(".txt"):
         # TODO: Make parse_text async
-        parser_kwargs.pop("use_block_parsing", None)  # Not a parse_text kwarg
         parsed_text = await asyncio.to_thread(parse_text, path, **parser_kwargs)
     elif str_path.endswith(".html"):
-        parser_kwargs.pop("use_block_parsing", None)  # Not a parse_text kwarg
         parsed_text = await asyncio.to_thread(
             parse_text, path, html=True, **parser_kwargs
         )
     else:
-        parser_kwargs.pop("use_block_parsing", None)  # Not a parse_text kwarg
         parsed_text = await asyncio.to_thread(
             parse_text, path, split_lines=True, use_tiktoken=False, **parser_kwargs
         )