docling-project
diff --git a/‎README.md‎
Lines changed: 39 additions & 0 deletions b/‎README.md‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎app/pybind_parse.cpp‎
Lines changed: 132 additions & 0 deletions b/‎app/pybind_parse.cpp‎
Lines changed: 132 additions & 0 deletions
diff --git a/‎docling_parse/pdf_parser.py‎
Lines changed: 122 additions & 0 deletions b/‎docling_parse/pdf_parser.py‎
Lines changed: 122 additions & 0 deletions
@@ -86,6 +86,45 @@ for page_no, pred_page in pdf_doc.iterate_pages():
     img.show()
 ```
 
+### Parallel parsing (multi-threaded)
+
+Parse pages from one or more PDFs in parallel using a thread pool with backpressure:
+
+```python
+from docling_parse.pdf_parser import (
+    DoclingThreadedPdfParser,
+    ThreadedPdfParserConfig,
+)
+from docling_parse.pdf_parsers import DecodePageConfig  # type: ignore[import]
+
+parser_config = ThreadedPdfParserConfig(
+    loglevel="fatal",
+    threads=4,                # worker threads
+    max_concurrent_results=32 # cap buffered results to limit memory
+)
+decode_config = DecodePageConfig()
+
+parser = DoclingThreadedPdfParser(
+    parser_config=parser_config,
+    decode_config=decode_config,
+)
+
+# load one or more documents
+for source in ["doc_a.pdf", "doc_b.pdf"]:
+    parser.load(source)
+
+# consume decoded pages as they become available
+while parser.has_tasks():
+    task = parser.get_task()
+
+    if task.success:
+        page_decoder, timings = task.get()
+        print(f"{task.doc_key} p{task.page_number}: "
+              f"{len(list(page_decoder.get_word_cells()))} words")
+    else:
+        print(f"error on {task.doc_key} p{task.page_number}: {task.error()}")
+```
+
 Use the CLI
 
 ```sh
 
@@ -8,6 +8,7 @@
 #include <pybind/utils/pybind11_json.h>
 
 #include <pybind/docling_parser.h>
+#include <pybind/docling_threaded_parser.h>
 
 // Include parse headers for typed bindings
 #include <parse.h>
@@ -533,4 +534,135 @@ PYBIND11_MODULE(pdf_parsers, m) {
 
     Returns:
         PdfPageDecoder: A typed page decoder object.)");
+
+  // ============= Threaded PDF Parser =============
+
+  // PageDecodeResult - result of a threaded page decode task
+  pybind11::class_<docling::page_decode_result>(m, "PageDecodeResult",
+    R"(
+    Result of a threaded page decoding task.
+
+    Attributes:
+        doc_key (str): The document key this page belongs to.
+        page_number (int): The page number (0-indexed).
+        success (bool): Whether the decoding succeeded.
+    )")
+    .def_readonly("doc_key", &docling::page_decode_result::doc_key)
+    .def_readonly("page_number", &docling::page_decode_result::page_number)
+    .def_readonly("success", &docling::page_decode_result::success)
+    .def("get", [](docling::page_decode_result& self)
+         -> std::pair<std::shared_ptr<pdflib::pdf_decoder<pdflib::PAGE>>,
+                      std::unordered_map<std::string, double>> {
+           if(!self.success)
+             {
+               throw std::runtime_error("Cannot get result from failed task: " + self.error_message);
+             }
+           auto timings_map = self.page_decoder->get_timings().to_sum_map();
+           return std::make_pair(self.page_decoder, timings_map);
+         },
+         R"(
+    Get the page decoder and timing information.
+
+    Returns:
+        Tuple[PdfPageDecoder, Dict[str, float]]: The page decoder and timing data.
+
+    Raises:
+        RuntimeError: If the task was not successful.)")
+    .def("error", [](docling::page_decode_result& self) -> std::string {
+           return self.error_message;
+         },
+         R"(
+    Get the error message if the task failed.
+
+    Returns:
+        str: The error message.)");
+
+  // threaded_pdf_parser - parallel PDF parser with bounded result queue
+  pybind11::class_<docling::docling_threaded_parser>(m, "threaded_pdf_parser",
+    R"(
+    Threaded PDF parser that processes pages in parallel.
+
+    Loads multiple documents and decodes their pages using a thread pool.
+    Results are available via a bounded queue to control memory usage.
+    )")
+    .def(pybind11::init<const std::string&, int, int, pdflib::decode_page_config>(),
+         pybind11::arg("loglevel") = "fatal",
+         pybind11::arg("num_threads") = 4,
+         pybind11::arg("max_concurrent_results") = 32,
+         pybind11::arg("config") = pdflib::decode_page_config(),
+         R"(
+    Construct a threaded PDF parser.
+
+    Parameters:
+        loglevel (str): Logging level ('fatal', 'error', 'warning', 'info').
+        num_threads (int): Number of worker threads.
+        max_concurrent_results (int): Maximum results buffered before workers pause.
+        config (DecodePageConfig): Configuration for page decoding.)")
+
+    .def("load_document",
+         [](docling::docling_threaded_parser& self,
+            const std::string& key,
+            const std::string& filename,
+            std::optional<std::string>& password) -> bool {
+           return self.load_document(key, filename, password);
+         },
+         pybind11::arg("key"),
+         pybind11::arg("filename"),
+         pybind11::arg("password") = pybind11::none(),
+         R"(
+    Load a document by key and filename.
+
+    Parameters:
+        key (str): The unique key to identify the document.
+        filename (str): The path to the document file to load.
+        password (str, optional): Optional password for password-protected files.
+
+    Returns:
+        bool: True if the document was successfully loaded.)")
+
+    .def("load_document_from_bytesio",
+         [](docling::docling_threaded_parser& self,
+            const std::string& key,
+            pybind11::object bytes_io,
+            std::optional<std::string>& password) -> bool {
+           return self.load_document_from_bytesio(key, bytes_io, password);
+         },
+         pybind11::arg("key"),
+         pybind11::arg("bytes_io"),
+         pybind11::arg("password") = pybind11::none(),
+         R"(
+    Load a document from a BytesIO-like object.
+
+    Parameters:
+        key (str): The unique key to identify the document.
+        bytes_io (Any): A BytesIO-like object containing the document data.
+        password (str, optional): Optional password for password-protected files.
+
+    Returns:
+        bool: True if the document was successfully loaded.)")
+
+    .def("has_tasks",
+         [](docling::docling_threaded_parser& self) -> bool {
+           return self.has_tasks();
+         },
+         R"(
+    Check if there are remaining tasks to consume.
+
+    On first call, builds the task queue from all loaded documents and starts worker threads.
+
+    Returns:
+        bool: True if there are remaining results to consume.)")
+
+    .def("get_task",
+         [](docling::docling_threaded_parser& self) -> docling::page_decode_result {
+           pybind11::gil_scoped_release release;
+           return self.get_task();
+         },
+         R"(
+    Get the next completed page decode result.
+
+    Blocks until a result is available. Releases the GIL while waiting.
+
+    Returns:
+        PageDecodeResult: The result of a page decoding task.)");
 }
@@ -29,7 +29,9 @@
 from pydantic import BaseModel, ConfigDict
 
 from docling_parse.pdf_parsers import DecodePageConfig  # type: ignore[import]
+from docling_parse.pdf_parsers import PageDecodeResult  # type: ignore[import]
 from docling_parse.pdf_parsers import pdf_parser  # type: ignore[import]
+from docling_parse.pdf_parsers import threaded_pdf_parser  # type: ignore[import]
 from docling_parse.pdf_parsers import (  # type: ignore[import]
     TIMING_KEY_CREATE_LINE_CELLS,
     TIMING_KEY_CREATE_WORD_CELLS,
@@ -854,3 +856,123 @@ def _load_document_from_bytesio(self, key: str, data: BytesIO) -> bool:
              bool: True if the document was successfully loaded, False otherwise.)")
         """
         return self.parser.load_document_from_bytesio(key=key, bytes_io=data)
+
+
+class ThreadedPdfParserConfig(BaseModel):
+    """Configuration for the threaded PDF parser.
+
+    Attributes:
+        loglevel: Logging level ('fatal', 'error', 'warning', 'info').
+        threads: Number of worker threads for parallel page decoding.
+        max_concurrent_results: Maximum results buffered before workers pause.
+    """
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    loglevel: str = "fatal"
+    threads: int = 4
+    max_concurrent_results: int = 32
+
+
+class DoclingThreadedPdfParser:
+    """Threaded PDF parser that decodes pages from multiple documents in parallel.
+
+    Usage::
+
+        parser_config = ThreadedPdfParserConfig(loglevel="fatal", threads=4, max_concurrent_results=32)
+        decode_config = DecodePageConfig()
+
+        parser = DoclingThreadedPdfParser(parser_config=parser_config, decode_config=decode_config)
+
+        for source in sources:
+            parser.load(source)
+
+        while parser.has_tasks():
+            task = parser.get_task()
+
+            if task.success:
+                page_decoder, timings = task.get()
+            else:
+                error_msg = task.error()
+    """
+
+    def __init__(
+        self,
+        parser_config: Optional[ThreadedPdfParserConfig] = None,
+        decode_config: Optional[DecodePageConfig] = None,
+    ):
+        if parser_config is None:
+            parser_config = ThreadedPdfParserConfig()
+        if decode_config is None:
+            decode_config = DecodePageConfig()
+
+        self._parser = threaded_pdf_parser(
+            loglevel=parser_config.loglevel,
+            num_threads=parser_config.threads,
+            max_concurrent_results=parser_config.max_concurrent_results,
+            config=decode_config,
+        )
+
+    def load(
+        self,
+        path_or_stream: Union[str, Path, BytesIO],
+        password: Optional[str] = None,
+    ) -> str:
+        """Load a document for parallel processing.
+
+        Parameters:
+            path_or_stream: File path or BytesIO object.
+            password: Optional password for protected files.
+
+        Returns:
+            str: The document key.
+        """
+        if isinstance(path_or_stream, str):
+            path_or_stream = Path(path_or_stream)
+
+        if isinstance(path_or_stream, Path):
+            key = f"key={str(path_or_stream)}"
+            success = self._parser.load_document(
+                key=key, filename=str(path_or_stream).encode("utf8"), password=password
+            )
+        elif isinstance(path_or_stream, BytesIO):
+            hasher = hashlib.sha256(usedforsecurity=False)
+            while chunk := path_or_stream.read(8192):
+                hasher.update(chunk)
+            path_or_stream.seek(0)
+            hash_val = hasher.hexdigest()
+
+            key = f"key={hash_val}"
+            success = self._parser.load_document_from_bytesio(
+                key=key, bytes_io=path_or_stream, password=password
+            )
+        else:
+            raise TypeError(
+                f"Expected str, Path, or BytesIO, got {type(path_or_stream)}"
+            )
+
+        if not success:
+            raise RuntimeError(f"Failed to load document with key {key}")
+
+        return key
+
+    def has_tasks(self) -> bool:
+        """Check if there are remaining tasks to consume.
+
+        On first call, builds the task queue and starts worker threads.
+
+        Returns:
+            bool: True if there are remaining results to consume.
+        """
+        return self._parser.has_tasks()
+
+    def get_task(self) -> "PageDecodeResult":
+        """Get the next completed page decode result.
+
+        Blocks until a result is available.
+
+        Returns:
+            PageDecodeResult: The result with doc_key, page_number, success flag.
+                Use task.get() to get (PdfPageDecoder, timings) or task.error() for error message.
+        """
+        return self._parser.get_task()