|
8 | 8 | #include <pybind/utils/pybind11_json.h> |
9 | 9 |
|
10 | 10 | #include <pybind/docling_parser.h> |
| 11 | +#include <pybind/docling_threaded_parser.h> |
11 | 12 |
|
12 | 13 | // Include parse headers for typed bindings |
13 | 14 | #include <parse.h> |
@@ -533,4 +534,135 @@ PYBIND11_MODULE(pdf_parsers, m) { |
533 | 534 |
|
534 | 535 | Returns: |
535 | 536 | PdfPageDecoder: A typed page decoder object.)"); |
| 537 | + |
| 538 | + // ============= Threaded PDF Parser ============= |
| 539 | + |
| 540 | + // PageDecodeResult - result of a threaded page decode task |
| 541 | + pybind11::class_<docling::page_decode_result>(m, "PageDecodeResult", |
| 542 | + R"( |
| 543 | + Result of a threaded page decoding task. |
| 544 | +
|
| 545 | + Attributes: |
| 546 | + doc_key (str): The document key this page belongs to. |
| 547 | + page_number (int): The page number (0-indexed). |
| 548 | + success (bool): Whether the decoding succeeded. |
| 549 | + )") |
| 550 | + .def_readonly("doc_key", &docling::page_decode_result::doc_key) |
| 551 | + .def_readonly("page_number", &docling::page_decode_result::page_number) |
| 552 | + .def_readonly("success", &docling::page_decode_result::success) |
| 553 | + .def("get", [](docling::page_decode_result& self) |
| 554 | + -> std::pair<std::shared_ptr<pdflib::pdf_decoder<pdflib::PAGE>>, |
| 555 | + std::unordered_map<std::string, double>> { |
| 556 | + if(!self.success) |
| 557 | + { |
| 558 | + throw std::runtime_error("Cannot get result from failed task: " + self.error_message); |
| 559 | + } |
| 560 | + auto timings_map = self.page_decoder->get_timings().to_sum_map(); |
| 561 | + return std::make_pair(self.page_decoder, timings_map); |
| 562 | + }, |
| 563 | + R"( |
| 564 | + Get the page decoder and timing information. |
| 565 | +
|
| 566 | + Returns: |
| 567 | + Tuple[PdfPageDecoder, Dict[str, float]]: The page decoder and timing data. |
| 568 | +
|
| 569 | + Raises: |
| 570 | + RuntimeError: If the task was not successful.)") |
| 571 | + .def("error", [](docling::page_decode_result& self) -> std::string { |
| 572 | + return self.error_message; |
| 573 | + }, |
| 574 | + R"( |
| 575 | + Get the error message if the task failed. |
| 576 | +
|
| 577 | + Returns: |
| 578 | + str: The error message.)"); |
| 579 | + |
| 580 | + // threaded_pdf_parser - parallel PDF parser with bounded result queue |
| 581 | + pybind11::class_<docling::docling_threaded_parser>(m, "threaded_pdf_parser", |
| 582 | + R"( |
| 583 | + Threaded PDF parser that processes pages in parallel. |
| 584 | +
|
| 585 | + Loads multiple documents and decodes their pages using a thread pool. |
| 586 | + Results are available via a bounded queue to control memory usage. |
| 587 | + )") |
| 588 | + .def(pybind11::init<const std::string&, int, int, pdflib::decode_page_config>(), |
| 589 | + pybind11::arg("loglevel") = "fatal", |
| 590 | + pybind11::arg("num_threads") = 4, |
| 591 | + pybind11::arg("max_concurrent_results") = 32, |
| 592 | + pybind11::arg("config") = pdflib::decode_page_config(), |
| 593 | + R"( |
| 594 | + Construct a threaded PDF parser. |
| 595 | +
|
| 596 | + Parameters: |
| 597 | + loglevel (str): Logging level ('fatal', 'error', 'warning', 'info'). |
| 598 | + num_threads (int): Number of worker threads. |
| 599 | + max_concurrent_results (int): Maximum results buffered before workers pause. |
| 600 | + config (DecodePageConfig): Configuration for page decoding.)") |
| 601 | + |
| 602 | + .def("load_document", |
| 603 | + [](docling::docling_threaded_parser& self, |
| 604 | + const std::string& key, |
| 605 | + const std::string& filename, |
| 606 | + std::optional<std::string>& password) -> bool { |
| 607 | + return self.load_document(key, filename, password); |
| 608 | + }, |
| 609 | + pybind11::arg("key"), |
| 610 | + pybind11::arg("filename"), |
| 611 | + pybind11::arg("password") = pybind11::none(), |
| 612 | + R"( |
| 613 | + Load a document by key and filename. |
| 614 | +
|
| 615 | + Parameters: |
| 616 | + key (str): The unique key to identify the document. |
| 617 | + filename (str): The path to the document file to load. |
| 618 | + password (str, optional): Optional password for password-protected files. |
| 619 | +
|
| 620 | + Returns: |
| 621 | + bool: True if the document was successfully loaded.)") |
| 622 | + |
| 623 | + .def("load_document_from_bytesio", |
| 624 | + [](docling::docling_threaded_parser& self, |
| 625 | + const std::string& key, |
| 626 | + pybind11::object bytes_io, |
| 627 | + std::optional<std::string>& password) -> bool { |
| 628 | + return self.load_document_from_bytesio(key, bytes_io, password); |
| 629 | + }, |
| 630 | + pybind11::arg("key"), |
| 631 | + pybind11::arg("bytes_io"), |
| 632 | + pybind11::arg("password") = pybind11::none(), |
| 633 | + R"( |
| 634 | + Load a document from a BytesIO-like object. |
| 635 | +
|
| 636 | + Parameters: |
| 637 | + key (str): The unique key to identify the document. |
| 638 | + bytes_io (Any): A BytesIO-like object containing the document data. |
| 639 | + password (str, optional): Optional password for password-protected files. |
| 640 | +
|
| 641 | + Returns: |
| 642 | + bool: True if the document was successfully loaded.)") |
| 643 | + |
| 644 | + .def("has_tasks", |
| 645 | + [](docling::docling_threaded_parser& self) -> bool { |
| 646 | + return self.has_tasks(); |
| 647 | + }, |
| 648 | + R"( |
| 649 | + Check if there are remaining tasks to consume. |
| 650 | +
|
| 651 | + On first call, builds the task queue from all loaded documents and starts worker threads. |
| 652 | +
|
| 653 | + Returns: |
| 654 | + bool: True if there are remaining results to consume.)") |
| 655 | + |
| 656 | + .def("get_task", |
| 657 | + [](docling::docling_threaded_parser& self) -> docling::page_decode_result { |
| 658 | + pybind11::gil_scoped_release release; |
| 659 | + return self.get_task(); |
| 660 | + }, |
| 661 | + R"( |
| 662 | + Get the next completed page decode result. |
| 663 | +
|
| 664 | + Blocks until a result is available. Releases the GIL while waiting. |
| 665 | +
|
| 666 | + Returns: |
| 667 | + PageDecodeResult: The result of a page decoding task.)"); |
536 | 668 | } |
0 commit comments