docling-project
diff --git a/‎docling/datamodel/pipeline_options.py‎
Lines changed: 15 additions & 0 deletions b/‎docling/datamodel/pipeline_options.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎docling/datamodel/settings.py‎
Lines changed: 7 additions & 12 deletions b/‎docling/datamodel/settings.py‎
Lines changed: 7 additions & 12 deletions
diff --git a/‎docling/document_converter.py‎
Lines changed: 27 additions & 17 deletions b/‎docling/document_converter.py‎
Lines changed: 27 additions & 17 deletions
diff --git a/‎docling/models/layout_model.py‎
Lines changed: 84 additions & 66 deletions b/‎docling/models/layout_model.py‎
Lines changed: 84 additions & 66 deletions
@@ -332,3 +332,18 @@ class ProcessingPipeline(str, Enum):
     STANDARD = "standard"
     VLM = "vlm"
     ASR = "asr"
+
+
+class ThreadedPdfPipelineOptions(PdfPipelineOptions):
+    """Pipeline options for the threaded PDF pipeline with batching and backpressure control"""
+
+    # Batch sizes for different stages
+    ocr_batch_size: int = 4
+    layout_batch_size: int = 4
+    table_batch_size: int = 4
+
+    # Timing control
+    batch_timeout_seconds: float = 2.0
+
+    # Backpressure and queue control
+    queue_max_size: int = 100
@@ -26,18 +26,13 @@ class DocumentLimits(BaseModel):
 
 
 class BatchConcurrencySettings(BaseModel):
-    doc_batch_size: int = 2
-    doc_batch_concurrency: int = 2
-    page_batch_size: int = 4
-    page_batch_concurrency: int = 2
-    elements_batch_size: int = 16
-
-    # doc_batch_size: int = 1
-    # doc_batch_concurrency: int = 1
-    # page_batch_size: int = 1
-    # page_batch_concurrency: int = 1
-
-    # model_concurrency: int = 2
+    doc_batch_size: int = 1  # Number of documents processed in one batch. Should be >= doc_batch_concurrency
+    doc_batch_concurrency: int = 1  # Number of parallel threads processing documents. Warning: Experimental! No benefit expected without free-threaded python.
+    page_batch_size: int = 4  # Number of pages processed in one batch.
+    page_batch_concurrency: int = 1  # Currently unused.
+    elements_batch_size: int = (
+        16  # Number of elements processed in one batch, in enrichment models.
+    )
 
     # To force models into single core: export OMP_NUM_THREADS=1
 
 
@@ -4,6 +4,7 @@
 import threading
 import time
 from collections.abc import Iterable, Iterator
+from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Type, Union
@@ -284,24 +285,33 @@ def _convert(
             settings.perf.doc_batch_size,  # pass format_options
         ):
             _log.info("Going to convert document batch...")
+            process_func = partial(
+                self._process_document, raises_on_error=raises_on_error
+            )
 
-            # parallel processing only within input_batch
-            # with ThreadPoolExecutor(
-            #    max_workers=settings.perf.doc_batch_concurrency
-            # ) as pool:
-            #   yield from pool.map(self.process_document, input_batch)
-            # Note: PDF backends are not thread-safe, thread pool usage was disabled.
-
-            for item in map(
-                partial(self._process_document, raises_on_error=raises_on_error),
-                input_batch,
+            if (
+                settings.perf.doc_batch_concurrency > 1
+                and settings.perf.doc_batch_size > 1
             ):
-                elapsed = time.monotonic() - start_time
-                start_time = time.monotonic()
-                _log.info(
-                    f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
-                )
-                yield item
+                with ThreadPoolExecutor(
+                    max_workers=settings.perf.doc_batch_concurrency
+                ) as pool:
+                    for item in pool.map(
+                        process_func,
+                        input_batch,
+                    ):
+                        yield item
+            else:
+                for item in map(
+                    process_func,
+                    input_batch,
+                ):
+                    elapsed = time.monotonic() - start_time
+                    start_time = time.monotonic()
+                    _log.info(
+                        f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
+                    )
+                    yield item
 
     def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
         """Retrieve or initialize a pipeline, reusing instances based on class and options."""
@@ -330,7 +340,7 @@ def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
                     f"Reusing cached pipeline for {pipeline_class.__name__} with options hash {options_hash}"
                 )
 
-        return self.initialized_pipelines[cache_key]
+            return self.initialized_pipelines[cache_key]
 
     def _process_document(
         self, in_doc: InputDocument, raises_on_error: bool
 
@@ -3,7 +3,7 @@
 import warnings
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Optional
+from typing import List, Optional, Union
 
 import numpy as np
 from docling_core.types.doc import DocItemLabel
@@ -148,72 +148,90 @@ def draw_clusters_and_cells_side_by_side(
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
-        for page in page_batch:
+        # Convert to list to allow multiple iterations
+        pages = list(page_batch)
+
+        # Separate valid and invalid pages
+        valid_pages = []
+        valid_page_images: List[Union[Image.Image, np.ndarray]] = []
+
+        for page in pages:
             assert page._backend is not None
             if not page._backend.is_valid():
-                yield page
-            else:
-                with TimeRecorder(conv_res, "layout"):
-                    assert page.size is not None
-                    page_image = page.get_image(scale=1.0)
-                    assert page_image is not None
-
-                    clusters = []
-                    for ix, pred_item in enumerate(
-                        self.layout_predictor.predict(page_image)
-                    ):
-                        label = DocItemLabel(
-                            pred_item["label"]
-                            .lower()
-                            .replace(" ", "_")
-                            .replace("-", "_")
-                        )  # Temporary, until docling-ibm-model uses docling-core types
-                        cluster = Cluster(
-                            id=ix,
-                            label=label,
-                            confidence=pred_item["confidence"],
-                            bbox=BoundingBox.model_validate(pred_item),
-                            cells=[],
-                        )
-                        clusters.append(cluster)
-
-                    if settings.debug.visualize_raw_layout:
-                        self.draw_clusters_and_cells_side_by_side(
-                            conv_res, page, clusters, mode_prefix="raw"
-                        )
-
-                    # Apply postprocessing
-
-                    processed_clusters, processed_cells = LayoutPostprocessor(
-                        page, clusters, self.options
-                    ).postprocess()
-                    # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
-
-                    with warnings.catch_warnings():
-                        warnings.filterwarnings(
-                            "ignore",
-                            "Mean of empty slice|invalid value encountered in scalar divide",
-                            RuntimeWarning,
-                            "numpy",
-                        )
-
-                        conv_res.confidence.pages[page.page_no].layout_score = float(
-                            np.mean([c.confidence for c in processed_clusters])
-                        )
-
-                        conv_res.confidence.pages[page.page_no].ocr_score = float(
-                            np.mean(
-                                [c.confidence for c in processed_cells if c.from_ocr]
-                            )
-                        )
-
-                    page.predictions.layout = LayoutPrediction(
-                        clusters=processed_clusters
-                    )
-
-                if settings.debug.visualize_layout:
-                    self.draw_clusters_and_cells_side_by_side(
-                        conv_res, page, processed_clusters, mode_prefix="postprocessed"
-                    )
+                continue
 
+            assert page.size is not None
+            page_image = page.get_image(scale=1.0)
+            assert page_image is not None
+
+            valid_pages.append(page)
+            valid_page_images.append(page_image)
+
+        # Process all valid pages with batch prediction
+        batch_predictions = []
+        if valid_page_images:
+            with TimeRecorder(conv_res, "layout"):
+                batch_predictions = self.layout_predictor.predict_batch(  # type: ignore[attr-defined]
+                    valid_page_images
+                )
+
+        # Process each page with its predictions
+        valid_page_idx = 0
+        for page in pages:
+            assert page._backend is not None
+            if not page._backend.is_valid():
                 yield page
+                continue
+
+            page_predictions = batch_predictions[valid_page_idx]
+            valid_page_idx += 1
+
+            clusters = []
+            for ix, pred_item in enumerate(page_predictions):
+                label = DocItemLabel(
+                    pred_item["label"].lower().replace(" ", "_").replace("-", "_")
+                )  # Temporary, until docling-ibm-model uses docling-core types
+                cluster = Cluster(
+                    id=ix,
+                    label=label,
+                    confidence=pred_item["confidence"],
+                    bbox=BoundingBox.model_validate(pred_item),
+                    cells=[],
+                )
+                clusters.append(cluster)
+
+            if settings.debug.visualize_raw_layout:
+                self.draw_clusters_and_cells_side_by_side(
+                    conv_res, page, clusters, mode_prefix="raw"
+                )
+
+            # Apply postprocessing
+            processed_clusters, processed_cells = LayoutPostprocessor(
+                page, clusters, self.options
+            ).postprocess()
+            # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
+
+            with warnings.catch_warnings():
+                warnings.filterwarnings(
+                    "ignore",
+                    "Mean of empty slice|invalid value encountered in scalar divide",
+                    RuntimeWarning,
+                    "numpy",
+                )
+
+                conv_res.confidence.pages[page.page_no].layout_score = float(
+                    np.mean([c.confidence for c in processed_clusters])
+                )
+
+                conv_res.confidence.pages[page.page_no].ocr_score = float(
+                    np.mean([c.confidence for c in processed_cells if c.from_ocr])
+                )
+
+            page.predictions.layout = LayoutPrediction(clusters=processed_clusters)
+
+            if settings.debug.visualize_layout:
+                self.draw_clusters_and_cells_side_by_side(
+                    conv_res, page, processed_clusters, mode_prefix="postprocessed"
+                )
+
+            yield page