docling-project · nikos-livathinos · Dec 16, 2025 · Dec 11, 2025 · Dec 11, 2025 · Dec 11, 2025
diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py
@@ -641,6 +641,7 @@ def evaluate(
     split: str = "test",
     cvat_overview_path: Optional[Path] = None,
     external_predictions_path: Optional[Path] = None,
+    concurrency: int = 4,
 ) -> Optional[DatasetEvaluationType]:
     """Evaluate predictions against ground truth."""
     if not os.path.exists(idir):
@@ -673,17 +674,16 @@ def evaluate(
             # label_filtering_strategy=LabelFilteringStrategy.INTERSECTION,
             page_mapping_path=cvat_overview_path,
         )
-        evaluation = layout_evaluator(  # type: ignore
+        layout_evaluation = layout_evaluator(  # type: ignore
             idir,
             split=split,
             external_predictions_path=external_predictions_path,
         )
-
         with open(save_fn, "w") as fd:
-            json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
+            json.dump(layout_evaluation.model_dump(), fd, indent=2, sort_keys=True)
 
         # Evaluate with the pixel-wise layout evaluation
-        pixel_layout_evaluator = PixelLayoutEvaluator()
+        pixel_layout_evaluator = PixelLayoutEvaluator(concurrency=concurrency)
         pixel_ds_evaluation: DatasetPixelLayoutEvaluation = pixel_layout_evaluator(
             idir,
             split=split,
@@ -696,6 +696,9 @@ def evaluate(
             pixel_save_root,
         )
 
+        # TODO: Redesign evaluate() to return multiple evaluation objects
+        evaluation = pixel_ds_evaluation  # type: ignore
+
     elif modality == EvaluationModality.TABLE_STRUCTURE:
         table_evaluator = TableEvaluator()
         evaluation = table_evaluator(  # type: ignore
@@ -764,7 +767,7 @@ def evaluate(
             )
 
     elif modality == EvaluationModality.MARKDOWN_TEXT:
-        md_evaluator = MarkdownTextEvaluator()
+        md_evaluator = MarkdownTextEvaluator(concurrency=concurrency)
         evaluation = md_evaluator(  # type: ignore
             idir,
             split=split,
@@ -823,8 +826,8 @@ def evaluate(
 def visualize(
     modality: EvaluationModality,
     benchmark: BenchMarkNames,
-    idir: Path,
     odir: Path,
+    idir: Path | None = None,
     split: str = "test",
 ):
     """
@@ -839,10 +842,6 @@ def visualize(
         begin_index: Begin index
         end_index: End index
     """
-    if not os.path.exists(idir):
-        _log.error(f"Input directory not found: {idir}")
-        return
-
     os.makedirs(odir, exist_ok=True)
     metrics_filename = odir / f"evaluation_{benchmark.value}_{modality.value}.json"
 
@@ -989,6 +988,11 @@ def visualize(
 
     elif modality == EvaluationModality.READING_ORDER:
         try:
+            # idir is required here
+            if idir is None or not idir.is_dir():
+                _log.error(f"Input directory not found: {idir}")
+                return
+
             with open(metrics_filename, "r") as fd:
                 ro_evaluation = DatasetReadingOrderEvaluation.model_validate_json(
                     fd.read()
@@ -1080,6 +1084,11 @@ def visualize(
 
     elif modality == EvaluationModality.OCR:
         try:
+            # idir is required here
+            if idir is None or not idir.is_dir():
+                _log.error(f"Input directory not found: {idir}")
+                return
+
             with open(metrics_filename, "r") as fd:
                 ocr_evaluation = OcrDatasetEvaluationResult.model_validate_json(
                     fd.read()
@@ -1511,6 +1520,9 @@ def evaluate_cmd(
             help="Path to load existing DoclingDocument predictions. The filename must follow the pattern [doc_id].[json|dt|yaml|yml]",
         ),
     ] = None,
+    concurrency: Annotated[
+        int, typer.Option(help="Concurrency for the computation of each metric")
+    ] = 4,
 ):
     """Evaluate predictions against ground truth."""
     input_dir, output_dir = derive_input_output_dirs(
@@ -1531,6 +1543,7 @@ def evaluate_cmd(
         odir=eval_output_dir,
         split=split,
         external_predictions_path=external_predictions_path,
+        concurrency=concurrency,
     )
 
 
@@ -1573,8 +1586,8 @@ def visualize_cmd(
     visualize(
         modality=modality,
         benchmark=benchmark,
-        idir=input_dir,
         odir=eval_output_dir,
+        idir=input_dir,
         split=split,
     )
 

diff --git a/docling_eval/evaluators/base_evaluator.py b/docling_eval/evaluators/base_evaluator.py
@@ -75,12 +75,14 @@ def __init__(
         supported_prediction_formats: List[PredictionFormats] = [
             PredictionFormats.DOCLING_DOCUMENT
         ],
+        concurrency: int = 4,
     ):
         r"""
         Parameters
         ----------
         intermediate_evaluations_path: When True the evalution per example will be saved in a file
         """
+        self._concurrency = concurrency
         self._intermediate_evaluations_path = intermediate_evaluations_path
 
         # Validate the prediction_sources

diff --git a/docling_eval/evaluators/layout_evaluator.py b/docling_eval/evaluators/layout_evaluator.py
@@ -489,7 +489,7 @@ def __call__(
             weighted_map_90_values.append(average_iou_90)
             weighted_map_95_values.append(average_iou_95)
 
-            _log.info(
+            _log.debug(
                 "doc: %s\tprecision: %.2f, recall: %.2f, f1: %.2f, map_50: %.2f, "
                 "precision_no_pics: %.2f, recall_no_pics: %.2f, f1_no_pics: %.2f",
                 doc_id_page,
@@ -528,7 +528,6 @@ def __call__(
                 segmentation_precision_no_pictures=precision_no_pics,
                 segmentation_recall_no_pictures=recall_no_pics,
                 segmentation_f1_no_pictures=f1_no_pics,
-                # New per-sample element count metrics
                 true_element_count=true_element_count,
                 pred_element_count=pred_element_count,
                 true_table_count=true_table_count,
@@ -836,9 +835,7 @@ def _find_intersecting_labels(
         true_labels: Dict[str, int] = {}
         pred_labels: Dict[str, int] = {}
 
-        for i, data in tqdm(
-            enumerate(ds), desc="Layout evaluations", ncols=120, total=len(ds)
-        ):
+        for i, data in enumerate(ds):
             data_record = DatasetRecordWithPrediction.model_validate(data)
             true_doc = data_record.ground_truth_doc
             pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader)