feat: Parallelize the table evaluation

nikos-livathinos · nikos-livathinos · commit efbdd0e29133 · 2025-12-16T12:39:45.000+01:00
Signed-off-by: Nikos Livathinos &lt;nli@zurich.ibm.com&gt;
diff --git a/docling_eval/evaluators/base_evaluator.py b/docling_eval/evaluators/base_evaluator.py
@@ -118,7 +118,7 @@ def supported_prediction_formats(self) -> List[PredictionFormats]:
     def save_intermediate_evaluations(
         self,
         evaluation_name: str,
-        enunumerate_id: int,
+        enumerate_id: int,
         doc_id: str,
         evaluations: List[UnitEvaluationType],
     ) -> Optional[Path]:
@@ -131,7 +131,7 @@ def save_intermediate_evaluations(
             return None
 
         evals = [ev.model_dump() for ev in evaluations]
-        evaluation_filename = f"{evaluation_name}_{enunumerate_id:05d}_{doc_id}.json"
+        evaluation_filename = f"{evaluation_name}_{enumerate_id:05d}_{doc_id}.json"
         evaluation_fn = self._intermediate_evaluations_path / evaluation_filename  # type: ignore
         _log.info("Saving intermediate evaluations: %s", evaluation_fn)
         with open(evaluation_fn, "w") as fd:
diff --git a/docling_eval/evaluators/table_evaluator.py b/docling_eval/evaluators/table_evaluator.py
@@ -1,6 +1,7 @@
 import glob
 import logging
 import random
+from concurrent.futures import Executor, Future, ProcessPoolExecutor, as_completed
 from pathlib import Path
 from typing import Dict, List, Optional
 
@@ -90,7 +91,7 @@ def save_histogram_delta_row_col(self, figname: Path):
         plt.ylabel("%")
         plt.legend(loc="upper right")
 
-        logging.info(f"saving figure to {figname}")
+        _log.info(f"saving figure to {figname}")
         plt.savefig(figname)
 
 
@@ -104,6 +105,62 @@ def is_complex_table(table: TableItem) -> bool:
     return False
 
 
+def evaluate_tables(
+    teds_scorer,
+    stopwords: list[str],
+    doc_id: str,
+    table_id: int,
+    true_html: str,
+    true_num_rows: int,
+    true_num_cols: int,
+    pred_html: str,
+    pred_num_rows: int,
+    pred_num_cols: int,
+    is_complex: bool,
+    structure_only: bool,
+    # ) -> tuple[float, bool, bool]:
+) -> TableEvaluation:
+    r"""
+    Execution function
+    Receive 2 tables as html-formatted string. Compute the TEDS score
+
+    Return
+    ------
+    teds: float
+    is_complex: bool
+    structure_only: bool
+    """
+    # TODO: Check if exceptions can be thrown in the following code
+    for stopword in stopwords:
+        pred_html = pred_html.replace(stopword, "")
+    for stopword in stopwords:
+        true_html = true_html.replace(stopword, "")
+
+    pred_html_obj = html.fromstring(pred_html)
+    true_html_obj = html.fromstring(true_html)
+
+    teds = teds_scorer(
+        gt_table=true_html_obj,
+        pred_table=pred_html_obj,
+        structure_only=structure_only,
+    )
+    teds = round(teds, 3)
+
+    # Prepare output
+    table_evaluation = TableEvaluation(
+        TEDS=teds,
+        is_complex=is_complex,
+        filename=doc_id,
+        table_id=table_id,
+        true_ncols=true_num_cols,
+        pred_ncols=pred_num_cols,
+        true_nrows=true_num_rows,
+        pred_nrows=pred_num_rows,
+        structure_only_evaluation=structure_only,
+    )
+    return table_evaluation
+
+
 class TableEvaluator(BaseEvaluator):
     r"""
     Evaluate table predictions from HF dataset with the columns:
@@ -114,6 +171,7 @@ def __init__(
         intermediate_evaluations_path: Optional[Path] = None,
         structure_only: bool = False,
         prediction_sources: List[PredictionFormats] = [],
+        concurrency: int = 4,
     ):
         supported_prediction_formats: List[PredictionFormats] = [
             PredictionFormats.DOCLING_DOCUMENT,
@@ -122,6 +180,7 @@ def __init__(
         if not prediction_sources:
             prediction_sources = supported_prediction_formats
         super().__init__(
+            concurrency=concurrency,
             intermediate_evaluations_path=intermediate_evaluations_path,
             prediction_sources=prediction_sources,
             supported_prediction_formats=supported_prediction_formats,
@@ -142,7 +201,7 @@ def __call__(
         "GTDoclingDocument"
         "PredictionDoclingDocument"
         """
-        logging.info("Loading the split '%s' from: '%s'", split, ds_path)
+        _log.info("Loading the split '%s' from: '%s'", split, ds_path)
 
         ext_docdoc_loader: Optional[ExternalDoclingDocumentLoader] = None
         if external_predictions_path is not None:
@@ -151,9 +210,9 @@ def __call__(
         # Load the dataset
         split_path = str(ds_path / split / "*.parquet")
         split_files = glob.glob(split_path)
-        logging.info("Files: %s", split_files)
+        _log.info("Files: %s", split_files)
         ds = load_dataset("parquet", data_files={split: split_files})
-        logging.info("Overview of dataset: %s", ds)
+        _log.info("Overview of dataset: %s", ds)
 
         # Select the split
         ds_selection: Dataset = ds[split]
@@ -163,54 +222,78 @@ def __call__(
         rejected_samples: Dict[EvaluationRejectionType, int] = {
             EvaluationRejectionType.MISSING_PREDICTION: 0,
             EvaluationRejectionType.EVALUATION_ERROR: 0,
+            EvaluationRejectionType.MISMATHCED_DOCUMENT: 0,
         }
 
-        for i, data in tqdm(
-            enumerate(ds_selection),
-            desc="Table evaluations",
-            ncols=120,
-            total=len(ds_selection),
-        ):
-            data_record = DatasetRecordWithPrediction.model_validate(data)
-            doc_id = data_record.doc_id
-            gt_doc = data_record.ground_truth_doc
-            pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader)
-            if not pred_doc:
-                _log.error("There is no prediction for doc_id=%s", doc_id)
-                rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
-                continue
+        with ProcessPoolExecutor(max_workers=self._concurrency) as executor:
+            futures: list[Future] = []
+            table_futures: list[Future]
+            table_rejection: Optional[EvaluationRejectionType]
+
+            # Submit pages for execution
+            _log.info("Submitting the tables for evaluation...")
+            for i, data in enumerate(ds_selection):
+                data_record = DatasetRecordWithPrediction.model_validate(data)
+                doc_id = data_record.doc_id
+                gt_doc = data_record.ground_truth_doc
+                pred_doc = self._get_pred_doc(data_record, ext_docdoc_loader)
+                if not pred_doc:
+                    _log.error("There is no prediction for doc_id=%s", doc_id)
+                    rejected_samples[EvaluationRejectionType.MISSING_PREDICTION] += 1
+                    continue
 
-            try:
                 if not self._structure_only:
-                    results = self._evaluate_tables_in_documents(
+                    # Evaluate the tables with structure + content
+                    table_futures, table_rejection = self._evaluate_tables_in_documents(
+                        executor,
                         doc_id=doc_id,
                         true_doc=gt_doc,
                         pred_doc=pred_doc,
                         structure_only=False,
                     )
-                    table_evaluations.extend(results)
-
-                    if self._intermediate_evaluations_path:
-                        self.save_intermediate_evaluations(
-                            "TEDs_struct_content", i, doc_id, results
-                        )
-
-                results = self._evaluate_tables_in_documents(
-                    doc_id=data[BenchMarkColumns.DOC_ID],
+                    if table_rejection != None:
+                        rejected_samples[table_rejection] += 1
+                        continue
+                    futures.extend(table_futures)
+
+                # Always evaluate the tables with structure
+                table_futures, table_rejection = self._evaluate_tables_in_documents(
+                    executor,
+                    doc_id=doc_id,
                     true_doc=gt_doc,
                     pred_doc=pred_doc,
                     structure_only=True,
                 )
-                table_struct_evaluations.extend(results)
+                if table_rejection != None:
+                    rejected_samples[table_rejection] += 1
+                    continue
+                futures.extend(table_futures)
+
+            # Collect the futures
+            _log.info("Collecting the tables for evaluations...")
+            for future in tqdm(
+                as_completed(futures),
+                desc="Table evaluations",
+                ncols=120,
+                total=len(ds_selection),
+            ):
+                table_evaluation: TableEvaluation = future.result()
+                table_id: int = table_evaluation.table_id
+                doc_id = table_evaluation.filename
+
+                if not table_evaluation.structure_only_evaluation:
+                    table_evaluations.append(table_evaluation)
+                    if self._intermediate_evaluations_path:
+                        self.save_intermediate_evaluations(
+                            "TEDs_struct_content", table_id, doc_id, [table_evaluation]
+                        )
+
+                table_struct_evaluations.append(table_evaluation)
                 if self._intermediate_evaluations_path:
                     self.save_intermediate_evaluations(
-                        "TEDs_struct", i, doc_id, results
+                        "TEDs_struct", table_id, doc_id, [table_evaluation]
                     )
 
-            except Exception as ex:
-                rejected_samples[EvaluationRejectionType.EVALUATION_ERROR] += 1
-                _log.error("Error during tables evaluation for %s", doc_id)
-
         _log.info(
             "Finish. %s documents were skipped due to evaluation errors",
             rejected_samples[EvaluationRejectionType.EVALUATION_ERROR],
@@ -247,28 +330,45 @@ def __call__(
 
     def _evaluate_tables_in_documents(
         self,
+        executor: Executor,
         doc_id: str,
         true_doc: DoclingDocument,
         pred_doc: DoclingDocument,
         structure_only: bool = False,
-    ) -> List[TableEvaluation]:
-        r""" """
-        table_evaluations = []
-        true_tables = true_doc.tables
-        pred_tables = pred_doc.tables
-        _log.info(
-            "#-true-tables: %s, #-pred-tables: %s", len(true_tables), len(pred_tables)
+    ) -> tuple[list[Future], Optional[EvaluationRejectionType]]:
+        r"""
+        1. Extract the tables from true/pred document
+        2. Reject if the number of tables differs across true/pred
+        3. Export table as html-formatted string.
+        4. Submit the tables for evaluation
+        5. Return futures (one per table)
+
+        Return
+        ------
+
+        """
+        futures: list[Future] = []
+        true_tables: list[TableItem] = true_doc.tables
+        pred_tables: list[TableItem] = pred_doc.tables
+        true_tables_len = len(true_tables)
+        pred_tables_len = len(pred_tables)
+        _log.debug(
+            "#-true-tables: %s, #-pred-tables: %s", true_tables_len, pred_tables_len
         )
-        assert len(true_tables) == len(
-            pred_tables
-        ), "len(true_tables)!=len(pred_tables)"
+        # Reject the document is there is a mismatch in the number of tables between true/pred doc
+        if true_tables_len != pred_tables_len:
+            _log.error(
+                "Mismatched number of tables between GT and predictions: [%d, %d]. Skipping doc: %s",
+                true_tables_len,
+                pred_tables_len,
+                doc_id,
+            )
+            return futures, EvaluationRejectionType.MISMATHCED_DOCUMENT
 
         for table_id in range(len(true_tables)):  # , len(pred_tables)):
             # Avoid items of type DocItemLabel.DOCUMENT_INDEX
             if true_tables[table_id].label != DocItemLabel.TABLE:
-                logging.warning(
-                    f"Skipping table with label {true_tables[table_id].label}"
-                )
+                _log.warning(f"Skipping table with label {true_tables[table_id].label}")
                 continue
 
             try:
@@ -277,44 +377,32 @@ def _evaluate_tables_in_documents(
 
                 is_complex = is_complex_table(true_table)
 
-                true_html = true_table.export_to_html(true_doc)
-                pred_html = pred_table.export_to_html(pred_doc)
-
-                # Filter out tags that may be present in GT but not in prediction to avoid penalty
-                for stopword in self._stopwords:
-                    predicted_html = pred_html.replace(stopword, "")
-                for stopword in self._stopwords:
-                    true_html = true_html.replace(stopword, "")
-
-                true_html_obj = html.fromstring(true_html)
-                pred_html_obj = html.fromstring(pred_html)
-
-                teds = self._teds_scorer(
-                    gt_table=true_html_obj,
-                    pred_table=pred_html_obj,
-                    structure_only=structure_only,
-                )
-                # logging.info(f"teds: {teds}")
-
-                teds = round(teds, 3)
-                table_evaluation = TableEvaluation(
-                    TEDS=teds,
-                    is_complex=is_complex,
-                    filename=doc_id,
-                    table_id=table_id,
-                    true_ncols=true_table.data.num_cols,
-                    pred_ncols=pred_table.data.num_cols,
-                    true_nrows=true_table.data.num_rows,
-                    pred_nrows=pred_table.data.num_rows,
-                    structure_only_evaluation=structure_only,
+                true_html: str = true_table.export_to_html(true_doc)
+                pred_html: str = pred_table.export_to_html(pred_doc)
+
+                # Submit table for evaluation
+                futures.append(
+                    executor.submit(
+                        evaluate_tables,
+                        self._teds_scorer,
+                        self._stopwords,
+                        doc_id,
+                        table_id,
+                        true_html,
+                        true_table.data.num_rows,
+                        true_table.data.num_cols,
+                        pred_html,
+                        pred_table.data.num_rows,
+                        pred_table.data.num_cols,
+                        is_complex,
+                        structure_only,
+                    )
                 )
-                table_evaluations.append(table_evaluation)
             except Exception:
-                logging.error(
+                _log.error(
                     f"Table {table_id} from document {doc_id} could not be compared!"
                 )
-
-        return table_evaluations
+        return futures, None
 
     def _get_pred_doc(
         self,