feat: Introduce examples how to evaluate using external predictions using the API and the CLI.

nikos-livathinos · nikos-livathinos · commit ae10646fdd9e · 2025-12-08T13:54:25.000+01:00
Signed-off-by: Nikos Livathinos &lt;nli@zurich.ibm.com&gt;
diff --git a/docs/examples/evaluate_dpbench_on_external_predictions.sh b/docs/examples/evaluate_dpbench_on_external_predictions.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+###########################################################################################
+# Invariants
+#
+
+readonly GT_DIR=scratch/DPBench/gt_dataset
+
+readonly MODALITIES=(
+layout
+table_structure
+document_structure
+reading_order
+markdown_text
+bboxes_text
+key_value
+timings
+)
+
+
+###########################################################################################
+# Functions
+#
+
+evaluate() {
+    local pred_dir save_dir modality
+    pred_dir="$1"
+    save_dir="$2"
+
+    # Check if the GT/preds dirs exist
+    if [ ! -d "${GT_DIR}" ]; then
+        echo "Missing GT dir: ${GT_DIR}"
+        exit 1
+    fi
+    if [ ! -d "${pred_dir}" ]; then
+        echo "Missing predictions dir: ${pred_dir}"
+        exit 2
+    fi
+
+    for modality in "${MODALITIES[@]}"; do
+        echo "Evaluation modality: ${modality}, predictions: ${pred_dir}"
+        uv run docling-eval evaluate \
+            --benchmark DPBench \
+            --modality "${modality}" \
+            --input-dir "${GT_DIR}" \
+            --external-predictions-path "${pred_dir}" \
+            --output-dir "${save_dir}"
+    done
+}
+
+
+###########################################################################################
+# Main
+#
+
+# json predictions
+evaluate \
+    scratch/DPBench/predicted_documents/json \
+    scratch/DPBench/external_evaluations_jsons
+
+
+# doctags predictions
+evaluate \
+    scratch/DPBench/predicted_documents/doctag \
+    scratch/DPBench/external_evaluations_doctags
+
+
+# yaml predictions
+evaluate \
+    scratch/DPBench/predicted_documents/yaml \
+    scratch/DPBench/external_evaluations_yaml
+
diff --git a/docs/examples/evaluate_external_predictions.py b/docs/examples/evaluate_external_predictions.py
@@ -0,0 +1,85 @@
+import argparse
+import logging
+from pathlib import Path
+
+from docling_eval.cli.main import evaluate
+from docling_eval.datamodels.types import BenchMarkNames, EvaluationModality
+
+_log = logging.getLogger(__name__)
+
+
+def evaluate_external_predictions(
+    benchmark: BenchMarkNames,
+    modality: EvaluationModality,
+    gt_path: Path,
+    predictions_dir: Path,
+    save_dir: Path,
+):
+    r""" """
+    evaluate(
+        modality,
+        benchmark,
+        gt_path,
+        save_dir,
+        external_predictions_path=predictions_dir,
+    )
+
+
+def main():
+    r""" """
+    parser = argparse.ArgumentParser(
+        description="Example how to use GT from parquet and predictions from externally provided prediction files",
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "-b",
+        "--benchmark",
+        required=True,
+        type=BenchMarkNames,
+        help="Evaluation modality",
+    )
+    parser.add_argument(
+        "-m",
+        "--modality",
+        required=True,
+        type=EvaluationModality,
+        help="Evaluation modality",
+    )
+    parser.add_argument(
+        "-g",
+        "--gt_parquet_dir",
+        required=True,
+        type=Path,
+        help="Path to the parquet GT dataset",
+    )
+    parser.add_argument(
+        "-p",
+        "--predictions_dir",
+        required=True,
+        type=Path,
+        help="Dir with the external prediction files (json, dt, yaml)",
+    )
+    parser.add_argument(
+        "-s",
+        "--save_dir",
+        required=False,
+        type=Path,
+        help="Path to save the produced evaluation files",
+    )
+    args = parser.parse_args()
+
+    # Configure logger
+    log_format = "%(asctime)s - %(levelname)s - %(message)s"
+    logging.basicConfig(level=logging.INFO, format=log_format)
+
+    evaluate_external_predictions(
+        args.benchmark,
+        args.modality,
+        args.gt_parquet_dir,
+        args.predictions_dir,
+        args.save_dir,
+    )
+
+
+if __name__ == "__main__":
+    main()