fix: ocr visualization and add ocr recognition metrics (#144)

samiuc · samiullahchattha · web-flow · commit d63a439441ff · 2025-09-15T22:03:34.000-07:00
* fix: ocr visualization

Signed-off-by: samiullahchattha &lt;Sami.Ullah1@ibm.com&gt;

* fix type error

Signed-off-by: samiullahchattha &lt;Sami.Ullah1@ibm.com&gt;

* fix: improve OCR visualizer

* fix: build errors

* add word and character accuracy metrics

Signed-off-by: samiullahchattha &lt;Sami.Ullah1@ibm.com&gt;

* strip leading or trailing whitespace in edit distance

Signed-off-by: samiullahchattha &lt;Sami.Ullah1@ibm.com&gt;

* fi visualizations

Signed-off-by: samiuc &lt;sami.ullah.chat@gmail.com&gt;

---------

Signed-off-by: samiullahchattha &lt;Sami.Ullah1@ibm.com&gt;
Signed-off-by: samiuc &lt;sami.ullah.chat@gmail.com&gt;
Co-authored-by: samiullahchattha &lt;Sami.Ullah1@ibm.com&gt;
diff --git a/docling_eval/cli/main.py b/docling_eval/cli/main.py
@@ -603,7 +603,7 @@ def evaluate(
             json.dump(evaluation.model_dump(), fd, indent=2, sort_keys=True)
 
     elif modality == EvaluationModality.OCR:
-        ocr_evaluator = OCREvaluator()
+        ocr_evaluator = OCREvaluator(intermediate_evaluations_path=odir)
         evaluation = ocr_evaluator(  # type: ignore
             idir,
             split=split,
diff --git a/docling_eval/evaluators/ocr/benchmark_runner.py b/docling_eval/evaluators/ocr/benchmark_runner.py
@@ -5,7 +5,6 @@
 from docling_core.types.doc.page import SegmentedPage
 
 from docling_eval.evaluators.ocr.evaluation_models import (
-    AggregatedBenchmarkMetrics,
     OcrBenchmarkEntry,
     OcrMetricsSummary,
     Word,
@@ -14,6 +13,7 @@
 from docling_eval.evaluators.ocr.processing_utils import (
     _CalculationConstants,
     _IgnoreZoneFilter,
+    _IgnoreZoneFilterHWR,
     extract_word_from_text_cell,
 )
 
@@ -26,6 +26,7 @@ def __init__(
         ignore_zone_filter_type: str = "default",
         add_space_for_merged_prediction_words: bool = True,
         add_space_for_merged_gt_words: bool = True,
+        aggregation_mode: str = "union",  # "mean" or "union"
     ) -> None:
         self.model_identifier: str = model_identifier
         self.add_space_for_merged_prediction_words: bool = (
@@ -39,8 +40,13 @@ def __init__(
         ] = {}
         self.image_to_ignore_zones_map: Dict[str, List[Word]] = {}
         self.calculator_type: str = performance_calculator_type
+        self.aggregation_mode: str = aggregation_mode
 
-        self.ignore_zone_filter: _IgnoreZoneFilter = _IgnoreZoneFilter()
+        self.ignore_zone_filter: "_IgnoreZoneFilter | _IgnoreZoneFilterHWR"
+        if ignore_zone_filter_type.lower() == "hwr":
+            self.ignore_zone_filter = _IgnoreZoneFilterHWR()
+        else:
+            self.ignore_zone_filter = _IgnoreZoneFilter()
 
     def process_single_page_pair(
         self,
@@ -126,6 +132,70 @@ def calculate_aggregated_metrics(
                     if key not in summed_metrics:
                         summed_metrics[key] = ""
 
+        num_images = len(self.image_metrics_results)
+        # Recognition aggregation
+        if self.aggregation_mode == "union":
+            total_weighted_tp_words: float = summed_metrics.get(
+                "tp_words_weighted", 0.0
+            )
+            total_fp: float = summed_metrics.get(
+                "number_of_false_positive_detections", 0.0
+            )
+            total_fn: float = summed_metrics.get(
+                "number_of_false_negative_detections", 0.0
+            )
+            total_union_words: float = total_weighted_tp_words + total_fp + total_fn
+            total_perfect_sensitive: float = summed_metrics.get(
+                "perfect_matches_sensitive_weighted", 0.0
+            )
+            total_perfect_insensitive: float = summed_metrics.get(
+                "perfect_matches_insensitive_weighted", 0.0
+            )
+            avg_word_acc_sensitive = total_perfect_sensitive / max(
+                _CalculationConstants.EPS, total_union_words
+            )
+            avg_word_acc_insensitive = total_perfect_insensitive / max(
+                _CalculationConstants.EPS, total_union_words
+            )
+            # Character (union)
+            sum_ed_sensitive_tp: float = summed_metrics.get("sum_ed_sensitive_tp", 0.0)
+            sum_ed_insensitive_tp: float = summed_metrics.get(
+                "sum_ed_insensitive_tp", 0.0
+            )
+            sum_max_len_tp: float = summed_metrics.get("sum_max_len_tp", 0.0)
+            sum_text_len_fp: float = summed_metrics.get("text_len_fp", 0.0)
+            sum_text_len_fn: float = summed_metrics.get("text_len_fn", 0.0)
+            total_chars_union: float = (
+                sum_max_len_tp + sum_text_len_fp + sum_text_len_fn
+            )
+            avg_ed_union_sensitive: float = (
+                sum_ed_sensitive_tp + sum_text_len_fp + sum_text_len_fn
+            ) / max(_CalculationConstants.EPS, total_chars_union)
+            avg_ed_union_insensitive: float = (
+                sum_ed_insensitive_tp + sum_text_len_fp + sum_text_len_fn
+            ) / max(_CalculationConstants.EPS, total_chars_union)
+            avg_char_acc_sensitive = 1 - avg_ed_union_sensitive
+            avg_char_acc_insensitive = 1 - avg_ed_union_insensitive
+            # Convert to percentage later
+            avg_word_acc_sensitive *= 100.0
+            avg_word_acc_insensitive *= 100.0
+            avg_char_acc_sensitive *= 100.0
+            avg_char_acc_insensitive *= 100.0
+        else:
+            # Per-image mean of already-percentage metrics
+            avg_word_acc_sensitive = (
+                summed_metrics.get("word_accuracy_sensitive", 0.0) / num_images
+            )
+            avg_word_acc_insensitive = (
+                summed_metrics.get("word_accuracy_insensitive", 0.0) / num_images
+            )
+            avg_char_acc_sensitive = (
+                summed_metrics.get("character_accuracy_sensitive", 0.0) / num_images
+            )
+            avg_char_acc_insensitive = (
+                summed_metrics.get("character_accuracy_insensitive", 0.0) / num_images
+            )
+
         total_true_positives: float = summed_metrics.get(
             "number_of_true_positive_matches", _CalculationConstants.EPS
         )
@@ -147,28 +217,35 @@ def calculate_aggregated_metrics(
             _CalculationConstants.EPS,
         )
 
+        avg_char_acc_sensitive = (
+            summed_metrics.get("character_accuracy_sensitive", 0.0) / num_images
+        )
+        avg_char_acc_insensitive = (
+            summed_metrics.get("character_accuracy_insensitive", 0.0) / num_images
+        )
+
         aggregated_metrics_data = {
             "f1": 100 * overall_f1_score,
             "recall": 100 * overall_recall,
             "precision": 100 * overall_precision,
+            "word_accuracy_sensitive": avg_word_acc_sensitive,
+            "word_accuracy_insensitive": avg_word_acc_insensitive,
+            "character_accuracy_sensitive": avg_char_acc_sensitive,
+            "character_accuracy_insensitive": avg_char_acc_insensitive,
         }
 
-        aggregated_metrics = AggregatedBenchmarkMetrics.model_validate(
-            aggregated_metrics_data
-        )
-        output_results = aggregated_metrics.model_dump(by_alias=True)
-
-        for key, val in output_results.items():
+        for key, val in aggregated_metrics_data.items():
             try:
                 formatted_value: float = float(f"{{:.{float_precision}f}}".format(val))
-                output_results[key] = formatted_value
+                aggregated_metrics_data[key] = formatted_value
             except (ValueError, TypeError):
                 pass
-        return output_results
+
+        return aggregated_metrics_data
 
     def get_formatted_metrics_summary(
         self,
-        float_precision: int = 1,
+        float_precision: int = 2,
     ) -> List[Dict[str, Any]]:
         summary_list: List[Dict[str, Any]] = []
 
diff --git a/docling_eval/evaluators/ocr/evaluation_models.py b/docling_eval/evaluators/ocr/evaluation_models.py
@@ -1,4 +1,4 @@
-from typing import Any, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 from docling_core.types.doc import BoundingBox
 from docling_core.types.doc.page import TextCell
@@ -7,6 +7,17 @@
 
 class _CalculationConstants:
     EPS: float = 1.0e-6
+    CHAR_NORMALIZE_MAP: Dict[str, str] = {
+        "ﬁ": "fi",
+        "ﬂ": "fl",
+        "“": '"',
+        "”": '"',
+        "‘": "'",
+        "’": "'",
+        "—": "-",
+        "–": "-",
+        "\xa0": " ",
+    }
 
 
 class Word(TextCell):
@@ -15,6 +26,8 @@ class Word(TextCell):
     matched: bool = Field(default=False)
     ignore_zone: Optional[bool] = None
     to_remove: Optional[bool] = None
+    # number of GT words represented by this Word after merging
+    word_weight: int = Field(default=1)
 
     @property
     def bbox(self) -> BoundingBox:
@@ -42,6 +55,20 @@ class OcrMetricsSummary(BaseModel):
     detection_precision: float
     detection_recall: float
     detection_f1: float
+    # recognition metrics
+    word_accuracy_sensitive: float = 0.0
+    word_accuracy_insensitive: float = 0.0
+    character_accuracy_sensitive: float = 0.0
+    character_accuracy_insensitive: float = 0.0
+    # for dataset-level union aggregation
+    tp_words_weighted: float = 0.0
+    perfect_matches_sensitive_weighted: float = 0.0
+    perfect_matches_insensitive_weighted: float = 0.0
+    sum_ed_sensitive_tp: float = 0.0
+    sum_ed_insensitive_tp: float = 0.0
+    sum_max_len_tp: float = 0.0
+    text_len_fp: float = 0.0
+    text_len_fn: float = 0.0
 
     class Config:
         populate_by_name = True
@@ -52,15 +79,6 @@ class OcrBenchmarkEntry(BaseModel):
     metrics: OcrMetricsSummary
 
 
-class AggregatedBenchmarkMetrics(BaseModel):
-    f1: float = Field(alias="F1")
-    recall: float = Field(alias="Recall")
-    precision: float = Field(alias="Precision")
-
-    class Config:
-        populate_by_name = True
-
-
 class DocumentEvaluationEntry(BaseModel):
     doc_id: str
 
@@ -72,3 +90,31 @@ class OcrDatasetEvaluationResult(BaseModel):
     f1_score: float = 0.0
     recall: float = 0.0
     precision: float = 0.0
+    word_accuracy_sensitive: float = 0.0
+    word_accuracy_insensitive: float = 0.0
+    character_accuracy_sensitive: float = 0.0
+    character_accuracy_insensitive: float = 0.0
+
+
+class WordEvaluationMetadata(BaseModel):
+    text: str
+    confidence: Optional[float] = None
+    bounding_box: BoundingBox
+    is_true_positive: bool = False
+    is_false_positive: bool = False
+    is_false_negative: bool = False
+    edit_distance_sensitive: Optional[int] = None
+    edit_distance_insensitive: Optional[int] = None
+
+
+class TruePositiveMatch(BaseModel):
+    pred: WordEvaluationMetadata
+    gt: WordEvaluationMetadata
+
+
+class DocumentEvaluationMetadata(BaseModel):
+    doc_id: str
+    true_positives: List[TruePositiveMatch]
+    false_positives: List[WordEvaluationMetadata]
+    false_negatives: List[WordEvaluationMetadata]
+    metrics: OcrMetricsSummary
diff --git a/docling_eval/evaluators/ocr/performance_calculator.py b/docling_eval/evaluators/ocr/performance_calculator.py
@@ -2,6 +2,8 @@
 from collections import namedtuple
 from typing import Dict, List, Tuple
 
+import edit_distance
+import numpy as np
 from docling_core.types.doc.page import SegmentedPage
 
 from docling_eval.evaluators.ocr.evaluation_models import (
@@ -17,6 +19,7 @@
     refine_prediction_to_many_gt_boxes,
 )
 from docling_eval.evaluators.ocr.processing_utils import (
+    calculate_edit_distance,
     convert_word_to_text_cell,
     merge_words_into_one,
 )
@@ -389,6 +392,63 @@ def calculate_image_metrics(self) -> OcrMetricsSummary:
             recall + precision, _CalculationConstants.EPS
         )
 
+        sum_ed_sensitive = _CalculationConstants.EPS
+        sum_ed_insensitive = _CalculationConstants.EPS
+        sum_max_len_tp = _CalculationConstants.EPS
+        perfect_matches_sensitive = 0
+        perfect_matches_insensitive = 0
+        total_tp_words_weighted = 0
+
+        for gt_word, pred_word in self.confirmed_gt_prediction_matches:
+            gt_text = gt_word.text
+            pred_text = pred_word.text
+            # weight by the number of GT words represented by this merged word
+            gt_weight = getattr(gt_word, "word_weight", 1)
+            total_tp_words_weighted += gt_weight
+
+            max_len = max(len(gt_text), len(pred_text), 1)
+            sum_max_len_tp += max_len
+
+            # Case-sensitive metrics
+            ed_sensitive = calculate_edit_distance(gt_text, pred_text, None)
+            sum_ed_sensitive += ed_sensitive
+            if ed_sensitive == 0:
+                perfect_matches_sensitive += gt_weight
+
+            # Case-insensitive metrics
+            ed_insensitive = calculate_edit_distance(
+                gt_text.upper(), pred_text.upper(), None
+            )
+            sum_ed_insensitive += ed_insensitive
+            if ed_insensitive == 0:
+                perfect_matches_insensitive += gt_weight
+
+        text_len_fp = sum(len(w.text) for w in self.current_false_positives)
+        text_len_fn = sum(len(w.text) for w in self.current_false_negatives)
+
+        # word accuracy (union-based), weighted by GT merges for TPs
+        total_union_words = (
+            total_tp_words_weighted + num_false_positives + num_false_negatives
+        )
+        word_acc_union_sensitive = perfect_matches_sensitive / max(
+            _CalculationConstants.EPS, total_union_words
+        )
+        word_acc_union_insensitive = perfect_matches_insensitive / max(
+            _CalculationConstants.EPS, total_union_words
+        )
+
+        # character accuracy (edit score union-based)
+        total_chars_union = sum_max_len_tp + text_len_fp + text_len_fn
+        avg_ed_union_sensitive = (sum_ed_sensitive + text_len_fp + text_len_fn) / max(
+            _CalculationConstants.EPS, total_chars_union
+        )
+        avg_ed_union_insensitive = (
+            sum_ed_insensitive + text_len_fp + text_len_fn
+        ) / max(_CalculationConstants.EPS, total_chars_union)
+
+        char_acc_sensitive = 1 - avg_ed_union_sensitive
+        char_acc_insensitive = 1 - avg_ed_union_insensitive
+
         metrics_summary_data = {
             "number_of_prediction_cells": num_prediction_cells_final,
             "number_of_gt_cells": num_gt_cells_final,
@@ -398,6 +458,19 @@ def calculate_image_metrics(self) -> OcrMetricsSummary:
             "detection_precision": 100.0 * precision,
             "detection_recall": 100.0 * recall,
             "detection_f1": 100.0 * f1_score,
+            "word_accuracy_sensitive": 100.0 * word_acc_union_sensitive,
+            "word_accuracy_insensitive": 100.0 * word_acc_union_insensitive,
+            "character_accuracy_sensitive": 100.0 * char_acc_sensitive,
+            "character_accuracy_insensitive": 100.0 * char_acc_insensitive,
+            # additional counters for dataset-level union aggregation
+            "tp_words_weighted": float(total_tp_words_weighted),
+            "perfect_matches_sensitive_weighted": float(perfect_matches_sensitive),
+            "perfect_matches_insensitive_weighted": float(perfect_matches_insensitive),
+            "sum_ed_sensitive_tp": float(sum_ed_sensitive),
+            "sum_ed_insensitive_tp": float(sum_ed_insensitive),
+            "sum_max_len_tp": float(sum_max_len_tp),
+            "text_len_fp": float(text_len_fp),
+            "text_len_fn": float(text_len_fn),
         }
 
         summary_instance = OcrMetricsSummary.model_validate(metrics_summary_data)
diff --git a/docling_eval/evaluators/ocr/processing_utils.py b/docling_eval/evaluators/ocr/processing_utils.py
diff --git a/docling_eval/evaluators/ocr_evaluator.py b/docling_eval/evaluators/ocr_evaluator.py