55from docling_core .types .doc .page import SegmentedPage
66
77from docling_eval .evaluators .ocr .evaluation_models import (
8- AggregatedBenchmarkMetrics ,
98 OcrBenchmarkEntry ,
109 OcrMetricsSummary ,
1110 Word ,
1413from docling_eval .evaluators .ocr .processing_utils import (
1514 _CalculationConstants ,
1615 _IgnoreZoneFilter ,
16+ _IgnoreZoneFilterHWR ,
1717 extract_word_from_text_cell ,
1818)
1919
@@ -26,6 +26,7 @@ def __init__(
2626 ignore_zone_filter_type : str = "default" ,
2727 add_space_for_merged_prediction_words : bool = True ,
2828 add_space_for_merged_gt_words : bool = True ,
29+ aggregation_mode : str = "union" , # "mean" or "union"
2930 ) -> None :
3031 self .model_identifier : str = model_identifier
3132 self .add_space_for_merged_prediction_words : bool = (
@@ -39,8 +40,13 @@ def __init__(
3940 ] = {}
4041 self .image_to_ignore_zones_map : Dict [str , List [Word ]] = {}
4142 self .calculator_type : str = performance_calculator_type
43+ self .aggregation_mode : str = aggregation_mode
4244
43- self .ignore_zone_filter : _IgnoreZoneFilter = _IgnoreZoneFilter ()
45+ self .ignore_zone_filter : "_IgnoreZoneFilter | _IgnoreZoneFilterHWR"
46+ if ignore_zone_filter_type .lower () == "hwr" :
47+ self .ignore_zone_filter = _IgnoreZoneFilterHWR ()
48+ else :
49+ self .ignore_zone_filter = _IgnoreZoneFilter ()
4450
4551 def process_single_page_pair (
4652 self ,
@@ -126,6 +132,70 @@ def calculate_aggregated_metrics(
126132 if key not in summed_metrics :
127133 summed_metrics [key ] = ""
128134
135+ num_images = len (self .image_metrics_results )
136+ # Recognition aggregation
137+ if self .aggregation_mode == "union" :
138+ total_weighted_tp_words : float = summed_metrics .get (
139+ "tp_words_weighted" , 0.0
140+ )
141+ total_fp : float = summed_metrics .get (
142+ "number_of_false_positive_detections" , 0.0
143+ )
144+ total_fn : float = summed_metrics .get (
145+ "number_of_false_negative_detections" , 0.0
146+ )
147+ total_union_words : float = total_weighted_tp_words + total_fp + total_fn
148+ total_perfect_sensitive : float = summed_metrics .get (
149+ "perfect_matches_sensitive_weighted" , 0.0
150+ )
151+ total_perfect_insensitive : float = summed_metrics .get (
152+ "perfect_matches_insensitive_weighted" , 0.0
153+ )
154+ avg_word_acc_sensitive = total_perfect_sensitive / max (
155+ _CalculationConstants .EPS , total_union_words
156+ )
157+ avg_word_acc_insensitive = total_perfect_insensitive / max (
158+ _CalculationConstants .EPS , total_union_words
159+ )
160+ # Character (union)
161+ sum_ed_sensitive_tp : float = summed_metrics .get ("sum_ed_sensitive_tp" , 0.0 )
162+ sum_ed_insensitive_tp : float = summed_metrics .get (
163+ "sum_ed_insensitive_tp" , 0.0
164+ )
165+ sum_max_len_tp : float = summed_metrics .get ("sum_max_len_tp" , 0.0 )
166+ sum_text_len_fp : float = summed_metrics .get ("text_len_fp" , 0.0 )
167+ sum_text_len_fn : float = summed_metrics .get ("text_len_fn" , 0.0 )
168+ total_chars_union : float = (
169+ sum_max_len_tp + sum_text_len_fp + sum_text_len_fn
170+ )
171+ avg_ed_union_sensitive : float = (
172+ sum_ed_sensitive_tp + sum_text_len_fp + sum_text_len_fn
173+ ) / max (_CalculationConstants .EPS , total_chars_union )
174+ avg_ed_union_insensitive : float = (
175+ sum_ed_insensitive_tp + sum_text_len_fp + sum_text_len_fn
176+ ) / max (_CalculationConstants .EPS , total_chars_union )
177+ avg_char_acc_sensitive = 1 - avg_ed_union_sensitive
178+ avg_char_acc_insensitive = 1 - avg_ed_union_insensitive
179+ # Convert to percentage later
180+ avg_word_acc_sensitive *= 100.0
181+ avg_word_acc_insensitive *= 100.0
182+ avg_char_acc_sensitive *= 100.0
183+ avg_char_acc_insensitive *= 100.0
184+ else :
185+ # Per-image mean of already-percentage metrics
186+ avg_word_acc_sensitive = (
187+ summed_metrics .get ("word_accuracy_sensitive" , 0.0 ) / num_images
188+ )
189+ avg_word_acc_insensitive = (
190+ summed_metrics .get ("word_accuracy_insensitive" , 0.0 ) / num_images
191+ )
192+ avg_char_acc_sensitive = (
193+ summed_metrics .get ("character_accuracy_sensitive" , 0.0 ) / num_images
194+ )
195+ avg_char_acc_insensitive = (
196+ summed_metrics .get ("character_accuracy_insensitive" , 0.0 ) / num_images
197+ )
198+
129199 total_true_positives : float = summed_metrics .get (
130200 "number_of_true_positive_matches" , _CalculationConstants .EPS
131201 )
@@ -147,28 +217,35 @@ def calculate_aggregated_metrics(
147217 _CalculationConstants .EPS ,
148218 )
149219
220+ avg_char_acc_sensitive = (
221+ summed_metrics .get ("character_accuracy_sensitive" , 0.0 ) / num_images
222+ )
223+ avg_char_acc_insensitive = (
224+ summed_metrics .get ("character_accuracy_insensitive" , 0.0 ) / num_images
225+ )
226+
150227 aggregated_metrics_data = {
151228 "f1" : 100 * overall_f1_score ,
152229 "recall" : 100 * overall_recall ,
153230 "precision" : 100 * overall_precision ,
231+ "word_accuracy_sensitive" : avg_word_acc_sensitive ,
232+ "word_accuracy_insensitive" : avg_word_acc_insensitive ,
233+ "character_accuracy_sensitive" : avg_char_acc_sensitive ,
234+ "character_accuracy_insensitive" : avg_char_acc_insensitive ,
154235 }
155236
156- aggregated_metrics = AggregatedBenchmarkMetrics .model_validate (
157- aggregated_metrics_data
158- )
159- output_results = aggregated_metrics .model_dump (by_alias = True )
160-
161- for key , val in output_results .items ():
237+ for key , val in aggregated_metrics_data .items ():
162238 try :
163239 formatted_value : float = float (f"{{:.{ float_precision } f}}" .format (val ))
164- output_results [key ] = formatted_value
240+ aggregated_metrics_data [key ] = formatted_value
165241 except (ValueError , TypeError ):
166242 pass
167- return output_results
243+
244+ return aggregated_metrics_data
168245
169246 def get_formatted_metrics_summary (
170247 self ,
171- float_precision : int = 1 ,
248+ float_precision : int = 2 ,
172249 ) -> List [Dict [str , Any ]]:
173250 summary_list : List [Dict [str , Any ]] = []
174251
0 commit comments