11import json
22import logging
3- from enum import Enum
43from pathlib import Path
54from typing import Any , Dict , Generic , List , Optional
65
2726
2827class SingleEvaluation (BaseModel , Generic [DatasetEvaluationType ]):
2928 evaluation : DatasetEvaluationType
29+ experiment : str
30+
31+ # The prediction provider cannot be deteremined if it comes outside of docling-eval.
3032 prediction_provider_type : Optional [PredictionProviderType ] = None
3133
3234
@@ -77,18 +79,14 @@ def validate_modality(
7779
7880
7981def read_prediction_provider_type (
80- pred_path : Path ,
82+ pred_dir : Path ,
83+ split : str ,
8184) -> Optional [PredictionProviderType ]:
85+ r"""
86+ Open the evaluation dataset and read the prediction provider column
87+ """
8288 try :
83- # Discover the split
84- split = None
85- for split_path in pred_path .iterdir ():
86- split = split_path .name
87- break
88- if not split :
89- return None
90-
91- parquet_files = str (pred_path / split / "*.parquet" )
89+ parquet_files = str (pred_dir / split / "*.parquet" )
9290 ds : IterableDataset = load_dataset (
9391 "parquet" ,
9492 data_files = {split : parquet_files },
@@ -121,8 +119,12 @@ class MultiEvaluator(Generic[DatasetEvaluationType]):
121119 """
122120
123121 # Leaf dirs for GT, predictions, evaluations
124- GT_LEAF_DIR = "_GT_"
125- PRED_LEAF_DIR = "predictions"
122+ # GT_LEAF_DIR = "_GT_"
123+ # PRED_LEAF_DIR = "predictions"
124+
125+ GT_LEAF_DIR = "gt_dataset"
126+ PRED_LEAF_DIR = "eval_dataset"
127+ EVALUATIONS_DIR = "evaluations"
126128
127129 def __init__ (
128130 self ,
@@ -141,42 +143,48 @@ def __init__(
141143
142144 def __call__ (
143145 self ,
144- prediction_provider_types : List [PredictionProviderType ],
146+ experiment_names : List [str ],
145147 benchmarks : List [BenchMarkNames ],
146148 modalities : List [EvaluationModality ],
147149 dataset_sources : Optional [Dict [BenchMarkNames , Path ]] = None ,
148150 dataset_splits : Optional [Dict [BenchMarkNames , str ]] = None ,
149151 ) -> MultiEvaluation :
150152 r""" """
151153 # Build any missing dataset
152- benchmark_preds = self ._build_datasets (
153- prediction_provider_types ,
154+ benchmark_experiments = self ._build_datasets (
155+ experiment_names ,
154156 benchmarks ,
155157 dataset_sources ,
156158 dataset_splits ,
157159 )
158160
159161 # Perform the evaluations
160- multi_evaluation = self ._run_evaluations (modalities , benchmark_preds )
162+ multi_evaluation = self ._run_evaluations (
163+ modalities , benchmark_experiments , dataset_splits
164+ )
161165 return multi_evaluation
162166
163167 def _build_datasets (
164168 self ,
165- prediction_provider_types : List [PredictionProviderType ],
169+ experiment_names : List [str ],
166170 benchmarks : List [BenchMarkNames ],
167171 dataset_sources : Optional [Dict [BenchMarkNames , Path ]] = None ,
168172 dataset_splits : Optional [Dict [BenchMarkNames , str ]] = None ,
169- ) -> Dict [BenchMarkNames , Dict [PredictionProviderType , Path ]]:
173+ ) -> Dict [BenchMarkNames , Dict [str , Path ]]:
170174 r"""
171175 1. Get the predicted datasets
172176 2. If a predicted dataset is missing, check if the GT for this dataset exists.
173177 3. If both pred and GT datasets exist, build the GT dataset and the pred dataset.
174178 4. If GT is present, build the pred dataset.
175179
176- Return the paths of the prediction datasets
180+ Notice: In case the prediction dataset does not exist, the experiment name MUST match a
181+ provider name.
182+
183+ Return the paths of the prediction datasets:
184+ benchmark_name -> experiment_name -> Path
177185 """
178186 # Dict with benchmark predictions
179- benchmark_preds : Dict [BenchMarkNames , Dict [PredictionProviderType , Path ]] = {}
187+ benchmark_experiments : Dict [BenchMarkNames , Dict [str , Path ]] = {}
180188
181189 # Set the benchmark_preds
182190 for benchmark in benchmarks :
@@ -189,17 +197,19 @@ def _build_datasets(
189197 else self ._default_split
190198 )
191199
192- if benchmark not in benchmark_preds :
193- benchmark_preds [benchmark ] = {}
194- for provider_type in prediction_provider_types :
200+ if benchmark not in benchmark_experiments :
201+ benchmark_experiments [benchmark ] = {}
202+ for experiment_name in experiment_names :
195203 benchmark_pred_dir = (
196204 self ._root_dir
197205 / benchmark .value
198- / provider_type . value
206+ / experiment_name
199207 / MultiEvaluator .PRED_LEAF_DIR
200208 )
201209 if dataset_exists (benchmark_pred_dir , split ):
202- benchmark_preds [benchmark ][provider_type ] = benchmark_pred_dir
210+ benchmark_experiments [benchmark ][
211+ experiment_name
212+ ] = benchmark_pred_dir
203213 continue
204214
205215 # Create the GT dataset if needed
@@ -211,9 +221,19 @@ def _build_datasets(
211221 _log .info ("Creating GT for: %s" , benchmark .value )
212222 self ._create_gt (benchmark , benchmark_gt_dir , split , dataset_source )
213223
214- # Create the pred dataset
224+ # Map the experiment_name to a PredictionProviderType
225+ try :
226+ provider_type = PredictionProviderType (experiment_name )
227+ except ValueError as ex :
228+ _log .error (
229+ "Prediction dataset is missing and experiment %s does NOT match any provider name" ,
230+ experiment_name ,
231+ )
232+ raise ex
233+
234+ # Create the prediction dataset
215235 _log .info (
216- "Creating predictions for: %s / %s / %s " ,
236+ "Creating predictions for: %s / %s" ,
217237 benchmark .value ,
218238 provider_type .value ,
219239 )
@@ -225,56 +245,69 @@ def _build_datasets(
225245 benchmark_pred_dir ,
226246 )
227247
228- benchmark_preds [benchmark ][provider_type ] = benchmark_pred_dir
248+ benchmark_experiments [benchmark ][experiment_name ] = benchmark_pred_dir
229249
230- return benchmark_preds
250+ return benchmark_experiments
231251
232252 def _run_evaluations (
233253 self ,
234254 modalities : List [EvaluationModality ],
235- benchmark_preds : Dict [BenchMarkNames , Dict [PredictionProviderType , Path ]],
255+ benchmark_experiments : Dict [BenchMarkNames , Dict [str , Path ]],
236256 dataset_splits : Optional [Dict [BenchMarkNames , str ]] = None ,
237257 ) -> MultiEvaluation :
238258 evaluations : Dict [
239259 BenchMarkNames ,
240260 Dict [str , Dict [EvaluationModality , SingleEvaluation ]],
241261 ] = {}
242- for benchmark , prov_mod_paths in benchmark_preds .items ():
262+ for benchmark , exp_mod_paths in benchmark_experiments .items ():
243263 split = (
244264 dataset_splits .get (benchmark , self ._default_split )
245265 if dataset_splits
246266 else self ._default_split
247267 )
248268 if benchmark not in evaluations :
249269 evaluations [benchmark ] = {}
250- for provider_type , pred_dir in prov_mod_paths .items ():
251- experiment = provider_type .value
270+ for experiment , pred_dir in exp_mod_paths .items ():
252271 if experiment not in evaluations [benchmark ]:
253272 evaluations [benchmark ][experiment ] = {}
254273
274+ # Try to get the prediction provider
275+ provider_type = read_prediction_provider_type (pred_dir , split )
276+
255277 for modality in modalities :
256- # Check if the provider supports the asked modality
257- if not validate_modality (provider_type , modality ):
278+ # If the provider is available, check if it supports the asked modality
279+ if provider_type and not validate_modality (provider_type , modality ):
258280 _log .error (
259- "Provider %s does not support modality: %s " ,
260- provider_type ,
281+ "Prediction dataset from provider '%s', which does not support modality '%s' " ,
282+ provider_type . value ,
261283 modality ,
262284 )
263285 continue
264286
265287 eval_dir = (
266- self ._root_dir / benchmark .value / experiment / modality .value
288+ self ._root_dir
289+ / benchmark .value
290+ / experiment
291+ / MultiEvaluator .EVALUATIONS_DIR
292+ / modality .value
267293 )
268294 # Check if the evaluations are already present
269295 evaluation = load_evaluation (benchmark , modality , eval_dir )
270296 if not evaluation :
297+ _log .info (
298+ "Evaluate benchmark: %s, for experiment: %s, for modality: %s" ,
299+ benchmark .value ,
300+ experiment ,
301+ modality ,
302+ )
271303 evaluation = evaluate (
272304 modality , benchmark , pred_dir , eval_dir , split
273305 )
274306 if evaluation :
275307 assert evaluation
276308 evaluations [benchmark ][experiment ][modality ] = SingleEvaluation (
277309 evaluation = evaluation ,
310+ experiment = experiment ,
278311 prediction_provider_type = provider_type ,
279312 )
280313
@@ -315,7 +348,7 @@ def _create_gt(
315348 def _create_eval (
316349 self ,
317350 benchmark : BenchMarkNames ,
318- prediction_provider : PredictionProviderType ,
351+ provider_type : PredictionProviderType ,
319352 gt_dir : Path ,
320353 split : str ,
321354 pred_dir : Path ,
@@ -330,7 +363,7 @@ def _create_eval(
330363 try :
331364 # Create the appropriate prediction provider
332365 provider = get_prediction_provider (
333- provider_type = prediction_provider ,
366+ provider_type = provider_type ,
334367 do_visualization = False ,
335368 )
336369
@@ -359,26 +392,24 @@ def load_multi_evaluation(multi_evaluation_path: Path) -> MultiEvaluation:
359392 Dict [Path , Dict [EvaluationModality , DatasetEvaluationType ]],
360393 ] = {}
361394
395+ # Get the benchmark
362396 for benchmark_path in multi_evaluation_path .iterdir ():
363397 try :
364398 benchmark = BenchMarkNames (benchmark_path .name )
365399 except ValueError :
366400 continue
401+ # Get the experiment
367402 for experiment_path in benchmark_path .iterdir ():
368403 if not experiment_path .is_dir ():
369404 continue
370405
371406 experiment = experiment_path .name
372- if experiment == "_GT_" :
407+ if experiment == MultiEvaluator . GT_LEAF_DIR :
373408 continue
374409
375- # Get the provider_type from the prediction
376- pred_provider_type = read_prediction_provider_type (
377- experiment_path / MultiEvaluator .PRED_LEAF_DIR
378- )
379-
380- # Get the experiment
381- for modality_path in experiment_path .iterdir ():
410+ # Load the evaluations for each modality
411+ evaluations_path = experiment_path / MultiEvaluator .EVALUATIONS_DIR
412+ for modality_path in evaluations_path .iterdir ():
382413 try :
383414 modality = EvaluationModality (modality_path .name )
384415 except ValueError :
@@ -395,8 +426,8 @@ def load_multi_evaluation(multi_evaluation_path: Path) -> MultiEvaluation:
395426 evaluations [benchmark ][experiment ] = {}
396427 evaluations [benchmark ][experiment ][modality ] = SingleEvaluation (
397428 evaluation = evaluation ,
398- prediction_provider_type = pred_provider_type ,
429+ experiment = experiment ,
399430 )
400431
401- multi_evalution : MultiEvaluation = MultiEvaluation (evaluations = evaluations )
402- return multi_evalution
432+ multi_evaluation : MultiEvaluation = MultiEvaluation (evaluations = evaluations )
433+ return multi_evaluation
0 commit comments