@@ -216,51 +216,52 @@ def merge_tasks_summary(bench_name: str, model_name: str, run_dir: Path) -> None
216
216
if not model_dir .exists ():
217
217
return
218
218
219
- # Collect all task results from all repeats
220
- task_data_by_prompt : Dict [str , Dict [str , List [float ]]] = {}
219
+ task_data_by_id : Dict [str , Dict [str , Any ]] = {}
221
220
222
221
for repeat_dir in model_dir .iterdir ():
223
222
if repeat_dir .is_dir () and repeat_dir .name .isdigit ():
224
223
results_file = repeat_dir / DETAILED_FILE_NAME
225
224
if results_file .exists ():
226
- # Read detailed results from this repeat
227
225
with open (results_file , "r" ) as f :
228
226
reader = csv .DictReader (f )
229
227
for row in reader :
228
+ task_id = row ["task_id" ]
230
229
task_prompt = row ["task_prompt" ]
231
230
score = float (row ["score" ])
232
231
total_time = float (row ["total_time" ])
233
232
234
- if task_prompt not in task_data_by_prompt :
235
- task_data_by_prompt [ task_prompt ] = {
233
+ if task_id not in task_data_by_id :
234
+ task_data_by_id [ task_id ] = {
236
235
"scores" : [],
237
236
"times" : [],
237
+ "task_prompt" : task_prompt ,
238
238
}
239
239
240
- task_data_by_prompt [ task_prompt ]["scores" ].append (score )
241
- task_data_by_prompt [ task_prompt ]["times" ].append (total_time )
240
+ task_data_by_id [ task_id ]["scores" ].append (score )
241
+ task_data_by_id [ task_id ]["times" ].append (total_time )
242
242
243
- if not task_data_by_prompt :
243
+ if not task_data_by_id :
244
244
return
245
245
246
246
# Calculate statistics for each task
247
247
task_summaries : List [TasksSummary ] = []
248
- for task_prompt , data in task_data_by_prompt .items ():
248
+ for task_id , data in task_data_by_id .items ():
249
249
scores = np .array (data ["scores" ])
250
250
times = np .array (data ["times" ])
251
+ task_prompt = data ["task_prompt" ]
251
252
252
253
task_summary = TasksSummary (
253
254
model_name = model_name ,
255
+ task_id = task_id ,
254
256
task_prompt = task_prompt ,
255
257
avg_success_rate = round (float (scores .mean ()), 3 ),
256
258
std_success_rate = round (float (scores .std ()), 3 ),
257
259
avg_time = round (float (times .mean ()), 3 ),
258
260
std_time = round (float (times .std ()), 3 ),
259
- repeats = len (scores ), # TODO (mkotynia) (extract repeats in another way)
261
+ repeats = len (scores ),
260
262
)
261
263
task_summaries .append (task_summary )
262
264
263
- # Save task summaries to CSV
264
265
tasks_summary_file = model_dir / TASKS_SUMMARY_FILE_NAME
265
266
with open (tasks_summary_file , "w" , newline = "" ) as f :
266
267
if task_summaries :
@@ -420,15 +421,17 @@ def test_models(
420
421
bench_logger .critical (
421
422
f"{ bench_conf .name } benchmark for { model_name } , vendor: { vendors [i ]} , execution number: { u + 1 } "
422
423
)
423
- # TODO (mkotynia): resolve unbound bench_logger
424
- bench_logger .info (f"Merging summaries for benchmark: { bench_conf .name } " )
424
+ merge_results_logger = define_benchmark_logger (out_dir = Path (out_dir ))
425
+ merge_results_logger .info (
426
+ f"Merging summaries for benchmark: { bench_conf .name } "
427
+ )
425
428
426
429
for model_name in model_names :
427
430
merge_model_repeats_summary (bench_conf .name , model_name , run_dir )
428
431
merge_tasks_summary (bench_conf .name , model_name , run_dir )
429
432
430
433
merge_benchmark_summary (bench_conf .name , run_dir , model_names )
431
434
432
- bench_logger .info (
435
+ merge_results_logger .info (
433
436
f"Summary merging completed for benchmark: { bench_conf .name } "
434
437
)
0 commit comments