|
14 | 14 | import logging |
15 | 15 | import os |
16 | 16 | import runpy |
17 | | -import statistics |
18 | 17 | import warnings |
19 | 18 | from collections import defaultdict |
20 | 19 | from datetime import datetime |
|
29 | 28 | from mlipaudit.benchmark import Benchmark, ModelOutput |
30 | 29 | from mlipaudit.exceptions import ModelOutputTransferError |
31 | 30 | from mlipaudit.io import ( |
| 31 | + OVERALL_SCORE_KEY_NAME, |
| 32 | + generate_empty_scores_dict, |
32 | 33 | write_benchmark_result_to_disk, |
33 | 34 | write_scores_to_disk, |
34 | 35 | ) |
35 | 36 | from mlipaudit.run_mode import RunMode |
| 37 | +from mlipaudit.scoring import compute_model_score |
36 | 38 |
|
37 | 39 | logger = logging.getLogger("mlipaudit") |
38 | 40 |
|
@@ -246,7 +248,8 @@ def run_benchmarks( |
246 | 248 | force_field = load_force_field(model_to_run) |
247 | 249 |
|
248 | 250 | reusable_model_outputs: dict[tuple[str, ...], ModelOutput] = {} |
249 | | - scores = {} |
| 251 | + scores = generate_empty_scores_dict() |
| 252 | + |
250 | 253 | for benchmark_attempt_idx, benchmark_class in enumerate(benchmarks_to_run, 1): |
251 | 254 | # First check we can run the benchmark with the model |
252 | 255 | missing_elements = fetch_missing_elements(benchmark_class, force_field) |
@@ -288,7 +291,7 @@ def run_benchmarks( |
288 | 291 | if reusable_output_id and reusable_output_id in reusable_model_outputs: |
289 | 292 | logger.info( |
290 | 293 | "[%d/%d] MODEL %s - [%d/%d] BENCHMARK %s - Loading in " |
291 | | - "model outputs from previous benchmark...", |
| 294 | + "model outputs from a previous benchmark...", |
292 | 295 | model_index, |
293 | 296 | len(model_paths), |
294 | 297 | model_name, |
@@ -383,31 +386,26 @@ def run_benchmarks( |
383 | 386 | time_for_analysis, |
384 | 387 | ) |
385 | 388 |
|
386 | | - # Compute model score here from results |
387 | | - if len(scores) > 0: |
388 | | - model_score = statistics.mean(scores.values()) |
389 | | - scores["overall_score"] = model_score |
390 | | - scores["overall_score"] = model_score |
391 | | - logger.info( |
392 | | - "--- [%d/%d] MODEL %s score: %.2f ---", |
393 | | - model_index, |
394 | | - len(model_paths), |
395 | | - model_name, |
396 | | - model_score, |
397 | | - ) |
| 389 | + # Compute mean model score over all benchmarks |
| 390 | + model_score = compute_model_score(scores) |
398 | 391 |
|
399 | | - write_scores_to_disk(scores, output_dir / model_name) |
400 | | - logger.info( |
401 | | - "Wrote benchmark results and scores to disk at path %s.", |
402 | | - output_dir / model_name, |
403 | | - ) |
404 | | - else: |
405 | | - logger.info( |
406 | | - "--- [%d/%d] MODEL %s did not generate any scores ---", |
407 | | - model_index, |
408 | | - len(model_paths), |
409 | | - model_name, |
410 | | - ) |
| 392 | + logger.info( |
| 393 | + "--- [%d/%d] MODEL %s score" |
| 394 | + " (averaged over all available benchmarks): %.2f ---", |
| 395 | + model_index, |
| 396 | + len(model_paths), |
| 397 | + model_name, |
| 398 | + model_score, |
| 399 | + ) |
| 400 | + |
| 401 | + # Also write the overall score to disk |
| 402 | + scores[OVERALL_SCORE_KEY_NAME] = model_score |
| 403 | + write_scores_to_disk(scores, output_dir / model_name) |
| 404 | + |
| 405 | + logger.info( |
| 406 | + "Wrote benchmark results and scores to disk at path %s.", |
| 407 | + output_dir / model_name, |
| 408 | + ) |
411 | 409 |
|
412 | 410 | # Log skipped benchmarks |
413 | 411 | for model_name, skipped in skipped_benchmarks.items(): |
|
0 commit comments