Skip to content

Commit 4aa58a8

Browse files
authored
feat: update scoring average saving and ui display (#102)
* feat: add universal average scoring function * feat: add BENCHMARKS_WITHOUT_SCORES global var * feat: fix score saving * feat: correctly average scores per categories * docs: note in cli about overwriting results * fix: typo
1 parent 38cc238 commit 4aa58a8

File tree

7 files changed

+109
-36
lines changed

7 files changed

+109
-36
lines changed

src/mlipaudit/benchmarks/__init__.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14+
from collections import defaultdict
1415

1516
from mlipaudit.benchmark import Benchmark
1617
from mlipaudit.benchmarks.bond_length_distribution.bond_length_distribution import (
@@ -93,13 +94,20 @@
9394
BENCHMARKS = Benchmark.__subclasses__()
9495
BENCHMARK_NAMES = [b.name for b in BENCHMARKS]
9596

97+
BENCHMARKS_WITHOUT_SCORES = [ScalingBenchmark]
98+
9699

97100
def _setup_benchmark_categories() -> dict[str, list[type[Benchmark]]]:
98-
categories = set(b.category for b in BENCHMARKS)
99-
mapping = {cat: [] for cat in categories} # type: ignore
101+
mapping = defaultdict(list)
100102
for b in BENCHMARKS:
101103
mapping[b.category].append(b)
102104
return mapping
103105

104106

105107
BENCHMARK_CATEGORIES = _setup_benchmark_categories()
108+
109+
# Dict with keys, values: category, number of benchmarks with scores in the category
110+
BENCHMARK_WITH_SCORES_CATEGORIES = {
111+
cat: sum(1 for b in benchmarks if b not in BENCHMARKS_WITHOUT_SCORES)
112+
for cat, benchmarks in BENCHMARK_CATEGORIES.items()
113+
}

src/mlipaudit/benchmarks_cli.py

Lines changed: 25 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
import logging
1515
import os
1616
import runpy
17-
import statistics
1817
import warnings
1918
from collections import defaultdict
2019
from datetime import datetime
@@ -29,10 +28,13 @@
2928
from mlipaudit.benchmark import Benchmark, ModelOutput
3029
from mlipaudit.exceptions import ModelOutputTransferError
3130
from mlipaudit.io import (
31+
OVERALL_SCORE_KEY_NAME,
32+
generate_empty_scores_dict,
3233
write_benchmark_result_to_disk,
3334
write_scores_to_disk,
3435
)
3536
from mlipaudit.run_mode import RunMode
37+
from mlipaudit.scoring import compute_model_score
3638

3739
logger = logging.getLogger("mlipaudit")
3840

@@ -246,7 +248,8 @@ def run_benchmarks(
246248
force_field = load_force_field(model_to_run)
247249

248250
reusable_model_outputs: dict[tuple[str, ...], ModelOutput] = {}
249-
scores = {}
251+
scores = generate_empty_scores_dict()
252+
250253
for benchmark_attempt_idx, benchmark_class in enumerate(benchmarks_to_run, 1):
251254
# First check we can run the benchmark with the model
252255
missing_elements = fetch_missing_elements(benchmark_class, force_field)
@@ -288,7 +291,7 @@ def run_benchmarks(
288291
if reusable_output_id and reusable_output_id in reusable_model_outputs:
289292
logger.info(
290293
"[%d/%d] MODEL %s - [%d/%d] BENCHMARK %s - Loading in "
291-
"model outputs from previous benchmark...",
294+
"model outputs from a previous benchmark...",
292295
model_index,
293296
len(model_paths),
294297
model_name,
@@ -383,31 +386,26 @@ def run_benchmarks(
383386
time_for_analysis,
384387
)
385388

386-
# Compute model score here from results
387-
if len(scores) > 0:
388-
model_score = statistics.mean(scores.values())
389-
scores["overall_score"] = model_score
390-
scores["overall_score"] = model_score
391-
logger.info(
392-
"--- [%d/%d] MODEL %s score: %.2f ---",
393-
model_index,
394-
len(model_paths),
395-
model_name,
396-
model_score,
397-
)
389+
# Compute mean model score over all benchmarks
390+
model_score = compute_model_score(scores)
398391

399-
write_scores_to_disk(scores, output_dir / model_name)
400-
logger.info(
401-
"Wrote benchmark results and scores to disk at path %s.",
402-
output_dir / model_name,
403-
)
404-
else:
405-
logger.info(
406-
"--- [%d/%d] MODEL %s did not generate any scores ---",
407-
model_index,
408-
len(model_paths),
409-
model_name,
410-
)
392+
logger.info(
393+
"--- [%d/%d] MODEL %s score"
394+
" (averaged over all available benchmarks): %.2f ---",
395+
model_index,
396+
len(model_paths),
397+
model_name,
398+
model_score,
399+
)
400+
401+
# Also write the overall score to disk
402+
scores[OVERALL_SCORE_KEY_NAME] = model_score
403+
write_scores_to_disk(scores, output_dir / model_name)
404+
405+
logger.info(
406+
"Wrote benchmark results and scores to disk at path %s.",
407+
output_dir / model_name,
408+
)
411409

412410
# Log skipped benchmarks
413411
for model_name, skipped in skipped_benchmarks.items():

src/mlipaudit/io.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,15 @@
2121
import numpy as np
2222

2323
from mlipaudit.benchmark import Benchmark, BenchmarkResult, ModelOutput
24+
from mlipaudit.benchmarks import BENCHMARK_NAMES, BENCHMARKS_WITHOUT_SCORES
2425
from mlipaudit.io_helpers import (
2526
dataclass_to_dict_with_arrays,
2627
dict_with_arrays_to_dataclass,
2728
)
2829

2930
RESULT_FILENAME = "result.json"
3031
SCORE_FILENAME = "score.json"
32+
OVERALL_SCORE_KEY_NAME = "overall_score"
3133
MODEL_OUTPUT_ZIP_FILENAME = "model_output.zip"
3234
MODEL_OUTPUT_JSON_FILENAME = "model_output.json"
3335
MODEL_OUTPUT_ARRAYS_FILENAME = "arrays.npz"
@@ -133,11 +135,33 @@ def load_benchmark_results_from_disk(
133135
return results
134136

135137

138+
def generate_empty_scores_dict() -> dict[str, float | None]:
139+
"""Generate a scores dict with scores of 0.0 assigned to
140+
benchmarks returning a score and scores of None for those
141+
that don't.
142+
143+
Returns:
144+
The dictionary of 0 or null scores.
145+
"""
146+
padded_scores: dict[str, float | None] = {}
147+
benchmarks_without_scores_names = [b.name for b in BENCHMARKS_WITHOUT_SCORES]
148+
for benchmark_name in BENCHMARK_NAMES:
149+
if benchmark_name not in benchmarks_without_scores_names:
150+
padded_scores[benchmark_name] = 0.0
151+
else:
152+
padded_scores[benchmark_name] = None
153+
154+
return padded_scores
155+
156+
136157
def write_scores_to_disk(
137-
scores: dict[str, float],
158+
scores: dict[str, float | None],
138159
output_dir: str | os.PathLike,
139160
) -> None:
140-
"""Writes the scores to disk.
161+
"""Writes the scores to disk. This will populate the resulting json
162+
with the generated scores, as well as scores of 0.0 for benchmarks
163+
that were skipped and scores of None for benchmarks that don't return
164+
scores.
141165
142166
Args:
143167
scores: The results as a dictionary with the benchmark names as keys
@@ -146,6 +170,7 @@ def write_scores_to_disk(
146170
"""
147171
_output_dir = Path(output_dir)
148172
_output_dir.mkdir(exist_ok=True, parents=True)
173+
149174
with open(_output_dir / SCORE_FILENAME, "w", encoding="utf-8") as f:
150175
json.dump(scores, f, indent=2)
151176

src/mlipaudit/main.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,11 @@ def _subparse_benchmark(parser):
6868
help="paths to the model zip archives or python files",
6969
)
7070
parser.add_argument(
71-
"-o", "--output", required=True, help="path to the output directory"
71+
"-o",
72+
"--output",
73+
required=True,
74+
help="path to the output directory;"
75+
" will overwrite existing results for a given model",
7276
)
7377
parser.add_argument(
7478
"-i",

src/mlipaudit/scoring.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,3 +80,32 @@ def compute_benchmark_score(
8080
metric_scores.append(scores.mean())
8181

8282
return float(np.mean(np.array(metric_scores)))
83+
84+
85+
def compute_model_score(scores: dict[str, float | None]) -> float:
86+
"""Compute the score for a model given a dictionary of scores.
87+
88+
Args:
89+
scores: The dictionary of scores to use to compute the average
90+
with keys being benchmark_name and values the score.
91+
This dictionary should contain the skipped benchmarks
92+
too with scores assigned as 0.0 and scores of None for
93+
benchmarks that do not return scores.
94+
95+
Raises:
96+
ValueError: If 'Overall score' is a key in the scores dictionary.
97+
98+
Returns:
99+
The mean score.
100+
"""
101+
if "Overall score" in scores:
102+
raise ValueError("Overall score should not be part of dictionary.")
103+
104+
assert len(scores) > 0
105+
106+
# Ignore benchmarks that don't return scores
107+
benchmarks_with_scores = {
108+
name: score for name, score in scores.items() if score is not None
109+
}
110+
111+
return sum(benchmarks_with_scores.values()) / len(benchmarks_with_scores)

src/mlipaudit/ui/leaderboard.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
import pandas as pd
1616
import streamlit as st
1717

18-
from mlipaudit.benchmarks import BENCHMARK_CATEGORIES
18+
from mlipaudit.benchmarks import BENCHMARK_CATEGORIES, BENCHMARK_WITH_SCORES_CATEGORIES
19+
from mlipaudit.io import OVERALL_SCORE_KEY_NAME
1920
from mlipaudit.ui.utils import (
2021
color_scores,
2122
highlight_overall_score,
@@ -88,7 +89,10 @@ def _group_score_df_by_benchmark_category(score_df: pd.DataFrame) -> pd.DataFram
8889
]
8990
names_filtered = [b for b in names if b in score_df.columns]
9091

91-
score_df[category] = score_df[names_filtered].mean(axis=1)
92+
score_df[category] = (
93+
score_df[names_filtered].sum(axis=1)
94+
/ BENCHMARK_WITH_SCORES_CATEGORIES[category]
95+
)
9296
score_df = score_df.drop(columns=names_filtered)
9397

9498
columns_in_order = [
@@ -162,7 +166,11 @@ def leaderboard_page(
162166
df_main = parse_scores_dict_into_df(scores)
163167

164168
# 2. Main Table Display (Common Logic)
165-
df_main.sort_values(by="Overall score", ascending=False, inplace=True)
169+
df_main.sort_values(
170+
by=OVERALL_SCORE_KEY_NAME.replace("_", " ").capitalize(),
171+
ascending=False,
172+
inplace=True,
173+
)
166174

167175
df_grouped_main = _group_score_df_by_benchmark_category(df_main).fillna("N/A")
168176

src/mlipaudit/ui/utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import streamlit as st
2020

2121
from mlipaudit.benchmark import BenchmarkResult
22+
from mlipaudit.io import OVERALL_SCORE_KEY_NAME
2223

2324
INTERNAL_MODELS_FILE_EXTENSION = "_int"
2425
EXTERNAL_MODELS_FILE_EXTENSION = "_ext"
@@ -81,7 +82,7 @@ def highlight_overall_score(s: pd.Series) -> list[str]:
8182
Returns:
8283
The list of styles to apply to each cell in the Series.
8384
"""
84-
if s.name == "Overall score":
85+
if s.name == OVERALL_SCORE_KEY_NAME.replace("_", " ").capitalize():
8586
# Specific background color for 'Overall score'
8687
bg_r, bg_g, bg_b = (173, 216, 230) # RGB for Light Blue
8788
text_color = get_text_color(bg_r, bg_g, bg_b)

0 commit comments

Comments
 (0)