feat: update scoring average saving and ui display (#102)

lwalew · web-flow · commit 4aa58a870499 · 2025-11-19T15:30:21.000+01:00
* feat: add universal average scoring function

* feat: add BENCHMARKS_WITHOUT_SCORES global var

* feat: fix score saving

* feat: correctly average scores per categories

* docs: note in cli about overwriting results

* fix: typo
diff --git a/src/mlipaudit/benchmarks/__init__.py b/src/mlipaudit/benchmarks/__init__.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from collections import defaultdict
 
 from mlipaudit.benchmark import Benchmark
 from mlipaudit.benchmarks.bond_length_distribution.bond_length_distribution import (
@@ -93,13 +94,20 @@
 BENCHMARKS = Benchmark.__subclasses__()
 BENCHMARK_NAMES = [b.name for b in BENCHMARKS]
 
+BENCHMARKS_WITHOUT_SCORES = [ScalingBenchmark]
+
 
 def _setup_benchmark_categories() -> dict[str, list[type[Benchmark]]]:
-    categories = set(b.category for b in BENCHMARKS)
-    mapping = {cat: [] for cat in categories}  # type: ignore
+    mapping = defaultdict(list)
     for b in BENCHMARKS:
         mapping[b.category].append(b)
     return mapping
 
 
 BENCHMARK_CATEGORIES = _setup_benchmark_categories()
+
+# Dict with keys, values: category, number of benchmarks with scores in the category
+BENCHMARK_WITH_SCORES_CATEGORIES = {
+    cat: sum(1 for b in benchmarks if b not in BENCHMARKS_WITHOUT_SCORES)
+    for cat, benchmarks in BENCHMARK_CATEGORIES.items()
+}
diff --git a/src/mlipaudit/benchmarks_cli.py b/src/mlipaudit/benchmarks_cli.py
@@ -14,7 +14,6 @@
 import logging
 import os
 import runpy
-import statistics
 import warnings
 from collections import defaultdict
 from datetime import datetime
@@ -29,10 +28,13 @@
 from mlipaudit.benchmark import Benchmark, ModelOutput
 from mlipaudit.exceptions import ModelOutputTransferError
 from mlipaudit.io import (
+    OVERALL_SCORE_KEY_NAME,
+    generate_empty_scores_dict,
     write_benchmark_result_to_disk,
     write_scores_to_disk,
 )
 from mlipaudit.run_mode import RunMode
+from mlipaudit.scoring import compute_model_score
 
 logger = logging.getLogger("mlipaudit")
 
@@ -246,7 +248,8 @@ def run_benchmarks(
         force_field = load_force_field(model_to_run)
 
         reusable_model_outputs: dict[tuple[str, ...], ModelOutput] = {}
-        scores = {}
+        scores = generate_empty_scores_dict()
+
         for benchmark_attempt_idx, benchmark_class in enumerate(benchmarks_to_run, 1):
             # First check we can run the benchmark with the model
             missing_elements = fetch_missing_elements(benchmark_class, force_field)
@@ -288,7 +291,7 @@ def run_benchmarks(
             if reusable_output_id and reusable_output_id in reusable_model_outputs:
                 logger.info(
                     "[%d/%d] MODEL %s - [%d/%d] BENCHMARK %s - Loading in "
-                    "model outputs from previous benchmark...",
+                    "model outputs from a previous benchmark...",
                     model_index,
                     len(model_paths),
                     model_name,
@@ -383,31 +386,26 @@ def run_benchmarks(
                     time_for_analysis,
                 )
 
-        # Compute model score here from results
-        if len(scores) > 0:
-            model_score = statistics.mean(scores.values())
-            scores["overall_score"] = model_score
-            scores["overall_score"] = model_score
-            logger.info(
-                "--- [%d/%d] MODEL %s score: %.2f ---",
-                model_index,
-                len(model_paths),
-                model_name,
-                model_score,
-            )
+        # Compute mean model score over all benchmarks
+        model_score = compute_model_score(scores)
 
-            write_scores_to_disk(scores, output_dir / model_name)
-            logger.info(
-                "Wrote benchmark results and scores to disk at path %s.",
-                output_dir / model_name,
-            )
-        else:
-            logger.info(
-                "--- [%d/%d] MODEL %s did not generate any scores ---",
-                model_index,
-                len(model_paths),
-                model_name,
-            )
+        logger.info(
+            "--- [%d/%d] MODEL %s score"
+            " (averaged over all available benchmarks): %.2f ---",
+            model_index,
+            len(model_paths),
+            model_name,
+            model_score,
+        )
+
+        # Also write the overall score to disk
+        scores[OVERALL_SCORE_KEY_NAME] = model_score
+        write_scores_to_disk(scores, output_dir / model_name)
+
+        logger.info(
+            "Wrote benchmark results and scores to disk at path %s.",
+            output_dir / model_name,
+        )
 
     # Log skipped benchmarks
     for model_name, skipped in skipped_benchmarks.items():
diff --git a/src/mlipaudit/io.py b/src/mlipaudit/io.py
@@ -21,13 +21,15 @@
 import numpy as np
 
 from mlipaudit.benchmark import Benchmark, BenchmarkResult, ModelOutput
+from mlipaudit.benchmarks import BENCHMARK_NAMES, BENCHMARKS_WITHOUT_SCORES
 from mlipaudit.io_helpers import (
     dataclass_to_dict_with_arrays,
     dict_with_arrays_to_dataclass,
 )
 
 RESULT_FILENAME = "result.json"
 SCORE_FILENAME = "score.json"
+OVERALL_SCORE_KEY_NAME = "overall_score"
 MODEL_OUTPUT_ZIP_FILENAME = "model_output.zip"
 MODEL_OUTPUT_JSON_FILENAME = "model_output.json"
 MODEL_OUTPUT_ARRAYS_FILENAME = "arrays.npz"
@@ -133,11 +135,33 @@ def load_benchmark_results_from_disk(
     return results
 
 
+def generate_empty_scores_dict() -> dict[str, float | None]:
+    """Generate a scores dict with scores of 0.0 assigned to
+    benchmarks returning a score and scores of None for those
+    that don't.
+
+    Returns:
+        The dictionary of 0 or null scores.
+    """
+    padded_scores: dict[str, float | None] = {}
+    benchmarks_without_scores_names = [b.name for b in BENCHMARKS_WITHOUT_SCORES]
+    for benchmark_name in BENCHMARK_NAMES:
+        if benchmark_name not in benchmarks_without_scores_names:
+            padded_scores[benchmark_name] = 0.0
+        else:
+            padded_scores[benchmark_name] = None
+
+    return padded_scores
+
+
 def write_scores_to_disk(
-    scores: dict[str, float],
+    scores: dict[str, float | None],
     output_dir: str | os.PathLike,
 ) -> None:
-    """Writes the scores to disk.
+    """Writes the scores to disk. This will populate the resulting json
+    with the generated scores, as well as scores of 0.0 for benchmarks
+    that were skipped and scores of None for benchmarks that don't return
+    scores.
 
     Args:
         scores: The results as a dictionary with the benchmark names as keys
@@ -146,6 +170,7 @@ def write_scores_to_disk(
     """
     _output_dir = Path(output_dir)
     _output_dir.mkdir(exist_ok=True, parents=True)
+
     with open(_output_dir / SCORE_FILENAME, "w", encoding="utf-8") as f:
         json.dump(scores, f, indent=2)
 
diff --git a/src/mlipaudit/main.py b/src/mlipaudit/main.py
@@ -68,7 +68,11 @@ def _subparse_benchmark(parser):
         help="paths to the model zip archives or python files",
     )
     parser.add_argument(
-        "-o", "--output", required=True, help="path to the output directory"
+        "-o",
+        "--output",
+        required=True,
+        help="path to the output directory;"
+        " will overwrite existing results for a given model",
     )
     parser.add_argument(
         "-i",
diff --git a/src/mlipaudit/scoring.py b/src/mlipaudit/scoring.py
@@ -80,3 +80,32 @@ def compute_benchmark_score(
         metric_scores.append(scores.mean())
 
     return float(np.mean(np.array(metric_scores)))
+
+
+def compute_model_score(scores: dict[str, float | None]) -> float:
+    """Compute the score for a model given a dictionary of scores.
+
+    Args:
+        scores: The dictionary of scores to use to compute the average
+            with keys being benchmark_name and values the score.
+            This dictionary should contain the skipped benchmarks
+            too with scores assigned as 0.0 and scores of None for
+            benchmarks that do not return scores.
+
+    Raises:
+        ValueError: If 'Overall score' is a key in the scores dictionary.
+
+    Returns:
+        The mean score.
+    """
+    if "Overall score" in scores:
+        raise ValueError("Overall score should not be part of dictionary.")
+
+    assert len(scores) > 0
+
+    # Ignore benchmarks that don't return scores
+    benchmarks_with_scores = {
+        name: score for name, score in scores.items() if score is not None
+    }
+
+    return sum(benchmarks_with_scores.values()) / len(benchmarks_with_scores)
diff --git a/src/mlipaudit/ui/leaderboard.py b/src/mlipaudit/ui/leaderboard.py
@@ -15,7 +15,8 @@
 import pandas as pd
 import streamlit as st
 
-from mlipaudit.benchmarks import BENCHMARK_CATEGORIES
+from mlipaudit.benchmarks import BENCHMARK_CATEGORIES, BENCHMARK_WITH_SCORES_CATEGORIES
+from mlipaudit.io import OVERALL_SCORE_KEY_NAME
 from mlipaudit.ui.utils import (
     color_scores,
     highlight_overall_score,
@@ -88,7 +89,10 @@ def _group_score_df_by_benchmark_category(score_df: pd.DataFrame) -> pd.DataFram
         ]
         names_filtered = [b for b in names if b in score_df.columns]
 
-        score_df[category] = score_df[names_filtered].mean(axis=1)
+        score_df[category] = (
+            score_df[names_filtered].sum(axis=1)
+            / BENCHMARK_WITH_SCORES_CATEGORIES[category]
+        )
         score_df = score_df.drop(columns=names_filtered)
 
     columns_in_order = [
@@ -162,7 +166,11 @@ def leaderboard_page(
         df_main = parse_scores_dict_into_df(scores)
 
     # 2. Main Table Display (Common Logic)
-    df_main.sort_values(by="Overall score", ascending=False, inplace=True)
+    df_main.sort_values(
+        by=OVERALL_SCORE_KEY_NAME.replace("_", " ").capitalize(),
+        ascending=False,
+        inplace=True,
+    )
 
     df_grouped_main = _group_score_df_by_benchmark_category(df_main).fillna("N/A")
 
diff --git a/src/mlipaudit/ui/utils.py b/src/mlipaudit/ui/utils.py
@@ -19,6 +19,7 @@
 import streamlit as st
 
 from mlipaudit.benchmark import BenchmarkResult
+from mlipaudit.io import OVERALL_SCORE_KEY_NAME
 
 INTERNAL_MODELS_FILE_EXTENSION = "_int"
 EXTERNAL_MODELS_FILE_EXTENSION = "_ext"
@@ -81,7 +82,7 @@ def highlight_overall_score(s: pd.Series) -> list[str]:
     Returns:
         The list of styles to apply to each cell in the Series.
     """
-    if s.name == "Overall score":
+    if s.name == OVERALL_SCORE_KEY_NAME.replace("_", " ").capitalize():
         # Specific background color for 'Overall score'
         bg_r, bg_g, bg_b = (173, 216, 230)  # RGB for Light Blue
         text_color = get_text_color(bg_r, bg_g, bg_b)

Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,11 @@ def _subparse_benchmark(parser):`
`68`	`68`	`help="paths to the model zip archives or python files",`
`69`	`69`	`)`
`70`	`70`	`parser.add_argument(`
`71`		`- "-o", "--output", required=True, help="path to the output directory"`
	`71`	`+ "-o",`
	`72`	`+ "--output",`
	`73`	`+ required=True,`
	`74`	`+ help="path to the output directory;"`
	`75`	`+ " will overwrite existing results for a given model",`
`72`	`76`	`)`
`73`	`77`	`parser.add_argument(`
`74`	`78`	`"-i",`