feat: update scoring (#52)

lwalew · chrbrunk · web-flow · commit 8b294e4e39ff · 2025-09-19T18:46:27.000+02:00
* feat: update scoring alpha

* feat: update scoring functions

* feat: update bond length dist

* feat: update conformer selection

* chore: add todos

* feat: update most benchmarks for scoring

* feat: let UI distinguish benchmark categories

* test: updated tests

* test: fix tests

* feat: update solvent rdf score

* feat: update water rdf score

* feat: update solvent rdf score

* fix: divide score by 2

---------

Co-authored-by: Christoph Brunken &lt;c.brunken@instadeep.com&gt;
diff --git a/src/mlipaudit/app.py b/src/mlipaudit/app.py
@@ -230,14 +230,16 @@ def main():
             ring_planarity,
             small_molecule_minimization,
             bond_length_distribution,
-            water_radial_distribution,
-            solvent_radial_distribution,
             reactivity,
         ],
         "Biomolecules": [
             folding_stability,
             sampling,
         ],
+        "Molecular Liquids": [
+            water_radial_distribution,
+            solvent_radial_distribution,
+        ],
         "General": [stability, scaling],
     }
 
@@ -255,6 +257,7 @@ def main():
         pages_to_show = [leaderboard] + (
             page_categories["Small Molecules"]
             + page_categories["Biomolecules"]
+            + page_categories["Molecular Liquids"]
             + page_categories["General"]
         )
 
diff --git a/src/mlipaudit/benchmarks/__init__.py b/src/mlipaudit/benchmarks/__init__.py
@@ -99,4 +99,24 @@
     StabilityBenchmark,
     ScalingBenchmark,
 ]
+
 BENCHMARK_NAMES = [b.name for b in BENCHMARKS]
+
+BENCHMARK_CATEGORIES = {
+    "Small Molecules": [
+        ConformerSelectionBenchmark,
+        DihedralScanBenchmark,
+        TautomersBenchmark,
+        NoncovalentInteractionsBenchmark,
+        RingPlanarityBenchmark,
+        SmallMoleculeMinimizationBenchmark,
+        BondLengthDistributionBenchmark,
+        ReactivityBenchmark,
+    ],
+    "Biomolecules": [FoldingStabilityBenchmark, SamplingBenchmark],
+    "Molecular Liquids": [
+        WaterRadialDistributionBenchmark,
+        SolventRadialDistributionBenchmark,
+    ],
+    "General": [StabilityBenchmark],
+}
diff --git a/src/mlipaudit/benchmarks/bond_length_distribution/bond_length_distribution.py b/src/mlipaudit/benchmarks/bond_length_distribution/bond_length_distribution.py
@@ -43,7 +43,7 @@
     "temperature_kelvin": 300.0,
 }
 
-AVG_DEVIATION_SCORE_THRESHOLD = 0.05
+DEVIATION_SCORE_THRESHOLD = 0.05
 
 
 class Molecule(BaseModel):
@@ -218,23 +218,23 @@ def analyze(self) -> BondLengthDistributionResult:
                 trajectory[:, pattern_indices[0]] - trajectory[:, pattern_indices[1]],
                 axis=1,
             )
-            deviation_trajectory = list(
+            deviation_trajectory = np.abs(
                 bond_length_trajectory - reference_bond_distance
             )
 
             molecule_result = BondLengthDistributionMoleculeResult(
                 molecule_name=molecule_output.molecule_name,
-                deviation_trajectory=deviation_trajectory,
-                avg_deviation=statistics.mean(deviation_trajectory),
+                deviation_trajectory=list(deviation_trajectory),
+                avg_deviation=float(np.mean(deviation_trajectory)),
             )
             results.append(molecule_result)
 
         avg_deviation = statistics.mean(r.avg_deviation for r in results)
 
         score = compute_benchmark_score(
-            [avg_deviation],
+            [[r.avg_deviation for r in results]],
             [
-                AVG_DEVIATION_SCORE_THRESHOLD,
+                DEVIATION_SCORE_THRESHOLD,
             ],
         )
 
diff --git a/src/mlipaudit/benchmarks/conformer_selection/conformer_selection.py b/src/mlipaudit/benchmarks/conformer_selection/conformer_selection.py
@@ -31,8 +31,8 @@
 
 WIGGLE_DATASET_FILENAME = "wiggle150_dataset.json"
 
-AVG_MAE_SCORE_THRESHOLD = 0.5
-AVG_RMSE_SCORE_THRESHOLD = 1.5
+MAE_SCORE_THRESHOLD = 0.5
+RMSE_SCORE_THRESHOLD = 1.5
 
 
 class ConformerSelectionMoleculeResult(BaseModel):
@@ -251,8 +251,8 @@ def analyze(self) -> ConformerSelectionResult:
         avg_rmse = statistics.mean(r.rmse for r in results)
 
         score = compute_benchmark_score(
-            [avg_mae, avg_rmse],
-            [AVG_MAE_SCORE_THRESHOLD, AVG_RMSE_SCORE_THRESHOLD],
+            [[r.mae for r in results], [r.rmse for r in results]],
+            [MAE_SCORE_THRESHOLD, RMSE_SCORE_THRESHOLD],
         )
 
         return ConformerSelectionResult(
diff --git a/src/mlipaudit/benchmarks/dihedral_scan/dihedral_scan.py b/src/mlipaudit/benchmarks/dihedral_scan/dihedral_scan.py
@@ -32,7 +32,7 @@
 
 TORSIONNET_DATASET_FILENAME = "TorsionNet500.json"
 
-MAE_BARRIER_HEIGHT_SCORE_THRESHOLD = 1.0
+BARRIER_HEIGHT_SCORE_THRESHOLD = 1.0
 
 
 class Fragment(BaseModel):
@@ -256,10 +256,9 @@ def analyze(self) -> DihedralScanResult:
 
             results.append(fragment_result)
 
-        mae_barrier_height = statistics.mean(r.barrier_height_error for r in results)
         score = compute_benchmark_score(
-            [mae_barrier_height],
-            [MAE_BARRIER_HEIGHT_SCORE_THRESHOLD],
+            [[r.barrier_height_error for r in results]],
+            [BARRIER_HEIGHT_SCORE_THRESHOLD],
         )
 
         return DihedralScanResult(
diff --git a/src/mlipaudit/benchmarks/folding_stability/folding_stability.py b/src/mlipaudit/benchmarks/folding_stability/folding_stability.py
@@ -57,8 +57,8 @@
     "temperature_kelvin": 300.0,
 }
 
-MIN_RMSD_SCORE_THRESHOLD = 2.0
-MAX_TM_SCORE_THRESHOLD = 0.5
+RMSD_SCORE_THRESHOLD = 2.0
+TM_SCORE_THRESHOLD = 0.5
 
 
 class FoldingStabilityMoleculeResult(BaseModel):
@@ -260,20 +260,20 @@ def analyze(self) -> FoldingStabilityResult:
             )
             molecule_results.append(molecule_result)
 
-        min_rmsd = min(r.avg_rmsd for r in molecule_results)
-        max_tm_score = max(r.avg_tm_score for r in molecule_results)
-
         score = compute_benchmark_score(
-            [min_rmsd, max_tm_score],
-            [MIN_RMSD_SCORE_THRESHOLD, MAX_TM_SCORE_THRESHOLD],
+            [
+                [r.avg_rmsd for r in molecule_results],
+                [r.avg_tm_score for r in molecule_results],
+            ],
+            [RMSD_SCORE_THRESHOLD, TM_SCORE_THRESHOLD],
         )
 
         return FoldingStabilityResult(
             molecules=molecule_results,
             avg_rmsd=statistics.mean(r.avg_rmsd for r in molecule_results),
-            min_rmsd=min_rmsd,
+            min_rmsd=min(r.avg_rmsd for r in molecule_results),
             avg_tm_score=statistics.mean(r.avg_tm_score for r in molecule_results),
-            max_tm_score=max_tm_score,
+            max_tm_score=max(r.avg_tm_score for r in molecule_results),
             avg_match=statistics.mean(r.avg_match for r in molecule_results),
             max_abs_deviation_radius_of_gyration=max(
                 r.max_abs_deviation_radius_of_gyration for r in molecule_results
diff --git a/src/mlipaudit/benchmarks/noncovalent_interactions/noncovalent_interactions.py b/src/mlipaudit/benchmarks/noncovalent_interactions/noncovalent_interactions.py
@@ -58,8 +58,7 @@
     "B": "Boron",
 }
 
-MAE_INTERACTION_ENERGY_SCORE_THRESHOLD = 1.0
-RMSE_INTERACTION_ENERGY_SCORE_THRESHOLD = 1.0
+INTERACTION_ENERGY_SCORE_THRESHOLD = 1.0
 
 
 class NoncovalentInteractionsSystemResult(BenchmarkResult):
@@ -262,17 +261,16 @@ def _compute_metrics_from_system_results(
         )
 
     all_deviations = [system_results.deviation for system_results in results]
-    mae_interaction_energy_all = np.mean(np.abs(all_deviations))
-    rmse_interaction_energy_all = np.sqrt(np.mean(np.array(all_deviations) ** 2))
+    abs_deviations = [np.abs(dev) for dev in all_deviations]
 
     score = compute_benchmark_score(
-        [mae_interaction_energy_all, rmse_interaction_energy_all],
-        [
-            MAE_INTERACTION_ENERGY_SCORE_THRESHOLD,
-            RMSE_INTERACTION_ENERGY_SCORE_THRESHOLD,
-        ],
+        [abs_deviations],
+        [INTERACTION_ENERGY_SCORE_THRESHOLD],
     )
 
+    mae_interaction_energy_all = np.mean(abs_deviations)
+    rmse_interaction_energy_all = np.sqrt(np.mean(np.array(all_deviations) ** 2))
+
     return NoncovalentInteractionsResult(
         systems=results,
         n_skipped_unallowed_elements=n_skipped_unallowed_elements,
diff --git a/src/mlipaudit/benchmarks/reactivity/reactivity.py b/src/mlipaudit/benchmarks/reactivity/reactivity.py
@@ -45,8 +45,8 @@
 
 GRAMBOW_DATASET_FILENAME = "grambow_dataset.json"
 
-MAE_ACTIVATION_ENERGY_SCORE_THRESHOLD = 3.0
-MAE_ENTHALPY_OF_REACTION_SCORE_THRESHOLD = 2.0
+ACTIVATION_ENERGY_SCORE_THRESHOLD = 3.0
+ENTHALPY_OF_REACTION_SCORE_THRESHOLD = 2.0
 
 
 class Molecule(BaseModel):
@@ -285,17 +285,17 @@ def analyze(self) -> ReactivityResult:
             for reaction_result in result.values()
         ])
 
-        mae_activation_energy = float(np.mean(ea_abs_errors))
-        mae_enthalpy_of_reaction = float(np.mean(dh_abs_errors))
-
         score = compute_benchmark_score(
-            [mae_activation_energy, mae_enthalpy_of_reaction],
+            [list(ea_abs_errors), list(dh_abs_errors)],
             [
-                MAE_ACTIVATION_ENERGY_SCORE_THRESHOLD,
-                MAE_ENTHALPY_OF_REACTION_SCORE_THRESHOLD,
+                ACTIVATION_ENERGY_SCORE_THRESHOLD,
+                ENTHALPY_OF_REACTION_SCORE_THRESHOLD,
             ],
         )
 
+        mae_activation_energy = float(np.mean(ea_abs_errors))
+        mae_enthalpy_of_reaction = float(np.mean(dh_abs_errors))
+
         return ReactivityResult(
             reaction_results=result,
             mae_activation_energy=mae_activation_energy,
diff --git a/src/mlipaudit/benchmarks/ring_planarity/ring_planarity.py b/src/mlipaudit/benchmarks/ring_planarity/ring_planarity.py
@@ -44,7 +44,7 @@
     "temperature_kelvin": 300.0,
 }
 
-MAE_DEVIATION_SCORE_THRESHOLD = 0.05
+DEVIATION_SCORE_THRESHOLD = 0.05
 
 
 def deviation_from_plane(coords: np.ndarray) -> float:
@@ -237,7 +237,7 @@ def analyze(self) -> RingPlanarityResult:
 
         mae_deviation = statistics.mean(r.avg_deviation for r in results)
         score = compute_benchmark_score(
-            [mae_deviation], [MAE_DEVIATION_SCORE_THRESHOLD]
+            [[r.avg_deviation for r in results]], [DEVIATION_SCORE_THRESHOLD]
         )
 
         return RingPlanarityResult(
diff --git a/src/mlipaudit/benchmarks/sampling/sampling.py b/src/mlipaudit/benchmarks/sampling/sampling.py
@@ -14,7 +14,6 @@
 
 import functools
 import logging
-import statistics
 from collections import defaultdict
 
 import numpy as np
@@ -221,6 +220,8 @@ class SamplingResult(BenchmarkResult):
             dihedral distribution for each residue type.
         outliers_ratio_sidechain_dihedrals: The ratio of outliers in the sidechain
             dihedral distribution for each residue type.
+        score: The final score for the benchmark between
+            0 and 1.
     """
 
     systems: list[SamplingSystemResult]
@@ -458,8 +459,8 @@ def analyze(self) -> SamplingResult:
 
         score = compute_benchmark_score(
             [
-                statistics.mean(avg_outliers_ratio_backbone.values()),
-                statistics.mean(avg_outliers_ratio_sidechain.values()),
+                list(avg_outliers_ratio_backbone.values()),
+                list(avg_outliers_ratio_sidechain.values()),
             ],
             [
                 OUTLIERS_RATIO_BACKBONE_SCORE_THRESHOLD,
diff --git a/src/mlipaudit/benchmarks/small_molecule_minimization/small_molecule_minimization.py b/src/mlipaudit/benchmarks/small_molecule_minimization/small_molecule_minimization.py
@@ -67,7 +67,7 @@
     "timestep_fs": 0.1,
 }
 
-AVG_RMSD_SCORE_THRESHOLD = 0.075
+RMSD_SCORE_THRESHOLD = 0.075
 
 
 class Molecule(BaseModel):
@@ -292,11 +292,10 @@ def analyze(self) -> SmallMoleculeMinimizationResult:
             )
             result[dataset_prefix] = dataset_result
 
-        result["avg_rmsd"] = statistics.mean(
-            dataset_result.avg_rmsd for dataset_result in result.values()
-        )
-        result["score"] = compute_benchmark_score(
-            [result["avg_rmsd"]], [AVG_RMSD_SCORE_THRESHOLD]
+        all_avg_rsmds = [dataset_result.avg_rmsd for dataset_result in result.values()]
+        result["avg_rmsd"] = statistics.mean(all_avg_rsmds)
+        result["score"] = compute_benchmark_score(  # type: ignore
+            [all_avg_rsmds], [RMSD_SCORE_THRESHOLD]
         )
 
         return SmallMoleculeMinimizationResult(**result)
diff --git a/src/mlipaudit/benchmarks/solvent_radial_distribution/solvent_radial_distribution.py b/src/mlipaudit/benchmarks/solvent_radial_distribution/solvent_radial_distribution.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+import math
 import statistics
 
 import mdtraj as md
@@ -25,6 +26,7 @@
 
 from mlipaudit.benchmark import Benchmark, BenchmarkResult, ModelOutput
 from mlipaudit.run_mode import RunMode
+from mlipaudit.scoring import ALPHA
 from mlipaudit.utils import create_mdtraj_trajectory_from_simulation_state
 
 logger = logging.getLogger("mlipaudit")
@@ -103,13 +105,15 @@ class SolventRadialDistributionStructureResult(BaseModel):
             the radius at which the rdf is the maximum.
         peak_deviation: The deviation of the
             first solvent peak from the reference.
+        score: The score for the molecule.
     """
 
     structure_name: str
     radii: list[float]
     rdf: list[float]
     first_solvent_peak: float
     peak_deviation: NonNegativeFloat
+    score: float
 
 
 class SolventRadialDistributionResult(BenchmarkResult):
@@ -119,6 +123,8 @@ class SolventRadialDistributionResult(BenchmarkResult):
         structure_names: The names of the structures.
         structures: List of per structure results.
         avg_peak_deviation: The average deviation across all structures.
+        score: The final score for the benchmark between
+            0 and 1.
     """
 
     structure_names: list[str]
@@ -238,14 +244,20 @@ def analyze(self) -> SolventRadialDistributionResult:
             ].item()
             rdf = g_r.tolist()
 
+            peak_deviation = abs(
+                first_solvent_peak - REFERENCE_MAXIMA[system_name]["distance"]
+            )
+            score = math.exp(
+                -ALPHA * peak_deviation / REFERENCE_MAXIMA[system_name]["distance"]
+            )
+
             structure_result = SolventRadialDistributionStructureResult(
                 structure_name=system_name,
                 radii=radii.tolist(),
                 rdf=rdf,
                 first_solvent_peak=first_solvent_peak,
-                peak_deviation=abs(
-                    first_solvent_peak - REFERENCE_MAXIMA[system_name]["distance"]
-                ),
+                peak_deviation=peak_deviation,
+                score=score,
             )
 
             structure_results.append(structure_result)
@@ -256,6 +268,7 @@ def analyze(self) -> SolventRadialDistributionResult:
             avg_peak_deviation=statistics.mean(
                 structure.peak_deviation for structure in structure_results
             ),
+            score=statistics.mean(r.score for r in structure_results),
         )
 
     @property
diff --git a/src/mlipaudit/benchmarks/tautomers/tautomers.py b/src/mlipaudit/benchmarks/tautomers/tautomers.py
@@ -193,7 +193,9 @@ def analyze(self) -> TautomersResult:
         mae = statistics.mean(r.abs_deviation for r in molecule_results)
         mse = statistics.mean(r.abs_deviation**2 for r in molecule_results)
 
-        score = compute_benchmark_score([mae], [MAE_SCORE_THRESHOLD])
+        score = compute_benchmark_score(
+            [[r.abs_deviation for r in molecule_results]], [MAE_SCORE_THRESHOLD]
+        )
 
         return TautomersResult(
             molecules=molecule_results, mae=mae, rmse=math.sqrt(mse), score=score
diff --git a/src/mlipaudit/benchmarks/water_radial_distribution/water_radial_distribution.py b/src/mlipaudit/benchmarks/water_radial_distribution/water_radial_distribution.py
diff --git a/src/mlipaudit/scoring.py b/src/mlipaudit/scoring.py
diff --git a/src/mlipaudit/ui/leaderboard.py b/src/mlipaudit/ui/leaderboard.py
diff --git a/tests/test_scoring.py b/tests/test_scoring.py

Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@`
`43`	`43`	`"temperature_kelvin": 300.0,`
`44`	`44`	`}`
`45`	`45`
`46`		`-AVG_DEVIATION_SCORE_THRESHOLD = 0.05`
	`46`	`+DEVIATION_SCORE_THRESHOLD = 0.05`
`47`	`47`
`48`	`48`
`49`	`49`	`class Molecule(BaseModel):`
`@@ -218,23 +218,23 @@ def analyze(self) -> BondLengthDistributionResult:`
`218`	`218`	`trajectory[:, pattern_indices[0]] - trajectory[:, pattern_indices[1]],`
`219`	`219`	`axis=1,`
`220`	`220`	`)`
`221`		`- deviation_trajectory = list(`
	`221`	`+ deviation_trajectory = np.abs(`
`222`	`222`	`bond_length_trajectory - reference_bond_distance`
`223`	`223`	`)`
`224`	`224`
`225`	`225`	`molecule_result = BondLengthDistributionMoleculeResult(`
`226`	`226`	`molecule_name=molecule_output.molecule_name,`
`227`		`- deviation_trajectory=deviation_trajectory,`
`228`		`- avg_deviation=statistics.mean(deviation_trajectory),`
	`227`	`+ deviation_trajectory=list(deviation_trajectory),`
	`228`	`+ avg_deviation=float(np.mean(deviation_trajectory)),`
`229`	`229`	`)`
`230`	`230`	`results.append(molecule_result)`
`231`	`231`
`232`	`232`	`avg_deviation = statistics.mean(r.avg_deviation for r in results)`
`233`	`233`
`234`	`234`	`score = compute_benchmark_score(`
`235`		`- [avg_deviation],`
	`235`	`+ [[r.avg_deviation for r in results]],`
`236`	`236`	`[`
`237`		`- AVG_DEVIATION_SCORE_THRESHOLD,`
	`237`	`+ DEVIATION_SCORE_THRESHOLD,`
`238`	`238`	`],`
`239`	`239`	`)`
`240`	`240`
Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@`
`44`	`44`	`"temperature_kelvin": 300.0,`
`45`	`45`	`}`
`46`	`46`
`47`		`-MAE_DEVIATION_SCORE_THRESHOLD = 0.05`
	`47`	`+DEVIATION_SCORE_THRESHOLD = 0.05`
`48`	`48`
`49`	`49`
`50`	`50`	`def deviation_from_plane(coords: np.ndarray) -> float:`
`@@ -237,7 +237,7 @@ def analyze(self) -> RingPlanarityResult:`
`237`	`237`
`238`	`238`	`mae_deviation = statistics.mean(r.avg_deviation for r in results)`
`239`	`239`	`score = compute_benchmark_score(`
`240`		`- [mae_deviation], [MAE_DEVIATION_SCORE_THRESHOLD]`
	`240`	`+ [[r.avg_deviation for r in results]], [DEVIATION_SCORE_THRESHOLD]`
`241`	`241`	`)`
`242`	`242`
`243`	`243`	`return RingPlanarityResult(`