instadeepai
diff --git a/‎.github/workflows/deploy_docs.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/deploy_docs.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/tests_and_linters.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/tests_and_linters.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/api_reference/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/api_reference/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/api_reference/io.rst‎
Lines changed: 6 additions & 0 deletions b/‎docs/source/api_reference/io.rst‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/source/api_reference/scoring.rst‎
Lines changed: 10 additions & 0 deletions b/‎docs/source/api_reference/scoring.rst‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎docs/source/benchmarks/small_molecules/img/dihedral_scan‎
-109 KB b/‎docs/source/benchmarks/small_molecules/img/dihedral_scan‎
-109 KB
diff --git a/‎docs/source/tutorials/cli/index.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/tutorials/cli/index.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 3 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/mlipaudit/app.py‎
Lines changed: 40 additions & 20 deletions b/‎src/mlipaudit/app.py‎
Lines changed: 40 additions & 20 deletions
diff --git a/‎src/mlipaudit/benchmark.py‎
Lines changed: 11 additions & 3 deletions b/‎src/mlipaudit/benchmark.py‎
Lines changed: 11 additions & 3 deletions
@@ -34,6 +34,7 @@ jobs:
 
       - name: Build documentation
         run: |
+          uv sync --group dev --group jax_md
           uv run sphinx-build -b html docs/source/ _build/
 
       - name: Deploy to GitHub Pages
 
@@ -63,6 +63,7 @@ jobs:
 
       - name: Run tests 🧪
         run: |
+          uv sync --group dev --group jax_md
           uv run pytest --verbose --cov-report xml:coverage.xml \
                             --cov-report term-missing \
                             --junitxml=pytest.xml \
 
@@ -14,6 +14,7 @@ Base classes and utilities
     benchmark
     io
     run_mode
+    scoring
     utils/trajectory_helpers
 
 Benchmark implementations
 
@@ -11,6 +11,12 @@ I/O of model outputs and benchmark results
 
 .. autofunction:: load_benchmark_results_from_disk
 
+.. autofunction:: write_scores_to_disk
+
+.. autofunction:: load_score_from_disk
+
+.. autofunction:: load_scores_from_disk
+
 .. autofunction:: write_model_output_to_disk
 
 .. autofunction:: load_model_output_from_disk
@@ -0,0 +1,10 @@
+.. _scoring:
+
+.. module:: mlipaudit.scoring
+
+Scoring
+=======
+
+.. autofunction:: compute_metric_score
+
+.. autofunction:: compute_benchmark_score
@@ -39,7 +39,7 @@ The tool has the following command line options:
   list of benchmark names (e.g., ``dihedral_scan``, ``ring_planarity``) or ``all`` to
   run all available benchmarks which is also the default which means that if this flag
   is not used, all benchmarks will be run.
-* ``--run-mode``: *Optional* setting that allows to run faster versions of the
+* ``-rm / --run-mode``: *Optional* setting that allows to run faster versions of the
   benchmark suite. The default option ``standard`` which runs the entire suite.
   The option ``fast`` runs a slightly faster version for some of the very long-running
   benchmarks. The option ``dev`` runs a very minimal version of each benchmark for
 
@@ -16,7 +16,6 @@ dependencies = [
     "vl-convert-python>=1.8.0",
     "mdtraj>=1.10.3",
     "tmtools==0.2.0",
-    "jax-md",
 ]
 
 [project.scripts]
@@ -45,6 +44,9 @@ gpu = [
     "jax[cuda12]==0.4.33",
     "jaxlib==0.4.33"
 ]
+jax_md = [
+    "jax-md",
+]
 
 [tool.coverage.run]
 omit = [
 
@@ -26,7 +26,7 @@
 from mlipaudit.conformer_selection import ConformerSelectionBenchmark
 from mlipaudit.dihedral_scan import DihedralScanBenchmark
 from mlipaudit.folding_stability import FoldingStabilityBenchmark
-from mlipaudit.io import load_benchmark_results_from_disk
+from mlipaudit.io import load_benchmark_results_from_disk, load_scores_from_disk
 from mlipaudit.noncovalent_interactions import NoncovalentInteractionsBenchmark
 from mlipaudit.reactivity import ReactivityBenchmark
 from mlipaudit.ring_planarity import RingPlanarityBenchmark
@@ -43,6 +43,7 @@
     conformer_selection_page,
     dihedral_scan_page,
     folding_stability_page,
+    leaderboard_page,
     noncovalent_interactions_page,
     reactivity_page,
     ring_planarity_page,
@@ -54,6 +55,9 @@
     tautomers_page,
     water_radial_distribution_page,
 )
+from mlipaudit.ui.utils import (
+    remove_model_name_extensions_and_capitalize_benchmark_names,
+)
 from mlipaudit.water_radial_distribution import (
     WaterRadialDistributionBenchmark,
 )
@@ -105,24 +109,40 @@ def main():
             "You must provide the results directory as a command line argument, "
             "like this: mlipauditapp /path/to/results"
         )
+    is_public = False
+    if len(sys.argv) == 3 and sys.argv[2] == "__hf":
+        is_public = True
+    else:
+        if not Path(sys.argv[1]).exists():
+            raise RuntimeError("The specified results directory does not exist.")
+
+    results_dir = sys.argv[1]
 
-    if not Path(sys.argv[1]).exists():
-        raise RuntimeError("The specified results directory does not exist.")
+    results = load_benchmark_results_from_disk(results_dir, BENCHMARKS)
+    scores = load_scores_from_disk(scores_dir=results_dir)
 
-    data = load_benchmark_results_from_disk(sys.argv[1], BENCHMARKS)
+    if is_public:
+        remove_model_name_extensions_and_capitalize_benchmark_names(results)
+
+    leaderboard = st.Page(
+        functools.partial(leaderboard_page, scores=scores, is_public=is_public),
+        title="Leaderboard",
+        icon=":material/trophy:",
+        default=True,
+    )
 
     conformer_selection = st.Page(
         functools.partial(
             conformer_selection_page,
-            data_func=_data_func_from_key("conformer_selection", data),
+            data_func=_data_func_from_key("conformer_selection", results),
         ),
         title="Conformer selection",
         url_path="conformer_selection",
     )
     dihedral_scan = st.Page(
         functools.partial(
             dihedral_scan_page,
-            data_func=_data_func_from_key("dihedral_scan", data),
+            data_func=_data_func_from_key("dihedral_scan", results),
         ),
         title="Dihedral scan",
         url_path="dihedral_scan",
@@ -131,23 +151,23 @@ def main():
     tautomers = st.Page(
         functools.partial(
             tautomers_page,
-            data_func=_data_func_from_key("tautomers", data),
+            data_func=_data_func_from_key("tautomers", results),
         ),
         title="Tautomers",
         url_path="tautomers",
     )
     noncovalent_interactions = st.Page(
         functools.partial(
             noncovalent_interactions_page,
-            data_func=_data_func_from_key("noncovalent_interactions", data),
+            data_func=_data_func_from_key("noncovalent_interactions", results),
         ),
         title="Noncovalent Interactions",
         url_path="noncovalent_interactions",
     )
     ring_planarity = st.Page(
         functools.partial(
             ring_planarity_page,
-            data_func=_data_func_from_key("ring_planarity", data),
+            data_func=_data_func_from_key("ring_planarity", results),
         ),
         title="Ring planarity",
         url_path="ring_planarity",
@@ -156,7 +176,7 @@ def main():
     small_molecule_minimization = st.Page(
         functools.partial(
             small_molecule_minimization_page,
-            data_func=_data_func_from_key("small_molecule_minimization", data),
+            data_func=_data_func_from_key("small_molecule_minimization", results),
         ),
         title="Small molecule minimization",
         url_path="small_molecule_minimization",
@@ -165,7 +185,7 @@ def main():
     reactivity = st.Page(
         functools.partial(
             reactivity_page,
-            data_func=_data_func_from_key("reactivity", data),
+            data_func=_data_func_from_key("reactivity", results),
         ),
         title="Reactivity",
         url_path="reactivity",
@@ -174,7 +194,7 @@ def main():
     folding_stability = st.Page(
         functools.partial(
             folding_stability_page,
-            data_func=_data_func_from_key("folding_stability", data),
+            data_func=_data_func_from_key("folding_stability", results),
         ),
         title="Protein folding stability",
         url_path="protein_folding_stability",
@@ -183,7 +203,7 @@ def main():
     bond_length_distribution = st.Page(
         functools.partial(
             bond_length_distribution_page,
-            data_func=_data_func_from_key("bond_length_distribution", data),
+            data_func=_data_func_from_key("bond_length_distribution", results),
         ),
         title="Bond length distribution",
         url_path="bond_length_distribution",
@@ -192,7 +212,7 @@ def main():
     sampling = st.Page(
         functools.partial(
             sampling_page,
-            data_func=_data_func_from_key("sampling", data),
+            data_func=_data_func_from_key("sampling", results),
         ),
         title="Protein sampling",
         url_path="sampling",
@@ -201,7 +221,7 @@ def main():
     water_radial_distribution = st.Page(
         functools.partial(
             water_radial_distribution_page,
-            data_func=_data_func_from_key("water_radial_distribution", data),
+            data_func=_data_func_from_key("water_radial_distribution", results),
         ),
         title="Water radial distribution function",
         url_path="water_radial_distribution_function",
@@ -210,7 +230,7 @@ def main():
     solvent_radial_distribution = st.Page(
         functools.partial(
             solvent_radial_distribution_page,
-            data_func=_data_func_from_key("solvent_radial_distribution", data),
+            data_func=_data_func_from_key("solvent_radial_distribution", results),
         ),
         title="Solvent radial distribution",
         url_path="solvent_radial_distribution",
@@ -219,7 +239,7 @@ def main():
     stability = st.Page(
         functools.partial(
             stability_page,
-            data_func=_data_func_from_key("stability", data),
+            data_func=_data_func_from_key("stability", results),
         ),
         title="Stability",
         url_path="stability",
@@ -228,7 +248,7 @@ def main():
     scaling = st.Page(
         functools.partial(
             scaling_page,
-            data_func=_data_func_from_key("scaling", data),
+            data_func=_data_func_from_key("scaling", results),
         ),
         title="Scaling",
         url_path="scaling",
@@ -266,14 +286,14 @@ def main():
 
     # Filter pages based on selection
     if selected_category == "All Categories":
-        pages_to_show = (
+        pages_to_show = [leaderboard] + (
             page_categories["Small Molecules"]
             + page_categories["Biomolecules"]
             + page_categories["General"]
         )
 
     else:
-        pages_to_show = page_categories[selected_category]
+        pages_to_show = [leaderboard] + page_categories[selected_category]
 
     # Set up navigation in main area
     pg = st.navigation(pages_to_show)
 
@@ -21,7 +21,7 @@
 from ase import Atom
 from huggingface_hub import hf_hub_download
 from mlip.models import ForceField
-from pydantic import BaseModel
+from pydantic import BaseModel, Field
 
 from mlipaudit.exceptions import ChemicalElementsMissingError
 from mlipaudit.run_mode import RunMode
@@ -30,7 +30,14 @@
 
 
 class BenchmarkResult(BaseModel):
-    """A base model for all benchmark results."""
+    """A base model for all benchmark results.
+
+    Attributes:
+        score: The final score for the benchmark between
+            0 and 1.
+    """
+
+    score: float | None = Field(ge=0, le=1, default=None)
 
 
 class ModelOutput(BaseModel):
@@ -212,7 +219,8 @@ def analyze(self) -> BenchmarkResult:
 
         Subclasses must implement this method. This method
         processes the raw data generated from the generation step
-        to compute final metrics.
+        to compute final metrics. Subclasses are also responsible
+        for computing the final score for the benchmark.
 
         Returns:
             A class-specific instance of `BenchmarkResult`.