make the leaderboard dependencies into an optional target under instructlab-eval[leaderboard]

RobotSail · RobotSail · commit ce38464d503b · 2025-04-16T03:40:42.000Z
Signed-off-by: Oleg Silkin &lt;97077423+RobotSail@users.noreply.github.com&gt;
diff --git a/README.md b/README.md
@@ -29,6 +29,16 @@ the phase. At the end of each phase, we evaluate all the checkpoints in order to
 Once training is complete, and we have picked the best checkpoint from the output of the final phase, we can run full-scale evaluation suite which runs MT-Bench, MMLU,
 MT-Bench Branch and MMLU Branch.
 
+### Leaderboard Evaluation
+
+For cases when you want to run the full Open LLM Leaderboard v2 evaluation suite, we provide an optional dependency package for the leaderboard tasks. This includes additional benchmarks like GPQA, IFEVAL, BBH, MMLU-PRO, MUSR, and MATH-HARD.
+
+To install the optional leaderboard dependencies, use:
+
+```bash
+pip install instructlab-eval[leaderboard]
+```
+
 ## Methods of Evaluation
 
 Below are more in-depth explanations of the suite of benchmarks we are using as methods for evaluation of models.
diff --git a/pyproject.toml b/pyproject.toml
@@ -54,6 +54,7 @@ package-dir = {"" = "src"}
 
 [tool.setuptools.dynamic]
 dependencies = {file = ["requirements.txt"]}
+optional-dependencies = {leaderboard = {file = ["requirements-leaderboard.txt"]}}
 
 [tool.setuptools.packages.find]
 where = ["src"]
diff --git a/requirements.txt b/requirements.txt
@@ -8,8 +8,7 @@ transformers
 accelerate
 pandas
 pandas-stubs
-# All optional dependencies like this can be found in lm-eval:
-# https://github.com/EleutherAI/lm-evaluation-harness/blob/main/pyproject.toml 
-lm-eval[math,ifeval,sentencepiece,vllm]>=0.4.4
+# Base lm-eval dependency
+lm-eval>=0.4.4
 httpx
 ragas>=0.2.11
diff --git a/scripts/test_leaderboard.py b/scripts/test_leaderboard.py
@@ -1,9 +1,19 @@
+#!/usr/bin/env python
+# SPDX-License-Identifier: Apache-2.0
+
+# NOTE: This script requires the leaderboard optional dependencies.
+# Install with: pip install instructlab-eval[leaderboard]
+
 # First Party
+import json
 from instructlab.eval.leaderboard import LeaderboardV2Evaluator
 
 if __name__ == "__main__":
     evaluator = LeaderboardV2Evaluator(
-        model_path="ibm-granite/granite-3.1-8b-instruct",
+        model_path="ibm-granite/granite-3.1-8b-base",
+        eval_config={
+            "apply_chat_template": False,
+        },
     )
     results = evaluator.run()
     print("got results from leaderboard v2")
diff --git a/src/instructlab/eval/leaderboard.py b/src/instructlab/eval/leaderboard.py
@@ -541,6 +541,9 @@ def calculate_overall_leaderboard_score(results: t.Dict[str, ParsedScores]) -> f
 class LeaderboardV2Evaluator(Evaluator):
     """
     Evaluator for Open Leaderboard v2.
+
+    NOTE: This evaluator requires the optional leaderboard dependencies.
+    Install with: pip install instructlab-eval[leaderboard]
     """
 
     name = "leaderboard_v2"