Skip to content

Commit c70b9de

Browse files
committed
feat: add ability to run RULER benchmark against a local openai endpoint
Signed-off-by: Jaideep Rao <jrao@redhat.com>
1 parent cea8acd commit c70b9de

File tree

4 files changed

+167
-0
lines changed

4 files changed

+167
-0
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ issues = "https://github.com/instructlab/eval/issues"
4343
"mt_bench" = "instructlab.eval.mt_bench:MTBenchEvaluator"
4444
"mt_bench_branch" = "instructlab.eval.mt_bench:MTBenchBranchEvaluator"
4545
"leaderboard_v2" = "instructlab.eval.leaderboard:LeaderboardV2Evaluator"
46+
"ruler" = "instructlab.eval.ruler:RulerEvaluator"
4647

4748
[tool.setuptools_scm]
4849
version_file = "src/instructlab/eval/_version.py"

requirements-ruler.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
lm-eval[ruler]>=0.4.8

src/instructlab/eval/ruler.py

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
# Standard
2+
from typing import Any, Dict, List, Optional
3+
import json
4+
import os
5+
import pathlib
6+
7+
# Third Party
8+
from lm_eval.evaluator import simple_evaluate
9+
10+
# First Party
11+
from instructlab.eval.evaluator import Evaluator
12+
13+
RULER_TASKS = [
14+
"niah_single_1",
15+
"niah_single_2",
16+
"niah_single_3",
17+
"niah_multikey_1",
18+
"niah_multikey_2",
19+
"niah_multikey_3",
20+
"niah_multiquery",
21+
"niah_multivalue",
22+
"ruler_vt",
23+
"ruler_cwe",
24+
"ruler_fwe",
25+
"ruler_qa_hotpot",
26+
"ruler_qa_squad",
27+
]
28+
29+
30+
class RulerEvaluator(Evaluator):
31+
"""
32+
Class definition for running RULER benchmarking tasks.
33+
"""
34+
35+
name = "ruler"
36+
37+
def __init__(
38+
self,
39+
model_path: Optional[str] = None,
40+
output_file: Optional[str] = None,
41+
tasks: list[str] = RULER_TASKS,
42+
num_gpus: Optional[int] = None,
43+
eval_config: Optional[Dict[str, Any]] = None,
44+
vllm_config: Optional[Dict[str, Any]] = None,
45+
hf_config: Optional[Dict[str, Any]] = None,
46+
openai_config: Optional[Dict[str, Any]] = None,
47+
api_endpoint: Optional[str] = None,
48+
max_length: Optional[int] = None,
49+
) -> None:
50+
self.model_path = model_path
51+
self.tasks = tasks
52+
self.results: Dict[Any, Any] = {}
53+
self.output_file = output_file
54+
55+
# Store evaluation configurations
56+
self.eval_config = eval_config or {}
57+
self.vllm_config = vllm_config or {}
58+
self.hf_config = hf_config or {}
59+
self.openai_config = openai_config or {}
60+
61+
self.api_endpoint = api_endpoint or None
62+
self.num_gpus = num_gpus
63+
self.max_length = max_length or 4096
64+
65+
def save_to_file(self, output_file: Optional[str] = None) -> None:
66+
"""Save results to a JSON file"""
67+
output_file = output_file or self.output_file
68+
if not output_file:
69+
raise ValueError("Output file path cannot be empty")
70+
71+
os.makedirs(os.path.dirname(output_file), exist_ok=True)
72+
with open(output_file, "w", encoding="utf-8") as f:
73+
json.dump(self.results, f, indent=2)
74+
75+
def process_lm_eval_results(
76+
self,
77+
fpath: Optional[pathlib.Path] = None,
78+
raw_results: Optional[dict[str, Any]] = None,
79+
) -> None:
80+
"""
81+
Process the evaluation results from lm_eval for the given file path and extract
82+
aggregarted scores for each context length
83+
Args:
84+
fpath (pathlib.Path): The file path to the evaluation results.
85+
86+
"""
87+
unqiue_metrics_dict: dict[str, Any] = {}
88+
89+
def extract_metrics(results: dict, unqiue_metrics_dict: dict = {}):
90+
for k, v in results.items():
91+
if isinstance(v, dict):
92+
extract_metrics(v, unqiue_metrics_dict)
93+
else:
94+
if "stderr" not in k:
95+
metric = k.split(",")[0]
96+
if metric not in unqiue_metrics_dict:
97+
unqiue_metrics_dict[metric] = []
98+
unqiue_metrics_dict[metric].append(v)
99+
100+
return unqiue_metrics_dict
101+
102+
if fpath:
103+
with open(fpath, "r", encoding="utf-8") as f:
104+
raw_results = json.load(f)
105+
106+
if raw_results is not None:
107+
extract_metrics(raw_results["results"], unqiue_metrics_dict)
108+
unique_float_metrics = {}
109+
# if value is list of floats, average the list
110+
for k, v in unqiue_metrics_dict.items():
111+
if isinstance(v, list) and all(isinstance(i, float) for i in v):
112+
unique_float_metrics[k] = sum(v) / len(v)
113+
114+
# find average of all float values in dict
115+
float_values = [
116+
v for v in unique_float_metrics.values() if isinstance(v, float)
117+
]
118+
if float_values:
119+
unique_float_metrics["avg"] = sum(float_values) / len(float_values)
120+
else:
121+
unique_float_metrics["avg"] = 0.0
122+
123+
# result format
124+
# {'8192': 0.90, '32768': 0.82, '65536': 0.77, '131072': 0.71, 'avg': 0.80}
125+
self.results = unique_float_metrics
126+
127+
def run(
128+
self,
129+
model_path: Optional[str] = None,
130+
tasks: Optional[List[str]] = None,
131+
output_file: Optional[str] = None,
132+
api_endpoint: Optional[str] = None,
133+
max_length: Optional[int] = 4096,
134+
) -> None:
135+
"""
136+
Run the RULER evaluation using the specified model and tasks.
137+
"""
138+
139+
model_path = self.model_path if model_path is None else model_path
140+
tasks = self.tasks if not tasks else tasks
141+
output_file = self.output_file if not output_file else output_file
142+
143+
# Prepare model_args
144+
model_args = {
145+
"pretrained": model_path,
146+
"base_url": api_endpoint,
147+
"max_length": max_length,
148+
}
149+
150+
lm_eval_results = simple_evaluate(
151+
model="local-completions",
152+
model_args=model_args,
153+
tasks=tasks,
154+
)
155+
156+
self.process_lm_eval_results(
157+
raw_results=lm_eval_results,
158+
)
159+
# write results to file
160+
if output_file:
161+
# os.makedirs(os.path.dirname(output_file), exist_ok=True)
162+
with open(output_file, "w", encoding="utf-8") as f:
163+
json.dump(self.results, f, indent=2)

tests/test_project.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from instructlab.eval.leaderboard import LeaderboardV2Evaluator
88
from instructlab.eval.mmlu import MMLUBranchEvaluator, MMLUEvaluator
99
from instructlab.eval.mt_bench import MTBenchBranchEvaluator, MTBenchEvaluator
10+
from instructlab.eval.ruler import RulerEvaluator
1011

1112

1213
def test_evaluator_eps():
@@ -16,6 +17,7 @@ def test_evaluator_eps():
1617
"mt_bench": MTBenchEvaluator,
1718
"mt_bench_branch": MTBenchBranchEvaluator,
1819
"leaderboard_v2": LeaderboardV2Evaluator,
20+
"ruler": RulerEvaluator,
1921
}
2022
eps = entry_points(group="instructlab.eval.evaluator")
2123
found = {}

0 commit comments

Comments
 (0)