Skip to content

Commit f1ae970

Browse files
committed
test run
Signed-off-by: Jaideep Rao <[email protected]>
1 parent cea8acd commit f1ae970

File tree

1 file changed

+171
-0
lines changed

1 file changed

+171
-0
lines changed

src/instructlab/eval/ruler.py

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
# Standard
2+
from typing import Any, Dict, List, Optional
3+
import json
4+
import os
5+
import pathlib
6+
7+
# Third Party
8+
from lm_eval.evaluator import simple_evaluate
9+
10+
# First Party
11+
from instructlab.eval.evaluator import Evaluator
12+
13+
RULER_TASKS = [
14+
"niah_single_1",
15+
"niah_single_2",
16+
"niah_single_3",
17+
"niah_multikey_1",
18+
"niah_multikey_2",
19+
"niah_multikey_3",
20+
"niah_multiquery",
21+
"niah_multivalue",
22+
"ruler_vt",
23+
"ruler_cwe",
24+
"ruler_fwe",
25+
"ruler_qa_hotpot",
26+
"ruler_qa_squad",
27+
]
28+
29+
30+
class RulerEvaluator(Evaluator):
31+
"""
32+
Class definition for running RULER benchmarking tasks.
33+
"""
34+
35+
def __init__(
36+
self,
37+
model_path: Optional[str] = None,
38+
output_file: Optional[str] = None,
39+
tasks: list[str] = RULER_TASKS,
40+
num_gpus: Optional[int] = None,
41+
eval_config: Optional[Dict[str, Any]] = None,
42+
vllm_config: Optional[Dict[str, Any]] = None,
43+
hf_config: Optional[Dict[str, Any]] = None,
44+
openai_config: Optional[Dict[str, Any]] = None,
45+
api_endpoint: Optional[str] = None,
46+
max_length: Optional[int] = None,
47+
) -> None:
48+
self.model_path = model_path
49+
self.tasks = tasks
50+
self.results: Dict[Any, Any] = {}
51+
self.output_file = output_file
52+
53+
# Store evaluation configurations
54+
self.eval_config = eval_config or {}
55+
self.vllm_config = vllm_config or {}
56+
self.hf_config = hf_config or {}
57+
self.openai_config = openai_config or {}
58+
59+
self.api_endpoint = api_endpoint or None
60+
self.num_gpus = num_gpus
61+
self.max_length = max_length or 4096
62+
63+
def save_to_file(self, output_file: Optional[str] = None) -> None:
64+
"""Save results to a JSON file"""
65+
output_file = output_file or self.output_file
66+
if not output_file:
67+
raise ValueError("Output file path cannot be empty")
68+
69+
os.makedirs(os.path.dirname(output_file), exist_ok=True)
70+
with open(output_file, "w", encoding="utf-8") as f:
71+
json.dump(self.results, f, indent=2)
72+
73+
def process_lm_eval_results(
74+
self,
75+
fpath: Optional[pathlib.Path] = None,
76+
raw_results: Optional[dict[str, Any]] = None,
77+
) -> None:
78+
"""
79+
Process the evaluation results from lm_eval for the given file path and extract
80+
aggregarted scores for each context length
81+
Args:
82+
fpath (pathlib.Path): The file path to the evaluation results.
83+
84+
"""
85+
unqiue_metrics_dict: dict[str, Any] = {}
86+
87+
def extract_metrics(results: dict, unqiue_metrics_dict: dict = {}):
88+
for k, v in results.items():
89+
if isinstance(v, dict):
90+
extract_metrics(v, unqiue_metrics_dict)
91+
else:
92+
if "stderr" not in k:
93+
metric = k.split(",")[0]
94+
if metric not in unqiue_metrics_dict:
95+
unqiue_metrics_dict[metric] = []
96+
unqiue_metrics_dict[metric].append(v)
97+
98+
return unqiue_metrics_dict
99+
100+
if fpath:
101+
with open(fpath, "r", encoding="utf-8") as f:
102+
raw_results = json.load(f)
103+
104+
if raw_results is not None:
105+
extract_metrics(raw_results["results"], unqiue_metrics_dict)
106+
unique_float_metrics = {}
107+
# if value is list of floats, average the list
108+
for k, v in unqiue_metrics_dict.items():
109+
if isinstance(v, list) and all(isinstance(i, float) for i in v):
110+
unique_float_metrics[k] = sum(v) / len(v)
111+
112+
# find average of all float values in dict
113+
float_values = [
114+
v for v in unique_float_metrics.values() if isinstance(v, float)
115+
]
116+
if float_values:
117+
unique_float_metrics["avg"] = sum(float_values) / len(float_values)
118+
else:
119+
unique_float_metrics["avg"] = 0.0
120+
121+
# result format
122+
# {'8192': 0.90, '32768': 0.82, '65536': 0.77, '131072': 0.71, 'avg': 0.80}
123+
self.results = unique_float_metrics
124+
125+
def run(
126+
self,
127+
model_path: Optional[str] = None,
128+
tasks: Optional[List[str]] = None,
129+
output_file: Optional[str] = None,
130+
api_endpoint: Optional[str] = None,
131+
max_length: Optional[int] = None,
132+
) -> None:
133+
"""
134+
Run the RULER evaluation using the specified model and tasks.
135+
"""
136+
137+
model_path = self.model_path if model_path is None else model_path
138+
tasks = self.tasks if not tasks else tasks
139+
output_file = self.output_file if not output_file else output_file
140+
141+
# Prepare model_args
142+
model_args = {
143+
"pretrained": model_path,
144+
"base_url": api_endpoint,
145+
"max_length": max_length,
146+
}
147+
148+
lm_eval_results = simple_evaluate(
149+
model="local-completions",
150+
model_args=model_args,
151+
tasks=tasks,
152+
)
153+
154+
self.process_lm_eval_results(
155+
raw_results=lm_eval_results,
156+
)
157+
# write results to file
158+
if output_file:
159+
# os.makedirs(os.path.dirname(output_file), exist_ok=True)
160+
with open(output_file, "w", encoding="utf-8") as f:
161+
json.dump(self.results, f, indent=2)
162+
163+
164+
if __name__ == "__main__":
165+
RE = RulerEvaluator(
166+
model_path="microsoft/phi-4-mini-instruct",
167+
output_file="ruler_results.json",
168+
tasks=["niah_single_1"],
169+
api_endpoint="http://localhost:8000/v1/completions",
170+
max_length=4096,
171+
)

0 commit comments

Comments
 (0)