Skip to content

Commit 97ed0ff

Browse files
committed
test run
Signed-off-by: Jaideep Rao <[email protected]>
1 parent cea8acd commit 97ed0ff

File tree

2 files changed

+262
-0
lines changed

2 files changed

+262
-0
lines changed

src/instructlab/eval/ruler.py

Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
2+
from instructlab.eval.evaluator import Evaluator
3+
from lm_eval.evaluator import simple_evaluate
4+
5+
from typing import Any, Dict, Optional, List
6+
import pathlib
7+
import json
8+
import os
9+
from torch import cuda
10+
11+
RULER_TASKS = [
12+
"niah_single_1",
13+
"niah_single_2",
14+
"niah_single_3",
15+
"niah_multikey_1",
16+
"niah_multikey_2",
17+
"niah_multikey_3",
18+
"niah_multiquery",
19+
"niah_multivalue",
20+
"ruler_vt",
21+
"ruler_cwe",
22+
"ruler_fwe",
23+
"ruler_qa_hotpot",
24+
"ruler_qa_squad"
25+
]
26+
27+
DEFAULT_VLLM_CONFIG = {
28+
"dtype": "float16",
29+
"gpu_memory_utilization": 0.8,
30+
"disable_custom_all_reduce": True,
31+
"enforce_eager": False,
32+
"max_model_len": 131072,
33+
}
34+
35+
# Default configuration parameters
36+
DEFAULT_EVAL_CONFIG = {
37+
"batch_size": "auto",
38+
"apply_chat_template": True,
39+
"fewshot_as_multiturn": True,
40+
"confirm_run_unsafe_code": True,
41+
"system_instruction": None,
42+
"cache_requests": False,
43+
}
44+
45+
class RulerEvaluator(Evaluator):
46+
"""
47+
Class definition for running RULER benchmarking tasks.
48+
"""
49+
50+
def __init(
51+
self,
52+
model_path: Optional[str] = None,
53+
output_file: Optional[str] = None,
54+
tasks: list[str] = RULER_TASKS,
55+
num_gpus: Optional[int] = None,
56+
eval_config: Optional[Dict[str, Any]] = None,
57+
vllm_config: Optional[Dict[str, Any]] = None,
58+
hf_config: Optional[Dict[str, Any]] = None,
59+
openai_config: Optional[Dict[str, Any]] = None,
60+
api_endpoint: Optional[str] = None,
61+
62+
) -> None:
63+
self.model_path = model_path
64+
self.tasks = tasks
65+
self._results = None
66+
self.output_file = output_file
67+
68+
# Store evaluation configurations
69+
self.eval_config = eval_config or {}
70+
self.vllm_config = vllm_config or {}
71+
self.hf_config = hf_config or {}
72+
self.openai_config = openai_config or {}
73+
74+
self.api_endpoint = api_endpoint or None
75+
self.num_gpus = num_gpus
76+
77+
@property
78+
def results(self) -> Dict[str, Any] | None:
79+
"""
80+
Returns the results of the last RULER evaluation, if one has taken place.
81+
82+
Returns:
83+
Dict[str, Any] | None: The processed output from `lm_eval.evaluator.simple_evaluate`
84+
"""
85+
return self._results
86+
87+
def save_to_file(
88+
self,
89+
output_file: Optional[str] = None
90+
) -> None:
91+
"""Save results to a JSON file"""
92+
output_file = output_file or self.output_file
93+
if not output_file:
94+
raise ValueError("Output file path cannot be empty")
95+
96+
os.makedirs(os.path.dirname(output_file), exist_ok=True)
97+
with open(output_file, "w", encoding="utf-8") as f:
98+
json.dump(self._results, f, indent=2)
99+
100+
def process_lm_eval_results(
101+
self,
102+
fpath: Optional[pathlib.Path] = None,
103+
raw_results: Optional[dict] = None
104+
) -> None:
105+
"""
106+
Process the evaluation results from lm_eval for the given file path and extract
107+
aggregarted scores for each context length
108+
Args:
109+
fpath (pathlib.Path): The file path to the evaluation results.
110+
111+
"""
112+
unqiue_metrics_dict = {}
113+
114+
def extract_metrics(results: dict, unqiue_metrics_dict: dict = {}):
115+
for k,v in results.items():
116+
if isinstance(v, dict):
117+
extract_metrics(v, unqiue_metrics_dict)
118+
else:
119+
if "stderr" not in k:
120+
metric = k.split(",")[0]
121+
if metric not in unqiue_metrics_dict:
122+
unqiue_metrics_dict[metric] = []
123+
unqiue_metrics_dict[metric].append(v)
124+
125+
return unqiue_metrics_dict
126+
127+
if fpath:
128+
with open(fpath, "r") as f:
129+
raw_results = json.load(f)
130+
131+
extract_metrics(raw_results["results"], unqiue_metrics_dict)
132+
unique_float_metrics = {}
133+
# if value is list of floats, average the list
134+
for k, v in unqiue_metrics_dict.items():
135+
if isinstance(v, list) and all(isinstance(i, float) for i in v):
136+
unique_float_metrics[k] = sum(v) / len(v)
137+
138+
# find average of all float values in dict
139+
float_values = [v for v in unique_float_metrics.values() if isinstance(v, float)]
140+
if float_values:
141+
unique_float_metrics["avg"] = sum(float_values) / len(float_values)
142+
else:
143+
unique_float_metrics["avg"] = 0.0
144+
145+
self._results = unique_float_metrics
146+
147+
def run(
148+
self,
149+
model_path: Optional[str] = None,
150+
tasks: Optional[List[str]] = None,
151+
num_gpus: Optional[int] = None,
152+
output_file: Optional[str] = None,
153+
eval_config: Optional[Dict[str, Any]] = None,
154+
vllm_config: Optional[Dict[str, Any]] = None,
155+
hf_config: Optional[Dict[str, Any]] = None,
156+
openai_config: Optional[Dict[str, Any]] = None,
157+
api_endpoint: Optional[str] = None,
158+
max_length: Optional[int] = None,
159+
) -> None:
160+
"""
161+
Run the RULER evaluation using the specified model and tasks.
162+
"""
163+
164+
model_path = self.model_path if model_path is None else model_path
165+
num_gpus = self.num_gpus if not num_gpus else num_gpus
166+
tasks = self.tasks if not tasks else tasks
167+
output_file = self.output_file if not output_file else output_file
168+
169+
# Merge configurations with instance configurations, with run-time configs taking precedence
170+
# final_eval_config = {**self.eval_config, **(eval_config or {})}
171+
# final_vllm_config = {**self.vllm_config, **(vllm_config or {})}
172+
# final_hf_config = {**self.hf_config, **(hf_config or {})}
173+
# final_openai_config = {**self.openai_config, **(openai_config or {})}
174+
175+
176+
if not num_gpus:
177+
num_gpus = cuda.device_count()
178+
if num_gpus <= 0 or num_gpus > cuda.device_count():
179+
raise ValueError(
180+
f"invalid value for num_gpus, must be between 1 and {cuda.device_count()}; got: {num_gpus}"
181+
)
182+
183+
# Extract system_instruction if provided
184+
system_instruction = eval_config.pop("system_instruction", None)
185+
186+
# final_vllm_config = {}
187+
# final_vllm_config.update(DEFAULT_VLLM_CONFIG)
188+
# # final_vllm_config.update(self.vllm_config)
189+
# # final_vllm_config.update(vllm_config)
190+
191+
# Prepare model_args
192+
model_args = {
193+
"pretrained": model_path,
194+
"base_url": api_endpoint,
195+
"max_length": max_length,
196+
}
197+
198+
lm_eval_results = simple_evaluate(
199+
model="local-completions",
200+
model_args=model_args,
201+
tasks=tasks,
202+
system_instruction=system_instruction,
203+
)
204+
205+
print("Results: ", lm_eval_results)
206+
207+
208+
if __name__ == "__main__":
209+
fpath = "/Users/jrao/eval_results-short.json"
210+
RE = RulerEvaluator()
211+
212+
# with open(fpath, "r") as f:
213+
# data = json.load(f)
214+
# unqiue_metrics_dict = {}
215+
# # get_results(data["results"], unqiue_metrics_dict)
216+
# # RE.process_lm_eval_results(fpath)
217+
# RE.process_lm_eval_results(raw_results=data)
218+
# print(RE.results)
219+
RE.run(
220+
model_path="microsoft/phi-4-mini-instruct",
221+
tasks=["niah_single_1"],
222+
num_gpus=1,
223+
output_file="/Users/jrao/eval_results.json",
224+
eval_config=DEFAULT_EVAL_CONFIG,
225+
vllm_config=DEFAULT_VLLM_CONFIG,
226+
)

src/instructlab/eval/temp_res.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import json
2+
3+
def get_results(results: dict, unqiue_metrics_dict: dict = {}):
4+
5+
for k,v in results.items():
6+
if isinstance(v, dict):
7+
get_results(v, unqiue_metrics_dict)
8+
else:
9+
if "stderr" not in k:
10+
metric = k.split(",")[0]
11+
if metric not in unqiue_metrics_dict:
12+
unqiue_metrics_dict[metric] = []
13+
unqiue_metrics_dict[metric].append(v)
14+
15+
if __name__ == "__main__":
16+
fpath = "/Users/jrao/eval_results-short.json"
17+
with open(fpath, "r") as f:
18+
data = json.load(f)
19+
unqiue_metrics_dict = {}
20+
get_results(data["results"], unqiue_metrics_dict)
21+
22+
unique_float_metrics = {}
23+
# if value is list of floats, average the list
24+
for k, v in unqiue_metrics_dict.items():
25+
if isinstance(v, list) and all(isinstance(i, float) for i in v):
26+
unique_float_metrics[k] = sum(v) / len(v)
27+
28+
# find average of all float values in dict
29+
float_values = [v for v in unique_float_metrics.values() if isinstance(v, float)]
30+
if float_values:
31+
unique_float_metrics["avg"] = sum(float_values) / len(float_values)
32+
else:
33+
unique_float_metrics["avg"] = 0.0
34+
35+
36+
print(unique_float_metrics)

0 commit comments

Comments
 (0)