Skip to content

Commit b198986

Browse files
committed
test run
Signed-off-by: Jaideep Rao <[email protected]>
1 parent cea8acd commit b198986

File tree

2 files changed

+246
-0
lines changed

2 files changed

+246
-0
lines changed

src/instructlab/eval/ruler.py

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
2+
from instructlab.eval.evaluator import Evaluator
3+
from lm_eval.evaluator import simple_evaluate
4+
5+
from typing import Any, Dict, Optional, List
6+
import pathlib
7+
import json
8+
import os
9+
from torch import cuda
10+
11+
RULER_TASKS = [
12+
"niah_single_1",
13+
"niah_single_2",
14+
"niah_single_3",
15+
"niah_multikey_1",
16+
"niah_multikey_2",
17+
"niah_multikey_3",
18+
"niah_multiquery",
19+
"niah_multivalue",
20+
"ruler_vt",
21+
"ruler_cwe",
22+
"ruler_fwe",
23+
"ruler_qa_hotpot",
24+
"ruler_qa_squad"
25+
]
26+
27+
DEFAULT_VLLM_CONFIG = {
28+
"dtype": "float16",
29+
"gpu_memory_utilization": 0.8,
30+
"disable_custom_all_reduce": True,
31+
"enforce_eager": False,
32+
"max_model_len": 131072,
33+
}
34+
35+
# Default configuration parameters
36+
DEFAULT_EVAL_CONFIG = {
37+
"batch_size": "auto",
38+
"apply_chat_template": True,
39+
"fewshot_as_multiturn": True,
40+
"confirm_run_unsafe_code": True,
41+
"system_instruction": None,
42+
"cache_requests": False,
43+
}
44+
45+
class RulerEvaluator(Evaluator):
46+
"""
47+
Class definition for running RULER benchmarking tasks.
48+
"""
49+
50+
def __init(
51+
self,
52+
model_path: Optional[str] = None,
53+
output_file: Optional[str] = None,
54+
tasks: list[str] = RULER_TASKS,
55+
num_gpus: Optional[int] = None,
56+
eval_config: Optional[Dict[str, Any]] = None,
57+
vllm_config: Optional[Dict[str, Any]] = None,
58+
hf_config: Optional[Dict[str, Any]] = None,
59+
openai_config: Optional[Dict[str, Any]] = None,
60+
api_endpoint: Optional[str] = None,
61+
62+
) -> None:
63+
self.model_path = model_path
64+
self.tasks = tasks
65+
self._results = None
66+
self.output_file = output_file
67+
68+
# Store evaluation configurations
69+
self.eval_config = eval_config or {}
70+
self.vllm_config = vllm_config or {}
71+
self.hf_config = hf_config or {}
72+
self.openai_config = openai_config or {}
73+
74+
self.api_endpoint = api_endpoint or None
75+
self.num_gpus = num_gpus
76+
77+
@property
78+
def results(self) -> Dict[str, Any] | None:
79+
"""
80+
Returns the results of the last RULER evaluation, if one has taken place.
81+
82+
Returns:
83+
Dict[str, Any] | None: The processed output from `lm_eval.evaluator.simple_evaluate`
84+
"""
85+
return self._results
86+
87+
def save_to_file(
88+
self,
89+
output_file: Optional[str] = None
90+
) -> None:
91+
"""Save results to a JSON file"""
92+
output_file = output_file or self.output_file
93+
if not output_file:
94+
raise ValueError("Output file path cannot be empty")
95+
96+
os.makedirs(os.path.dirname(output_file), exist_ok=True)
97+
with open(output_file, "w", encoding="utf-8") as f:
98+
json.dump(self._results, f, indent=2)
99+
100+
def process_lm_eval_results(
101+
self,
102+
fpath: Optional[pathlib.Path] = None,
103+
raw_results: Optional[dict] = None
104+
) -> None:
105+
"""
106+
Process the evaluation results from lm_eval for the given file path and extract
107+
aggregarted scores for each context length
108+
Args:
109+
fpath (pathlib.Path): The file path to the evaluation results.
110+
111+
"""
112+
unqiue_metrics_dict = {}
113+
114+
def extract_metrics(results: dict, unqiue_metrics_dict: dict = {}):
115+
for k,v in results.items():
116+
if isinstance(v, dict):
117+
extract_metrics(v, unqiue_metrics_dict)
118+
else:
119+
if "stderr" not in k:
120+
metric = k.split(",")[0]
121+
if metric not in unqiue_metrics_dict:
122+
unqiue_metrics_dict[metric] = []
123+
unqiue_metrics_dict[metric].append(v)
124+
125+
return unqiue_metrics_dict
126+
127+
if fpath:
128+
with open(fpath, "r") as f:
129+
raw_results = json.load(f)
130+
131+
extract_metrics(raw_results["results"], unqiue_metrics_dict)
132+
unique_float_metrics = {}
133+
# if value is list of floats, average the list
134+
for k, v in unqiue_metrics_dict.items():
135+
if isinstance(v, list) and all(isinstance(i, float) for i in v):
136+
unique_float_metrics[k] = sum(v) / len(v)
137+
138+
# find average of all float values in dict
139+
float_values = [v for v in unique_float_metrics.values() if isinstance(v, float)]
140+
if float_values:
141+
unique_float_metrics["avg"] = sum(float_values) / len(float_values)
142+
else:
143+
unique_float_metrics["avg"] = 0.0
144+
145+
self._results = unique_float_metrics
146+
147+
def run(
148+
self,
149+
model_path: Optional[str] = None,
150+
tasks: Optional[List[str]] = None,
151+
num_gpus: Optional[int] = None,
152+
output_file: Optional[str] = None,
153+
eval_config: Optional[Dict[str, Any]] = None,
154+
api_endpoint: Optional[str] = None,
155+
max_length: Optional[int] = None,
156+
) -> None:
157+
"""
158+
Run the RULER evaluation using the specified model and tasks.
159+
"""
160+
161+
model_path = self.model_path if model_path is None else model_path
162+
num_gpus = self.num_gpus if not num_gpus else num_gpus
163+
tasks = self.tasks if not tasks else tasks
164+
output_file = self.output_file if not output_file else output_file
165+
166+
167+
if not num_gpus:
168+
num_gpus = cuda.device_count()
169+
if num_gpus <= 0 or num_gpus > cuda.device_count():
170+
raise ValueError(
171+
f"invalid value for num_gpus, must be between 1 and {cuda.device_count()}; got: {num_gpus}"
172+
)
173+
174+
# Extract system_instruction if provided
175+
system_instruction = eval_config.pop("system_instruction", None)
176+
177+
# Prepare model_args
178+
model_args = {
179+
"pretrained": model_path,
180+
"base_url": api_endpoint,
181+
"max_length": max_length,
182+
}
183+
184+
lm_eval_results = simple_evaluate(
185+
model="local-completions",
186+
model_args=model_args,
187+
tasks=tasks,
188+
system_instruction=system_instruction,
189+
)
190+
final_results = self.process_lm_eval_results(
191+
raw_results=lm_eval_results,
192+
)
193+
# write results to file
194+
if output_file:
195+
# os.makedirs(os.path.dirname(output_file), exist_ok=True)
196+
with open(output_file, "w", encoding="utf-8") as f:
197+
json.dump(final_results, f, indent=2)
198+
199+
200+
if __name__ == "__main__":
201+
fpath = "/Users/jrao/eval_results-short.json"
202+
RE = RulerEvaluator()
203+
RE.run(
204+
model_path="microsoft/phi-4-mini-instruct",
205+
tasks=["niah_single_1"],
206+
num_gpus=1,
207+
output_file="eval_results.json",
208+
api_endpoint="http://localhost:8000/v1/completions",
209+
max_length=4096,
210+
)

src/instructlab/eval/temp_res.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import json
2+
3+
def get_results(results: dict, unqiue_metrics_dict: dict = {}):
4+
5+
for k,v in results.items():
6+
if isinstance(v, dict):
7+
get_results(v, unqiue_metrics_dict)
8+
else:
9+
if "stderr" not in k:
10+
metric = k.split(",")[0]
11+
if metric not in unqiue_metrics_dict:
12+
unqiue_metrics_dict[metric] = []
13+
unqiue_metrics_dict[metric].append(v)
14+
15+
if __name__ == "__main__":
16+
fpath = "/Users/jrao/eval_results-short.json"
17+
with open(fpath, "r") as f:
18+
data = json.load(f)
19+
unqiue_metrics_dict = {}
20+
get_results(data["results"], unqiue_metrics_dict)
21+
22+
unique_float_metrics = {}
23+
# if value is list of floats, average the list
24+
for k, v in unqiue_metrics_dict.items():
25+
if isinstance(v, list) and all(isinstance(i, float) for i in v):
26+
unique_float_metrics[k] = sum(v) / len(v)
27+
28+
# find average of all float values in dict
29+
float_values = [v for v in unique_float_metrics.values() if isinstance(v, float)]
30+
if float_values:
31+
unique_float_metrics["avg"] = sum(float_values) / len(float_values)
32+
else:
33+
unique_float_metrics["avg"] = 0.0
34+
35+
36+
print(unique_float_metrics)

0 commit comments

Comments
 (0)