1+
2+ from instructlab .eval .evaluator import Evaluator
3+ from lm_eval .evaluator import simple_evaluate
4+
5+ from typing import Any , Dict , Optional , List
6+ import pathlib
7+ import json
8+ import os
9+ from torch import cuda
10+
11+ RULER_TASKS = [
12+ "niah_single_1" ,
13+ "niah_single_2" ,
14+ "niah_single_3" ,
15+ "niah_multikey_1" ,
16+ "niah_multikey_2" ,
17+ "niah_multikey_3" ,
18+ "niah_multiquery" ,
19+ "niah_multivalue" ,
20+ "ruler_vt" ,
21+ "ruler_cwe" ,
22+ "ruler_fwe" ,
23+ "ruler_qa_hotpot" ,
24+ "ruler_qa_squad"
25+ ]
26+
27+ DEFAULT_VLLM_CONFIG = {
28+ "dtype" : "float16" ,
29+ "gpu_memory_utilization" : 0.8 ,
30+ "disable_custom_all_reduce" : True ,
31+ "enforce_eager" : False ,
32+ "max_model_len" : 131072 ,
33+ }
34+
35+ # Default configuration parameters
36+ DEFAULT_EVAL_CONFIG = {
37+ "batch_size" : "auto" ,
38+ "apply_chat_template" : True ,
39+ "fewshot_as_multiturn" : True ,
40+ "confirm_run_unsafe_code" : True ,
41+ "system_instruction" : None ,
42+ "cache_requests" : False ,
43+ }
44+
45+ class RulerEvaluator (Evaluator ):
46+ """
47+ Class definition for running RULER benchmarking tasks.
48+ """
49+
50+ def __init (
51+ self ,
52+ model_path : Optional [str ] = None ,
53+ output_file : Optional [str ] = None ,
54+ tasks : list [str ] = RULER_TASKS ,
55+ num_gpus : Optional [int ] = None ,
56+ eval_config : Optional [Dict [str , Any ]] = None ,
57+ vllm_config : Optional [Dict [str , Any ]] = None ,
58+ hf_config : Optional [Dict [str , Any ]] = None ,
59+ openai_config : Optional [Dict [str , Any ]] = None ,
60+ api_endpoint : Optional [str ] = None ,
61+
62+ ) -> None :
63+ self .model_path = model_path
64+ self .tasks = tasks
65+ self ._results = None
66+ self .output_file = output_file
67+
68+ # Store evaluation configurations
69+ self .eval_config = eval_config or {}
70+ self .vllm_config = vllm_config or {}
71+ self .hf_config = hf_config or {}
72+ self .openai_config = openai_config or {}
73+
74+ self .api_endpoint = api_endpoint or None
75+ self .num_gpus = num_gpus
76+
77+ @property
78+ def results (self ) -> Dict [str , Any ] | None :
79+ """
80+ Returns the results of the last RULER evaluation, if one has taken place.
81+
82+ Returns:
83+ Dict[str, Any] | None: The processed output from `lm_eval.evaluator.simple_evaluate`
84+ """
85+ return self ._results
86+
87+ def save_to_file (
88+ self ,
89+ output_file : Optional [str ] = None
90+ ) -> None :
91+ """Save results to a JSON file"""
92+ output_file = output_file or self .output_file
93+ if not output_file :
94+ raise ValueError ("Output file path cannot be empty" )
95+
96+ os .makedirs (os .path .dirname (output_file ), exist_ok = True )
97+ with open (output_file , "w" , encoding = "utf-8" ) as f :
98+ json .dump (self ._results , f , indent = 2 )
99+
100+ def process_lm_eval_results (
101+ self ,
102+ fpath : Optional [pathlib .Path ] = None ,
103+ raw_results : Optional [dict ] = None
104+ ) -> None :
105+ """
106+ Process the evaluation results from lm_eval for the given file path and extract
107+ aggregarted scores for each context length
108+ Args:
109+ fpath (pathlib.Path): The file path to the evaluation results.
110+
111+ """
112+ unqiue_metrics_dict = {}
113+
114+ def extract_metrics (results : dict , unqiue_metrics_dict : dict = {}):
115+ for k ,v in results .items ():
116+ if isinstance (v , dict ):
117+ extract_metrics (v , unqiue_metrics_dict )
118+ else :
119+ if "stderr" not in k :
120+ metric = k .split ("," )[0 ]
121+ if metric not in unqiue_metrics_dict :
122+ unqiue_metrics_dict [metric ] = []
123+ unqiue_metrics_dict [metric ].append (v )
124+
125+ return unqiue_metrics_dict
126+
127+ if fpath :
128+ with open (fpath , "r" ) as f :
129+ raw_results = json .load (f )
130+
131+ extract_metrics (raw_results ["results" ], unqiue_metrics_dict )
132+ unique_float_metrics = {}
133+ # if value is list of floats, average the list
134+ for k , v in unqiue_metrics_dict .items ():
135+ if isinstance (v , list ) and all (isinstance (i , float ) for i in v ):
136+ unique_float_metrics [k ] = sum (v ) / len (v )
137+
138+ # find average of all float values in dict
139+ float_values = [v for v in unique_float_metrics .values () if isinstance (v , float )]
140+ if float_values :
141+ unique_float_metrics ["avg" ] = sum (float_values ) / len (float_values )
142+ else :
143+ unique_float_metrics ["avg" ] = 0.0
144+
145+ self ._results = unique_float_metrics
146+
147+ def run (
148+ self ,
149+ model_path : Optional [str ] = None ,
150+ tasks : Optional [List [str ]] = None ,
151+ num_gpus : Optional [int ] = None ,
152+ output_file : Optional [str ] = None ,
153+ eval_config : Optional [Dict [str , Any ]] = None ,
154+ vllm_config : Optional [Dict [str , Any ]] = None ,
155+ hf_config : Optional [Dict [str , Any ]] = None ,
156+ openai_config : Optional [Dict [str , Any ]] = None ,
157+ api_endpoint : Optional [str ] = None ,
158+ max_length : Optional [int ] = None ,
159+ ) -> None :
160+ """
161+ Run the RULER evaluation using the specified model and tasks.
162+ """
163+
164+ model_path = self .model_path if model_path is None else model_path
165+ num_gpus = self .num_gpus if not num_gpus else num_gpus
166+ tasks = self .tasks if not tasks else tasks
167+ output_file = self .output_file if not output_file else output_file
168+
169+ # Merge configurations with instance configurations, with run-time configs taking precedence
170+ # final_eval_config = {**self.eval_config, **(eval_config or {})}
171+ # final_vllm_config = {**self.vllm_config, **(vllm_config or {})}
172+ # final_hf_config = {**self.hf_config, **(hf_config or {})}
173+ # final_openai_config = {**self.openai_config, **(openai_config or {})}
174+
175+
176+ if not num_gpus :
177+ num_gpus = cuda .device_count ()
178+ if num_gpus <= 0 or num_gpus > cuda .device_count ():
179+ raise ValueError (
180+ f"invalid value for num_gpus, must be between 1 and { cuda .device_count ()} ; got: { num_gpus } "
181+ )
182+
183+ # Extract system_instruction if provided
184+ system_instruction = eval_config .pop ("system_instruction" , None )
185+
186+ # final_vllm_config = {}
187+ # final_vllm_config.update(DEFAULT_VLLM_CONFIG)
188+ # # final_vllm_config.update(self.vllm_config)
189+ # # final_vllm_config.update(vllm_config)
190+
191+ # Prepare model_args
192+ model_args = {
193+ "pretrained" : model_path ,
194+ "base_url" : api_endpoint ,
195+ "max_length" : max_length ,
196+ }
197+
198+ lm_eval_results = simple_evaluate (
199+ model = "local-completions" ,
200+ model_args = model_args ,
201+ tasks = tasks ,
202+ system_instruction = system_instruction ,
203+ )
204+
205+ print ("Results: " , lm_eval_results )
206+
207+
208+ if __name__ == "__main__" :
209+ fpath = "/Users/jrao/eval_results-short.json"
210+ RE = RulerEvaluator ()
211+
212+ # with open(fpath, "r") as f:
213+ # data = json.load(f)
214+ # unqiue_metrics_dict = {}
215+ # # get_results(data["results"], unqiue_metrics_dict)
216+ # # RE.process_lm_eval_results(fpath)
217+ # RE.process_lm_eval_results(raw_results=data)
218+ # print(RE.results)
219+ RE .run (
220+ model_path = "microsoft/phi-4-mini-instruct" ,
221+ tasks = ["niah_single_1" ],
222+ num_gpus = 1 ,
223+ output_file = "/Users/jrao/eval_results.json" ,
224+ eval_config = DEFAULT_EVAL_CONFIG ,
225+ vllm_config = DEFAULT_VLLM_CONFIG ,
226+ )
0 commit comments