1+
2+ from instructlab .eval .evaluator import Evaluator
3+ from lm_eval .evaluator import simple_evaluate
4+
5+ from typing import Any , Dict , Optional , List
6+ import pathlib
7+ import json
8+ import os
9+ from torch import cuda
10+
11+ RULER_TASKS = [
12+ "niah_single_1" ,
13+ "niah_single_2" ,
14+ "niah_single_3" ,
15+ "niah_multikey_1" ,
16+ "niah_multikey_2" ,
17+ "niah_multikey_3" ,
18+ "niah_multiquery" ,
19+ "niah_multivalue" ,
20+ "ruler_vt" ,
21+ "ruler_cwe" ,
22+ "ruler_fwe" ,
23+ "ruler_qa_hotpot" ,
24+ "ruler_qa_squad"
25+ ]
26+
27+ DEFAULT_VLLM_CONFIG = {
28+ "dtype" : "float16" ,
29+ "gpu_memory_utilization" : 0.8 ,
30+ "disable_custom_all_reduce" : True ,
31+ "enforce_eager" : False ,
32+ "max_model_len" : 131072 ,
33+ }
34+
35+ # Default configuration parameters
36+ DEFAULT_EVAL_CONFIG = {
37+ "batch_size" : "auto" ,
38+ "apply_chat_template" : True ,
39+ "fewshot_as_multiturn" : True ,
40+ "confirm_run_unsafe_code" : True ,
41+ "system_instruction" : None ,
42+ "cache_requests" : False ,
43+ }
44+
45+ class RulerEvaluator (Evaluator ):
46+ """
47+ Class definition for running RULER benchmarking tasks.
48+ """
49+
50+ def __init (
51+ self ,
52+ model_path : Optional [str ] = None ,
53+ output_file : Optional [str ] = None ,
54+ tasks : list [str ] = RULER_TASKS ,
55+ num_gpus : Optional [int ] = None ,
56+ eval_config : Optional [Dict [str , Any ]] = None ,
57+ vllm_config : Optional [Dict [str , Any ]] = None ,
58+ hf_config : Optional [Dict [str , Any ]] = None ,
59+ openai_config : Optional [Dict [str , Any ]] = None ,
60+ api_endpoint : Optional [str ] = None ,
61+
62+ ) -> None :
63+ self .model_path = model_path
64+ self .tasks = tasks
65+ self ._results = None
66+ self .output_file = output_file
67+
68+ # Store evaluation configurations
69+ self .eval_config = eval_config or {}
70+ self .vllm_config = vllm_config or {}
71+ self .hf_config = hf_config or {}
72+ self .openai_config = openai_config or {}
73+
74+ self .api_endpoint = api_endpoint or None
75+ self .num_gpus = num_gpus
76+
77+ @property
78+ def results (self ) -> Dict [str , Any ] | None :
79+ """
80+ Returns the results of the last RULER evaluation, if one has taken place.
81+
82+ Returns:
83+ Dict[str, Any] | None: The processed output from `lm_eval.evaluator.simple_evaluate`
84+ """
85+ return self ._results
86+
87+ def save_to_file (
88+ self ,
89+ output_file : Optional [str ] = None
90+ ) -> None :
91+ """Save results to a JSON file"""
92+ output_file = output_file or self .output_file
93+ if not output_file :
94+ raise ValueError ("Output file path cannot be empty" )
95+
96+ os .makedirs (os .path .dirname (output_file ), exist_ok = True )
97+ with open (output_file , "w" , encoding = "utf-8" ) as f :
98+ json .dump (self ._results , f , indent = 2 )
99+
100+ def process_lm_eval_results (
101+ self ,
102+ fpath : Optional [pathlib .Path ] = None ,
103+ raw_results : Optional [dict ] = None
104+ ) -> None :
105+ """
106+ Process the evaluation results from lm_eval for the given file path and extract
107+ aggregarted scores for each context length
108+ Args:
109+ fpath (pathlib.Path): The file path to the evaluation results.
110+
111+ """
112+ unqiue_metrics_dict = {}
113+
114+ def extract_metrics (results : dict , unqiue_metrics_dict : dict = {}):
115+ for k ,v in results .items ():
116+ if isinstance (v , dict ):
117+ extract_metrics (v , unqiue_metrics_dict )
118+ else :
119+ if "stderr" not in k :
120+ metric = k .split ("," )[0 ]
121+ if metric not in unqiue_metrics_dict :
122+ unqiue_metrics_dict [metric ] = []
123+ unqiue_metrics_dict [metric ].append (v )
124+
125+ return unqiue_metrics_dict
126+
127+ if fpath :
128+ with open (fpath , "r" ) as f :
129+ raw_results = json .load (f )
130+
131+ extract_metrics (raw_results ["results" ], unqiue_metrics_dict )
132+ unique_float_metrics = {}
133+ # if value is list of floats, average the list
134+ for k , v in unqiue_metrics_dict .items ():
135+ if isinstance (v , list ) and all (isinstance (i , float ) for i in v ):
136+ unique_float_metrics [k ] = sum (v ) / len (v )
137+
138+ # find average of all float values in dict
139+ float_values = [v for v in unique_float_metrics .values () if isinstance (v , float )]
140+ if float_values :
141+ unique_float_metrics ["avg" ] = sum (float_values ) / len (float_values )
142+ else :
143+ unique_float_metrics ["avg" ] = 0.0
144+
145+ self ._results = unique_float_metrics
146+
147+ def run (
148+ self ,
149+ model_path : Optional [str ] = None ,
150+ tasks : Optional [List [str ]] = None ,
151+ num_gpus : Optional [int ] = None ,
152+ output_file : Optional [str ] = None ,
153+ eval_config : Optional [Dict [str , Any ]] = None ,
154+ api_endpoint : Optional [str ] = None ,
155+ max_length : Optional [int ] = None ,
156+ ) -> None :
157+ """
158+ Run the RULER evaluation using the specified model and tasks.
159+ """
160+
161+ model_path = self .model_path if model_path is None else model_path
162+ num_gpus = self .num_gpus if not num_gpus else num_gpus
163+ tasks = self .tasks if not tasks else tasks
164+ output_file = self .output_file if not output_file else output_file
165+
166+
167+ if not num_gpus :
168+ num_gpus = cuda .device_count ()
169+ if num_gpus <= 0 or num_gpus > cuda .device_count ():
170+ raise ValueError (
171+ f"invalid value for num_gpus, must be between 1 and { cuda .device_count ()} ; got: { num_gpus } "
172+ )
173+
174+ # Extract system_instruction if provided
175+ system_instruction = eval_config .pop ("system_instruction" , None )
176+
177+ # Prepare model_args
178+ model_args = {
179+ "pretrained" : model_path ,
180+ "base_url" : api_endpoint ,
181+ "max_length" : max_length ,
182+ }
183+
184+ lm_eval_results = simple_evaluate (
185+ model = "local-completions" ,
186+ model_args = model_args ,
187+ tasks = tasks ,
188+ system_instruction = system_instruction ,
189+ )
190+ final_results = self .process_lm_eval_results (
191+ raw_results = lm_eval_results ,
192+ )
193+ # write results to file
194+ if output_file :
195+ # os.makedirs(os.path.dirname(output_file), exist_ok=True)
196+ with open (output_file , "w" , encoding = "utf-8" ) as f :
197+ json .dump (final_results , f , indent = 2 )
198+
199+
200+ if __name__ == "__main__" :
201+ fpath = "/Users/jrao/eval_results-short.json"
202+ RE = RulerEvaluator ()
203+ RE .run (
204+ model_path = "microsoft/phi-4-mini-instruct" ,
205+ tasks = ["niah_single_1" ],
206+ num_gpus = 1 ,
207+ output_file = "eval_results.json" ,
208+ api_endpoint = "http://localhost:8000/v1/completions" ,
209+ max_length = 4096 ,
210+ )
0 commit comments