11
11
12
12
import numpy as np
13
13
import lm_eval
14
- from lm_eval import evaluator , tasks
14
+ from lm_eval import tasks
15
15
from lm_eval .utils import make_table
16
16
17
17
@@ -73,20 +73,19 @@ def handle_output(args, results, logger):
73
73
74
74
75
75
def load_tasks (args ):
76
- tasks .initialize_tasks ()
77
76
if args .open_llm_leaderboard_tasks :
78
77
current_dir = os .getcwd ()
79
78
config_dir = os .path .join (current_dir , "open_llm_leaderboard" )
80
- lm_eval . tasks .include_path ( config_dir )
81
- return [
79
+ task_manager = tasks .TaskManager ( include_path = config_dir )
80
+ return task_manager , [
82
81
"arc_challenge_25_shot" ,
83
82
"hellaswag_10_shot" ,
84
83
"truthfulqa_mc2" ,
85
84
"winogrande_5_shot" ,
86
85
"gsm8k" ,
87
86
"mmlu" ,
88
87
]
89
- return args .tasks .split ("," ) if args .tasks else []
88
+ return None , args .tasks .split ("," ) if args .tasks else []
90
89
91
90
92
91
def parse_eval_args ():
@@ -190,21 +189,18 @@ def parse_eval_args():
190
189
default = None ,
191
190
help = "Additional path to include if there are external tasks." ,
192
191
)
193
- parser .add_argument (
194
- "--decontamination_ngrams_path" , default = None
195
- ) # Not currently used
196
192
return parser .parse_args ()
197
193
198
194
199
195
def evaluate_model (args ):
200
196
try :
201
- task_list = load_tasks (args )
197
+ task_manager , task_list = load_tasks (args )
202
198
# Customized model such as Quantized model etc.
203
199
# In case you are working with a custom model, you can use the following guide to add it here:
204
200
# https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage
205
201
206
202
# Evaluate
207
- results = evaluator .simple_evaluate (
203
+ results = lm_eval .simple_evaluate (
208
204
model = args .model ,
209
205
model_args = args .model_args ,
210
206
tasks = task_list ,
@@ -214,11 +210,11 @@ def evaluate_model(args):
214
210
device = args .device ,
215
211
use_cache = args .use_cache ,
216
212
limit = args .limit ,
217
- decontamination_ngrams_path = args .decontamination_ngrams_path ,
218
213
check_integrity = args .check_integrity ,
219
214
write_out = args .write_out ,
220
215
log_samples = args .log_samples ,
221
216
gen_kwargs = args .gen_kwargs ,
217
+ task_manager = task_manager ,
222
218
)
223
219
handle_output (args , results , logger )
224
220
0 commit comments