2727
2828import numpy as np
2929import requests
30- from aenum import extend_enum
3130from scipy .optimize import minimize
3231
3332import lighteval .tasks .default_prompts as prompt
34- from lighteval .metrics .metrics import CorpusLevelMetricGrouping , Metrics
33+ from lighteval .metrics .metrics import CorpusLevelMetricGrouping
3534from lighteval .metrics .metrics_corpus import CorpusLevelComputation
3635from lighteval .metrics .metrics_sample import ExactMatches , LoglikelihoodAcc , SampleLevelComputation
3736from lighteval .metrics .normalizations import gsm8k_normalizer
37+ from lighteval .models .model_output import ModelResponse
3838from lighteval .tasks .lighteval_task import LightevalTaskConfig
39- from lighteval .tasks .requests import SamplingMethod
39+ from lighteval .tasks .requests import Doc , SamplingMethod
4040
4141
4242# Utility functions
@@ -96,18 +96,18 @@ def download(self):
9696 with open (path_dld , "wb" ) as file :
9797 file .write (response .content )
9898
99- def compute (self , ** args ) :
99+ def compute (self , doc : Doc , model_response : ModelResponse , ** kwargs ) -> float :
100100 if self .task == "gsm8k" :
101101 res = ExactMatches (
102102 strip_strings = True , normalize_pred = gsm8k_normalizer , normalize_gold = gsm8k_normalizer
103- ).compute (** args )
103+ ).compute (doc , model_response , ** kwargs )
104104 return dict .fromkeys (self .METRICS , res )
105105 else :
106- res = LoglikelihoodAcc ().compute (** args )
106+ res = LoglikelihoodAcc ().compute (doc , model_response , ** kwargs )
107107 return dict .fromkeys (self .METRICS , res )
108108
109- def compute_corpus (self , y_input ):
110- if len (y_input ) == self .num_samples and self .estimates is not None :
109+ def compute_corpus (self , items ):
110+ if len (items ) == self .num_samples and self .estimates is not None :
111111 return self .estimates [self .task ]
112112
113113 # We load the weights for the relevant examples
@@ -144,7 +144,7 @@ def compute_corpus(self, y_input):
144144 # Creating vector y and estimating theta
145145 y = np .zeros (N )
146146 for i , j in enumerate (seen_examples ):
147- y [j ] = y_input [i ]
147+ y [j ] = items [i ]
148148
149149 # Getting estimates
150150 theta = fit_theta (y , seen_examples , A , B )
@@ -170,7 +170,7 @@ def compute_corpus(self, y_input):
170170 estimates [scenario ]["pirt" ] = IRTp
171171 estimates [scenario ]["gpirt" ] = IRTpp
172172
173- self .num_samples = len (y_input )
173+ self .num_samples = len (items )
174174 self .estimates = estimates
175175
176176 return estimates [self .task ]
@@ -233,6 +233,25 @@ def compute_corpus(self, y_input):
233233 # },
234234]
235235
236+ metrics = {}
237+
238+ for task_param in task_params :
239+ name = task_param ["name" ]
240+ if name == "gsm8k" :
241+ category = SamplingMethod .GENERATIVE
242+ else :
243+ category = SamplingMethod .LOGPROBS
244+
245+ metrics [f"tinybench_metric_{ name } " ] = (
246+ CorpusLevelMetricGrouping (
247+ metric_name = TinyCorpusAggregator .METRICS ,
248+ higher_is_better = dict .fromkeys (TinyCorpusAggregator .METRICS , True ),
249+ sample_level_fn = TinyCorpusAggregator (name ),
250+ category = category ,
251+ corpus_level_fn = TinyCorpusAggregator (name ),
252+ ),
253+ )
254+
236255TASKS_TABLE = []
237256for task in task_params :
238257 name = task ["name" ]
@@ -251,28 +270,8 @@ def compute_corpus(self, y_input):
251270 evaluation_splits = task ["evaluation_split" ],
252271 few_shots_split = None ,
253272 few_shots_select = "random_sampling" ,
254- metrics = [f"tinybench_metric_{ name } " ],
273+ metrics = metrics [f"tinybench_metric_{ name } " ],
255274 generation_size = generation_size ,
256275 stop_sequence = stop_sequence ,
257276 )
258277 TASKS_TABLE .append (task )
259-
260- # CUSTOM METRIC
261- for task_param in task_params :
262- name = task_param ["name" ]
263- if name == "gsm8k" :
264- category = SamplingMethod .GENERATIVE
265- else :
266- category = SamplingMethod .LOGPROBS
267-
268- extend_enum (
269- Metrics ,
270- f"tinybench_metric_{ name } " ,
271- CorpusLevelMetricGrouping (
272- metric_name = TinyCorpusAggregator .METRICS ,
273- higher_is_better = dict .fromkeys (TinyCorpusAggregator .METRICS , True ),
274- sample_level_fn = TinyCorpusAggregator (name ),
275- category = category ,
276- corpus_level_fn = TinyCorpusAggregator (name ),
277- ),
278- )
0 commit comments