Skip to content

Commit ffa43cc

Browse files
committed
fix
1 parent 75af52d commit ffa43cc

File tree

2 files changed

+57
-52
lines changed

2 files changed

+57
-52
lines changed

src/lighteval/tasks/extended/__init__.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,12 @@
2020
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2121
# SOFTWARE.
2222

23+
import lighteval.tasks.extended.misc.instruct as instruct
2324
from lighteval.utils.imports import can_load_extended_tasks
2425

2526

27+
AVAILABLE_EXTENDED_TASKS_MODULES = [instruct]
28+
2629
if can_load_extended_tasks():
2730
import lighteval.tasks.extended.hle.main as hle
2831
import lighteval.tasks.extended.ifeval.main as ifeval
@@ -32,7 +35,4 @@
3235
import lighteval.tasks.extended.olympiade_bench.main as olympiad_bench
3336
import lighteval.tasks.extended.tiny_benchmarks.main as tiny_benchmarks
3437

35-
AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, hle, lcb]
36-
37-
else:
38-
AVAILABLE_EXTENDED_TASKS_MODULES = []
38+
AVAILABLE_EXTENDED_TASKS_MODULES.extend([ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench, hle, lcb])

src/lighteval/tasks/extended/instruct.py renamed to src/lighteval/tasks/extended/misc/instruct.py

Lines changed: 53 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ def belebele_prompt(line, task_name: str = None):
125125
"spa",
126126
]
127127
]
128+
TASKS_TABLE.extend(BELEBELE_TASKS)
128129

129130

130131
MMLU_SUBSETS = [
@@ -225,8 +226,8 @@ def prompt(self, line, task_name: str = None):
225226
GLOBAL_MMLU_TASKS = [
226227
LightevalTaskConfig(
227228
name=f"global_mmlu_instruct_{sensitivity_label.lower()}_{language.value}:{subset}",
228-
prompt_function=GlobalMMLUPrompt(language).prompt,
229-
suite=("extended"),
229+
prompt_function=GlobalMMLUPrompt(language.value).prompt,
230+
suite=["extended"],
230231
hf_repo="CohereForAI/Global-MMLU",
231232
hf_subset=standardize_tag(language.value),
232233
evaluation_splits=("test",),
@@ -240,23 +241,25 @@ def prompt(self, line, task_name: str = None):
240241
subset,
241242
sensitivity_label,
242243
),
243-
metric=SampleLevelMetric(
244-
metric_name="pass@1:1_samples",
245-
sample_level_fn=PassAtK(
246-
k=1,
247-
n=1,
248-
sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric(
249-
language=language,
250-
gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
251-
pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
252-
precision=6,
253-
).sample_level_fn([ref], [pred], doc),
254-
).compute,
255-
category=MetricCategory.GENERATIVE_SAMPLING,
256-
use_case=MetricUseCase.REASONING,
257-
corpus_level_fn=np.mean,
258-
higher_is_better=True,
259-
),
244+
metric=[
245+
SampleLevelMetric(
246+
metric_name="pass@1:1_samples",
247+
sample_level_fn=PassAtK(
248+
k=1,
249+
n=1,
250+
sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric(
251+
language=language,
252+
gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
253+
pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
254+
precision=6,
255+
).sample_level_fn([ref], [pred], doc),
256+
).compute,
257+
category=MetricCategory.GENERATIVE_SAMPLING,
258+
use_case=MetricUseCase.REASONING,
259+
corpus_level_fn=np.mean,
260+
higher_is_better=True,
261+
)
262+
],
260263
generation_size=32768, # needed for reasoning models like R1
261264
stop_sequence=[], # no stop sequence, will use eos token
262265
)
@@ -266,45 +269,47 @@ def prompt(self, line, task_name: str = None):
266269
Language.ENGLISH,
267270
Language.SPANISH,
268271
Language.FRENCH,
269-
Language.HEBREW,
272+
# Language.HEBREW,
270273
Language.HINDI,
271274
Language.INDONESIAN,
272275
Language.ITALIAN,
273276
Language.JAPANESE,
274-
Language.KOREAN,
275-
Language.MALAY,
277+
# Language.KOREAN,
278+
# Language.MALAY,
276279
Language.DUTCH,
277280
Language.NORWEGIAN,
278281
Language.POLISH,
279282
Language.PORTUGUESE,
280-
Language.ROMANIAN,
283+
# Language.ROMANIAN,
281284
Language.RUSSIAN,
282285
Language.SERBIAN,
283286
Language.SWEDISH,
284287
Language.SWAHILI,
285-
Language.TAMIL,
288+
# Language.TAMIL,
286289
Language.TELUGU,
287290
Language.THAI,
288291
Language.TURKISH,
289292
Language.UKRAINIAN,
290293
Language.URDU,
291294
Language.VIETNAMESE,
292-
Language.YORUBA,
293-
Language.ZULU,
295+
# Language.YORUBA,
296+
# Language.ZULU, missing literals
294297
]
295298
for sensitivity_label in ["ALL", "CA", "CS", "UNK"]
296299
]
300+
TASKS_TABLE.extend(GLOBAL_MMLU_TASKS)
297301

298302

299303
def mmlu_pro(line, task_name: str = None):
300-
instruction = f"Given the following question about {line['category']} and answer choices, output the letter corresponding to the correct answer. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of {' ,'.join(LETTER_INDICES[: len(line['choices'] - 1)])}, or {LETTER_INDICES[len(line['choices'])]}. Think step by step before answering.\n\n"
304+
num_choices = len(line["options"])
305+
instruction = f"Given the following question about {line['category']} and answer choices, output the letter corresponding to the correct answer. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of {' ,'.join(LETTER_INDICES[: num_choices - 1])}, or {LETTER_INDICES[num_choices]}. Think step by step before answering.\n\n"
301306
query = f"{instruction}###\nQuery:\n{line['question']}\n###\nChoices:\n"
302-
query += "".join([f"\n{key}) {choice}" for key, choice in zip(LETTER_INDICES, line["choices"])])
307+
query += "".join([f"\n{key}) {choice}" for key, choice in zip(LETTER_INDICES, line["options"])])
303308

304309
return Doc(
305310
task_name=task_name,
306311
query=query,
307-
choices=LETTER_INDICES[: len(line["choices"])],
312+
choices=LETTER_INDICES[:num_choices],
308313
gold_index=line["answer_index"],
309314
instruction=instruction,
310315
)
@@ -322,27 +327,27 @@ def mmlu_pro(line, task_name: str = None):
322327
few_shots_select=None,
323328
generation_size=32768, # needed for reasoning models like R1
324329
stop_sequence=[], # no stop sequence, will use eos token
325-
metric=SampleLevelMetric(
326-
metric_name="pass@1:1_samples",
327-
sample_level_fn=PassAtK(
328-
k=1,
329-
n=1,
330-
sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric(
331-
language=Language.ENGLISH,
332-
gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
333-
pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
334-
precision=6,
335-
).sample_level_fn([ref], [pred], doc),
336-
).compute,
337-
category=MetricCategory.GENERATIVE_SAMPLING,
338-
use_case=MetricUseCase.REASONING,
339-
corpus_level_fn=np.mean,
340-
higher_is_better=True,
341-
),
330+
metric=[
331+
SampleLevelMetric(
332+
metric_name="pass@1:1_samples",
333+
sample_level_fn=PassAtK(
334+
k=1,
335+
n=1,
336+
sample_scoring_function=lambda pred, ref, doc: multilingual_extractive_match_metric(
337+
language=Language.ENGLISH,
338+
gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
339+
pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
340+
precision=6,
341+
).sample_level_fn([ref], [pred], doc),
342+
).compute,
343+
category=MetricCategory.GENERATIVE_SAMPLING,
344+
use_case=MetricUseCase.REASONING,
345+
corpus_level_fn=np.mean,
346+
higher_is_better=True,
347+
)
348+
],
342349
trust_dataset=True,
343350
version=0,
344351
)
345352

346-
TASKS_TABLE.extend(BELEBELE_TASKS)
347-
TASKS_TABLE.extend(GLOBAL_MMLU_TASKS)
348353
TASKS_TABLE.append(mmlu_pro)

0 commit comments

Comments
 (0)