Skip to content

Commit 936612c

Browse files
add TUMLU-mini benchmark, solves #577 (#811)
* add TUMLU-mini benchmark, solves #577 * add benchmark info for tumlu-mini * Update community_tasks/turkic_evals.py --------- Co-authored-by: Clémentine Fourrier <[email protected]>
1 parent b0c092a commit 936612c

File tree

1 file changed

+146
-0
lines changed

1 file changed

+146
-0
lines changed

community_tasks/turkic_evals.py

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
# MIT License
2+
3+
# Copyright (c) 2024 The HuggingFace Team
4+
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
12+
# The above copyright notice and this permission notice shall be included in all
13+
# copies or substantial portions of the Software.
14+
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
# ruff: noqa: F405, F403, F401
24+
"""
25+
Task to evaluate LLMs on TUMLU-mini benchmark: https://huggingface.co/datasets/jafarisbarov/TUMLU-mini
26+
27+
For more details, see the associated paper:
28+
29+
@misc{isbarov2025tumluunifiednativelanguage,
30+
title={{TUMLU: A Unified and Native Language Understanding Benchmark for Turkic Languages}},
31+
author={Jafar Isbarov and Arofat Akhundjanova and Mammad Hajili and Kavsar Huseynova and Dmitry Gaynullin and Anar Rzayev and Osman Tursun and Ilshat Saetov and Rinat Kharisov and Saule Belginova and Ariana Kenbayeva and Amina Alisheva and Aizirek Turdubaeva and Abdullatif Köksal and Samir Rustamov and Duygu Ataman},
32+
year={2025},
33+
eprint={2502.11020},
34+
archivePrefix={arXiv},
35+
primaryClass={cs.CL},
36+
url={https://arxiv.org/abs/2502.11020},
37+
}
38+
"""
39+
40+
import random
41+
import re
42+
from functools import partial
43+
from typing import Any, Dict, List, Optional, Union
44+
45+
from lighteval.metrics.llm_as_judge import JudgeLM
46+
from lighteval.metrics.metrics import Metric, MetricCategory, Metrics
47+
from lighteval.metrics.utils.metric_utils import MetricUseCase
48+
from lighteval.tasks.default_prompts import LETTER_INDICES
49+
from lighteval.tasks.lighteval_task import LightevalTaskConfig
50+
from lighteval.tasks.requests import Doc
51+
52+
53+
# TUMLU
54+
# fmt: off
55+
TUMLU_SUBSETS = [
56+
"azerbaijani",
57+
"crimean-tatar",
58+
"karakalpak",
59+
"kazakh",
60+
"tatar",
61+
"turkish",
62+
"uyghur",
63+
"uzbek",
64+
"kyrgyz"
65+
]
66+
# fmt: on
67+
68+
INSTRUCTION_BY_LANGUAGE = {
69+
"azerbaijani": "Aşağıdakı sual çoxvariantlı sualdır. Düzgün cavabı seçin:\n\n",
70+
"crimean-tatar": "Aşağıdaki sual çoqtan-çoq cevaplı sualdir. Doğru cevapnı seçip alıñız:\n\n",
71+
"karakalpak": "Tómendegi soraw kóp tańlawlı soraw Tuwrı juwaptı saylań:\n\n",
72+
"kazakh": "Төмендегі сұрақ көп таңдау мүмкіндігі бар сұрақ. Дұрыс жауапты таңдаңыз:\n\n",
73+
"tatar": "Түбәндәге сорау - күп сорау. Дөрес җавапны сайлагыз:\n\n",
74+
"turkish": "Aşağıdaki soru çoktan seçmeli bir sorudur. Doğru cevabı seçin:\n\n",
75+
"uyghur": "تۆۋەندىكى سوئال كۆپ تاللاش سوئالى. توغرا جاۋابنى تاللاڭ:\n\n",
76+
"uzbek": "Quyidagi savol tanlovli savoldir. To‘g‘ri javobni tanlang:\n\n",
77+
"kyrgyz": "Төмөнкү суроо бир нече варианттуу суроо. Туура жоопту тандаңыз:\n\n",
78+
}
79+
80+
ANSWER_BY_LANGUAGE = {
81+
"uzbek": "Javob:",
82+
"uzbek-cyrillic": "Жавоб",
83+
"crimean-tatar": "Cevap:",
84+
"crimean-tatar-cyrillic": "Джевап",
85+
"tatar": "Җавап:",
86+
"kazakh": "Жауап:",
87+
"kazakh-latin": "Jawap",
88+
"karakalpak": "Juwap:",
89+
"kyrgyz": "Жооп:",
90+
"turkish": "Cevap:",
91+
"uyghur": "جاۋاب:",
92+
"uyghur-latin": "Jawab:",
93+
"azerbaijani": "Cavab:",
94+
}
95+
96+
97+
def tumlu_pfn(line, task_name: str = None, language: str = None):
98+
instruction = INSTRUCTION_BY_LANGUAGE[language]
99+
100+
# Create a list of valid choices with corresponding keys
101+
choices = line.get("choices")
102+
valid_keys = ["A", "B", "C", "D", "E"][: len(choices)]
103+
104+
answer_index = valid_keys.index(line.get("answer"))
105+
106+
# Construct the query
107+
query = f"{instruction}{line['question']}\n"
108+
query += "".join([f"{key}. {choice}\n" for key, choice in zip(valid_keys, choices)])
109+
query += ANSWER_BY_LANGUAGE[language]
110+
111+
return Doc(
112+
task_name=task_name,
113+
query=query,
114+
choices=valid_keys, # Return only valid choices
115+
gold_index=answer_index, # Correct index
116+
instruction=instruction,
117+
)
118+
119+
120+
class CustomTUMLUTask(LightevalTaskConfig):
121+
def __init__(
122+
self,
123+
name,
124+
hf_subset,
125+
):
126+
super().__init__(
127+
name=name,
128+
hf_subset=hf_subset,
129+
prompt_function=partial(tumlu_pfn, language=hf_subset),
130+
hf_repo="jafarisbarov/TUMLU-mini",
131+
metric=[Metrics.loglikelihood_acc_norm],
132+
hf_avail_splits=["test", "dev"],
133+
evaluation_splits=["test"],
134+
few_shots_split=["dev"],
135+
few_shots_select="sequential",
136+
suite=["community"],
137+
generation_size=-1,
138+
stop_sequence=None,
139+
trust_dataset=False,
140+
version=0,
141+
)
142+
143+
144+
TUMLU_TASKS = [CustomTUMLUTask(name=f"tumlu:{subset}", hf_subset=subset) for subset in TUMLU_SUBSETS]
145+
146+
TASKS_TABLE = TUMLU_TASKS

0 commit comments

Comments
 (0)