Skip to content

Commit f3a0b55

Browse files
authored
warning for "chat" pretrained; disable buggy evalita configs (#3127)
* check for chat for warning * add test * remove yaml extension from some evalita configs * move unitxt to own test script * fix CI test
1 parent ab3acc7 commit f3a0b55

File tree

7 files changed

+61
-47
lines changed

7 files changed

+61
-47
lines changed

lm_eval/evaluator.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -154,15 +154,23 @@ def simple_evaluate(
154154
"Either 'limit' or 'samples' must be None, but both are not None."
155155
)
156156

157+
_NEEDS_CHAT_TEMPLATE = ("inst", "chat")
157158
if (
158-
(isinstance(model_args, str) and "inst" in model_args.lower())
159+
(
160+
isinstance(model_args, str)
161+
and any(kw in model_args.lower() for kw in _NEEDS_CHAT_TEMPLATE)
162+
)
159163
or (
160164
isinstance(model_args, dict)
161-
and any("inst" in str(v).lower() for v in model_args.values())
165+
and any(
166+
any(kw in str(v).lower() for kw in _NEEDS_CHAT_TEMPLATE)
167+
for v in model_args.values()
168+
)
162169
)
163170
) and not apply_chat_template:
164171
eval_logger.warning(
165-
"Model appears to be an instruct variant but chat template is not applied. Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`)."
172+
"Model appears to be an instruct or chat variant but chat template is not applied. "
173+
"Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`)."
166174
)
167175

168176
if delete_requests_cache:

lm_eval/filters/extraction.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def __init__(
141141
"""
142142
regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
143143
- step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
144-
- step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
144+
- step 2 : We parse the choice with regex: r'\s*([A-?])', where ? varies by number of choices.
145145
group_select: Selects the (group_select)th match from the findall result.
146146
ignore_case: Ignores the case during step 1 matching
147147
ignore_punctuation: Remove the punctuation during step 1 matching

tests/test_tasks.py

Lines changed: 0 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@ def limit() -> int:
4646
return 10
4747

4848

49-
# Tests
5049
class BaseTasks:
5150
"""
5251
Base class for testing tasks
@@ -166,45 +165,3 @@ class TestNewTasksElseDefault(BaseTasks):
166165
Test class parameterized with a list of new/modified tasks
167166
(or a set of default tasks if none have been modified)
168167
"""
169-
170-
171-
@pytest.mark.parametrize(
172-
"task_class",
173-
task_class(
174-
["arc_easy_unitxt"], tasks.TaskManager(include_path="./tests/testconfigs")
175-
),
176-
ids=lambda x: f"{x.config.task}",
177-
)
178-
class TestUnitxtTasks(BaseTasks):
179-
"""
180-
Test class for Unitxt tasks parameterized with a small custom
181-
task as described here:
182-
https://www.unitxt.ai/en/latest/docs/lm_eval.html
183-
"""
184-
185-
def test_check_training_docs(self, task_class: ConfigurableTask):
186-
if task_class.has_training_docs():
187-
assert task_class.dataset["train"] is not None
188-
189-
def test_check_validation_docs(self, task_class):
190-
if task_class.has_validation_docs():
191-
assert task_class.dataset["validation"] is not None
192-
193-
def test_check_test_docs(self, task_class):
194-
task = task_class
195-
if task.has_test_docs():
196-
assert task.dataset["test"] is not None
197-
198-
def test_doc_to_text(self, task_class, limit: int):
199-
task = task_class
200-
arr = (
201-
list(islice(task.test_docs(), limit))
202-
if task.has_test_docs()
203-
else list(islice(task.validation_docs(), limit))
204-
)
205-
_array = [task.doc_to_text(doc) for doc in arr]
206-
if not task.multiple_input:
207-
for x in _array:
208-
assert isinstance(x, str)
209-
else:
210-
pass

tests/test_unitxt_tasks.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
from itertools import islice
2+
3+
import pytest
4+
5+
from lm_eval import tasks as tasks
6+
from lm_eval.api.task import ConfigurableTask
7+
from tests.test_tasks import BaseTasks, task_class
8+
9+
10+
@pytest.mark.parametrize(
11+
"task_class",
12+
task_class(
13+
["arc_easy_unitxt"], tasks.TaskManager(include_path="./tests/testconfigs")
14+
),
15+
ids=lambda x: f"{x.config.task}",
16+
)
17+
class TestUnitxtTasks(BaseTasks):
18+
"""
19+
Test class for Unitxt tasks parameterized with a small custom
20+
task as described here:
21+
https://www.unitxt.ai/en/latest/docs/lm_eval.html
22+
"""
23+
24+
def test_check_training_docs(self, task_class: ConfigurableTask):
25+
if task_class.has_training_docs():
26+
assert task_class.dataset["train"] is not None
27+
28+
def test_check_validation_docs(self, task_class):
29+
if task_class.has_validation_docs():
30+
assert task_class.dataset["validation"] is not None
31+
32+
def test_check_test_docs(self, task_class):
33+
task = task_class
34+
if task.has_test_docs():
35+
assert task.dataset["test"] is not None
36+
37+
def test_doc_to_text(self, task_class, limit: int):
38+
task = task_class
39+
arr = (
40+
list(islice(task.test_docs(), limit))
41+
if task.has_test_docs()
42+
else list(islice(task.validation_docs(), limit))
43+
)
44+
_array = [task.doc_to_text(doc) for doc in arr]
45+
if not task.multiple_input:
46+
for x in _array:
47+
assert isinstance(x, str)
48+
else:
49+
pass

0 commit comments

Comments
 (0)