Skip to content

Commit ab664b8

Browse files
committed
feat: allow MMLU to pass system_prompt to lm_eval
Signed-off-by: Oleg S <[email protected]>
1 parent 4cf3e14 commit ab664b8

File tree

4 files changed

+31
-8
lines changed

4 files changed

+31
-8
lines changed

.pylintrc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -448,7 +448,8 @@ disable=raw-checker-failed,
448448
pointless-statement,
449449
wrong-import-order,
450450
line-too-long,
451-
dangerous-default-value
451+
dangerous-default-value,
452+
too-many-instance-attributes
452453

453454
# Enable the message, report, category or checker with the given id(s). You can
454455
# either give multiple identifier separated by comma (,) or put this option

.spellcheck-en-custom.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,5 @@ TODO
2626
tox
2727
venv
2828
vllm
29+
barebones
30+
LM

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.4.2
2+
3+
* Adds the ability to provide a custom system prompt to the MMLU-based evaluators. When a system prompt is provided, LM-eval applies the chat template under the hood, else it will pass the model a barebones prompt.
4+
15
## 0.4
26

37
* Added ability to specify a custom http client to MT-Bench

src/instructlab/eval/mmlu.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ class AbstractMMLUEvaluator(Evaluator):
102102
few_shots number of examples
103103
batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'.
104104
device PyTorch device (e.g. "cpu" or "cuda:0") for running models
105+
system_prompt system prompt to be used when applying the chat template
105106
"""
106107

107108
def __init__(
@@ -113,8 +114,10 @@ def __init__(
113114
few_shots: int = 5,
114115
batch_size: Optional[Union[int, str]] = "auto",
115116
device: str = ("cuda" if torch.cuda.is_available() else "cpu"),
117+
system_prompt: Optional[str] = None,
116118
) -> None:
117119
self.model_path = model_path
120+
self.system_prompt = system_prompt
118121
self.tasks_dir = tasks_dir
119122
self.tasks = tasks
120123
self.model_dtype = model_dtype
@@ -168,6 +171,7 @@ def _run_mmlu(self, server_url: str | None = None) -> dict:
168171
if not os.access(self.tasks_dir, os.R_OK):
169172
raise InvalidTasksDirError(self.tasks_dir)
170173
tm = TaskManager(verbosity="DEBUG", include_path=self.tasks_dir)
174+
should_apply_chat_template = self.system_prompt is not None
171175
mmlu_output = self._simple_evaluate_with_error_handling(
172176
model=model,
173177
model_args=model_args,
@@ -176,6 +180,8 @@ def _run_mmlu(self, server_url: str | None = None) -> dict:
176180
batch_size=self.batch_size,
177181
device=self.device,
178182
task_manager=tm,
183+
system_instruction=self.system_prompt,
184+
apply_chat_template=should_apply_chat_template,
179185
)
180186
results = mmlu_output["results"]
181187
return results
@@ -213,12 +219,13 @@ class MMLUEvaluator(AbstractMMLUEvaluator):
213219
Evaluator for Massive Multitask Language Understanding (MMLU)
214220
215221
Attributes:
216-
model_path absolute path to or name of a huggingface model
217-
tasks list of tasks for MMLU to test the model with
218-
model_dtype dtype of model when served
219-
few_shots number of examples
220-
batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'.
221-
device PyTorch device (e.g. "cpu" or "cuda:0") for running models
222+
model_path absolute path to or name of a huggingface model
223+
tasks list of tasks for MMLU to test the model with
224+
model_dtype dtype of model when served
225+
few_shots number of examples
226+
batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'.
227+
device PyTorch device (e.g. "cpu" or "cuda:0") for running models
228+
system_prompt system prompt to be used when applying the chat template
222229
"""
223230

224231
name = "mmlu"
@@ -231,9 +238,17 @@ def __init__(
231238
few_shots: int = 5,
232239
batch_size: Optional[Union[int, str]] = "auto",
233240
device: str = ("cuda" if torch.cuda.is_available() else "cpu"),
241+
system_prompt: Optional[str] = None,
234242
) -> None:
235243
super().__init__(
236-
model_path, None, tasks, model_dtype, few_shots, batch_size, device
244+
model_path,
245+
None,
246+
tasks,
247+
model_dtype,
248+
few_shots,
249+
batch_size,
250+
device,
251+
system_prompt=system_prompt,
237252
)
238253

239254

@@ -243,6 +258,7 @@ class MMLUBranchEvaluator(AbstractMMLUEvaluator):
243258
244259
Attributes:
245260
model_path absolute path to or name of a huggingface model
261+
system_prompt system prompt to be used when applying the chat template
246262
tasks_dir path where the <TASK_NAME>.jsonl and <TASK_NAME>_task.yaml files for the branches being evaluated are stored
247263
tasks group name that is shared by all the MMLUBranch tasks
248264
model_dtype dtype of model when served

0 commit comments

Comments
 (0)