feat: allow MMLU to pass system_prompt to lm_eval

RobotSail · RobotSail · commit ab664b8bdfda · 2024-12-12T05:22:23.000Z
Signed-off-by: Oleg S &lt;97077423+RobotSail@users.noreply.github.com&gt;
diff --git a/.pylintrc b/.pylintrc
@@ -448,7 +448,8 @@ disable=raw-checker-failed,
         pointless-statement,
         wrong-import-order,
         line-too-long,
-        dangerous-default-value
+        dangerous-default-value,
+        too-many-instance-attributes
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option
diff --git a/.spellcheck-en-custom.txt b/.spellcheck-en-custom.txt
@@ -26,3 +26,5 @@ TODO
 tox
 venv
 vllm
+barebones
+LM
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.4.2
+
+* Adds the ability to provide a custom system prompt to the MMLU-based evaluators. When a system prompt is provided, LM-eval applies the chat template under the hood, else it will pass the model a barebones prompt.
+
 ## 0.4
 
 * Added ability to specify a custom http client to MT-Bench
diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py
@@ -102,6 +102,7 @@ class AbstractMMLUEvaluator(Evaluator):
         few_shots       number of examples
         batch_size      batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'.
         device          PyTorch device (e.g. "cpu" or "cuda:0") for running models
+        system_prompt   system prompt to be used when applying the chat template
     """
 
     def __init__(
@@ -113,8 +114,10 @@ def __init__(
         few_shots: int = 5,
         batch_size: Optional[Union[int, str]] = "auto",
         device: str = ("cuda" if torch.cuda.is_available() else "cpu"),
+        system_prompt: Optional[str] = None,
     ) -> None:
         self.model_path = model_path
+        self.system_prompt = system_prompt
         self.tasks_dir = tasks_dir
         self.tasks = tasks
         self.model_dtype = model_dtype
@@ -168,6 +171,7 @@ def _run_mmlu(self, server_url: str | None = None) -> dict:
             if not os.access(self.tasks_dir, os.R_OK):
                 raise InvalidTasksDirError(self.tasks_dir)
             tm = TaskManager(verbosity="DEBUG", include_path=self.tasks_dir)
+        should_apply_chat_template = self.system_prompt is not None
         mmlu_output = self._simple_evaluate_with_error_handling(
             model=model,
             model_args=model_args,
@@ -176,6 +180,8 @@ def _run_mmlu(self, server_url: str | None = None) -> dict:
             batch_size=self.batch_size,
             device=self.device,
             task_manager=tm,
+            system_instruction=self.system_prompt,
+            apply_chat_template=should_apply_chat_template,
         )
         results = mmlu_output["results"]
         return results
@@ -213,12 +219,13 @@ class MMLUEvaluator(AbstractMMLUEvaluator):
     Evaluator for Massive Multitask Language Understanding (MMLU)
 
     Attributes:
-        model_path   absolute path to or name of a huggingface model
-        tasks        list of tasks for MMLU to test the model with
-        model_dtype  dtype of model when served
-        few_shots    number of examples
-        batch_size   batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'.
-        device       PyTorch device (e.g. "cpu" or "cuda:0") for running models
+        model_path      absolute path to or name of a huggingface model
+        tasks           list of tasks for MMLU to test the model with
+        model_dtype     dtype of model when served
+        few_shots       number of examples
+        batch_size      batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'.
+        device          PyTorch device (e.g. "cpu" or "cuda:0") for running models
+        system_prompt   system prompt to be used when applying the chat template
     """
 
     name = "mmlu"
@@ -231,9 +238,17 @@ def __init__(
         few_shots: int = 5,
         batch_size: Optional[Union[int, str]] = "auto",
         device: str = ("cuda" if torch.cuda.is_available() else "cpu"),
+        system_prompt: Optional[str] = None,
     ) -> None:
         super().__init__(
-            model_path, None, tasks, model_dtype, few_shots, batch_size, device
+            model_path,
+            None,
+            tasks,
+            model_dtype,
+            few_shots,
+            batch_size,
+            device,
+            system_prompt=system_prompt,
         )
 
 
@@ -243,6 +258,7 @@ class MMLUBranchEvaluator(AbstractMMLUEvaluator):
 
     Attributes:
         model_path      absolute path to or name of a huggingface model
+        system_prompt   system prompt to be used when applying the chat template
         tasks_dir       path where the <TASK_NAME>.jsonl and <TASK_NAME>_task.yaml files for the branches being evaluated are stored
         tasks           group name that is shared by all the MMLUBranch tasks
         model_dtype     dtype of model when served

-Original file line number
+Diff line change
 tox
 venv
 vllm
 +barebones
 +LM