Merge pull request #19 from alimaredia/mmlu-branch

nathan-weinberg · web-flow · commit 8494a51728e3 · 2024-06-26T14:30:32.000-04:00
mmlu branch run() complete
diff --git a/src/instructlab/eval/mmlu.py b/src/instructlab/eval/mmlu.py
@@ -5,6 +5,7 @@
 
 # Third Party
 from lm_eval.evaluator import simple_evaluate  # type: ignore
+from lm_eval.tasks import TaskManager  # type: ignore
 
 # First Party
 from instructlab.eval.evaluator import Evaluator
@@ -78,8 +79,8 @@ class MMLUBranchEvaluator(Evaluator):
 
     Attributes:
         model_path  absolute path to or name of a huggingface model
-        sdg_path    path where all the MMLUBranch tasks are stored
-        task        group name that is shared by all the MMLUBranch tasks
+        sdg_path    path where the <TASK_NAME>.jsonl and <TASK_NAME>_task.yaml files for the branches being evaluated are stored
+        tasks       group name that is shared by all the MMLUBranch tasks
         few_shots   number of examples
         batch_size  number of GPUs
     """
@@ -88,13 +89,15 @@ def __init__(
         self,
         model_path,
         sdg_path: str,
-        task: str = "mmlu_pr",
+        tasks: list[str],
+        model_dtype="bfloat16",
         few_shots: int = 2,
         batch_size: int = 5,
     ) -> None:
         self.model_path = model_path
         self.sdg_path = sdg_path
-        self.task = task
+        self.tasks = tasks
+        self.model_dtype = model_dtype
         self.few_shots = few_shots
         self.batch_size = batch_size
 
@@ -103,11 +106,34 @@ def run(self) -> tuple:
         Runs MMLUBranch evaluation
 
         Returns:
-            overall_score       MMLUBranch score for the overall model evaluation
-            individual_scores   Individual MMLUBranch scores for each task
-            qa_pairs            Question and answer pairs from the evaluation
+            overall_score       Average MMLUBranch score for the task group
+            individual_scores   Individual MMLUBranch scores for each task in the task group
         """
-        individual_scores: dict[str, float] = {}
-        overall_score: float = 0.0
-        qa_pairs: list[tuple] = []
-        return overall_score, individual_scores, qa_pairs
+        # TODO: make this a parameter for class?
+        os.environ["TOKENIZERS_PARALLELISM"] = "true"
+
+        individual_scores: dict = {}
+        agg_score: float = 0.0
+        model_args = f"pretrained={self.model_path},dtype={self.model_dtype}"
+
+        tm = TaskManager(verbosity="DEBUG", include_path=self.sdg_path)
+
+        mmlu_output = simple_evaluate(
+            model="hf",
+            model_args=model_args,
+            tasks=self.tasks,
+            num_fewshot=self.few_shots,
+            batch_size=self.batch_size,
+            task_manager=tm,
+        )
+        results = mmlu_output["results"]
+
+        for task in self.tasks:
+            mmlu_res = results[task]
+            agg_score += float(mmlu_res["acc,none"])
+            individual_scores[task] = {}
+            individual_scores[task]["score"] = float(mmlu_res["acc,none"])
+            individual_scores[task]["stderr"] = float(mmlu_res["acc_stderr,none"])
+
+        overall_score = float(agg_score / len(self.tasks))
+        return overall_score, individual_scores