Skip to content

Commit 8494a51

Browse files
Merge pull request #19 from alimaredia/mmlu-branch
mmlu branch run() complete
2 parents fb97c33 + fc1aa98 commit 8494a51

File tree

1 file changed

+37
-11
lines changed

1 file changed

+37
-11
lines changed

src/instructlab/eval/mmlu.py

Lines changed: 37 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
# Third Party
77
from lm_eval.evaluator import simple_evaluate # type: ignore
8+
from lm_eval.tasks import TaskManager # type: ignore
89

910
# First Party
1011
from instructlab.eval.evaluator import Evaluator
@@ -78,8 +79,8 @@ class MMLUBranchEvaluator(Evaluator):
7879
7980
Attributes:
8081
model_path absolute path to or name of a huggingface model
81-
sdg_path path where all the MMLUBranch tasks are stored
82-
task group name that is shared by all the MMLUBranch tasks
82+
sdg_path path where the <TASK_NAME>.jsonl and <TASK_NAME>_task.yaml files for the branches being evaluated are stored
83+
tasks group name that is shared by all the MMLUBranch tasks
8384
few_shots number of examples
8485
batch_size number of GPUs
8586
"""
@@ -88,13 +89,15 @@ def __init__(
8889
self,
8990
model_path,
9091
sdg_path: str,
91-
task: str = "mmlu_pr",
92+
tasks: list[str],
93+
model_dtype="bfloat16",
9294
few_shots: int = 2,
9395
batch_size: int = 5,
9496
) -> None:
9597
self.model_path = model_path
9698
self.sdg_path = sdg_path
97-
self.task = task
99+
self.tasks = tasks
100+
self.model_dtype = model_dtype
98101
self.few_shots = few_shots
99102
self.batch_size = batch_size
100103

@@ -103,11 +106,34 @@ def run(self) -> tuple:
103106
Runs MMLUBranch evaluation
104107
105108
Returns:
106-
overall_score MMLUBranch score for the overall model evaluation
107-
individual_scores Individual MMLUBranch scores for each task
108-
qa_pairs Question and answer pairs from the evaluation
109+
overall_score Average MMLUBranch score for the task group
110+
individual_scores Individual MMLUBranch scores for each task in the task group
109111
"""
110-
individual_scores: dict[str, float] = {}
111-
overall_score: float = 0.0
112-
qa_pairs: list[tuple] = []
113-
return overall_score, individual_scores, qa_pairs
112+
# TODO: make this a parameter for class?
113+
os.environ["TOKENIZERS_PARALLELISM"] = "true"
114+
115+
individual_scores: dict = {}
116+
agg_score: float = 0.0
117+
model_args = f"pretrained={self.model_path},dtype={self.model_dtype}"
118+
119+
tm = TaskManager(verbosity="DEBUG", include_path=self.sdg_path)
120+
121+
mmlu_output = simple_evaluate(
122+
model="hf",
123+
model_args=model_args,
124+
tasks=self.tasks,
125+
num_fewshot=self.few_shots,
126+
batch_size=self.batch_size,
127+
task_manager=tm,
128+
)
129+
results = mmlu_output["results"]
130+
131+
for task in self.tasks:
132+
mmlu_res = results[task]
133+
agg_score += float(mmlu_res["acc,none"])
134+
individual_scores[task] = {}
135+
individual_scores[task]["score"] = float(mmlu_res["acc,none"])
136+
individual_scores[task]["stderr"] = float(mmlu_res["acc_stderr,none"])
137+
138+
overall_score = float(agg_score / len(self.tasks))
139+
return overall_score, individual_scores

0 commit comments

Comments
 (0)