55
66# Third Party
77from lm_eval .evaluator import simple_evaluate # type: ignore
8+ from lm_eval .tasks import TaskManager # type: ignore
89
910# First Party
1011from instructlab .eval .evaluator import Evaluator
@@ -78,8 +79,8 @@ class MMLUBranchEvaluator(Evaluator):
7879
7980 Attributes:
8081 model_path absolute path to or name of a huggingface model
81- sdg_path path where all the MMLUBranch tasks are stored
82- task group name that is shared by all the MMLUBranch tasks
82+ sdg_path path where the <TASK_NAME>.jsonl and <TASK_NAME>_task.yaml files for the branches being evaluated are stored
83+ tasks group name that is shared by all the MMLUBranch tasks
8384 few_shots number of examples
8485 batch_size number of GPUs
8586 """
@@ -88,13 +89,15 @@ def __init__(
8889 self ,
8990 model_path ,
9091 sdg_path : str ,
91- task : str = "mmlu_pr" ,
92+ tasks : list [str ],
93+ model_dtype = "bfloat16" ,
9294 few_shots : int = 2 ,
9395 batch_size : int = 5 ,
9496 ) -> None :
9597 self .model_path = model_path
9698 self .sdg_path = sdg_path
97- self .task = task
99+ self .tasks = tasks
100+ self .model_dtype = model_dtype
98101 self .few_shots = few_shots
99102 self .batch_size = batch_size
100103
@@ -103,11 +106,34 @@ def run(self) -> tuple:
103106 Runs MMLUBranch evaluation
104107
105108 Returns:
106- overall_score MMLUBranch score for the overall model evaluation
107- individual_scores Individual MMLUBranch scores for each task
108- qa_pairs Question and answer pairs from the evaluation
109+ overall_score Average MMLUBranch score for the task group
110+ individual_scores Individual MMLUBranch scores for each task in the task group
109111 """
110- individual_scores : dict [str , float ] = {}
111- overall_score : float = 0.0
112- qa_pairs : list [tuple ] = []
113- return overall_score , individual_scores , qa_pairs
112+ # TODO: make this a parameter for class?
113+ os .environ ["TOKENIZERS_PARALLELISM" ] = "true"
114+
115+ individual_scores : dict = {}
116+ agg_score : float = 0.0
117+ model_args = f"pretrained={ self .model_path } ,dtype={ self .model_dtype } "
118+
119+ tm = TaskManager (verbosity = "DEBUG" , include_path = self .sdg_path )
120+
121+ mmlu_output = simple_evaluate (
122+ model = "hf" ,
123+ model_args = model_args ,
124+ tasks = self .tasks ,
125+ num_fewshot = self .few_shots ,
126+ batch_size = self .batch_size ,
127+ task_manager = tm ,
128+ )
129+ results = mmlu_output ["results" ]
130+
131+ for task in self .tasks :
132+ mmlu_res = results [task ]
133+ agg_score += float (mmlu_res ["acc,none" ])
134+ individual_scores [task ] = {}
135+ individual_scores [task ]["score" ] = float (mmlu_res ["acc,none" ])
136+ individual_scores [task ]["stderr" ] = float (mmlu_res ["acc_stderr,none" ])
137+
138+ overall_score = float (agg_score / len (self .tasks ))
139+ return overall_score , individual_scores
0 commit comments