diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index cffc5144c..068db147f 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -469,7 +469,8 @@ def _loglikelihood_tokens( tokenized_contexts_batch.append(tokenized_context) # Left truncate the inputs to the maximum length - inputs = [input[-self.max_length :] for input in inputs] + if self.max_length: # can be None if the model is initialized with ray + inputs = [input[-self.max_length :] for input in inputs] outputs = self._generate(inputs, generate=False) flat_index = 0 diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py index 7f1544e98..385a5a407 100644 --- a/src/lighteval/tasks/default_prompts.py +++ b/src/lighteval/tasks/default_prompts.py @@ -1862,6 +1862,27 @@ def mmlu_helm(line, task_name: str = None): ) +def mmlu_redux_2(line, topic, task_name: str = None): + """ + Ref: https://arxiv.org/abs/2406.04127 + """ + query = f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n" + query += line["question"] + "\n" + query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, line["choices"])]) + query += "Answer: " + + # Handle answer format - MMLU-Redux-2 uses integer indices directly + gold_ix = line["answer"] if isinstance(line["answer"], int) else int(line["answer"]) + + return Doc( + task_name=task_name, + query=query, + choices=LETTER_INDICES[: len(line["choices"])], + gold_index=gold_ix, + instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n", + ) + + def mmlu_qa_abstract_algebra(line, task_name: str = None): return mmlu_qa(line, "abstract_algebra", task_name) diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index d054060b1..72a8ad399 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -21921,3 +21921,142 @@ trust_dataset=True, version=0, ) + +# MMLU-Redux-2 Tasks +_MMLU_REDUX_2_SUBSETS = [ + "abstract_algebra", + "anatomy", + "astronomy", + "business_ethics", + "clinical_knowledge", + "college_biology", + "college_chemistry", + "college_computer_science", + "college_mathematics", + "college_medicine", + "college_physics", + "computer_security", + "conceptual_physics", + "econometrics", + "electrical_engineering", + "elementary_mathematics", + "formal_logic", + "global_facts", + "high_school_biology", + "high_school_chemistry", + "high_school_computer_science", + "high_school_european_history", + "high_school_geography", + "high_school_government_and_politics", + "high_school_macroeconomics", + "high_school_mathematics", + "high_school_microeconomics", + "high_school_physics", + "high_school_psychology", + "high_school_statistics", + "high_school_us_history", + "high_school_world_history", + "human_aging", + "human_sexuality", + "international_law", + "jurisprudence", + "logical_fallacies", + "machine_learning", + "management", + "marketing", + "medical_genetics", + "miscellaneous", + "moral_disputes", + "moral_scenarios", + "nutrition", + "philosophy", + "prehistory", + "professional_accounting", + "professional_law", + "professional_medicine", + "professional_psychology", + "public_relations", + "security_studies", + "sociology", + "us_foreign_policy", + "virology", + "world_religions", +] + +_mmlu_redux_2_tasks = { + subset: LightevalTaskConfig( + name=f"mmlu_redux_2:{subset}", + suite=["lighteval"], + prompt_function=lambda line, task_name=None, s=subset: prompt.mmlu_redux_2(line, s, task_name), + hf_repo="edinburgh-dawg/mmlu-redux-2.0", + hf_subset=subset, + hf_avail_splits=["test"], + evaluation_splits=["test"], + few_shots_split=None, + few_shots_select=None, + generation_size=1, + metrics=[Metrics.loglikelihood_acc], + stop_sequence=["\n"], + trust_dataset=True, + version=0, + ) + for subset in _MMLU_REDUX_2_SUBSETS +} + +mmlu_redux_2_abstract_algebra = _mmlu_redux_2_tasks["abstract_algebra"] +mmlu_redux_2_anatomy = _mmlu_redux_2_tasks["anatomy"] +mmlu_redux_2_astronomy = _mmlu_redux_2_tasks["astronomy"] +mmlu_redux_2_business_ethics = _mmlu_redux_2_tasks["business_ethics"] +mmlu_redux_2_clinical_knowledge = _mmlu_redux_2_tasks["clinical_knowledge"] +mmlu_redux_2_college_biology = _mmlu_redux_2_tasks["college_biology"] +mmlu_redux_2_college_chemistry = _mmlu_redux_2_tasks["college_chemistry"] +mmlu_redux_2_college_computer_science = _mmlu_redux_2_tasks["college_computer_science"] +mmlu_redux_2_college_mathematics = _mmlu_redux_2_tasks["college_mathematics"] +mmlu_redux_2_college_medicine = _mmlu_redux_2_tasks["college_medicine"] +mmlu_redux_2_college_physics = _mmlu_redux_2_tasks["college_physics"] +mmlu_redux_2_computer_security = _mmlu_redux_2_tasks["computer_security"] +mmlu_redux_2_conceptual_physics = _mmlu_redux_2_tasks["conceptual_physics"] +mmlu_redux_2_econometrics = _mmlu_redux_2_tasks["econometrics"] +mmlu_redux_2_electrical_engineering = _mmlu_redux_2_tasks["electrical_engineering"] +mmlu_redux_2_elementary_mathematics = _mmlu_redux_2_tasks["elementary_mathematics"] +mmlu_redux_2_formal_logic = _mmlu_redux_2_tasks["formal_logic"] +mmlu_redux_2_global_facts = _mmlu_redux_2_tasks["global_facts"] +mmlu_redux_2_high_school_biology = _mmlu_redux_2_tasks["high_school_biology"] +mmlu_redux_2_high_school_chemistry = _mmlu_redux_2_tasks["high_school_chemistry"] +mmlu_redux_2_high_school_computer_science = _mmlu_redux_2_tasks["high_school_computer_science"] +mmlu_redux_2_high_school_european_history = _mmlu_redux_2_tasks["high_school_european_history"] +mmlu_redux_2_high_school_geography = _mmlu_redux_2_tasks["high_school_geography"] +mmlu_redux_2_high_school_government_and_politics = _mmlu_redux_2_tasks["high_school_government_and_politics"] +mmlu_redux_2_high_school_macroeconomics = _mmlu_redux_2_tasks["high_school_macroeconomics"] +mmlu_redux_2_high_school_mathematics = _mmlu_redux_2_tasks["high_school_mathematics"] +mmlu_redux_2_high_school_microeconomics = _mmlu_redux_2_tasks["high_school_microeconomics"] +mmlu_redux_2_high_school_physics = _mmlu_redux_2_tasks["high_school_physics"] +mmlu_redux_2_high_school_psychology = _mmlu_redux_2_tasks["high_school_psychology"] +mmlu_redux_2_high_school_statistics = _mmlu_redux_2_tasks["high_school_statistics"] +mmlu_redux_2_high_school_us_history = _mmlu_redux_2_tasks["high_school_us_history"] +mmlu_redux_2_high_school_world_history = _mmlu_redux_2_tasks["high_school_world_history"] +mmlu_redux_2_human_aging = _mmlu_redux_2_tasks["human_aging"] +mmlu_redux_2_human_sexuality = _mmlu_redux_2_tasks["human_sexuality"] +mmlu_redux_2_international_law = _mmlu_redux_2_tasks["international_law"] +mmlu_redux_2_jurisprudence = _mmlu_redux_2_tasks["jurisprudence"] +mmlu_redux_2_logical_fallacies = _mmlu_redux_2_tasks["logical_fallacies"] +mmlu_redux_2_machine_learning = _mmlu_redux_2_tasks["machine_learning"] +mmlu_redux_2_management = _mmlu_redux_2_tasks["management"] +mmlu_redux_2_marketing = _mmlu_redux_2_tasks["marketing"] +mmlu_redux_2_medical_genetics = _mmlu_redux_2_tasks["medical_genetics"] +mmlu_redux_2_miscellaneous = _mmlu_redux_2_tasks["miscellaneous"] +mmlu_redux_2_moral_disputes = _mmlu_redux_2_tasks["moral_disputes"] +mmlu_redux_2_moral_scenarios = _mmlu_redux_2_tasks["moral_scenarios"] +mmlu_redux_2_nutrition = _mmlu_redux_2_tasks["nutrition"] +mmlu_redux_2_philosophy = _mmlu_redux_2_tasks["philosophy"] +mmlu_redux_2_prehistory = _mmlu_redux_2_tasks["prehistory"] +mmlu_redux_2_professional_accounting = _mmlu_redux_2_tasks["professional_accounting"] +mmlu_redux_2_professional_law = _mmlu_redux_2_tasks["professional_law"] +mmlu_redux_2_professional_medicine = _mmlu_redux_2_tasks["professional_medicine"] +mmlu_redux_2_professional_psychology = _mmlu_redux_2_tasks["professional_psychology"] +mmlu_redux_2_public_relations = _mmlu_redux_2_tasks["public_relations"] +mmlu_redux_2_security_studies = _mmlu_redux_2_tasks["security_studies"] +mmlu_redux_2_sociology = _mmlu_redux_2_tasks["sociology"] +mmlu_redux_2_us_foreign_policy = _mmlu_redux_2_tasks["us_foreign_policy"] +mmlu_redux_2_virology = _mmlu_redux_2_tasks["virology"] +mmlu_redux_2_world_religions = _mmlu_redux_2_tasks["world_religions"]