diff --git a/lm_eval/tasks/README.md b/lm_eval/tasks/README.md index 7c72a2cd565..73a351aadd8 100644 --- a/lm_eval/tasks/README.md +++ b/lm_eval/tasks/README.md @@ -148,6 +148,8 @@ provided to the individual README.md files for each subfolder. | [pile](pile/README.md) | Open source language modelling data set that consists of 22 smaller, high-quality datasets. | English | | [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English | | [piqa](piqa/README.md) | Physical Interaction Question Answering tasks to test physical commonsense reasoning. | English | +| [pisa](pisa/README.md) | Multi-lingual, mulit-model tasks that involve reading comprehension and math challenges. | English, German, French, Spanish, Italien, Chinese. | + | [polemo2](polemo2/README.md) | Sentiment analysis and emotion detection tasks based on Polish language data. | Polish | | [portuguese_bench](portuguese_bench/README.md) | Collection of tasks in European Portuguese encompassing various evaluation areas. | Portuguese | | [prost](prost/README.md) | Tasks requiring understanding of professional standards and ethics in various domains. | English | diff --git a/lm_eval/tasks/pisa/README.md b/lm_eval/tasks/pisa/README.md new file mode 100644 index 00000000000..2f0d29dac5d --- /dev/null +++ b/lm_eval/tasks/pisa/README.md @@ -0,0 +1,62 @@ +# PisaBench + +### Paper + +Title: PISA-Bench: The PISA Index as a Multilingual and Multimodal Metric for the Evaluation of Vision-Language Models + +Abstract: https://arxiv.org/abs/2510.24792 + +Vision-language models (VLMs) have demonstrated remarkable progress in multimodal reasoning. However, existing benchmarks remain limited in terms of high-quality, human-verified examples. Many current datasets rely on synthetically generated content by large language models (LLMs). Furthermore, most datasets are limited to English, as manual quality assurance of translated samples is time-consuming and costly. To fill this gap, we introduce PISA-Bench, a multilingual benchmark derived from English examples of the expert-created PISA tests, a unified framework for the assessment of student competencies in over eighty countries. Each example consists of human-extracted instructions, questions, answer options, and images, enriched with question type categories, and has been translated from English into five additional languages (Spanish, German, Chinese, French, and Italian), resulting in a fully parallel corpus covering six languages. We evaluate state-of-the-art vision-language models on PISA-Bench and find that especially small models (<20B parameters) fail to achieve high test scores. We further find substantial performance degradation on non-English splits as well as high error-rates when models are tasked with spatial and geometric reasoning. By releasing the dataset and evaluation framework, we provide a resource for advancing research on multilingual multimodal reasoning. + +HuggingFace Dataset: https://huggingface.co/datasets/PisaBench/pisa-bench + +### Citation + +```@misc{haller2025pisabenchpisaindexmultilingual, + title={PISA-Bench: The PISA Index as a Multilingual and Multimodal Metric for the Evaluation of Vision-Language Models}, + author={Patrick Haller and Fabio Barth and Jonas Golde and Georg Rehm and Alan Akbik}, + year={2025}, + eprint={2510.24792}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2510.24792}, +}``` + +### Groups, Tags, and Tasks + +#### Groups + +* `pisa`: Evaluates over all language splits with substring matching for answer evaluation. +* `pisa_llm_judged`: Evaluates over all language splits with LLM-based answer evaluation (requires OpenAI api key). + +#### Tags + +None. + +#### Tasks + +* `pisa_en` +* `pisa_de` +* `pisa_es` +* `pisa_fr` +* `pisa_it` +* `pisa_ch` +* `pisa_en_llm_judged` +* `pisa_de_llm_judged` +* `pisa_es_llm_judged` +* `pisa_fr_llm_judged` +* `pisa_it_llm_judged` +* `pisa_ch_llm_judged` + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/lm_eval/tasks/pisa/_pisa.yaml b/lm_eval/tasks/pisa/_pisa.yaml new file mode 100644 index 00000000000..93777a5a7de --- /dev/null +++ b/lm_eval/tasks/pisa/_pisa.yaml @@ -0,0 +1,12 @@ +group: pisa +task: + - pisa_de + - pisa_fr + - pisa_it + - pisa_en + - pisa_es + - pisa_ch + +aggregate_metric_list: + - metric: acc # or acc_norm, ppl, etc. + weight_by_size: false diff --git a/lm_eval/tasks/pisa/_pisa_llm_judged.yaml b/lm_eval/tasks/pisa/_pisa_llm_judged.yaml new file mode 100644 index 00000000000..fd073ea6918 --- /dev/null +++ b/lm_eval/tasks/pisa/_pisa_llm_judged.yaml @@ -0,0 +1,12 @@ +group: pisa_llm_judged +task: + - pisa_de_llm_judged + - pisa_fr_llm_judged + - pisa_it_llm_judged + - pisa_en_llm_judged + - pisa_es_llm_judged + - pisa_ch_llm_judged + +aggregate_metric_list: + - metric: acc # or acc_norm, ppl, etc. + weight_by_size: false diff --git a/lm_eval/tasks/pisa/_template_yaml b/lm_eval/tasks/pisa/_template_yaml new file mode 100644 index 00000000000..4d5a5adfb09 --- /dev/null +++ b/lm_eval/tasks/pisa/_template_yaml @@ -0,0 +1,16 @@ +dataset_path: PisaBench/pisa-bench +output_type: generate_until +doc_to_text: !function utils.pisa_doc_to_text +doc_to_choice: ["A", "B", "C", "D"] +doc_to_target: answer +process_results: !function utils.pisa_process_results +doc_to_image: !function utils.pisa_doc_to_visual + +generation_kwargs: + until: + - "<|endoftext|>" + +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true diff --git a/lm_eval/tasks/pisa/pisa_ch.yaml b/lm_eval/tasks/pisa/pisa_ch.yaml new file mode 100644 index 00000000000..b4e3971261d --- /dev/null +++ b/lm_eval/tasks/pisa/pisa_ch.yaml @@ -0,0 +1,4 @@ +task: pisa_ch +include: _template_yaml +task_alias: pisa_ch +test_split: ch diff --git a/lm_eval/tasks/pisa/pisa_ch_llm_judged.yaml b/lm_eval/tasks/pisa/pisa_ch_llm_judged.yaml new file mode 100644 index 00000000000..67a08ca44a0 --- /dev/null +++ b/lm_eval/tasks/pisa/pisa_ch_llm_judged.yaml @@ -0,0 +1,5 @@ +task: pisa_ch_llm_judged +include: _template_yaml +task_alias: pisa_ch +test_split: ch +process_results: !function utils.pisa_process_results_llm_judged diff --git a/lm_eval/tasks/pisa/pisa_de.yaml b/lm_eval/tasks/pisa/pisa_de.yaml new file mode 100644 index 00000000000..cbd73691060 --- /dev/null +++ b/lm_eval/tasks/pisa/pisa_de.yaml @@ -0,0 +1,4 @@ +task: pisa_de +include: _template_yaml +task_alias: pisa_de +test_split: de diff --git a/lm_eval/tasks/pisa/pisa_de_llm_judged.yaml b/lm_eval/tasks/pisa/pisa_de_llm_judged.yaml new file mode 100644 index 00000000000..1b70a7ff6ee --- /dev/null +++ b/lm_eval/tasks/pisa/pisa_de_llm_judged.yaml @@ -0,0 +1,5 @@ +task: pisa_de_llm_judged +include: _template_yaml +task_alias: pisa_de +test_split: de +process_results: !function utils.pisa_process_results_llm_judged diff --git a/lm_eval/tasks/pisa/pisa_en.yaml b/lm_eval/tasks/pisa/pisa_en.yaml new file mode 100644 index 00000000000..f4000d5c857 --- /dev/null +++ b/lm_eval/tasks/pisa/pisa_en.yaml @@ -0,0 +1,4 @@ +task: pisa_en +include: _template_yaml +task_alias: pisa_en +test_split: en diff --git a/lm_eval/tasks/pisa/pisa_en_llm_judged.yaml b/lm_eval/tasks/pisa/pisa_en_llm_judged.yaml new file mode 100644 index 00000000000..4662a3e98c4 --- /dev/null +++ b/lm_eval/tasks/pisa/pisa_en_llm_judged.yaml @@ -0,0 +1,5 @@ +task: pisa_en_llm_judged +include: _template_yaml +task_alias: pisa_en +test_split: en +process_results: !function utils.pisa_process_results_llm_judged diff --git a/lm_eval/tasks/pisa/pisa_es.yaml b/lm_eval/tasks/pisa/pisa_es.yaml new file mode 100644 index 00000000000..bc537758870 --- /dev/null +++ b/lm_eval/tasks/pisa/pisa_es.yaml @@ -0,0 +1,4 @@ +task: pisa_es +include: _template_yaml +task_alias: pisa_es +test_split: es diff --git a/lm_eval/tasks/pisa/pisa_es_llm_judged.yaml b/lm_eval/tasks/pisa/pisa_es_llm_judged.yaml new file mode 100644 index 00000000000..81b703fd8a6 --- /dev/null +++ b/lm_eval/tasks/pisa/pisa_es_llm_judged.yaml @@ -0,0 +1,5 @@ +task: pisa_es_llm_judged +include: _template_yaml +task_alias: pisa_es +test_split: es +process_results: !function utils.pisa_process_results_llm_judged diff --git a/lm_eval/tasks/pisa/pisa_fr.yaml b/lm_eval/tasks/pisa/pisa_fr.yaml new file mode 100644 index 00000000000..b582d33a0d4 --- /dev/null +++ b/lm_eval/tasks/pisa/pisa_fr.yaml @@ -0,0 +1,4 @@ +task: pisa_fr +include: _template_yaml +task_alias: pisa_fr +test_split: fr diff --git a/lm_eval/tasks/pisa/pisa_fr_llm_judged.yaml b/lm_eval/tasks/pisa/pisa_fr_llm_judged.yaml new file mode 100644 index 00000000000..1bfcd073692 --- /dev/null +++ b/lm_eval/tasks/pisa/pisa_fr_llm_judged.yaml @@ -0,0 +1,5 @@ +task: pisa_fr_llm_judged +include: _template_yaml +task_alias: pisa_fr +test_split: fr +process_results: !function utils.pisa_process_results_llm_judged diff --git a/lm_eval/tasks/pisa/pisa_it.yaml b/lm_eval/tasks/pisa/pisa_it.yaml new file mode 100644 index 00000000000..a84dc597d93 --- /dev/null +++ b/lm_eval/tasks/pisa/pisa_it.yaml @@ -0,0 +1,4 @@ +task: pisa_it +include: _template_yaml +task_alias: pisa_it +test_split: it diff --git a/lm_eval/tasks/pisa/pisa_it_llm_judged.yaml b/lm_eval/tasks/pisa/pisa_it_llm_judged.yaml new file mode 100644 index 00000000000..10492dfb23a --- /dev/null +++ b/lm_eval/tasks/pisa/pisa_it_llm_judged.yaml @@ -0,0 +1,5 @@ +task: pisa_it_llm_judged +include: _template_yaml +task_alias: pisa_it +test_split: it +process_results: !function utils.pisa_process_results_llm_judged diff --git a/lm_eval/tasks/pisa/utils.py b/lm_eval/tasks/pisa/utils.py new file mode 100644 index 00000000000..abef6ffb96d --- /dev/null +++ b/lm_eval/tasks/pisa/utils.py @@ -0,0 +1,329 @@ +import ast +import os +import random +import re +from typing import List + +import numpy as np + + +try: + from openai import OpenAI +except ImportError: + pass + +API_TYPE = os.getenv("API_TYPE", "openai") +MODEL_VERSION = os.getenv("MODEL_VERSION", "gpt-4.1-mini") + +SYSTEM_PROMPT = """You are an impartial grader for multiple-choice questions. +You are given: +1) the model's free-form output (student_answer), +2) the available options (each with a letter and text), +3) the correct answer (by letter and/or text). + +Your job: +- Extract which single option the student intended (by letter if present, otherwise by best semantic match to the option text). +- Compare that choice to the correct answer. +- Output only a single character: 1 if correct, 0 if incorrect. +No explanation. No extra characters. Just 1 or 0.""" + + +def replace_images_tokens(input_string): + for i in range(1, 8): + question_text = f"" + query_text = "" + if question_text in input_string: + input_string = input_string.replace(question_text, query_text) + return input_string + + +def parse_options(options): + option_letters = [chr(ord("A") + i) for i in range(len(options))] + choices_str = "\n".join( + [ + f"{option_letter}. {option}" + for option_letter, option in zip(option_letters, options) + ] + ) + return choices_str + + +def construct_prompt(doc, mc_prompt=""): + question = doc["question"] + parsed_options = parse_options(ast.literal_eval(f"{doc['choices']}")) + question = f"Given the provided image , answer following questions:\n{question}\n{parsed_options}\n\n{mc_prompt}" + return question + + +def pisa_doc_to_text(doc): + question = construct_prompt(doc) + return question + + +def pisa_doc_to_visual(doc): + image_key = "image" + if doc[image_key] is None: + return None + return [doc[image_key]] + + +def pisa_process_results(doc, results, **kwargs): + """Default evaluation of answers based on substring matching.""" + index2ans, all_choices = get_multi_choice_info( + ast.literal_eval(f"{doc['choices']}") + ) + parsed_pred = parse_multi_choice_response(results[0], all_choices, index2ans) + gold_i = doc["answer"] + pred_i = all_choices.index(parsed_pred) if parsed_pred in all_choices else None + is_correct = gold_i == pred_i if pred_i is not None else False + + return { + "acc": float(is_correct), + } + + +def pisa_process_results_llm_judged(doc, results, **kwargs): + """Evaluation of answers based on LLM as a judge.""" + assert os.getenv("OPENAI_API_KEY") is not None, ( + "OPENAI_API_KEY environment variable is not set." + ) + try: + from openai import OpenAI + except ImportError: + raise ImportError("Please install openai package to use LLM judging.") + + index2ans, all_choices = get_multi_choice_info( + ast.literal_eval(f"{doc['choices']}") + ) + gold_i = doc["answer"] + correct_answer = index2ans[all_choices[gold_i]] + is_correct = ( + judge_mcq( + results[0], + [f"{k}) {v}" for k, v in index2ans.items()], + f"{chr(ord('A') + gold_i)}) {correct_answer}", + ) + == 1 + ) + + return { + "acc": float(is_correct), + } + + +def eval_multi_choice(gold_i, pred_i): + """Evaluate a multiple choice instance.""" + correct = False + # only they are exactly the same, we consider it as correct + if isinstance(gold_i, list): + for answer in gold_i: + if answer == pred_i: + correct = True + break + else: # gold_i is a string + if gold_i == pred_i: + correct = True + return correct + + +def eval_open(gold_i, pred_i): + """ + Evaluate an open question instance + https://github.com/pisa-Benchmark/pisa/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L191 + """ + correct = False + if isinstance(gold_i, list): + # use float to avoid trivial matches + norm_answers = [] + for answer in gold_i: + norm_answers.extend(normalize_str(answer)) + else: + norm_answers = normalize_str(gold_i) + for pred in pred_i: # pred is already normalized in parse response phase + if isinstance(pred, str): # if it's a string, then find if ans in the pred_i + for norm_ans in norm_answers: + # only see if the string answer in the string pred + if isinstance(norm_ans, str) and norm_ans in pred: + if not correct: + correct = True + break + else: # it's a float number + if pred in norm_answers: + if not correct: + correct = True + break + return correct + + +def parse_multi_choice_response(response, all_choices, index2ans): + """ + Parse the prediction from the generated response. + Return the predicted index e.g., A, B, C, D. + https://github.com/pisa-Benchmark/pisa/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L10 + """ + for char in [",", ".", "!", "?", ";", ":", "'"]: + response = response.strip(char) + response = " " + response + " " # add space to avoid partial match + + index_ans = True + ans_with_brack = False + candidates = [] + for choice in all_choices: # e.g., (A) (B) (C) (D) + if f"({choice})" in response: + candidates.append(choice) + ans_with_brack = True + + if len(candidates) == 0: + for choice in all_choices: # e.g., A B C D + if f"{choice} " in response: + candidates.append(choice) + + if len(candidates) == 0: + for choice in all_choices: # e.g., A. B. C. D. + if f"{choice}." in response: + candidates.append(choice) + + # if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example + if len(candidates) == 0 and len(response.split()) > 5: + for index, ans in index2ans.items(): + if ans.lower() in response.lower(): + candidates.append(index) + index_ans = False # it's content ans. + + if len(candidates) == 0: # still not get answer, randomly choose one. + pred_index = random.choice(all_choices) + elif len(candidates) > 1: + start_indexes = [] + if index_ans: + if ans_with_brack: + for can in candidates: + index = response.rfind(f"({can})") + start_indexes.append(index) # -1 will be ignored anyway + # start_indexes = [generated_response.index(f'({can})') for can in candidates] + else: + for can in candidates: + index = response.rfind(f" {can} ") + start_indexes.append(index) + else: + for can in candidates: + index = response.lower().rfind(index2ans[can].lower()) + start_indexes.append(index) + # get the last one + pred_index = candidates[np.argmax(start_indexes)] + else: # if only one candidate, use it. + pred_index = candidates[0] + + return pred_index + + +def extract_numbers(string): + """ + Exact all forms of numbers from a string with regex. + """ + # Pattern for numbers with commas + pattern_commas = r"-?\b\d{1,3}(?:,\d{3})+\b" + # Pattern for scientific notation + pattern_scientific = r"-?\d+(?:\.\d+)?[eE][+-]?\d+" + # Pattern for simple numbers without commas + pattern_simple = r"-?(?:\d+\.\d+|\.\d+|\d+\b)(?![eE][+-]?\d+)(?![,\d])" + + # Extract numbers with commas + numbers_with_commas = re.findall(pattern_commas, string) + # Extract numbers in scientific notation + numbers_scientific = re.findall(pattern_scientific, string) + # Extract simple numbers without commas + numbers_simple = re.findall(pattern_simple, string) + + # Combine all extracted numbersz + all_numbers = numbers_with_commas + numbers_scientific + numbers_simple + return all_numbers + + +def check_is_number(string): + try: + float(string.replace(",", "")) + return True + except ValueError: + # check if there's comma inside + return False + + +def normalize_str(string): + """Normalize the str to lower case and make them float numbers if possible.""" + # check if characters in the string + + # if number, numerize it. + string = string.strip() + + is_number = check_is_number(string) + + if is_number: + string = string.replace(",", "") + string = float(string) + # leave 2 decimal + string = round(string, 2) + return [string] + else: # it's likely to be a string + # lower it + string = string.lower() + if len(string) == 1: + return [" " + string, string + " "] # avoid trivial matches + return [string] + + +def get_multi_choice_info(options): + """ + Given the list of options for multiple choice question + Return the index2ans and all_choices + """ + + start_chr = "A" + all_choices = [] + index2ans = {} + for i, option in enumerate(options): + index2ans[chr(ord(start_chr) + i)] = option + all_choices.append(chr(ord(start_chr) + i)) + + return index2ans, all_choices + + +# LLM as a judge utils +def build_user_prompt(student_answer: str, options: List[str], correct: str) -> str: + """ + options: like ["A) red", "B) blue", "C) green", "D) yellow"] + correct: either a letter like "B" or the full option text. Both are provided to help you. + """ + return f"""Student Answer: +{student_answer.strip()} + +Options: +{chr(10).join(options)} + +Correct Answer (letter and/or text): +{correct.strip()} + +Instructions: +- If student gives multiple letters, pick the *final* one. +- If no clear letter, pick the best-matching option by meaning. +- Output only 1 or 0. +""" + + +def judge_mcq(pred: str, options: List[str], correct: str) -> int: + client = OpenAI( + api_key=os.getenv("OPENAI_API_KEY"), + ) + + user_prompt = build_user_prompt(pred, options, correct) + + resp = client.chat.completions.create( + model=MODEL_VERSION, + temperature=0, + max_tokens=1, + messages=[ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": user_prompt}, + ], + ) + raw = resp.choices[0].message.content.strip() + return 1 if raw == "1" else 0