|
| 1 | +import re |
| 2 | + |
| 3 | +from loguru import logger as eval_logger |
| 4 | +from PIL import Image |
| 5 | + |
| 6 | +Image.MAX_IMAGE_PIXELS = 10_0000_0000 |
| 7 | + |
| 8 | +TASK_PAIRs = [ |
| 9 | + "Complex reasoning/Anomaly Detection and Interpretation", |
| 10 | + "Complex reasoning/Environmental condition reasoning", |
| 11 | + "Complex reasoning/Route planning", |
| 12 | + "Counting/Counting with changing detection", |
| 13 | + "Counting/Counting with complex reasoning", |
| 14 | + "Counting/Overall counting", |
| 15 | + "Counting/Regional counting", |
| 16 | + "Land use classification/Overall Land use classification", |
| 17 | + "Land use classification/Regional Land use classification", |
| 18 | + "Object properties/Object classification", |
| 19 | + "Object properties/Object color", |
| 20 | + "Object properties/Object motion state", |
| 21 | + "Object spatial relationship/Object spatial relationship", |
| 22 | +] |
| 23 | + |
| 24 | + |
| 25 | +def xlrs_doc_to_visual(doc): |
| 26 | + return [img.convert("RGB") for img in doc["image"]] |
| 27 | + |
| 28 | + |
| 29 | +def xlrs_doc_to_text(doc, lmms_eval_specific_kwargs=None): |
| 30 | + question = doc["question"] |
| 31 | + option_prompt = "The choices are listed below:\n" + "\n".join(doc["multi-choice options"]) + "\n" |
| 32 | + # pre_prompt = lmms_eval_specific_kwargs["pre_prompt"] |
| 33 | + # post_promt = lmms_eval_specific_kwargs["post_prompt"] |
| 34 | + pre_prompt = "" |
| 35 | + assert doc["category"] in TASK_PAIRs, f"Unknown task: {doc['category']}" |
| 36 | + if doc["category"] == "Land use classification/Overall Land use classification": |
| 37 | + post_promt = "\nSelect the best answer(s) for the multiple-choice question based on the image. There may be more than one correct option. Only respond with the letter(s) corresponding to the correct answer(s) (A, B, C, D), with multiple choices separated by spaces.The answer(s) is(are):" |
| 38 | + else: |
| 39 | + post_promt = "\nSelect the best answer for the multiple-choice question based on the image. Only respond with the letter corresponding to the correct answer (A, B, C, D).\nThe answer is:" |
| 40 | + question += pre_prompt + option_prompt + post_promt |
| 41 | + return question |
| 42 | + |
| 43 | + |
| 44 | +# [Image] [Question] The choices are listed below: |
| 45 | +# (A) [Choice A] |
| 46 | +# (B) [Choice B] |
| 47 | +# (C) [Choice C] |
| 48 | +# (D) [Choice D] |
| 49 | +# (E) [Choice E] |
| 50 | +# Select the best answer to the above multiple-choice question based on the image. Respond with only the letter (A, B, C, D, or E) of the correct option. |
| 51 | +# The best answer is: |
| 52 | + |
| 53 | + |
| 54 | +def extract_characters_regex(s, choices=["(A)", "(B)", "(C)", "(D)", "(E)"]): |
| 55 | + if type(s) is dict: |
| 56 | + s = "" |
| 57 | + s = s.strip() |
| 58 | + answer_prefixes = [ |
| 59 | + "The best answer is", |
| 60 | + "The correct answer is", |
| 61 | + "The answer is", |
| 62 | + "The answer", |
| 63 | + "The best option isThe correct option is", |
| 64 | + "Best answer:Best option:", |
| 65 | + ] |
| 66 | + for answer_prefix in answer_prefixes: |
| 67 | + s = s.replace(answer_prefix, "") |
| 68 | + |
| 69 | + if not re.search("[ABCDE]", s): |
| 70 | + return "" |
| 71 | + matches = re.findall(r"\(([a-eA-E])\)", s) |
| 72 | + if len(matches) == 0: |
| 73 | + matches = re.findall(r"(?:^|\s)?([a-eA-E])(?:$|[\s,.])?", s) |
| 74 | + if len(matches) == 0: |
| 75 | + matches = re.findall(r"[a-eA-E]", s) |
| 76 | + if len(matches) == 0: |
| 77 | + return "" |
| 78 | + else: |
| 79 | + matches = set(mat.upper() for mat in matches) |
| 80 | + return "".join(matches) |
| 81 | + |
| 82 | + |
| 83 | +def xlrs_process_results(doc, results): |
| 84 | + """ |
| 85 | + Args: |
| 86 | + doc: a instance of the eval dataset |
| 87 | + results: [pred] |
| 88 | + Returns: |
| 89 | + a dictionary with key: metric name (in this case xlrs_micro_score), value: metric value |
| 90 | + """ |
| 91 | + pred = results[0] |
| 92 | + pred_ans = extract_characters_regex(pred) |
| 93 | + category, sub_category = doc["category"].split("/")[:2] |
| 94 | + task_category = doc["l2-category"] |
| 95 | + data_dict = { |
| 96 | + "question_id": doc["index"], |
| 97 | + "category": category, |
| 98 | + "sub_category": sub_category, |
| 99 | + "task_category": task_category, |
| 100 | + "pred_answer": pred_ans, |
| 101 | + "answer": doc["answer"], |
| 102 | + } |
| 103 | + |
| 104 | + return {"xlrs_micro_score": data_dict} |
| 105 | + |
| 106 | + |
| 107 | +def xlrs_aggregate_results(results): |
| 108 | + """ |
| 109 | + Args: |
| 110 | + results: a list of values returned by process_results |
| 111 | + Returns: |
| 112 | + A score |
| 113 | + """ |
| 114 | + |
| 115 | + metrics = {} |
| 116 | + for task_pair in TASK_PAIRs: |
| 117 | + task, subtask = task_pair.split("/") |
| 118 | + if task not in metrics.keys(): |
| 119 | + metrics[task] = {} |
| 120 | + metrics[f"{task}"][f"{subtask}"] = {} |
| 121 | + # for task in TASKS: |
| 122 | + # metrics[f"{task}"] = {} |
| 123 | + # for subtask in SUBTASKS: |
| 124 | + # metrics[f"{task}"][f"{subtask}"] = {} |
| 125 | + |
| 126 | + for i in range(len(results)): |
| 127 | + result = results[i] |
| 128 | + Task = result["category"] |
| 129 | + Subtask = result["sub_category"] |
| 130 | + Category = result["task_category"].lower() |
| 131 | + if "attribute" in Category.lower(): |
| 132 | + Category = Category.split("/")[0] + "/attribute" |
| 133 | + # print(f"pred: {result['pred_answer']}, answer: {result['answer']}") |
| 134 | + # exact match |
| 135 | + cnt = 1 if set(result["pred_answer"]) == set(result["answer"]) else 0 |
| 136 | + if Category not in metrics[Task][Subtask].keys(): |
| 137 | + metrics[Task][Subtask][f"{Category}"] = { |
| 138 | + "true": cnt, |
| 139 | + "false": 1 - cnt, |
| 140 | + } |
| 141 | + else: |
| 142 | + metrics[Task][Subtask][f"{Category}"]["true"] += cnt |
| 143 | + metrics[Task][Subtask][f"{Category}"]["false"] += 1 - cnt |
| 144 | + |
| 145 | + sum_all, succ_all = 0, 0 |
| 146 | + for task, tasks_values in metrics.items(): |
| 147 | + eval_logger.info("*" * 32 + f"{task} (Task Start)") |
| 148 | + cnt_task, sum_task = 0, 0 |
| 149 | + for substask, subtask_value in tasks_values.items(): |
| 150 | + eval_logger.info("+" * 16 + f"{substask} (Subtask Start)") |
| 151 | + cnt_subtask, sum_subtask, e_subtask = 0, 0, 0 |
| 152 | + for category, category_dict in subtask_value.items(): |
| 153 | + cnt_subtask += category_dict["true"] |
| 154 | + sum_subtask += category_dict["false"] + category_dict["true"] |
| 155 | + acc = category_dict["true"] / (category_dict["false"] + category_dict["true"]) |
| 156 | + eval_logger.info("-" * 4 + "\t" + "Acc " + "{:.4f}".format(acc) + f"\t{category.capitalize()} ({category_dict['false'] + category_dict['true']} items)") |
| 157 | + |
| 158 | + if sum_subtask == 0: |
| 159 | + acc_subtasks = 0 |
| 160 | + else: |
| 161 | + acc_subtasks = cnt_subtask / sum_subtask |
| 162 | + eval_logger.info("+" * 16 + "\t Acc " + "{:.4f}".format(acc_subtasks) + f"\t{substask} ({sum_subtask} items)") |
| 163 | + cnt_task += cnt_subtask |
| 164 | + sum_task += sum_subtask |
| 165 | + |
| 166 | + if sum_task == 0: |
| 167 | + acc_task = 0 |
| 168 | + else: |
| 169 | + acc_task = cnt_task / sum_task |
| 170 | + succ_all += cnt_task |
| 171 | + sum_all += sum_task |
| 172 | + eval_logger.info("*" * 32 + "Acc " + "{:.4f}".format(acc_task) + f"\t{task} ({sum_task} items)\n") |
| 173 | + eval_logger.info("*" * 32 + "Overall Acc " + "{:.4f}".format(succ_all / sum_all)) |
| 174 | + return succ_all / sum_all |
0 commit comments