Skip to content

Commit 7de55df

Browse files
authored
[Task] Add new task: XLRS-Bench-lite (#684)
* [Task] Add new task: XLRS-Bench-lite * [Task] Add new task: XLRS-Bench-lite
1 parent 4036555 commit 7de55df

File tree

2 files changed

+200
-0
lines changed

2 files changed

+200
-0
lines changed

lmms_eval/tasks/xlrs/XLRS-lite.yaml

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
dataset_path: initiacms/XLRS-Bench-lite
2+
task: xlrs-lite
3+
test_split: train
4+
output_type: generate_until
5+
doc_to_visual: !function mcq_utils.xlrs_doc_to_visual
6+
doc_to_text: !function mcq_utils.xlrs_doc_to_text
7+
doc_to_target: "answer"
8+
generation_kwargs:
9+
max_new_tokens: 64
10+
temperature: 0
11+
top_p: 1.0
12+
num_beams: 1
13+
do_sample: false
14+
# The return value of process_results will be used by metrics
15+
process_results: !function mcq_utils.xlrs_process_results
16+
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
17+
metric_list:
18+
- metric: xlrs_micro_score
19+
aggregation: !function mcq_utils.xlrs_aggregate_results
20+
higher_is_better: true
21+
lmms_eval_specific_kwargs:
22+
default:
23+
pre_prompt: ""
24+
post_prompt: ""
25+
metadata:
26+
- version: 0.0

lmms_eval/tasks/xlrs/mcq_utils.py

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
import re
2+
3+
from loguru import logger as eval_logger
4+
from PIL import Image
5+
6+
Image.MAX_IMAGE_PIXELS = 10_0000_0000
7+
8+
TASK_PAIRs = [
9+
"Complex reasoning/Anomaly Detection and Interpretation",
10+
"Complex reasoning/Environmental condition reasoning",
11+
"Complex reasoning/Route planning",
12+
"Counting/Counting with changing detection",
13+
"Counting/Counting with complex reasoning",
14+
"Counting/Overall counting",
15+
"Counting/Regional counting",
16+
"Land use classification/Overall Land use classification",
17+
"Land use classification/Regional Land use classification",
18+
"Object properties/Object classification",
19+
"Object properties/Object color",
20+
"Object properties/Object motion state",
21+
"Object spatial relationship/Object spatial relationship",
22+
]
23+
24+
25+
def xlrs_doc_to_visual(doc):
26+
return [img.convert("RGB") for img in doc["image"]]
27+
28+
29+
def xlrs_doc_to_text(doc, lmms_eval_specific_kwargs=None):
30+
question = doc["question"]
31+
option_prompt = "The choices are listed below:\n" + "\n".join(doc["multi-choice options"]) + "\n"
32+
# pre_prompt = lmms_eval_specific_kwargs["pre_prompt"]
33+
# post_promt = lmms_eval_specific_kwargs["post_prompt"]
34+
pre_prompt = ""
35+
assert doc["category"] in TASK_PAIRs, f"Unknown task: {doc['category']}"
36+
if doc["category"] == "Land use classification/Overall Land use classification":
37+
post_promt = "\nSelect the best answer(s) for the multiple-choice question based on the image. There may be more than one correct option. Only respond with the letter(s) corresponding to the correct answer(s) (A, B, C, D), with multiple choices separated by spaces.The answer(s) is(are):"
38+
else:
39+
post_promt = "\nSelect the best answer for the multiple-choice question based on the image. Only respond with the letter corresponding to the correct answer (A, B, C, D).\nThe answer is:"
40+
question += pre_prompt + option_prompt + post_promt
41+
return question
42+
43+
44+
# [Image] [Question] The choices are listed below:
45+
# (A) [Choice A]
46+
# (B) [Choice B]
47+
# (C) [Choice C]
48+
# (D) [Choice D]
49+
# (E) [Choice E]
50+
# Select the best answer to the above multiple-choice question based on the image. Respond with only the letter (A, B, C, D, or E) of the correct option.
51+
# The best answer is:
52+
53+
54+
def extract_characters_regex(s, choices=["(A)", "(B)", "(C)", "(D)", "(E)"]):
55+
if type(s) is dict:
56+
s = ""
57+
s = s.strip()
58+
answer_prefixes = [
59+
"The best answer is",
60+
"The correct answer is",
61+
"The answer is",
62+
"The answer",
63+
"The best option isThe correct option is",
64+
"Best answer:Best option:",
65+
]
66+
for answer_prefix in answer_prefixes:
67+
s = s.replace(answer_prefix, "")
68+
69+
if not re.search("[ABCDE]", s):
70+
return ""
71+
matches = re.findall(r"\(([a-eA-E])\)", s)
72+
if len(matches) == 0:
73+
matches = re.findall(r"(?:^|\s)?([a-eA-E])(?:$|[\s,.])?", s)
74+
if len(matches) == 0:
75+
matches = re.findall(r"[a-eA-E]", s)
76+
if len(matches) == 0:
77+
return ""
78+
else:
79+
matches = set(mat.upper() for mat in matches)
80+
return "".join(matches)
81+
82+
83+
def xlrs_process_results(doc, results):
84+
"""
85+
Args:
86+
doc: a instance of the eval dataset
87+
results: [pred]
88+
Returns:
89+
a dictionary with key: metric name (in this case xlrs_micro_score), value: metric value
90+
"""
91+
pred = results[0]
92+
pred_ans = extract_characters_regex(pred)
93+
category, sub_category = doc["category"].split("/")[:2]
94+
task_category = doc["l2-category"]
95+
data_dict = {
96+
"question_id": doc["index"],
97+
"category": category,
98+
"sub_category": sub_category,
99+
"task_category": task_category,
100+
"pred_answer": pred_ans,
101+
"answer": doc["answer"],
102+
}
103+
104+
return {"xlrs_micro_score": data_dict}
105+
106+
107+
def xlrs_aggregate_results(results):
108+
"""
109+
Args:
110+
results: a list of values returned by process_results
111+
Returns:
112+
A score
113+
"""
114+
115+
metrics = {}
116+
for task_pair in TASK_PAIRs:
117+
task, subtask = task_pair.split("/")
118+
if task not in metrics.keys():
119+
metrics[task] = {}
120+
metrics[f"{task}"][f"{subtask}"] = {}
121+
# for task in TASKS:
122+
# metrics[f"{task}"] = {}
123+
# for subtask in SUBTASKS:
124+
# metrics[f"{task}"][f"{subtask}"] = {}
125+
126+
for i in range(len(results)):
127+
result = results[i]
128+
Task = result["category"]
129+
Subtask = result["sub_category"]
130+
Category = result["task_category"].lower()
131+
if "attribute" in Category.lower():
132+
Category = Category.split("/")[0] + "/attribute"
133+
# print(f"pred: {result['pred_answer']}, answer: {result['answer']}")
134+
# exact match
135+
cnt = 1 if set(result["pred_answer"]) == set(result["answer"]) else 0
136+
if Category not in metrics[Task][Subtask].keys():
137+
metrics[Task][Subtask][f"{Category}"] = {
138+
"true": cnt,
139+
"false": 1 - cnt,
140+
}
141+
else:
142+
metrics[Task][Subtask][f"{Category}"]["true"] += cnt
143+
metrics[Task][Subtask][f"{Category}"]["false"] += 1 - cnt
144+
145+
sum_all, succ_all = 0, 0
146+
for task, tasks_values in metrics.items():
147+
eval_logger.info("*" * 32 + f"{task} (Task Start)")
148+
cnt_task, sum_task = 0, 0
149+
for substask, subtask_value in tasks_values.items():
150+
eval_logger.info("+" * 16 + f"{substask} (Subtask Start)")
151+
cnt_subtask, sum_subtask, e_subtask = 0, 0, 0
152+
for category, category_dict in subtask_value.items():
153+
cnt_subtask += category_dict["true"]
154+
sum_subtask += category_dict["false"] + category_dict["true"]
155+
acc = category_dict["true"] / (category_dict["false"] + category_dict["true"])
156+
eval_logger.info("-" * 4 + "\t" + "Acc " + "{:.4f}".format(acc) + f"\t{category.capitalize()} ({category_dict['false'] + category_dict['true']} items)")
157+
158+
if sum_subtask == 0:
159+
acc_subtasks = 0
160+
else:
161+
acc_subtasks = cnt_subtask / sum_subtask
162+
eval_logger.info("+" * 16 + "\t Acc " + "{:.4f}".format(acc_subtasks) + f"\t{substask} ({sum_subtask} items)")
163+
cnt_task += cnt_subtask
164+
sum_task += sum_subtask
165+
166+
if sum_task == 0:
167+
acc_task = 0
168+
else:
169+
acc_task = cnt_task / sum_task
170+
succ_all += cnt_task
171+
sum_all += sum_task
172+
eval_logger.info("*" * 32 + "Acc " + "{:.4f}".format(acc_task) + f"\t{task} ({sum_task} items)\n")
173+
eval_logger.info("*" * 32 + "Overall Acc " + "{:.4f}".format(succ_all / sum_all))
174+
return succ_all / sum_all

0 commit comments

Comments
 (0)