Skip to content

Commit f64dfa5

Browse files
chancharikmitraLuodiancoderabbitai[bot]
authored
Add CameraBench_VQA (#725)
* Added CameraBench_VQA * Apply suggestions from code review Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --------- Co-authored-by: Li Bo <[email protected]> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
1 parent 86a93f9 commit f64dfa5

File tree

2 files changed

+330
-0
lines changed

2 files changed

+330
-0
lines changed
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
dataset_path: chancharikm/camerabench_vqa_lmms_eval # The name of the dataset as listed by HF in the datasets Hub.
2+
dataset_kwargs:
3+
token: True
4+
cache_dir: camerabench_vqa
5+
video: True
6+
task: "camerabench_vqa" # The name of the task, this should be registered in the task manager. If successful, you can call lmms_eval with this task name by setting `--tasks mme`.
7+
test_split: test # The split of the dataset to use as the test split.
8+
output_type: generate_until # The type of model output for the given task. Options are `generate_until`, `loglikelihood`, and `multiple_choice`.
9+
doc_to_visual: !function utils.cambench_doc_to_visual # The function to process a sample into the appropriate input for the model.
10+
doc_to_text: !function utils.cambench_doc_to_text # The function to process a sample into the appropriate target output for the model.
11+
doc_to_target: "answer" # The function to process a sample into a list of possible string choices for `multiple_choice` tasks.
12+
generation_kwargs: # Auxiliary arguments for the `generate` function from HF transformers library. This would be used in different models files.
13+
max_new_tokens: 16
14+
temperature: 0
15+
top_p: 1.0
16+
num_beams: 1
17+
do_sample: false
18+
# The return value of process_results will be used by metrics
19+
process_results: !function utils.cambench_process_results
20+
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
21+
# e.g. Following metrics `mme_perception_score` is custom defined.
22+
# So `mme_process_results` function should return the dict `{"mme_perception_score": {sub_k:sub_v, ..., } }`
23+
# And the `mme_aggregate_results` function could get the dict `{sub_k:sub_v, ..., }`, and use the information to gather the final accuracy.
24+
metric_list:
25+
26+
- metric: cambench_Q_ACC # The name of the metric to use for evaluation. The process_results function should return the metric name and the metric value, in format of `{metric_name: results}`. And the aggregation function will use the results to get the final score.
27+
aggregation: !function utils.cambench_aggregate_results_Q_ACC # The name of the aggregation function to use for evaluation.
28+
higher_is_better: true # Whether the metric is better when the value is higher.
29+
30+
- metric: cambench_ACC # The name of the metric to use for evaluation. The process_results function should return the metric name and the metric value, in format of `{metric_name: results}`. And the aggregation function will use the results to get the final score.
31+
aggregation: !function utils.cambench_aggregate_results_ACC # The name of the aggregation function to use for evaluation.
32+
higher_is_better: true # Whether the metric is better when the value is higher.
Lines changed: 298 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,298 @@
1+
import os
2+
import re
3+
4+
dir_name = os.path.dirname(os.path.abspath(__file__))
5+
6+
SUFFIX_FOR_VQA = {"yes_no": "Please answer Yes or No.", "multiple_choice": "Please output the letter corresponding to the correct option."}
7+
8+
9+
10+
def get_scores(scores):
11+
"""
12+
Calculate various scores based on the given results.
13+
14+
Args:
15+
scores (dict or list): A dictionary or list containing results where each result can be:
16+
- dict: {id: {"q0_i0": 1 or 0, "q0_i1": 1 or 0, "q1_i0": 1 or 0, "q1_i1": 1 or 0}, ...}
17+
- list: [[q0_i0 (1 or 0), q0_i1 (1 or 0), q1_i0 (1 or 0), q1_i1 (1 or 0)], ...]
18+
19+
The keys "q0_i0", "q0_i1", "q1_i0", "q1_i1" represent combinations of questions and images:
20+
- "q0_i0" means question_0 on image_0
21+
- "q0_i1" means question_0 on image_1
22+
- "q1_i0" means question_1 on image_0
23+
- "q1_i1" means question_1 on image_1
24+
25+
Returns:
26+
dict: A dictionary containing the calculated scores:
27+
- 'Acc': Average binary VQA acc
28+
- 'Q_Acc': Average question acc
29+
- 'I_Acc': Average image acc
30+
- 'G_Acc': Average group acc
31+
"""
32+
Q_Acc = 0.0
33+
I_Acc = 0.0
34+
Acc = 0.0
35+
G_Acc = 0.0
36+
37+
num_samples = len(scores)
38+
39+
def calculate_image_score(result):
40+
image_correct = 0
41+
if isinstance(result, dict):
42+
if result["q0_i0"] == 1.0 and result["q1_i0"] == 0.0:
43+
image_correct += 1
44+
if result["q1_i1"] == 1.0 and result["q0_i1"] == 0.0:
45+
image_correct += 1
46+
elif isinstance(result, list):
47+
if result[0] == 1.0 and result[2] == 0.0:
48+
image_correct += 1
49+
if result[3] == 1.0 and result[1] == 0.0:
50+
image_correct += 1
51+
return image_correct
52+
53+
def calculate_question_score(result):
54+
text_correct = 0
55+
if isinstance(result, dict):
56+
if result["q0_i0"] == 1.0 and result["q0_i1"] == 0.0:
57+
text_correct += 1
58+
if result["q1_i1"] == 1.0 and result["q1_i0"] == 0.0:
59+
text_correct += 1
60+
else:
61+
if result[0] == 1.0 and result[1] == 0.0:
62+
text_correct += 1
63+
if result[3] == 1.0 and result[2] == 0.0:
64+
text_correct += 1
65+
return text_correct
66+
67+
def calculate_binary_score(result):
68+
binary_score_correct = 0
69+
if isinstance(result, dict):
70+
binary_score_correct += 1 if result["q0_i0"] == 1.0 else 0
71+
binary_score_correct += 1 if result["q0_i1"] == 0.0 else 0
72+
binary_score_correct += 1 if result["q1_i0"] == 0.0 else 0
73+
binary_score_correct += 1 if result["q1_i1"] == 1.0 else 0
74+
else:
75+
binary_score_correct += 1 if result[0] == 1.0 else 0
76+
binary_score_correct += 1 if result[1] == 0.0 else 0
77+
binary_score_correct += 1 if result[2] == 0.0 else 0
78+
binary_score_correct += 1 if result[3] == 1.0 else 0
79+
80+
return binary_score_correct
81+
82+
def calculate_group_score(result):
83+
group_correct = 0
84+
if calculate_question_score(result) == 2 and calculate_image_score(result) == 2:
85+
group_correct += 1
86+
87+
return group_correct
88+
89+
if isinstance(scores, dict):
90+
for _, result in scores.items():
91+
Q_Acc += calculate_question_score(result)
92+
I_Acc += calculate_image_score(result)
93+
Acc += calculate_binary_score(result)
94+
G_Acc += calculate_group_score(result)
95+
else:
96+
for result in scores:
97+
Q_Acc += calculate_question_score(result)
98+
I_Acc += calculate_image_score(result)
99+
Acc += calculate_binary_score(result)
100+
G_Acc += calculate_group_score(result)
101+
102+
results = {"Q_Acc": Q_Acc / float(num_samples * 2), "I_Acc": I_Acc / float(num_samples * 2), "Acc": Acc / float(num_samples * 4), "G_Acc": G_Acc / num_samples}
103+
104+
return results
105+
106+
107+
def extract_answer(output_string, task_type="yes_no"):
108+
"""
109+
Extracts the answer from the output string based on the task type.
110+
111+
Parameters:
112+
output_string (str): The output string.
113+
task_type (str): The type of task. Must be "yes_no" as CameraBench does not have "multiple_choice" questions.
114+
115+
Returns:
116+
int:
117+
1 if "yes" or "A"
118+
0 if "no" or "B"
119+
-1 if no relevant answer is found.
120+
Raises a ValueError if an unsupported task_type is provided.
121+
"""
122+
123+
def find_word_position(string, word):
124+
pattern = r"\b" + re.escape(word) + r"\b"
125+
match = re.search(pattern, string, re.IGNORECASE)
126+
if match:
127+
return match.start()
128+
return -1
129+
130+
if task_type != "yes_no":
131+
raise ValueError("Task type not supported. Must be 'yes_no'. CameraBench VQA only have 'yes_no' questions.")
132+
133+
# if task_type == "yes_no":
134+
position_yes_and_a = find_word_position(output_string, "yes")
135+
position_no_and_b = find_word_position(output_string, "no")
136+
# elif task_type == "multiple_choice":
137+
# position_yes_and_a = find_word_position(output_string, "A")
138+
# position_no_and_b = find_word_position(output_string, "B")
139+
140+
if position_yes_and_a == -1 and position_no_and_b == -1:
141+
print(f"No answer found in the output string: {output_string}.")
142+
return -1
143+
elif position_yes_and_a != -1 and position_no_and_b != -1:
144+
return 1 if position_yes_and_a < position_no_and_b else 0
145+
else:
146+
return 0 if position_yes_and_a == -1 else 1
147+
148+
149+
def cambench_doc_to_visual(doc):
150+
try:
151+
default_path = os.path.join(os.getenv('HOME'), '.cache/huggingface')
152+
load_path = os.path.expanduser(os.path.join(
153+
os.getenv("HF_HOME", default_path),
154+
'camerabench_vqa/datasets--chancharikm--camerabench_vqa_lmms_eval/snapshots'
155+
))
156+
157+
if not os.path.exists(load_path):
158+
raise FileNotFoundError(f"Dataset path not found: {load_path}")
159+
160+
snapshots = os.listdir(load_path)
161+
if not snapshots:
162+
raise FileNotFoundError(f"No snapshots found in: {load_path}")
163+
164+
snapshot_path = os.path.join(load_path, snapshots[0])
165+
video_path = os.path.join(snapshot_path, doc["Video"])
166+
167+
if not os.path.exists(video_path):
168+
raise FileNotFoundError(f"Video file not found: {video_path}")
169+
170+
return [video_path]
171+
except Exception as e:
172+
eval_logger.error(f"Error constructing video path: {e}")
173+
raise
174+
175+
176+
def cambench_doc_to_text(doc):
177+
question = doc["Question"]
178+
question = question + " " + SUFFIX_FOR_VQA["yes_no"]
179+
# if doc["Question_Type"] == "yes_no":
180+
# question = question + " " + SUFFIX_FOR_VQA["yes_no"]
181+
# elif doc["Question_Type"] == "multiple_choice":
182+
# question = question + " " + SUFFIX_FOR_VQA["multiple_choice"]
183+
return question
184+
185+
186+
def cambench_process_results(doc, results):
187+
"""
188+
Args:
189+
doc: a instance of the eval dataset
190+
results: [pred]
191+
Returns:
192+
a dictionary with key: metric name (in this case mme score), value: metric value
193+
"""
194+
pred = results[0]
195+
# type = doc["Question_Type"]
196+
gt_ans = extract_answer(pred, task_type="yes_no")
197+
return {
198+
"cambench_G_ACC": {"id": doc["Index"], "score": gt_ans},
199+
"cambench_Q_ACC": {"id": doc["Index"], "score": gt_ans},
200+
"cambench_I_ACC": {"id": doc["Index"], "score": gt_ans},
201+
"cambench_ACC": {"id": doc["Index"], "score": gt_ans},
202+
}
203+
204+
205+
def cambench_aggregate_results_G_ACC(results):
206+
"""
207+
Args:
208+
results: a list of values returned by process_results
209+
Returns:
210+
A score
211+
"""
212+
assert len(results) == 1900 * 4
213+
answers = {}
214+
number_answered_samples = len(results) // 4
215+
for i in range(number_answered_samples):
216+
assert int(results[i * 4]["id"]) == i * 4
217+
assert int(results[i * 4 + 1]["id"]) == i * 4 + 1
218+
assert int(results[i * 4 + 2]["id"]) == i * 4 + 2
219+
assert int(results[i * 4 + 3]["id"]) == i * 4 + 3
220+
answers[i] = {"q0_i0": results[i * 4]["score"], "q0_i1": results[i * 4 + 1]["score"], "q1_i0": results[i * 4 + 2]["score"], "q1_i1": results[i * 4 + 3]["score"]}
221+
222+
scores = get_scores(answers)
223+
224+
# eval_logger.info(f"G_Acc: {scores["G_Acc"]:.2f}")
225+
226+
return scores["G_Acc"]
227+
228+
229+
def cambench_aggregate_results_Q_ACC(results):
230+
"""
231+
Args:
232+
results: a list of values returned by process_results
233+
Returns:
234+
A score
235+
"""
236+
assert len(results) == 1900 * 4
237+
answers = {}
238+
number_answered_samples = len(results) // 4
239+
for i in range(number_answered_samples):
240+
assert int(results[i * 4]["id"]) == i * 4
241+
assert int(results[i * 4 + 1]["id"]) == i * 4 + 1
242+
assert int(results[i * 4 + 2]["id"]) == i * 4 + 2
243+
assert int(results[i * 4 + 3]["id"]) == i * 4 + 3
244+
answers[i] = {"q0_i0": results[i * 4]["score"], "q0_i1": results[i * 4 + 1]["score"], "q1_i0": results[i * 4 + 2]["score"], "q1_i1": results[i * 4 + 3]["score"]}
245+
246+
scores = get_scores(answers)
247+
248+
# eval_logger.info(f"Q_Acc: {scores["Q_Acc"]:.2f}")
249+
250+
return scores["Q_Acc"]
251+
252+
253+
def cambench_aggregate_results_I_ACC(results):
254+
"""
255+
Args:
256+
results: a list of values returned by process_results
257+
Returns:
258+
A score
259+
"""
260+
assert len(results) == 1900 * 4
261+
answers = {}
262+
number_answered_samples = len(results) // 4
263+
for i in range(number_answered_samples):
264+
assert int(results[i * 4]["id"]) == i * 4
265+
assert int(results[i * 4 + 1]["id"]) == i * 4 + 1
266+
assert int(results[i * 4 + 2]["id"]) == i * 4 + 2
267+
assert int(results[i * 4 + 3]["id"]) == i * 4 + 3
268+
answers[i] = {"q0_i0": results[i * 4]["score"], "q0_i1": results[i * 4 + 1]["score"], "q1_i0": results[i * 4 + 2]["score"], "q1_i1": results[i * 4 + 3]["score"]}
269+
270+
scores = get_scores(answers)
271+
272+
# eval_logger.info(f"I_Acc: {scores["I_Acc"]:.2f}")
273+
274+
return scores["I_Acc"]
275+
276+
277+
def cambench_aggregate_results_ACC(results):
278+
"""
279+
Args:
280+
results: a list of values returned by process_results
281+
Returns:
282+
A score
283+
"""
284+
assert len(results) == 1900 * 4
285+
answers = {}
286+
number_answered_samples = len(results) // 4
287+
for i in range(number_answered_samples):
288+
assert int(results[i * 4]["id"]) == i * 4
289+
assert int(results[i * 4 + 1]["id"]) == i * 4 + 1
290+
assert int(results[i * 4 + 2]["id"]) == i * 4 + 2
291+
assert int(results[i * 4 + 3]["id"]) == i * 4 + 3
292+
answers[i] = {"q0_i0": results[i * 4]["score"], "q0_i1": results[i * 4 + 1]["score"], "q1_i0": results[i * 4 + 2]["score"], "q1_i1": results[i * 4 + 3]["score"]}
293+
294+
scores = get_scores(answers)
295+
296+
# eval_logger.info(f"Acc: {scores["Acc"]:.2f}")
297+
298+
return scores["Acc"]

0 commit comments

Comments
 (0)