|
| 1 | +import os |
| 2 | +import re |
| 3 | + |
| 4 | +dir_name = os.path.dirname(os.path.abspath(__file__)) |
| 5 | + |
| 6 | +SUFFIX_FOR_VQA = {"yes_no": "Please answer Yes or No.", "multiple_choice": "Please output the letter corresponding to the correct option."} |
| 7 | + |
| 8 | + |
| 9 | + |
| 10 | +def get_scores(scores): |
| 11 | + """ |
| 12 | + Calculate various scores based on the given results. |
| 13 | +
|
| 14 | + Args: |
| 15 | + scores (dict or list): A dictionary or list containing results where each result can be: |
| 16 | + - dict: {id: {"q0_i0": 1 or 0, "q0_i1": 1 or 0, "q1_i0": 1 or 0, "q1_i1": 1 or 0}, ...} |
| 17 | + - list: [[q0_i0 (1 or 0), q0_i1 (1 or 0), q1_i0 (1 or 0), q1_i1 (1 or 0)], ...] |
| 18 | +
|
| 19 | + The keys "q0_i0", "q0_i1", "q1_i0", "q1_i1" represent combinations of questions and images: |
| 20 | + - "q0_i0" means question_0 on image_0 |
| 21 | + - "q0_i1" means question_0 on image_1 |
| 22 | + - "q1_i0" means question_1 on image_0 |
| 23 | + - "q1_i1" means question_1 on image_1 |
| 24 | +
|
| 25 | + Returns: |
| 26 | + dict: A dictionary containing the calculated scores: |
| 27 | + - 'Acc': Average binary VQA acc |
| 28 | + - 'Q_Acc': Average question acc |
| 29 | + - 'I_Acc': Average image acc |
| 30 | + - 'G_Acc': Average group acc |
| 31 | + """ |
| 32 | + Q_Acc = 0.0 |
| 33 | + I_Acc = 0.0 |
| 34 | + Acc = 0.0 |
| 35 | + G_Acc = 0.0 |
| 36 | + |
| 37 | + num_samples = len(scores) |
| 38 | + |
| 39 | + def calculate_image_score(result): |
| 40 | + image_correct = 0 |
| 41 | + if isinstance(result, dict): |
| 42 | + if result["q0_i0"] == 1.0 and result["q1_i0"] == 0.0: |
| 43 | + image_correct += 1 |
| 44 | + if result["q1_i1"] == 1.0 and result["q0_i1"] == 0.0: |
| 45 | + image_correct += 1 |
| 46 | + elif isinstance(result, list): |
| 47 | + if result[0] == 1.0 and result[2] == 0.0: |
| 48 | + image_correct += 1 |
| 49 | + if result[3] == 1.0 and result[1] == 0.0: |
| 50 | + image_correct += 1 |
| 51 | + return image_correct |
| 52 | + |
| 53 | + def calculate_question_score(result): |
| 54 | + text_correct = 0 |
| 55 | + if isinstance(result, dict): |
| 56 | + if result["q0_i0"] == 1.0 and result["q0_i1"] == 0.0: |
| 57 | + text_correct += 1 |
| 58 | + if result["q1_i1"] == 1.0 and result["q1_i0"] == 0.0: |
| 59 | + text_correct += 1 |
| 60 | + else: |
| 61 | + if result[0] == 1.0 and result[1] == 0.0: |
| 62 | + text_correct += 1 |
| 63 | + if result[3] == 1.0 and result[2] == 0.0: |
| 64 | + text_correct += 1 |
| 65 | + return text_correct |
| 66 | + |
| 67 | + def calculate_binary_score(result): |
| 68 | + binary_score_correct = 0 |
| 69 | + if isinstance(result, dict): |
| 70 | + binary_score_correct += 1 if result["q0_i0"] == 1.0 else 0 |
| 71 | + binary_score_correct += 1 if result["q0_i1"] == 0.0 else 0 |
| 72 | + binary_score_correct += 1 if result["q1_i0"] == 0.0 else 0 |
| 73 | + binary_score_correct += 1 if result["q1_i1"] == 1.0 else 0 |
| 74 | + else: |
| 75 | + binary_score_correct += 1 if result[0] == 1.0 else 0 |
| 76 | + binary_score_correct += 1 if result[1] == 0.0 else 0 |
| 77 | + binary_score_correct += 1 if result[2] == 0.0 else 0 |
| 78 | + binary_score_correct += 1 if result[3] == 1.0 else 0 |
| 79 | + |
| 80 | + return binary_score_correct |
| 81 | + |
| 82 | + def calculate_group_score(result): |
| 83 | + group_correct = 0 |
| 84 | + if calculate_question_score(result) == 2 and calculate_image_score(result) == 2: |
| 85 | + group_correct += 1 |
| 86 | + |
| 87 | + return group_correct |
| 88 | + |
| 89 | + if isinstance(scores, dict): |
| 90 | + for _, result in scores.items(): |
| 91 | + Q_Acc += calculate_question_score(result) |
| 92 | + I_Acc += calculate_image_score(result) |
| 93 | + Acc += calculate_binary_score(result) |
| 94 | + G_Acc += calculate_group_score(result) |
| 95 | + else: |
| 96 | + for result in scores: |
| 97 | + Q_Acc += calculate_question_score(result) |
| 98 | + I_Acc += calculate_image_score(result) |
| 99 | + Acc += calculate_binary_score(result) |
| 100 | + G_Acc += calculate_group_score(result) |
| 101 | + |
| 102 | + results = {"Q_Acc": Q_Acc / float(num_samples * 2), "I_Acc": I_Acc / float(num_samples * 2), "Acc": Acc / float(num_samples * 4), "G_Acc": G_Acc / num_samples} |
| 103 | + |
| 104 | + return results |
| 105 | + |
| 106 | + |
| 107 | +def extract_answer(output_string, task_type="yes_no"): |
| 108 | + """ |
| 109 | + Extracts the answer from the output string based on the task type. |
| 110 | +
|
| 111 | + Parameters: |
| 112 | + output_string (str): The output string. |
| 113 | + task_type (str): The type of task. Must be "yes_no" as CameraBench does not have "multiple_choice" questions. |
| 114 | +
|
| 115 | + Returns: |
| 116 | + int: |
| 117 | + 1 if "yes" or "A" |
| 118 | + 0 if "no" or "B" |
| 119 | + -1 if no relevant answer is found. |
| 120 | + Raises a ValueError if an unsupported task_type is provided. |
| 121 | + """ |
| 122 | + |
| 123 | + def find_word_position(string, word): |
| 124 | + pattern = r"\b" + re.escape(word) + r"\b" |
| 125 | + match = re.search(pattern, string, re.IGNORECASE) |
| 126 | + if match: |
| 127 | + return match.start() |
| 128 | + return -1 |
| 129 | + |
| 130 | + if task_type != "yes_no": |
| 131 | + raise ValueError("Task type not supported. Must be 'yes_no'. CameraBench VQA only have 'yes_no' questions.") |
| 132 | + |
| 133 | + # if task_type == "yes_no": |
| 134 | + position_yes_and_a = find_word_position(output_string, "yes") |
| 135 | + position_no_and_b = find_word_position(output_string, "no") |
| 136 | + # elif task_type == "multiple_choice": |
| 137 | + # position_yes_and_a = find_word_position(output_string, "A") |
| 138 | + # position_no_and_b = find_word_position(output_string, "B") |
| 139 | + |
| 140 | + if position_yes_and_a == -1 and position_no_and_b == -1: |
| 141 | + print(f"No answer found in the output string: {output_string}.") |
| 142 | + return -1 |
| 143 | + elif position_yes_and_a != -1 and position_no_and_b != -1: |
| 144 | + return 1 if position_yes_and_a < position_no_and_b else 0 |
| 145 | + else: |
| 146 | + return 0 if position_yes_and_a == -1 else 1 |
| 147 | + |
| 148 | + |
| 149 | +def cambench_doc_to_visual(doc): |
| 150 | + try: |
| 151 | + default_path = os.path.join(os.getenv('HOME'), '.cache/huggingface') |
| 152 | + load_path = os.path.expanduser(os.path.join( |
| 153 | + os.getenv("HF_HOME", default_path), |
| 154 | + 'camerabench_vqa/datasets--chancharikm--camerabench_vqa_lmms_eval/snapshots' |
| 155 | + )) |
| 156 | + |
| 157 | + if not os.path.exists(load_path): |
| 158 | + raise FileNotFoundError(f"Dataset path not found: {load_path}") |
| 159 | + |
| 160 | + snapshots = os.listdir(load_path) |
| 161 | + if not snapshots: |
| 162 | + raise FileNotFoundError(f"No snapshots found in: {load_path}") |
| 163 | + |
| 164 | + snapshot_path = os.path.join(load_path, snapshots[0]) |
| 165 | + video_path = os.path.join(snapshot_path, doc["Video"]) |
| 166 | + |
| 167 | + if not os.path.exists(video_path): |
| 168 | + raise FileNotFoundError(f"Video file not found: {video_path}") |
| 169 | + |
| 170 | + return [video_path] |
| 171 | + except Exception as e: |
| 172 | + eval_logger.error(f"Error constructing video path: {e}") |
| 173 | + raise |
| 174 | + |
| 175 | + |
| 176 | +def cambench_doc_to_text(doc): |
| 177 | + question = doc["Question"] |
| 178 | + question = question + " " + SUFFIX_FOR_VQA["yes_no"] |
| 179 | + # if doc["Question_Type"] == "yes_no": |
| 180 | + # question = question + " " + SUFFIX_FOR_VQA["yes_no"] |
| 181 | + # elif doc["Question_Type"] == "multiple_choice": |
| 182 | + # question = question + " " + SUFFIX_FOR_VQA["multiple_choice"] |
| 183 | + return question |
| 184 | + |
| 185 | + |
| 186 | +def cambench_process_results(doc, results): |
| 187 | + """ |
| 188 | + Args: |
| 189 | + doc: a instance of the eval dataset |
| 190 | + results: [pred] |
| 191 | + Returns: |
| 192 | + a dictionary with key: metric name (in this case mme score), value: metric value |
| 193 | + """ |
| 194 | + pred = results[0] |
| 195 | + # type = doc["Question_Type"] |
| 196 | + gt_ans = extract_answer(pred, task_type="yes_no") |
| 197 | + return { |
| 198 | + "cambench_G_ACC": {"id": doc["Index"], "score": gt_ans}, |
| 199 | + "cambench_Q_ACC": {"id": doc["Index"], "score": gt_ans}, |
| 200 | + "cambench_I_ACC": {"id": doc["Index"], "score": gt_ans}, |
| 201 | + "cambench_ACC": {"id": doc["Index"], "score": gt_ans}, |
| 202 | + } |
| 203 | + |
| 204 | + |
| 205 | +def cambench_aggregate_results_G_ACC(results): |
| 206 | + """ |
| 207 | + Args: |
| 208 | + results: a list of values returned by process_results |
| 209 | + Returns: |
| 210 | + A score |
| 211 | + """ |
| 212 | + assert len(results) == 1900 * 4 |
| 213 | + answers = {} |
| 214 | + number_answered_samples = len(results) // 4 |
| 215 | + for i in range(number_answered_samples): |
| 216 | + assert int(results[i * 4]["id"]) == i * 4 |
| 217 | + assert int(results[i * 4 + 1]["id"]) == i * 4 + 1 |
| 218 | + assert int(results[i * 4 + 2]["id"]) == i * 4 + 2 |
| 219 | + assert int(results[i * 4 + 3]["id"]) == i * 4 + 3 |
| 220 | + answers[i] = {"q0_i0": results[i * 4]["score"], "q0_i1": results[i * 4 + 1]["score"], "q1_i0": results[i * 4 + 2]["score"], "q1_i1": results[i * 4 + 3]["score"]} |
| 221 | + |
| 222 | + scores = get_scores(answers) |
| 223 | + |
| 224 | + # eval_logger.info(f"G_Acc: {scores["G_Acc"]:.2f}") |
| 225 | + |
| 226 | + return scores["G_Acc"] |
| 227 | + |
| 228 | + |
| 229 | +def cambench_aggregate_results_Q_ACC(results): |
| 230 | + """ |
| 231 | + Args: |
| 232 | + results: a list of values returned by process_results |
| 233 | + Returns: |
| 234 | + A score |
| 235 | + """ |
| 236 | + assert len(results) == 1900 * 4 |
| 237 | + answers = {} |
| 238 | + number_answered_samples = len(results) // 4 |
| 239 | + for i in range(number_answered_samples): |
| 240 | + assert int(results[i * 4]["id"]) == i * 4 |
| 241 | + assert int(results[i * 4 + 1]["id"]) == i * 4 + 1 |
| 242 | + assert int(results[i * 4 + 2]["id"]) == i * 4 + 2 |
| 243 | + assert int(results[i * 4 + 3]["id"]) == i * 4 + 3 |
| 244 | + answers[i] = {"q0_i0": results[i * 4]["score"], "q0_i1": results[i * 4 + 1]["score"], "q1_i0": results[i * 4 + 2]["score"], "q1_i1": results[i * 4 + 3]["score"]} |
| 245 | + |
| 246 | + scores = get_scores(answers) |
| 247 | + |
| 248 | + # eval_logger.info(f"Q_Acc: {scores["Q_Acc"]:.2f}") |
| 249 | + |
| 250 | + return scores["Q_Acc"] |
| 251 | + |
| 252 | + |
| 253 | +def cambench_aggregate_results_I_ACC(results): |
| 254 | + """ |
| 255 | + Args: |
| 256 | + results: a list of values returned by process_results |
| 257 | + Returns: |
| 258 | + A score |
| 259 | + """ |
| 260 | + assert len(results) == 1900 * 4 |
| 261 | + answers = {} |
| 262 | + number_answered_samples = len(results) // 4 |
| 263 | + for i in range(number_answered_samples): |
| 264 | + assert int(results[i * 4]["id"]) == i * 4 |
| 265 | + assert int(results[i * 4 + 1]["id"]) == i * 4 + 1 |
| 266 | + assert int(results[i * 4 + 2]["id"]) == i * 4 + 2 |
| 267 | + assert int(results[i * 4 + 3]["id"]) == i * 4 + 3 |
| 268 | + answers[i] = {"q0_i0": results[i * 4]["score"], "q0_i1": results[i * 4 + 1]["score"], "q1_i0": results[i * 4 + 2]["score"], "q1_i1": results[i * 4 + 3]["score"]} |
| 269 | + |
| 270 | + scores = get_scores(answers) |
| 271 | + |
| 272 | + # eval_logger.info(f"I_Acc: {scores["I_Acc"]:.2f}") |
| 273 | + |
| 274 | + return scores["I_Acc"] |
| 275 | + |
| 276 | + |
| 277 | +def cambench_aggregate_results_ACC(results): |
| 278 | + """ |
| 279 | + Args: |
| 280 | + results: a list of values returned by process_results |
| 281 | + Returns: |
| 282 | + A score |
| 283 | + """ |
| 284 | + assert len(results) == 1900 * 4 |
| 285 | + answers = {} |
| 286 | + number_answered_samples = len(results) // 4 |
| 287 | + for i in range(number_answered_samples): |
| 288 | + assert int(results[i * 4]["id"]) == i * 4 |
| 289 | + assert int(results[i * 4 + 1]["id"]) == i * 4 + 1 |
| 290 | + assert int(results[i * 4 + 2]["id"]) == i * 4 + 2 |
| 291 | + assert int(results[i * 4 + 3]["id"]) == i * 4 + 3 |
| 292 | + answers[i] = {"q0_i0": results[i * 4]["score"], "q0_i1": results[i * 4 + 1]["score"], "q1_i0": results[i * 4 + 2]["score"], "q1_i1": results[i * 4 + 3]["score"]} |
| 293 | + |
| 294 | + scores = get_scores(answers) |
| 295 | + |
| 296 | + # eval_logger.info(f"Acc: {scores["Acc"]:.2f}") |
| 297 | + |
| 298 | + return scores["Acc"] |
0 commit comments