Add CameraBench_VQA (#725)

chancharikmitra · Luodian · coderabbitai[bot] · web-flow · commit f64dfa5fd063 · 2025-06-26T15:50:49.000+08:00
* Added CameraBench_VQA

* Apply suggestions from code review

Co-authored-by: coderabbitai[bot] &lt;136622811+coderabbitai[bot]@users.noreply.github.com&gt;

* Apply suggestions from code review

Co-authored-by: coderabbitai[bot] &lt;136622811+coderabbitai[bot]@users.noreply.github.com&gt;

---------

Co-authored-by: Li Bo &lt;drluodian@gmail.com&gt;
Co-authored-by: coderabbitai[bot] &lt;136622811+coderabbitai[bot]@users.noreply.github.com&gt;
diff --git a/lmms_eval/tasks/camerabench_vqa/camerabench_vqa.yaml b/lmms_eval/tasks/camerabench_vqa/camerabench_vqa.yaml
@@ -0,0 +1,32 @@
+dataset_path: chancharikm/camerabench_vqa_lmms_eval # The name of the dataset as listed by HF in the datasets Hub.
+dataset_kwargs:
+  token: True
+  cache_dir: camerabench_vqa
+  video: True
+task: "camerabench_vqa" # The name of the task, this should be registered in the task manager. If successful, you can call lmms_eval with this task name by setting `--tasks mme`.
+test_split: test # The split of the dataset to use as the test split.
+output_type: generate_until # The type of model output for the given task. Options are `generate_until`, `loglikelihood`, and `multiple_choice`.
+doc_to_visual: !function utils.cambench_doc_to_visual # The function to process a sample into the appropriate input for the model. 
+doc_to_text: !function utils.cambench_doc_to_text # The function to process a sample into the appropriate target output for the model.
+doc_to_target: "answer" # The function to process a sample into a list of possible string choices for `multiple_choice` tasks.
+generation_kwargs: # Auxiliary arguments for the `generate` function from HF transformers library. This would be used in different models files.
+  max_new_tokens: 16
+  temperature: 0
+  top_p: 1.0
+  num_beams: 1
+  do_sample: false
+# The return value of process_results will be used by metrics
+process_results: !function utils.cambench_process_results
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+# e.g. Following metrics `mme_perception_score` is custom defined. 
+# So `mme_process_results` function should return the dict `{"mme_perception_score": {sub_k:sub_v, ..., } }`
+# And the `mme_aggregate_results` function could get the dict `{sub_k:sub_v, ..., }`, and use the information to gather the final accuracy.
+metric_list:
+
+  - metric: cambench_Q_ACC # The name of the metric to use for evaluation. The process_results function should return the metric name and the metric value, in format of `{metric_name: results}`. And the aggregation function will use the results to get the final score.
+    aggregation: !function utils.cambench_aggregate_results_Q_ACC # The name of the aggregation function to use for evaluation.
+    higher_is_better: true # Whether the metric is better when the value is higher.
+
+  - metric: cambench_ACC # The name of the metric to use for evaluation. The process_results function should return the metric name and the metric value, in format of `{metric_name: results}`. And the aggregation function will use the results to get the final score.
+    aggregation: !function utils.cambench_aggregate_results_ACC # The name of the aggregation function to use for evaluation.
+    higher_is_better: true # Whether the metric is better when the value is higher.
diff --git a/lmms_eval/tasks/camerabench_vqa/utils.py b/lmms_eval/tasks/camerabench_vqa/utils.py
@@ -0,0 +1,298 @@
+import os
+import re
+
+dir_name = os.path.dirname(os.path.abspath(__file__))
+
+SUFFIX_FOR_VQA = {"yes_no": "Please answer Yes or No.", "multiple_choice": "Please output the letter corresponding to the correct option."}
+
+
+
+def get_scores(scores):
+    """
+    Calculate various scores based on the given results.
+
+    Args:
+        scores (dict or list): A dictionary or list containing results where each result can be:
+            - dict: {id: {"q0_i0": 1 or 0, "q0_i1": 1 or 0, "q1_i0": 1 or 0, "q1_i1": 1 or 0}, ...}
+            - list: [[q0_i0 (1 or 0), q0_i1 (1 or 0), q1_i0 (1 or 0), q1_i1 (1 or 0)], ...]
+
+    The keys "q0_i0", "q0_i1", "q1_i0", "q1_i1" represent combinations of questions and images:
+        - "q0_i0" means question_0 on image_0
+        - "q0_i1" means question_0 on image_1
+        - "q1_i0" means question_1 on image_0
+        - "q1_i1" means question_1 on image_1
+
+    Returns:
+        dict: A dictionary containing the calculated scores:
+            - 'Acc': Average binary VQA acc
+            - 'Q_Acc': Average question acc
+            - 'I_Acc': Average image acc
+            - 'G_Acc': Average group acc
+    """
+    Q_Acc = 0.0
+    I_Acc = 0.0
+    Acc = 0.0
+    G_Acc = 0.0
+
+    num_samples = len(scores)
+
+    def calculate_image_score(result):
+        image_correct = 0
+        if isinstance(result, dict):
+            if result["q0_i0"] == 1.0 and result["q1_i0"] == 0.0:
+                image_correct += 1
+            if result["q1_i1"] == 1.0 and result["q0_i1"] == 0.0:
+                image_correct += 1
+        elif isinstance(result, list):
+            if result[0] == 1.0 and result[2] == 0.0:
+                image_correct += 1
+            if result[3] == 1.0 and result[1] == 0.0:
+                image_correct += 1
+        return image_correct
+
+    def calculate_question_score(result):
+        text_correct = 0
+        if isinstance(result, dict):
+            if result["q0_i0"] == 1.0 and result["q0_i1"] == 0.0:
+                text_correct += 1
+            if result["q1_i1"] == 1.0 and result["q1_i0"] == 0.0:
+                text_correct += 1
+        else:
+            if result[0] == 1.0 and result[1] == 0.0:
+                text_correct += 1
+            if result[3] == 1.0 and result[2] == 0.0:
+                text_correct += 1
+        return text_correct
+
+    def calculate_binary_score(result):
+        binary_score_correct = 0
+        if isinstance(result, dict):
+            binary_score_correct += 1 if result["q0_i0"] == 1.0 else 0
+            binary_score_correct += 1 if result["q0_i1"] == 0.0 else 0
+            binary_score_correct += 1 if result["q1_i0"] == 0.0 else 0
+            binary_score_correct += 1 if result["q1_i1"] == 1.0 else 0
+        else:
+            binary_score_correct += 1 if result[0] == 1.0 else 0
+            binary_score_correct += 1 if result[1] == 0.0 else 0
+            binary_score_correct += 1 if result[2] == 0.0 else 0
+            binary_score_correct += 1 if result[3] == 1.0 else 0
+
+        return binary_score_correct
+
+    def calculate_group_score(result):
+        group_correct = 0
+        if calculate_question_score(result) == 2 and calculate_image_score(result) == 2:
+            group_correct += 1
+
+        return group_correct
+
+    if isinstance(scores, dict):
+        for _, result in scores.items():
+            Q_Acc += calculate_question_score(result)
+            I_Acc += calculate_image_score(result)
+            Acc += calculate_binary_score(result)
+            G_Acc += calculate_group_score(result)
+    else:
+        for result in scores:
+            Q_Acc += calculate_question_score(result)
+            I_Acc += calculate_image_score(result)
+            Acc += calculate_binary_score(result)
+            G_Acc += calculate_group_score(result)
+
+    results = {"Q_Acc": Q_Acc / float(num_samples * 2), "I_Acc": I_Acc / float(num_samples * 2), "Acc": Acc / float(num_samples * 4), "G_Acc": G_Acc / num_samples}
+
+    return results
+
+
+def extract_answer(output_string, task_type="yes_no"):
+    """
+    Extracts the answer from the output string based on the task type.
+
+    Parameters:
+    output_string (str): The output string.
+    task_type (str): The type of task. Must be "yes_no" as CameraBench does not have "multiple_choice" questions.
+
+    Returns:
+    int:
+        1 if "yes" or "A"
+        0 if "no" or "B"
+        -1 if no relevant answer is found.
+        Raises a ValueError if an unsupported task_type is provided.
+    """
+
+    def find_word_position(string, word):
+        pattern = r"\b" + re.escape(word) + r"\b"
+        match = re.search(pattern, string, re.IGNORECASE)
+        if match:
+            return match.start()
+        return -1
+
+    if task_type != "yes_no":
+        raise ValueError("Task type not supported. Must be 'yes_no'. CameraBench VQA only have 'yes_no' questions.")
+
+    # if task_type == "yes_no":
+    position_yes_and_a = find_word_position(output_string, "yes")
+    position_no_and_b = find_word_position(output_string, "no")
+    # elif task_type == "multiple_choice":
+    #     position_yes_and_a = find_word_position(output_string, "A")
+    #     position_no_and_b = find_word_position(output_string, "B")
+
+    if position_yes_and_a == -1 and position_no_and_b == -1:
+        print(f"No answer found in the output string: {output_string}.")
+        return -1
+    elif position_yes_and_a != -1 and position_no_and_b != -1:
+        return 1 if position_yes_and_a < position_no_and_b else 0
+    else:
+        return 0 if position_yes_and_a == -1 else 1
+
+
+def cambench_doc_to_visual(doc):
+    try:
+        default_path = os.path.join(os.getenv('HOME'), '.cache/huggingface')
+        load_path = os.path.expanduser(os.path.join(
+            os.getenv("HF_HOME", default_path),
+            'camerabench_vqa/datasets--chancharikm--camerabench_vqa_lmms_eval/snapshots'
+        ))
+
+        if not os.path.exists(load_path):
+            raise FileNotFoundError(f"Dataset path not found: {load_path}")
+
+        snapshots = os.listdir(load_path)
+        if not snapshots:
+            raise FileNotFoundError(f"No snapshots found in: {load_path}")
+
+        snapshot_path = os.path.join(load_path, snapshots[0])
+        video_path = os.path.join(snapshot_path, doc["Video"])
+
+        if not os.path.exists(video_path):
+            raise FileNotFoundError(f"Video file not found: {video_path}")
+
+        return [video_path]
+    except Exception as e:
+        eval_logger.error(f"Error constructing video path: {e}")
+        raise
+
+
+def cambench_doc_to_text(doc):
+    question = doc["Question"]
+    question = question + " " + SUFFIX_FOR_VQA["yes_no"]
+    # if doc["Question_Type"] == "yes_no":
+    #     question = question + " " + SUFFIX_FOR_VQA["yes_no"]
+    # elif doc["Question_Type"] == "multiple_choice":
+    #     question = question + " " + SUFFIX_FOR_VQA["multiple_choice"]
+    return question
+
+
+def cambench_process_results(doc, results):
+    """
+    Args:
+        doc: a instance of the eval dataset
+        results: [pred]
+    Returns:
+        a dictionary with key: metric name (in this case mme score), value: metric value
+    """
+    pred = results[0]
+    # type = doc["Question_Type"]
+    gt_ans = extract_answer(pred, task_type="yes_no")
+    return {
+        "cambench_G_ACC": {"id": doc["Index"], "score": gt_ans},
+        "cambench_Q_ACC": {"id": doc["Index"], "score": gt_ans},
+        "cambench_I_ACC": {"id": doc["Index"], "score": gt_ans},
+        "cambench_ACC": {"id": doc["Index"], "score": gt_ans},
+    }
+
+
+def cambench_aggregate_results_G_ACC(results):
+    """
+    Args:
+        results: a list of values returned by process_results
+    Returns:
+        A score
+    """
+    assert len(results) == 1900 * 4
+    answers = {}
+    number_answered_samples = len(results) // 4
+    for i in range(number_answered_samples):
+        assert int(results[i * 4]["id"]) == i * 4
+        assert int(results[i * 4 + 1]["id"]) == i * 4 + 1
+        assert int(results[i * 4 + 2]["id"]) == i * 4 + 2
+        assert int(results[i * 4 + 3]["id"]) == i * 4 + 3
+        answers[i] = {"q0_i0": results[i * 4]["score"], "q0_i1": results[i * 4 + 1]["score"], "q1_i0": results[i * 4 + 2]["score"], "q1_i1": results[i * 4 + 3]["score"]}
+
+    scores = get_scores(answers)
+
+    # eval_logger.info(f"G_Acc: {scores["G_Acc"]:.2f}")
+
+    return scores["G_Acc"]
+
+
+def cambench_aggregate_results_Q_ACC(results):
+    """
+    Args:
+        results: a list of values returned by process_results
+    Returns:
+        A score
+    """
+    assert len(results) == 1900 * 4
+    answers = {}
+    number_answered_samples = len(results) // 4
+    for i in range(number_answered_samples):
+        assert int(results[i * 4]["id"]) == i * 4
+        assert int(results[i * 4 + 1]["id"]) == i * 4 + 1
+        assert int(results[i * 4 + 2]["id"]) == i * 4 + 2
+        assert int(results[i * 4 + 3]["id"]) == i * 4 + 3
+        answers[i] = {"q0_i0": results[i * 4]["score"], "q0_i1": results[i * 4 + 1]["score"], "q1_i0": results[i * 4 + 2]["score"], "q1_i1": results[i * 4 + 3]["score"]}
+
+    scores = get_scores(answers)
+
+    # eval_logger.info(f"Q_Acc: {scores["Q_Acc"]:.2f}")
+
+    return scores["Q_Acc"]
+
+
+def cambench_aggregate_results_I_ACC(results):
+    """
+    Args:
+        results: a list of values returned by process_results
+    Returns:
+        A score
+    """
+    assert len(results) == 1900 * 4
+    answers = {}
+    number_answered_samples = len(results) // 4
+    for i in range(number_answered_samples):
+        assert int(results[i * 4]["id"]) == i * 4
+        assert int(results[i * 4 + 1]["id"]) == i * 4 + 1
+        assert int(results[i * 4 + 2]["id"]) == i * 4 + 2
+        assert int(results[i * 4 + 3]["id"]) == i * 4 + 3
+        answers[i] = {"q0_i0": results[i * 4]["score"], "q0_i1": results[i * 4 + 1]["score"], "q1_i0": results[i * 4 + 2]["score"], "q1_i1": results[i * 4 + 3]["score"]}
+
+    scores = get_scores(answers)
+
+    # eval_logger.info(f"I_Acc: {scores["I_Acc"]:.2f}")
+
+    return scores["I_Acc"]
+
+
+def cambench_aggregate_results_ACC(results):
+    """
+    Args:
+        results: a list of values returned by process_results
+    Returns:
+        A score
+    """
+    assert len(results) == 1900 * 4
+    answers = {}
+    number_answered_samples = len(results) // 4
+    for i in range(number_answered_samples):
+        assert int(results[i * 4]["id"]) == i * 4
+        assert int(results[i * 4 + 1]["id"]) == i * 4 + 1
+        assert int(results[i * 4 + 2]["id"]) == i * 4 + 2
+        assert int(results[i * 4 + 3]["id"]) == i * 4 + 3
+        answers[i] = {"q0_i0": results[i * 4]["score"], "q0_i1": results[i * 4 + 1]["score"], "q1_i0": results[i * 4 + 2]["score"], "q1_i1": results[i * 4 + 3]["score"]}
+
+    scores = get_scores(answers)
+
+    # eval_logger.info(f"Acc: {scores["Acc"]:.2f}")
+
+    return scores["Acc"]