diff --git a/lmms_eval/tasks/vsibench/_default_template_yaml b/lmms_eval/tasks/vsibench/_default_template_yaml new file mode 100644 index 000000000..d7688d7b6 --- /dev/null +++ b/lmms_eval/tasks/vsibench/_default_template_yaml @@ -0,0 +1,35 @@ +dataset_path: nyu-visionx/VSI-Bench + +output_type: generate_until +process_docs: !function utils.process_docs +doc_to_visual: !function utils.vsibench_doc_to_visual +doc_to_text: !function utils.vsibench_doc_to_text +doc_to_target: "ground_truth" +generation_kwargs: + max_new_tokens: 16 + temperature: 0 + top_p: 1.0 + num_beams: 1 + do_sample: false +# The return value of process_results will be used by metrics +process_results: !function utils.vsibench_process_results +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +metric_list: + - metric: vsibench_score + aggregation: !function utils.vsibench_aggregate_results + higher_is_better: true +lmms_eval_specific_kwargs: + default: + pre_prompt: "" + mca_post_prompt: "Answer with the option's letter from the given choices directly." + na_post_prompt: "Please answer the question using a single word or phrase." + gemini_api: + pre_prompt: "" + mca_post_prompt: "Answer with the option's letter from the given choices directly." + na_post_prompt: "Do not response anything other than a single number!" + gpt4v: + pre_prompt: "" + mca_post_prompt: "Answer with the option's letter from the given choices directly." + na_post_prompt: "Do not response anything other than a single number!" +metadata: + - version: 0.0 \ No newline at end of file diff --git a/lmms_eval/tasks/vsibench/vsibench.yaml b/lmms_eval/tasks/vsibench/vsibench.yaml index a1743cb65..c04896104 100644 --- a/lmms_eval/tasks/vsibench/vsibench.yaml +++ b/lmms_eval/tasks/vsibench/vsibench.yaml @@ -1,40 +1,9 @@ -dataset_path: nyu-visionx/VSI-Bench +dataset_name: full +test_split: test +task: "vsibench" dataset_kwargs: token: True cache_dir: vsibench video: True -task: vsibench -test_split: test -output_type: generate_until -process_docs: !function utils.process_docs -doc_to_visual: !function utils.vsibench_doc_to_visual -doc_to_text: !function utils.vsibench_doc_to_text -doc_to_target: "ground_truth" -generation_kwargs: - max_new_tokens: 16 - temperature: 0 - top_p: 1.0 - num_beams: 1 - do_sample: false -# The return value of process_results will be used by metrics -process_results: !function utils.vsibench_process_results -# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results -metric_list: - - metric: vsibench_score - aggregation: !function utils.vsibench_aggregate_results - higher_is_better: true -lmms_eval_specific_kwargs: - default: - pre_prompt: "" - mca_post_prompt: "Answer with the option's letter from the given choices directly." - na_post_prompt: "Please answer the question using a single word or phrase." - gemini_api: - pre_prompt: "" - mca_post_prompt: "Answer with the option's letter from the given choices directly." - na_post_prompt: "Do not response anything other than a single number!" - gpt4v: - pre_prompt: "" - mca_post_prompt: "Answer with the option's letter from the given choices directly." - na_post_prompt: "Do not response anything other than a single number!" -metadata: - - version: 0.0 +include: _default_template_yaml + diff --git a/lmms_eval/tasks/vsibench/vsibench_debiased.yaml b/lmms_eval/tasks/vsibench/vsibench_debiased.yaml new file mode 100644 index 000000000..6ae8dde39 --- /dev/null +++ b/lmms_eval/tasks/vsibench/vsibench_debiased.yaml @@ -0,0 +1,8 @@ +dataset_name: debiased +test_split: test +task: "vsibench_debiased" +dataset_kwargs: + token: True + cache_dir: vsibench + video: True +include: _default_template_yaml diff --git a/lmms_eval/tasks/vsibench/vsibench_pruned.yaml b/lmms_eval/tasks/vsibench/vsibench_pruned.yaml new file mode 100644 index 000000000..c76a7783e --- /dev/null +++ b/lmms_eval/tasks/vsibench/vsibench_pruned.yaml @@ -0,0 +1,8 @@ +dataset_name: full +test_split: test +task: "vsibench_pruned" +dataset_kwargs: + token: True + cache_dir: vsibench + video: True +include: _default_template_yaml