Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/current_tasks.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,13 @@
- WildVision 0617(wildvision_0617)
- WildVision 0630 (wildvision_0630)
- [SeedBench 2 Plus](https://huggingface.co/datasets/AILab-CVC/SEED-Bench-2-plus) (seedbench_2_plus)
- [SalBench](https://salbench.github.io/)
- p3
- p3_box
- p3_box_img
- o3
- o3_box
- o3_box_img

## 2. Multi-image tasks:

Expand Down
123 changes: 123 additions & 0 deletions lmms_eval/tasks/salbench/o3.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
dataset_path: salbench-vlm/salbench
dataset_name: O3
dataset_kwargs:
token: True
task: "o3"
test_split: test
output_type: generate_until
doc_to_visual: !function utils.p3o3_doc_to_visual
doc_to_text: !function utils.p3o3_doc_to_text
doc_to_target: "answer"
generation_kwargs:
max_new_tokens: 128
# temperature: 0
# top_p: 0
# num_beams: 1
# do_sample: false
process_results: !function utils.o3_process_results
metric_list:
- metric: exact_match
aggregation: !function utils.aggregate_per_sample_score
higher_is_better: true
- metric: sample_precision
aggregation: !function utils.aggregate_per_sample_score
higher_is_better: true
- metric: sample_recall
aggregation: !function utils.aggregate_per_sample_score
higher_is_better: true
- metric: sample_f1
aggregation: !function utils.aggregate_per_sample_score
higher_is_better: true

- metric: all_cat_precision
aggregation: !function utils.o3_aggregate_all_category_precision
higher_is_better: true
- metric: all_cat_recall
aggregation: !function utils.o3_aggregate_all_category_recall
higher_is_better: true
- metric: all_cat_f1
aggregation: !function utils.o3_aggregate_all_category_f1
higher_is_better: true

- metric: orientation_precision
aggregation: !function utils.aggregate_per_category_precision
higher_is_better: true
- metric: orientation_recall
aggregation: !function utils.aggregate_per_category_recall
higher_is_better: true
- metric: orientation_f1
aggregation: !function utils.aggregate_per_category_f1
higher_is_better: true

- metric: color_precision
aggregation: !function utils.aggregate_per_category_precision
higher_is_better: true
- metric: color_recall
aggregation: !function utils.aggregate_per_category_recall
higher_is_better: true
- metric: color_f1
aggregation: !function utils.aggregate_per_category_f1
higher_is_better: true

- metric: focus_precision
aggregation: !function utils.aggregate_per_category_precision
higher_is_better: true
- metric: focus_recall
aggregation: !function utils.aggregate_per_category_recall
higher_is_better: true
- metric: focus_f1
aggregation: !function utils.aggregate_per_category_f1
higher_is_better: true

- metric: shape_precision
aggregation: !function utils.aggregate_per_category_precision
higher_is_better: true
- metric: shape_recall
aggregation: !function utils.aggregate_per_category_recall
higher_is_better: true
- metric: shape_f1
aggregation: !function utils.aggregate_per_category_f1
higher_is_better: true

- metric: size_precision
aggregation: !function utils.aggregate_per_category_precision
higher_is_better: true
- metric: size_recall
aggregation: !function utils.aggregate_per_category_recall
higher_is_better: true
- metric: size_f1
aggregation: !function utils.aggregate_per_category_f1
higher_is_better: true

- metric: location_precision
aggregation: !function utils.aggregate_per_category_precision
higher_is_better: true
- metric: location_recall
aggregation: !function utils.aggregate_per_category_recall
higher_is_better: true
- metric: location_f1
aggregation: !function utils.aggregate_per_category_f1
higher_is_better: true

- metric: pattern_precision
aggregation: !function utils.aggregate_per_category_precision
higher_is_better: true
- metric: pattern_recall
aggregation: !function utils.aggregate_per_category_recall
higher_is_better: true
- metric: pattern_f1
aggregation: !function utils.aggregate_per_category_f1
higher_is_better: true

- metric: pattern_precision
aggregation: !function utils.aggregate_per_category_precision
higher_is_better: true
- metric: pattern_recall
aggregation: !function utils.aggregate_per_category_recall
higher_is_better: true
- metric: pattern_f1
aggregation: !function utils.aggregate_per_category_f1
higher_is_better: true

metadata:
- version: 0.0
123 changes: 123 additions & 0 deletions lmms_eval/tasks/salbench/o3_box.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
dataset_path: salbench-vlm/salbench
dataset_name: O3_box
dataset_kwargs:
token: True
task: "o3_box"
test_split: test
output_type: generate_until
doc_to_visual: !function utils.p3o3_doc_to_visual
doc_to_text: !function utils.p3o3_doc_to_text
doc_to_target: "answer"
generation_kwargs:
max_new_tokens: 128
# temperature: 0
# top_p: 0
# num_beams: 1
# do_sample: false
process_results: !function utils.o3_process_results
metric_list:
- metric: exact_match
aggregation: !function utils.aggregate_per_sample_score
higher_is_better: true
- metric: sample_precision
aggregation: !function utils.aggregate_per_sample_score
higher_is_better: true
- metric: sample_recall
aggregation: !function utils.aggregate_per_sample_score
higher_is_better: true
- metric: sample_f1
aggregation: !function utils.aggregate_per_sample_score
higher_is_better: true

- metric: all_cat_precision
aggregation: !function utils.o3_aggregate_all_category_precision
higher_is_better: true
- metric: all_cat_recall
aggregation: !function utils.o3_aggregate_all_category_recall
higher_is_better: true
- metric: all_cat_f1
aggregation: !function utils.o3_aggregate_all_category_f1
higher_is_better: true

- metric: orientation_precision
aggregation: !function utils.aggregate_per_category_precision
higher_is_better: true
- metric: orientation_recall
aggregation: !function utils.aggregate_per_category_recall
higher_is_better: true
- metric: orientation_f1
aggregation: !function utils.aggregate_per_category_f1
higher_is_better: true

- metric: color_precision
aggregation: !function utils.aggregate_per_category_precision
higher_is_better: true
- metric: color_recall
aggregation: !function utils.aggregate_per_category_recall
higher_is_better: true
- metric: color_f1
aggregation: !function utils.aggregate_per_category_f1
higher_is_better: true

- metric: focus_precision
aggregation: !function utils.aggregate_per_category_precision
higher_is_better: true
- metric: focus_recall
aggregation: !function utils.aggregate_per_category_recall
higher_is_better: true
- metric: focus_f1
aggregation: !function utils.aggregate_per_category_f1
higher_is_better: true

- metric: shape_precision
aggregation: !function utils.aggregate_per_category_precision
higher_is_better: true
- metric: shape_recall
aggregation: !function utils.aggregate_per_category_recall
higher_is_better: true
- metric: shape_f1
aggregation: !function utils.aggregate_per_category_f1
higher_is_better: true

- metric: size_precision
aggregation: !function utils.aggregate_per_category_precision
higher_is_better: true
- metric: size_recall
aggregation: !function utils.aggregate_per_category_recall
higher_is_better: true
- metric: size_f1
aggregation: !function utils.aggregate_per_category_f1
higher_is_better: true

- metric: location_precision
aggregation: !function utils.aggregate_per_category_precision
higher_is_better: true
- metric: location_recall
aggregation: !function utils.aggregate_per_category_recall
higher_is_better: true
- metric: location_f1
aggregation: !function utils.aggregate_per_category_f1
higher_is_better: true

- metric: pattern_precision
aggregation: !function utils.aggregate_per_category_precision
higher_is_better: true
- metric: pattern_recall
aggregation: !function utils.aggregate_per_category_recall
higher_is_better: true
- metric: pattern_f1
aggregation: !function utils.aggregate_per_category_f1
higher_is_better: true

- metric: pattern_precision
aggregation: !function utils.aggregate_per_category_precision
higher_is_better: true
- metric: pattern_recall
aggregation: !function utils.aggregate_per_category_recall
higher_is_better: true
- metric: pattern_f1
aggregation: !function utils.aggregate_per_category_f1
higher_is_better: true

metadata:
- version: 0.0
123 changes: 123 additions & 0 deletions lmms_eval/tasks/salbench/o3_box_img.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
dataset_path: salbench-vlm/salbench
dataset_name: O3_box_img
dataset_kwargs:
token: True
task: "o3_box_img"
test_split: test
output_type: generate_until
doc_to_visual: !function utils.p3o3_doc_to_visual
doc_to_text: !function utils.p3o3_doc_to_text
doc_to_target: "answer"
generation_kwargs:
max_new_tokens: 128
# temperature: 0
# top_p: 0
# num_beams: 1
# do_sample: false
process_results: !function utils.o3_process_results
metric_list:
- metric: exact_match
aggregation: !function utils.aggregate_per_sample_score
higher_is_better: true
- metric: sample_precision
aggregation: !function utils.aggregate_per_sample_score
higher_is_better: true
- metric: sample_recall
aggregation: !function utils.aggregate_per_sample_score
higher_is_better: true
- metric: sample_f1
aggregation: !function utils.aggregate_per_sample_score
higher_is_better: true

- metric: all_cat_precision
aggregation: !function utils.o3_aggregate_all_category_precision
higher_is_better: true
- metric: all_cat_recall
aggregation: !function utils.o3_aggregate_all_category_recall
higher_is_better: true
- metric: all_cat_f1
aggregation: !function utils.o3_aggregate_all_category_f1
higher_is_better: true

- metric: orientation_precision
aggregation: !function utils.aggregate_per_category_precision
higher_is_better: true
- metric: orientation_recall
aggregation: !function utils.aggregate_per_category_recall
higher_is_better: true
- metric: orientation_f1
aggregation: !function utils.aggregate_per_category_f1
higher_is_better: true

- metric: color_precision
aggregation: !function utils.aggregate_per_category_precision
higher_is_better: true
- metric: color_recall
aggregation: !function utils.aggregate_per_category_recall
higher_is_better: true
- metric: color_f1
aggregation: !function utils.aggregate_per_category_f1
higher_is_better: true

- metric: focus_precision
aggregation: !function utils.aggregate_per_category_precision
higher_is_better: true
- metric: focus_recall
aggregation: !function utils.aggregate_per_category_recall
higher_is_better: true
- metric: focus_f1
aggregation: !function utils.aggregate_per_category_f1
higher_is_better: true

- metric: shape_precision
aggregation: !function utils.aggregate_per_category_precision
higher_is_better: true
- metric: shape_recall
aggregation: !function utils.aggregate_per_category_recall
higher_is_better: true
- metric: shape_f1
aggregation: !function utils.aggregate_per_category_f1
higher_is_better: true

- metric: size_precision
aggregation: !function utils.aggregate_per_category_precision
higher_is_better: true
- metric: size_recall
aggregation: !function utils.aggregate_per_category_recall
higher_is_better: true
- metric: size_f1
aggregation: !function utils.aggregate_per_category_f1
higher_is_better: true

- metric: location_precision
aggregation: !function utils.aggregate_per_category_precision
higher_is_better: true
- metric: location_recall
aggregation: !function utils.aggregate_per_category_recall
higher_is_better: true
- metric: location_f1
aggregation: !function utils.aggregate_per_category_f1
higher_is_better: true

- metric: pattern_precision
aggregation: !function utils.aggregate_per_category_precision
higher_is_better: true
- metric: pattern_recall
aggregation: !function utils.aggregate_per_category_recall
higher_is_better: true
- metric: pattern_f1
aggregation: !function utils.aggregate_per_category_f1
higher_is_better: true

- metric: pattern_precision
aggregation: !function utils.aggregate_per_category_precision
higher_is_better: true
- metric: pattern_recall
aggregation: !function utils.aggregate_per_category_recall
higher_is_better: true
- metric: pattern_f1
aggregation: !function utils.aggregate_per_category_f1
higher_is_better: true

metadata:
- version: 0.0
Loading
Loading