1+ dataset_path: salbench-vlm/salbench
2+ dataset_kwargs:
3+ token: True
4+
5+ test_split: test
6+ output_type: generate_until
7+ doc_to_visual: !function utils.p3o3_doc_to_visual
8+ doc_to_text: !function utils.p3o3_doc_to_text
9+ doc_to_target: "answer"
10+ generation_kwargs:
11+ max_new_tokens: 128
12+ # temperature: 0
13+ # top_p: 0
14+ # num_beams: 1
15+ # do_sample: false
16+
17+ process_results: !function utils.o3_process_results
18+ metric_list:
19+ - metric: exact_match
20+ aggregation: !function utils.aggregate_per_sample_score
21+ higher_is_better: true
22+ - metric: sample_precision
23+ aggregation: !function utils.aggregate_per_sample_score
24+ higher_is_better: true
25+ - metric: sample_recall
26+ aggregation: !function utils.aggregate_per_sample_score
27+ higher_is_better: true
28+ - metric: sample_f1
29+ aggregation: !function utils.aggregate_per_sample_score
30+ higher_is_better: true
31+
32+ - metric: all_cat_precision
33+ aggregation: !function utils.p3_aggregate_all_category_precision
34+ higher_is_better: true
35+ - metric: all_cat_recall
36+ aggregation: !function utils.p3_aggregate_all_category_recall
37+ higher_is_better: true
38+ - metric: all_cat_f1
39+ aggregation: !function utils.p3_aggregate_all_category_f1
40+ higher_is_better: true
41+
42+ - metric: orientation_precision
43+ aggregation: !function utils.aggregate_per_category_precision
44+ higher_is_better: true
45+ - metric: orientation_recall
46+ aggregation: !function utils.aggregate_per_category_recall
47+ higher_is_better: true
48+ - metric: orientation_f1
49+ aggregation: !function utils.aggregate_per_category_f1
50+ higher_is_better: true
51+
52+ - metric: color_precision
53+ aggregation: !function utils.aggregate_per_category_precision
54+ higher_is_better: true
55+ - metric: color_recall
56+ aggregation: !function utils.aggregate_per_category_recall
57+ higher_is_better: true
58+ - metric: color_f1
59+ aggregation: !function utils.aggregate_per_category_f1
60+ higher_is_better: true
61+
62+ - metric: size_precision
63+ aggregation: !function utils.aggregate_per_category_precision
64+ higher_is_better: true
65+ - metric: size_recall
66+ aggregation: !function utils.aggregate_per_category_recall
67+ higher_is_better: true
68+ - metric: size_f1
69+ aggregation: !function utils.aggregate_per_category_f1
70+ higher_is_better: true
71+
72+ metadata:
73+ - version: 0.0
0 commit comments