New tasks supported: EMMA (#790)

Devininthelab · web-flow · commit 9a895c4424b7 · 2025-08-07T09:30:45.000+08:00
* init emma

* remove log files

* Update .gitignore

* add minor changes
diff --git a/.gitignore b/.gitignore
@@ -49,4 +49,4 @@ outputs/
 span.log
 uv.lock
 workspace/*
-.claude/*
+.claude/*
diff --git a/lmms_eval/tasks/emma/emma_all.yaml b/lmms_eval/tasks/emma/emma_all.yaml
@@ -0,0 +1,27 @@
+dataset_path: lmms-lab/EMMA
+dataset_name: All  # Options available are: "All" for all data, "Chemistry" for chemistry only, "Physics" for physics only, "Coding" for code only and "Math" for math only
+dataset_kwargs:
+  token: True
+  cache_dir: EMMA
+  force_download: true
+task: "emma"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.emma_doc_to_visual
+doc_to_text: !function utils.emma_doc_to_text
+doc_to_target: utils.emma_doc_to_target
+doc_to_messages: !function utils.emma_doc_to_messages
+generation_kwargs:
+  max_new_tokens: 4096
+  temperature: 0.7
+# The return value of process_results will be used by metrics
+process_results: !function utils.emma_process_results
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: emma_score
+    aggregation: !function utils.emma_aggregate_results
+    higher_is_better: true
+metadata:
+  strategy: CoT
+  interleaved_format: True
+  use_lmms_judge: True
diff --git a/lmms_eval/tasks/emma/emma_mini_all.yaml b/lmms_eval/tasks/emma/emma_mini_all.yaml
@@ -0,0 +1,26 @@
+dataset_path: lmms-lab/EMMA-mini
+dataset_name: All # Options available are: "All" for all data, "Chemistry" for chemistry only, "Physics" for physics only, "Coding" for code only and "Math" for math only
+dataset_kwargs:
+  token: True
+  cache_dir: EMMA-mini
+  force_download: true
+task: "emma-mini"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function utils.emma_doc_to_visual
+doc_to_text: !function utils.emma_doc_to_text
+doc_to_target: utils.emma_doc_to_target
+generation_kwargs:
+  max_new_tokens: 4096
+  temperature: 0.7
+# The return value of process_results will be used by metrics
+process_results: !function utils.emma_process_results
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: emma_score
+    aggregation: !function utils.emma_aggregate_results
+    higher_is_better: true
+metadata:
+  strategy: CoT
+  interleaved_format: False
+  use_lmms_judge: True
diff --git a/lmms_eval/tasks/emma/utils.py b/lmms_eval/tasks/emma/utils.py