1+ from typing import Any , Dict , List
2+ from datasets import load_dataset
3+ from scieval import *
4+ from ...smp import *
5+ from ..text_base import TextBaseDataset
6+ from ..utils .judge_util import *
7+
8+
9+ def extract_final_answer (answer_with_thinking : str , start_tag = '<answer>' , end_tag = '</answer>' ):
10+ answer_with_thinking = str (answer_with_thinking )
11+ start_index = answer_with_thinking .rfind (start_tag )
12+ if start_index != - 1 :
13+ end_index = answer_with_thinking .find (end_tag , start_index )
14+ if end_index != - 1 :
15+ return answer_with_thinking [start_index + len (start_tag ):end_index ].strip ()
16+ return None
17+
18+
19+ class SGI_Bench_Deep_Research (TextBaseDataset ):
20+ TYPE = 'QA'
21+
22+ @classmethod
23+ def supported_datasets (cls ):
24+ return ["SGI-DeepResearch" ]
25+
26+ def load_data (self , dataset ):
27+ hf = load_dataset ("InternScience/SGI-DeepResearch" ,split = "test" )
28+
29+ rows : List [Dict [str , Any ]] = []
30+ idx = 0
31+ for prob in hf :
32+ rows .append (
33+ {
34+ "index" : idx ,
35+ "id" : prob ["idx" ],
36+ "question" : prob ["question" ],
37+ "steps" : prob ["steps" ],
38+ "answer" : prob ["answer" ],
39+ "discipline" : prob ["discipline" ],
40+ "direction" : prob ["direction" ],
41+ "type" : prob ["type" ]
42+ }
43+ )
44+ idx += 1
45+ return pd .DataFrame (rows )
46+
47+
48+ def build_prompt (self , line ):
49+ if isinstance (line , int ):
50+ line = self .data .iloc [line ]
51+ question = line ['question' ] + """
52+ You can reason step by step before giving the final answer. The final answer should be enclosed by <answer> and </answer>.
53+
54+ Example:
55+ Step 1. ...
56+ Step 2. ...
57+ ...
58+ <answer>1.00</answer>
59+ """
60+
61+ msgs = [{'type' : 'text' , 'value' : question }]
62+ return msgs
63+
64+ def evaluate (self , eval_file , ** judge_kwargs ):
65+ data = load (eval_file )
66+ data = pd .DataFrame (data )
67+
68+ data ['exact_match' ] = 0
69+ exact_match_count = 0
70+ for index , row in data .iterrows ():
71+ if extract_final_answer (row ['prediction' ]) == row ['answer' ]:
72+ data .loc [index , 'exact_match' ] = 1
73+ exact_match_count += 1
74+
75+ score_file = get_intermediate_file_path (eval_file , '_score' , 'csv' )
76+ result = {"Exact_Match" : exact_match_count / len (data )}
77+ result_file = get_intermediate_file_path (eval_file , '_result' , 'json' )
78+ dump (data , score_file )
79+ dump (result , result_file )
80+ return result
0 commit comments