Skip to content

Commit 172c83f

Browse files
rootblack-yt
andcommitted
add SGI-Bench1.0 (deep research , wet experiment , dry experiment)
Co-authored-by: black-yt <[email protected]>
1 parent 78b4ab0 commit 172c83f

File tree

5 files changed

+948
-1
lines changed

5 files changed

+948
-1
lines changed
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
from typing import Any, Dict, List
2+
from datasets import load_dataset
3+
from scieval import *
4+
from ...smp import *
5+
from ..text_base import TextBaseDataset
6+
from ..utils.judge_util import *
7+
8+
9+
def extract_final_answer(answer_with_thinking: str, start_tag='<answer>', end_tag='</answer>'):
10+
answer_with_thinking = str(answer_with_thinking)
11+
start_index = answer_with_thinking.rfind(start_tag)
12+
if start_index != -1:
13+
end_index = answer_with_thinking.find(end_tag, start_index)
14+
if end_index != -1:
15+
return answer_with_thinking[start_index + len(start_tag):end_index].strip()
16+
return None
17+
18+
19+
class SGI_Bench_Deep_Research(TextBaseDataset):
20+
TYPE = 'QA'
21+
22+
@classmethod
23+
def supported_datasets(cls):
24+
return ["SGI-DeepResearch"]
25+
26+
def load_data(self, dataset):
27+
hf = load_dataset("InternScience/SGI-DeepResearch",split="test")
28+
29+
rows: List[Dict[str, Any]] = []
30+
idx = 0
31+
for prob in hf:
32+
rows.append(
33+
{
34+
"index": idx,
35+
"id": prob["idx"],
36+
"question": prob["question"],
37+
"steps": prob["steps"],
38+
"answer": prob["answer"],
39+
"discipline": prob["discipline"],
40+
"direction": prob["direction"],
41+
"type": prob["type"]
42+
}
43+
)
44+
idx += 1
45+
return pd.DataFrame(rows)
46+
47+
48+
def build_prompt(self, line):
49+
if isinstance(line, int):
50+
line = self.data.iloc[line]
51+
question = line['question'] + """
52+
You can reason step by step before giving the final answer. The final answer should be enclosed by <answer> and </answer>.
53+
54+
Example:
55+
Step 1. ...
56+
Step 2. ...
57+
...
58+
<answer>1.00</answer>
59+
"""
60+
61+
msgs = [{'type': 'text', 'value': question}]
62+
return msgs
63+
64+
def evaluate(self, eval_file, **judge_kwargs):
65+
data = load(eval_file)
66+
data = pd.DataFrame(data)
67+
68+
data['exact_match'] = 0
69+
exact_match_count = 0
70+
for index, row in data.iterrows():
71+
if extract_final_answer(row['prediction']) == row['answer']:
72+
data.loc[index, 'exact_match'] = 1
73+
exact_match_count += 1
74+
75+
score_file = get_intermediate_file_path(eval_file, '_score', 'csv')
76+
result = {"Exact_Match": exact_match_count/len(data)}
77+
result_file = get_intermediate_file_path(eval_file, '_result', 'json')
78+
dump(data, score_file)
79+
dump(result, result_file)
80+
return result

0 commit comments

Comments
 (0)