Skip to content

Commit d8f3340

Browse files
author
root
committed
add Add SGI-Bench-1.0 (deep research , dry experiment , wet experiment , experimental reasoning)
1 parent 124921c commit d8f3340

File tree

10 files changed

+1296
-2
lines changed

10 files changed

+1296
-2
lines changed

docs/en/Quickstart.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,8 @@ Some datasets have specific requirements during evaluation:
193193
* **SciCode:**
194194
* **Environment Dependencies:** Before running, you need to download the runtime dependency file `test_data.h5` according to the [official instructions](https://github.com/scicode-bench/SciCode) and place it in the `scieval/dataset/SciCode/eval/data` directory.
195195
* **Evaluation Files:** By default, the framework stores the model's inference results in an `xlsx` format file for easy viewing. However, for SciCode, the output length of some models, such as `deepseek-R1`, may exceed the cell length limit of `xlsx`. In this case, you need to set the environment variable `PRED_FORMAT` to `json` or `tsv` (currently only these three formats are supported).
196+
* **SGI-Bench-1.0:**
197+
* **Instructions for use:** See details at:`scieval/dataset/SGI_Bench_1_0/readme.md`
196198

197199
### Default Judge Models
198200

docs/zh-CN/Quickstart.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,9 @@ python run.py --config config.json
193193
* **SciCode:**
194194
* **环境依赖:** 运行前需要按照 [官方说明](https://github.com/scicode-bench/SciCode) 下载运行依赖文件`test_data.h5`,并放置在`scieval/dataset/SciCode/eval/data`目录下。
195195
* **评测文件:** 框架默认将模型的推理结果存储在`xlsx`格式的文件中以方便查看,但是对于SciCode来说部分模型如`deepseek-R1`的输出长度可能会超出xlxs单元格长度限制,此时需要设置环境变量PRED_FORMAT为`json``tsv`(目前只支持这三种格式)
196+
* **SGI-Bench-1.0:**
197+
* **运行说明:** 详情见:`scieval/dataset/SGI_Bench_1_0/readme.md`
198+
196199
### 默认评判模型列表
197200

198201
以下数据集在评估阶段默认使用特定的模型作为 Judge:

scieval/dataset/SGI_Bench_1_0/__init__.py

Whitespace-only changes.
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
from typing import Any, Dict, List
2+
from datasets import load_dataset
3+
from ...smp import *
4+
from ..text_base import TextBaseDataset
5+
from ..utils.judge_util import *
6+
from ...smp.file import dump, load, get_intermediate_file_path
7+
from openai import OpenAI
8+
import concurrent.futures
9+
from tqdm import tqdm
10+
from json_repair import repair_json
11+
import os
12+
13+
14+
def extract_final_answer(answer_with_thinking: str, start_tag='<answer>', end_tag='</answer>'):
15+
answer_with_thinking = str(answer_with_thinking)
16+
start_index = answer_with_thinking.rfind(start_tag)
17+
if start_index != -1:
18+
end_index = answer_with_thinking.find(end_tag, start_index)
19+
if end_index != -1:
20+
return answer_with_thinking[start_index + len(start_tag):end_index].strip()
21+
return None
22+
23+
24+
class LLM:
25+
def __init__(self, model='gpt-4.1', **kwargs):
26+
self.api_key = kwargs.get('api_key', os.environ.get('OPENAI_API_KEY')) # export OPENAI_API_KEY="xxxxx"
27+
self.base_url = kwargs.get('base_url', os.environ.get('OPENAI_API_BASE'))
28+
self.base_url = self.base_url[:-17] # export OPENAI_BASE_URL="xxxxx"
29+
self.model = model
30+
if not self.api_key:
31+
raise ValueError("API key is required.")
32+
self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
33+
34+
def __call__(self, query=None, **kwargs):
35+
system_prompt = kwargs.get('system_prompt', 'You are a helpful assistant.')
36+
max_tokens = kwargs.get('max_tokens', None)
37+
temperature = kwargs.get('temperature', 0)
38+
39+
messages = [
40+
{"role": "system", "content": system_prompt},
41+
{"role": "user", "content": query},
42+
]
43+
44+
response = self.client.chat.completions.create(
45+
model=self.model,
46+
messages=messages,
47+
max_tokens=max_tokens,
48+
temperature=temperature,
49+
)
50+
assistant_response = response.choices[0].message.content
51+
return assistant_response
52+
53+
54+
def multi_process(inp_list, function, max_workers=40):
55+
results = [None] * len(inp_list)
56+
with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
57+
future_to_index = {
58+
executor.submit(function, **item): index
59+
for index, item in enumerate(inp_list)
60+
}
61+
62+
for future in tqdm(concurrent.futures.as_completed(future_to_index), total=len(future_to_index)):
63+
index = future_to_index[future]
64+
try:
65+
result = future.result()
66+
results[index] = result
67+
except Exception as e:
68+
print(f"Error processing item {inp_list[index]}: {str(e)}")
69+
70+
return results
71+
72+
73+
judge = LLM('o4-mini')
74+
75+
76+
def eval_model_output(ques_dict):
77+
newline = '\n'
78+
prompt = f"""
79+
You are an expert in systematically validating and evaluating LLM-generated solutions. Your task is to rigorously analyze the correctness of a provided solution by comparing it step-by-step against the reference solution, and output **only** a structured verification list—with no additional text.
80+
## Instructions
81+
1. Break down the given LLM solution into individual steps and evaluate each one against the corresponding reference solution steps.
82+
2. For each step, include the following three components:
83+
- **solution_step**: The specific part of the LLM solution being evaluated.
84+
- **reason**: A clear, critical explanation of whether the step contains errors, omissions, or deviations from the reference approach. Be stringent in your assessment.
85+
- **judge**: Your verdict: either `"correct"` or `"incorrect"`.
86+
3. If the final LLM answer is incorrect, you must identify at least one step in your analysis as incorrect.
87+
4. Justify your judgments rigorously, pointing out even minor inaccuracies or logical flaws.
88+
5. Do not attempt to answer the original question—your role is strictly to evaluate.
89+
6. Output **only** a list of dictionaries in the exact format provided below. Do not include any other text or comments.
90+
## Question
91+
{ques_dict['question']}
92+
## Reference Solution Steps
93+
{newline.join(ques_dict['steps'])}
94+
## Reference Answer
95+
{ques_dict['answer']}
96+
## LLM Solution Steps
97+
{ques_dict['prediction']}
98+
## LLM Answer
99+
{extract_final_answer(ques_dict['prediction'])}
100+
## Output Example
101+
[
102+
{{"solution_step": "step content", "reason": "reason of the judgement", "judge": "correct or incorrect"}},
103+
{{"solution_step": "step content", "reason": "reason of the judgement", "judge": "correct or incorrect"}},
104+
]
105+
"""
106+
107+
try:
108+
llm_judge = judge(prompt)
109+
start_index = llm_judge.find('[')
110+
end_index = llm_judge.rfind(']') + 1
111+
llm_judge = eval(repair_json(llm_judge[start_index:end_index]))
112+
correct_step_count = 0
113+
for step in llm_judge:
114+
if step["judge"] == "correct":
115+
correct_step_count += 1
116+
step_level_acc = correct_step_count / len(llm_judge)
117+
except:
118+
llm_judge = None
119+
120+
ques_dict['exact_match'] = 1 if (
121+
ques_dict['answer'] == ques_dict['prediction'] or ques_dict['answer'] == extract_final_answer(
122+
ques_dict['prediction'])) else 0
123+
ques_dict['llm_judge'] = llm_judge
124+
ques_dict['step_level_acc'] = step_level_acc
125+
return ques_dict
126+
127+
128+
class SGI_Bench_Deep_Research(TextBaseDataset):
129+
TYPE = 'QA'
130+
131+
@classmethod
132+
def supported_datasets(cls):
133+
return ["SGI-DeepResearch"]
134+
135+
def load_data(self, dataset):
136+
hf = load_dataset("InternScience/SGI-DeepResearch",split="test")
137+
138+
rows: List[Dict[str, Any]] = []
139+
idx = 0
140+
for prob in hf:
141+
rows.append(
142+
{
143+
"index": idx,
144+
"id": prob["idx"],
145+
"question": prob["question"],
146+
"steps": prob["steps"],
147+
"answer": prob["answer"],
148+
"discipline": prob["discipline"],
149+
"direction": prob["direction"],
150+
"type": prob["type"]
151+
}
152+
)
153+
idx += 1
154+
return pd.DataFrame(rows)
155+
156+
157+
def build_prompt(self, line):
158+
if isinstance(line, int):
159+
line = self.data.iloc[line]
160+
question = line['question'] + """
161+
You can reason step by step before giving the final answer. The final answer should be enclosed by <answer> and </answer>.
162+
Example:
163+
Step 1. ...
164+
Step 2. ...
165+
...
166+
<answer>1.00</answer>
167+
"""
168+
169+
msgs = [{'type': 'text', 'value': question}]
170+
return msgs
171+
172+
def evaluate(self, eval_file, **judge_kwargs):
173+
data = load(eval_file)
174+
data = pd.DataFrame(data)
175+
176+
inp_list = [{"ques_dict": item} for item in data.to_dict(orient="records")]
177+
out_list = multi_process(inp_list, eval_model_output, 48)
178+
179+
exact_match = sum([item['exact_match'] for item in out_list]) / len(out_list)
180+
step_level_acc = sum([item['step_level_acc'] for item in out_list]) / len(out_list)
181+
182+
result = {
183+
'Exact Match': exact_match,
184+
'Step Level Acc': step_level_acc
185+
}
186+
187+
score_file = get_intermediate_file_path(eval_file, '_score', 'json')
188+
result_file = get_intermediate_file_path(eval_file, '_result', 'json')
189+
dump(out_list, score_file)
190+
dump(result, result_file)
191+
return result

0 commit comments

Comments
 (0)