Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ Just like a compass guides us on our journey, OpenCompass will guide you through

## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>

- **\[2026.02.05\]** OpenCompass now supports Intern-S1-Pro related general and scientific evaluation benchmarks. Please check [Example for Evaluating Intern-S1](examples/eval_intern_s1_pro.py) and [Model Card](https://huggingface.co/internlm/Intern-S1-Pro)for more details! 🔥🔥🔥
- **\[2026.02.05\]** OpenCompass now supports Intern-S1-Pro related general and scientific evaluation benchmarks. Please check [Example for Evaluating Intern-S1-Pro](examples/eval_intern_s1_pro.py) and [Model Card](https://huggingface.co/internlm/Intern-S1-Pro) for more details! 🔥🔥🔥
- **\[2025.12.08\]** OpenCompass now supports evaluation for SciReasoner. Please check [Example for Evaluating SciReasoner](examples/eval_scireasoner.py) and [Project GitHub Repo](https://github.com/InternScience/SciReason) for more details! 🔥🔥🔥
- **\[2025.07.26\]** OpenCompass now supports Intern-S1 related general and scientific evaluation benchmarks. Please check [Tutorial for Evaluating Intern-S1](https://opencompass.readthedocs.io/en/latest/user_guides/interns1.html) for more details! 🔥🔥🔥
- **\[2025.04.01\]** OpenCompass now supports `CascadeEvaluator`, a flexible evaluation mechanism that allows multiple evaluators to work in sequence. This enables creating customized evaluation pipelines for complex assessment scenarios. Check out the [documentation](docs/en/advanced_guides/llm_judge.md) for more details! 🔥🔥🔥
- **\[2025.03.11\]** We have supported evaluation for `SuperGPQA` which is a great benchmark for measuring LLM knowledge ability 🔥🔥🔥
Expand Down
1 change: 1 addition & 0 deletions README_zh-CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>

- **\[2026.02.05\]** OpenCompass 现已支持Intern-S1-Pro相关的通用及科学评测基准,请参阅[Intern-S1-Pro评测示例](examples/eval_intern_s1_pro.py)[模型信息](https://huggingface.co/internlm/Intern-S1-Pro)了解详情!🔥🔥🔥
- **\[2025.12.08\]** OpenCompass 现已支持SciReasoner评测,请参阅[SciReasoner评测示例](examples/eval_scireasoner.py)[原项目地址](https://github.com/InternScience/SciReason)了解详情!🔥🔥🔥
- **\[2025.07.26\]** OpenCompass 现已支持Intern-S1相关的通用及科学评测基准,请参阅[Intern-S1评测教程](https://opencompass.readthedocs.io/zh-cn/latest/user_guides/interns1.html)了解详情!🔥🔥🔥
- **\[2025.04.01\]** OpenCompass 现已支持 `CascadeEvaluator`,允许多个评估器按顺序工作,可以为更复杂的评估场景创建自定义评估流程,查看[文档](docs/zh_cn/advanced_guides/llm_judge.md)了解具体用法!🔥🔥🔥
- **\[2025.03.11\]** 现已支持 `SuperGPQA` 覆盖285 个研究生学科的知识能力评测,欢迎尝试!🔥🔥🔥
Expand Down
66 changes: 60 additions & 6 deletions examples/eval_scireasoner.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,66 @@
from mmengine.config import read_base
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask


with read_base():
# scireasoner
from opencompass.configs.datasets.SciReasoner.scireasoner_gen import scireasoner_full_datasets, scireasoner_mini_datasets
from opencompass.configs.summarizers.scireasoner import SciReasonerSummarizer
# If you want to evaluate the full scireasoner dataset (more than one million samples)
from opencompass.configs.datasets.SciReasoner.scireasoner_gen import scireasoner_full_datasets

# If you only want to evaluate the miniset
from opencompass.configs.datasets.SciReasoner.scireasoner_gen import scireasoner_mini_datasets

from opencompass.configs.summarizers.scireasoner import SciReasonerSummarizer


datasets = sum(
(v for k, v in locals().items() if k.endswith('_datasets')),
[],
)

summarizer = dict(
type=SciReasonerSummarizer,
mini_set=False, # 如果测的是mini版本需要开True,默认False
show_details=False # 是否需要展示最底层的分数,默认不展示
)
mini_set=False, # When evaluating miniset, please set True
show_details=False # Whether you want to see the detailed results for each subset
)

system_prompt = [
dict(
role='SYSTEM',
prompt='You are a professional science expert, able to reason across science fields. You answer scientific questions by integrating theory, empirical evidence, and quantitative reasoning. Provide responses that are accurate, well-justified, and as concise as possible, and clearly distinguish established facts from assumptions, approximations, and remaining uncertainties.',
),
]

judge_cfg = () # Config your judge model here.

for item in datasets:
item['infer_cfg']['prompt_template']['template']['round'] = system_prompt + item['infer_cfg']['prompt_template']['template']['round']
if 'judge_cfg' in item['eval_cfg']['evaluator']:
item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
elif 'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']:
item['eval_cfg']['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg


infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(
type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask),
),
)

eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(
type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLEvalTask)
),
)


work_dir = './outputs/eval_scireasoner'


4 changes: 3 additions & 1 deletion opencompass/configs/summarizers/scireasoner.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from opencompass.summarizers.default import DefaultSummarizer
import functools
import math


def calculate_opi(scores):
Expand Down Expand Up @@ -212,6 +213,8 @@ def _calculate_group_metrics(self, raw_results, parsed_results, dataset_metrics,
self.logger.warning(
f'Non-numeric metric {i[1]} for dataset {i[0]} in model {model_abbr}, setting default value 0.')
parsed_results[model_abbr][i[0]][i[1]] = 0.0
if math.isinf(parsed_results[model_abbr][i[0]][i[1]]) or math.isnan(parsed_results[model_abbr][i[0]][i[1]]):
parsed_results[model_abbr][i[0]][i[1]] = 0.0
else:
if i in parsed_results[model_abbr]:
available_metrics.append(i)
Expand Down Expand Up @@ -317,5 +320,4 @@ def _calculate_group_metrics(self, raw_results, parsed_results, dataset_metrics,
parsed_results[model_abbr].setdefault(sg['name'], {}).update(result)
dataset_metrics.setdefault(sg['name'], []).extend(group_metrics)
dataset_eval_mode[sg['name']] = eval_mode

return raw_results, parsed_results, dataset_metrics, dataset_eval_mode
Loading