open-compass · Myhs-phz · Feb 5, 2026 · Feb 5, 2026 · Feb 5, 2026
diff --git a/README.md b/README.md
@@ -57,7 +57,8 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
 
 ## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
 
-- **\[2026.02.05\]** OpenCompass now supports Intern-S1-Pro related general and scientific evaluation benchmarks. Please check [Example for Evaluating Intern-S1](examples/eval_intern_s1_pro.py) and [Model Card](https://huggingface.co/internlm/Intern-S1-Pro)for more details! 🔥🔥🔥
+- **\[2026.02.05\]** OpenCompass now supports Intern-S1-Pro related general and scientific evaluation benchmarks. Please check [Example for Evaluating Intern-S1-Pro](examples/eval_intern_s1_pro.py) and [Model Card](https://huggingface.co/internlm/Intern-S1-Pro) for more details! 🔥🔥🔥
+- **\[2025.12.08\]** OpenCompass now supports evaluation for SciReasoner. Please check [Example for Evaluating SciReasoner](examples/eval_scireasoner.py) and [Project GitHub Repo](https://github.com/InternScience/SciReason) for more details! 🔥🔥🔥
 - **\[2025.07.26\]** OpenCompass now supports Intern-S1 related general and scientific evaluation benchmarks. Please check [Tutorial for Evaluating Intern-S1](https://opencompass.readthedocs.io/en/latest/user_guides/interns1.html) for more details! 🔥🔥🔥
 - **\[2025.04.01\]** OpenCompass now supports `CascadeEvaluator`, a flexible evaluation mechanism that allows multiple evaluators to work in sequence. This enables creating customized evaluation pipelines for complex assessment scenarios. Check out the [documentation](docs/en/advanced_guides/llm_judge.md) for more details! 🔥🔥🔥
 - **\[2025.03.11\]** We have supported evaluation for `SuperGPQA` which is a great benchmark for measuring LLM knowledge ability 🔥🔥🔥

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -58,6 +58,7 @@
 ## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
 
 - **\[2026.02.05\]** OpenCompass 现已支持Intern-S1-Pro相关的通用及科学评测基准，请参阅[Intern-S1-Pro评测示例](examples/eval_intern_s1_pro.py)和[模型信息](https://huggingface.co/internlm/Intern-S1-Pro)了解详情！🔥🔥🔥
+- **\[2025.12.08\]** OpenCompass 现已支持SciReasoner评测，请参阅[SciReasoner评测示例](examples/eval_scireasoner.py)和[原项目地址](https://github.com/InternScience/SciReason)了解详情！🔥🔥🔥
 - **\[2025.07.26\]** OpenCompass 现已支持Intern-S1相关的通用及科学评测基准，请参阅[Intern-S1评测教程](https://opencompass.readthedocs.io/zh-cn/latest/user_guides/interns1.html)了解详情！🔥🔥🔥
 - **\[2025.04.01\]** OpenCompass 现已支持 `CascadeEvaluator`，允许多个评估器按顺序工作，可以为更复杂的评估场景创建自定义评估流程，查看[文档](docs/zh_cn/advanced_guides/llm_judge.md)了解具体用法！🔥🔥🔥
 - **\[2025.03.11\]** 现已支持 `SuperGPQA`  覆盖285 个研究生学科的知识能力评测，欢迎尝试！🔥🔥🔥

diff --git a/examples/eval_scireasoner.py b/examples/eval_scireasoner.py
@@ -1,12 +1,66 @@
 from mmengine.config import read_base
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+
 
 with read_base():
-    # scireasoner
-    from opencompass.configs.datasets.SciReasoner.scireasoner_gen import scireasoner_full_datasets, scireasoner_mini_datasets
-from opencompass.configs.summarizers.scireasoner import SciReasonerSummarizer
+    # If you want to evaluate the full scireasoner dataset (more than one million samples)
+    from opencompass.configs.datasets.SciReasoner.scireasoner_gen import scireasoner_full_datasets
+
+    # If you only want to evaluate the miniset
+    from opencompass.configs.datasets.SciReasoner.scireasoner_gen import scireasoner_mini_datasets
+
+    from opencompass.configs.summarizers.scireasoner import SciReasonerSummarizer
+
+
+datasets = sum(
+    (v for k, v in locals().items() if k.endswith('_datasets')),
+    [],
+)
 
 summarizer = dict(
     type=SciReasonerSummarizer,
-    mini_set=False,  # 如果测的是mini版本需要开True，默认False
-    show_details=False  # 是否需要展示最底层的分数，默认不展示
-)
+    mini_set=False,  # When evaluating miniset, please set True
+    show_details=False  # Whether you want to see the detailed results for each subset
+)
+
+system_prompt = [
+    dict(
+        role='SYSTEM',
+        prompt='You are a professional science expert, able to reason across science fields. You answer scientific questions by integrating theory, empirical evidence, and quantitative reasoning. Provide responses that are accurate, well-justified, and as concise as possible, and clearly distinguish established facts from assumptions, approximations, and remaining uncertainties.',
+    ),
+]
+
+judge_cfg = () # Config your judge model here.
+
+for item in datasets:
+    item['infer_cfg']['prompt_template']['template']['round'] = system_prompt + item['infer_cfg']['prompt_template']['template']['round']
+    if 'judge_cfg' in item['eval_cfg']['evaluator']:
+        item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
+    elif 'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']:
+        item['eval_cfg']['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg
+
+
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        task=dict(type=OpenICLInferTask),
+    ),
+)
+
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(
+        type=LocalRunner,
+         max_num_workers=16,
+        task=dict(type=OpenICLEvalTask)
+    ),
+)
+
+
+work_dir = './outputs/eval_scireasoner'
+
+
diff --git a/opencompass/configs/summarizers/scireasoner.py b/opencompass/configs/summarizers/scireasoner.py
@@ -1,5 +1,6 @@
 from opencompass.summarizers.default import DefaultSummarizer
 import functools
+import math
 
 
 def calculate_opi(scores):
@@ -212,6 +213,8 @@ def _calculate_group_metrics(self, raw_results, parsed_results, dataset_metrics,
                             self.logger.warning(
                                 f'Non-numeric metric {i[1]} for dataset {i[0]} in model {model_abbr}, setting default value 0.')
                             parsed_results[model_abbr][i[0]][i[1]] = 0.0
+                        if math.isinf(parsed_results[model_abbr][i[0]][i[1]]) or math.isnan(parsed_results[model_abbr][i[0]][i[1]]):
+                            parsed_results[model_abbr][i[0]][i[1]] = 0.0
                     else:
                         if i in parsed_results[model_abbr]:
                             available_metrics.append(i)
@@ -317,5 +320,4 @@ def _calculate_group_metrics(self, raw_results, parsed_results, dataset_metrics,
                 parsed_results[model_abbr].setdefault(sg['name'], {}).update(result)
                 dataset_metrics.setdefault(sg['name'], []).extend(group_metrics)
                 dataset_eval_mode[sg['name']] = eval_mode
-
         return raw_results, parsed_results, dataset_metrics, dataset_eval_mode