fix evalscope config (#5899)

Yunnglin · Jintao-Huang · commit a75c2898c014 · 2025-09-22T17:22:36.000+08:00
diff --git a/docs/source/Instruction/评测.md b/docs/source/Instruction/评测.md
@@ -10,7 +10,7 @@ SWIFT的eval能力使用了魔搭社区[评测框架EvalScope](https://github.co
 
 目前我们支持了**标准评测集**的评测流程，以及**用户自定义**评测集的评测流程。其中**标准评测集**由三个评测后端提供支持：
 
-下面展示所支持的数据集名称，若需了解数据集的详细信息，请参考[所有支持的数据集](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset.html)
+下面展示所支持的数据集名称，若需了解数据集的详细信息，请参考[所有支持的数据集](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/index.html)
 
 1. Native（默认）：
 
diff --git a/docs/source_en/Instruction/Evaluation.md b/docs/source_en/Instruction/Evaluation.md
@@ -10,7 +10,7 @@ SWIFT's eval capability utilizes the EvalScope evaluation framework from the Mag
 
 Currently, we support the evaluation process of **standard evaluation datasets** as well as the evaluation process of **user-defined** evaluation datasets. The **standard evaluation datasets** are supported by three evaluation backends:
 
-Below are the names of the supported datasets. For detailed information on the datasets, please refer to [all supported datasets](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
+Below are the names of the supported datasets. For detailed information on the datasets, please refer to [all supported datasets](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/index.html).
 
 1. Native (default):
 
diff --git a/swift/llm/eval/utils.py b/swift/llm/eval/utils.py
@@ -99,9 +99,10 @@ def collect_model_arg(name: str) -> Optional[Any]:
         # Extract required model parameters
         self.model = collect_model_arg('model')  # model path or identifier
         self.template = collect_model_arg('template')  # conversation template
+        self.max_batch_size = collect_model_arg('max_batch_size')  # maximum batch size
 
         # Initialize the inference engine with batch support
-        self.engine = PtEngine.from_model_template(self.model, self.template, max_batch_size=self.config.batch_size)
+        self.engine = PtEngine.from_model_template(self.model, self.template, max_batch_size=self.max_batch_size)
 
     def generate(
         self,
diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py
@@ -779,16 +779,17 @@ def _compute_acc(self, outputs, labels) -> None:
 
     @torch.no_grad()
     def _evalscope_eval(self):
-        from ..llm.eval.utils import EvalModel  # registry here
+        from ..llm.eval.utils import EvalModel
         from evalscope import TaskConfig, run_task
 
         self.model.eval()
-
+        # prepare task config
         task_config_kwargs = dict(
-            model=f'model-step{self.state.global_step}',
-            model_args=dict(
+            model=EvalModel(
+                model_name=f'model-step{self.state.global_step}',
                 model=self.model,
                 template=self.template,
+                max_batch_size=self.args.per_device_eval_batch_size,
             ),
             eval_type='swift_custom',
             datasets=self.args.eval_dataset,