add the desc of batch eval

YalinFeng01 · YalinFeng01 · commit 3f770db7d5e6 · 2025-11-24T20:20:17.000+08:00
diff --git a/docs/en/notes/guide/pipelines/EvalPipeline.md b/docs/en/notes/guide/pipelines/EvalPipeline.md
@@ -30,8 +30,6 @@ dataflow eval init
 dataflow eval api / dataflow eval local
 ```
 
-
-
 ## Step 1: Install Evaluation Environment
 
 Download evaluation environment
@@ -41,17 +39,13 @@ pip install -e .[eval]
 cd ..
 ```
 
-
-
 ## Step 2: Create and Enter DataFlow Working Directory
 
 ```bash
 mkdir workspace
 cd workspace
 ```
 
-
-
 ## Step 3: Prepare Evaluation Data and Initialize Configuration Files
 
 Initialize configuration files
@@ -66,8 +60,6 @@ Project Root/
 └── eval_local.py    # Configuration file for local model evaluator
 ```
 
-
-
 ## Step 4: Prepare Evaluation Data
 
 ### Method 1: JSON Format
@@ -100,10 +92,8 @@ EVALUATOR_RUN_CONFIG = {
 }
 ```
 
-
-
 ## Step 5: Configure Parameters
-
+### Model Parameter Configure
 If you want to use a local model as the evaluator, please modify the parameters in the `eval_local.py` file.
 
 If you want to use an API model as the evaluator, please modify the parameters in the `eval_api.py` file.
@@ -122,24 +112,46 @@ TARGET_MODELS = [
     
     # 3. Custom configuration
     # Add more models...
-    # {
-    #     "name": "llama_8b",
-    #     "path": "meta-llama/Llama-3-8B-Instruct",
-    #     "tensor_parallel_size": 2,
-    #     "max_tokens": 2048,
-    #     "gpu_memory_utilization": 0.9,
-    
-    #     # You can customize prompts for each model. If not specified, defaults to the template in build_prompt function.
-    #     # Default prompt for evaluated models
-    #     # IMPORTANT: This is the prompt for models being evaluated, NOT for the judge model!!!
-    #     "answer_prompt": """please answer the questions：
-    #      question：{question}
-    #      answer："""
-    # }
+
+{
+    "name": "qwen_7b",  # Model name
+    "path": "./Qwen2.5-7B-Instruct",  # Model path
+    # Large language models can use different parameters
+    "vllm_tensor_parallel_size": 4,  # Number of GPUs
+    "vllm_temperature": 0.1,  # Randomness
+    "vllm_top_p": 0.9,  # Top-p sampling
+    "vllm_max_tokens": 2048,  # Maximum number of tokens
+    "vllm_repetition_penalty": 1.0,  # Repetition penalty
+    "vllm_seed": None,  # Random seed
+    "vllm_gpu_memory_utilization": 0.9,  # Maximum GPU memory utilization
+    # Custom prompt can be defined for each model
+    "answer_prompt": """please answer the following question:"""
+}
+
+
 ]
 ```
 
-
+### Bench Parameter Configuration
+Supports batch configuration of benchmarks
+```python
+BENCH_CONFIG = [
+    {
+        "name": "bench_name",  # Benchmark name
+        "input_file": "path_to_your_qa/qa.json",  # Data file
+        "question_key": "input",  # Question field name
+        "reference_answer_key": "output",  # Reference answer field name
+        "output_dir": "path//bench_name",  # Output directory
+    },
+    {
+        "name": "other_bench_name",
+        "input_file": "path_to_your_qa/other_qa.json",
+        "question_key": "input",
+        "reference_answer_key": "output",
+        "output_dir": "path/other_bench_name",
+    }
+]
+```
 
 ## Step 6: Run Evaluation
 
diff --git a/docs/zh/notes/guide/pipelines/EvalPipeline.md b/docs/zh/notes/guide/pipelines/EvalPipeline.md
@@ -40,17 +40,13 @@ pip install -e .[eval]
 cd ..
 ```
 
-
-
 ## 第二步：创建并进入dataflow工作文件夹
 
 ```bash
 mkdir workspace
 cd workspace
 ```
 
-
-
 ## 第三步：准备评估数据初始化配置文件
 
 初始化配置文件
@@ -67,8 +63,6 @@ dataflow eval init
 └──  eval_local.py # 评估器为本地模型的配置文件
 ```
 
-
-
 ## 第四步：准备评估数据
 
 ### 方式一:
@@ -90,8 +84,6 @@ dataflow eval init
 
 `output`是标准答案
 
-
-
 ### 方式二:
 
 也可以不处理数据（需要有明确的问题和标准答案这两个字段），通过eval_api.py以及eval_local.py来进行配置映射字段名字
@@ -104,16 +96,14 @@ EVALUATOR_RUN_CONFIG = {
 }
 ```
 
-
-
 ## 第五步：配置参数
+### 模型参数配置
 
 假设想用本地模型作为评估器，请修改`eval_local.py`文件中的参数
 
 假设想用api模型作为评估器，请修改`eval_api.py`文件中的参数
 
-```bash
-Target Models Configuration (same as API mode)
+```python
 
 TARGET_MODELS = [
 	# 展示所有用法
@@ -124,28 +114,45 @@ TARGET_MODELS = [
     # "Qwen/Qwen2.5-7B-Instruct"
     # 3.单独配置
     # 添加更多模型...
-    # {
-    #     "name": "llama_8b",
-    #     "path": "meta-llama/Llama-3-8B-Instruct",
-    #     "tensor_parallel_size": 2
-    #     "max_tokens": 2048,
-    #     "gpu_memory_utilization": 0.9,
-    # 可以为每个模型自定义提示词 不写就为默认模板 即				build_prompt函数中的prompt
-    # 默认被评估模型提示词 
-    # 再次提示:该prompt为被评估模型的提示词，请勿与评估模型提示词混淆！！！
-    # You can customize prompts for each model. If not specified, defaults to the template in build_prompt function.
-    # Default prompt for evaluated models
-    # IMPORTANT: This is the prompt for models being evaluated, NOT for the judge model!!!
-    # "answer_prompt": """please answer the questions：
-    #  question：{question}
-    #  answer："""
-    #     ""
-    # }
-    #
+{
+    "name": "qwen_7b",  # 模型名称
+    "path": "./Qwen2.5-7B-Instruct",  # 模型路径
+    # 大模型可以用不同的参数
+    "vllm_tensor_parallel_size": 4,  # 显卡数量
+    "vllm_temperature": 0.1,  # 随机性，值越大输出越随机
+    "vllm_top_p": 0.9,  # 核采样概率阈值，控制候选词的累积概率范围
+    "vllm_max_tokens": 2048,  # 最大生成token数
+    "vllm_repetition_penalty": 1.0,  # 重复惩罚系数，大于1时抑制重复内容
+    "vllm_seed": None,  # 随机种子，设置后可复现结果
+    "vllm_gpu_memory_utilization": 0.9,  # 最大显存利用率
+    # 可以为每个模型自定义提示词
+    "answer_prompt": """please answer the following question:"""  # 回答提示词模板
+}
     
 ]
 ```
 
+### Bench参数配置
+支持批量Bench评估
+```python
+BENCH_CONFIG = [
+    {
+        "name": "bench_name",  # bench名称
+        "input_file": "path_to_your_qa/qa.json",  # 数据文件
+        "question_key": "input",  # 问题字段名
+        "reference_answer_key": "output",  # 答案字段名
+        "output_dir": "path//bench_name",  # 输出目录
+    },
+    {
+        "name": "other_bench_name",
+        "input_file": "path_to_your_qa/other_qa.json",
+        "question_key": "input",
+        "reference_answer_key": "output",
+        "output_dir":"path/other_bench_name",
+    }
+]
+
+```
 
 
 ## 第六步：进行评估