diff --git a/cookbooks/training_judge_model/grpo/chat_rl_dataset.py b/cookbooks/training_judge_model/grpo/chat_rl_dataset.py
index f991887a..99595fbd 100644
--- a/cookbooks/training_judge_model/grpo/chat_rl_dataset.py
+++ b/cookbooks/training_judge_model/grpo/chat_rl_dataset.py
@@ -17,48 +17,47 @@
 from typing import List, Union
 
 import datasets
+import verl.utils.torch_functional as verl_F
 from omegaconf import DictConfig, ListConfig
 from torch.utils.data import Dataset
 from transformers import PreTrainedTokenizer
-
-import verl.utils.torch_functional as verl_F
 from verl.utils.model import compute_position_id_with_mask
 
-# 注意：已移除 pydantic 模板类以避免 Ray pickle 序列化问题
+# Note: Removed pydantic template classes to avoid Ray pickle serialization issues
 
 
 class BaseChatRLDataset(Dataset):
-    """聊天强化学习数据集基类"""
+    """Base class for chat reinforcement learning dataset."""
 
     def __init__(
         self,
         data_files: Union[str, List[str]],
         tokenizer: PreTrainedTokenizer,
         config: DictConfig,
-        processor=None,  # 保持向后兼容性，但不使用
-        max_samples: int = -1,  # 添加 max_samples 参数
+        processor=None,  # Keep for backward compatibility, but not used
+        max_samples: int = -1,  # Add max_samples parameter
     ):
-        # 初始化基本属性
+        # Initialize basic attributes
         self.data_files = self._normalize_data_files(data_files)
         self.original_data_files = copy.deepcopy(self.data_files)
         self.tokenizer = tokenizer
         self.config = config
         self.max_samples = max_samples
-        
-        # 加载配置设置
+
+        # Load configuration settings
         self._load_config()
-        
-        # 加载和处理数据
+
+        # Load and process data
         self._load_dataset()
 
     def _normalize_data_files(self, data_files):
-        """将数据文件转换为列表格式"""
+        """Convert data files to list format."""
         if not isinstance(data_files, (List, ListConfig)):
             data_files = [data_files]
         return copy.deepcopy(data_files)
 
     def _load_config(self):
-        """加载配置参数"""
+        """Load configuration parameters."""
         self.cache_dir = os.path.expanduser(self.config.get("cache_dir", "~/.cache/verl/rlhf"))
         self.prompt_key = self.config.get("prompt_key", "prompt")
         self.max_prompt_length = self.config.get("max_prompt_length", 1024)
@@ -66,146 +65,141 @@ def _load_config(self):
         self.truncation = self.config.get("truncation", "error")
         self.filter_overlong_prompts = self.config.get("filter_overlong_prompts", True)
         self.num_workers = min(
-            self.config.get("filter_overlong_prompts_workers", max(1, os.cpu_count() // 4)),
-            os.cpu_count()
+            self.config.get("filter_overlong_prompts_workers", max(1, os.cpu_count() // 4)), os.cpu_count()
         )
         self.serialize_dataset = False
 
     def _download_files(self):
-        """下载文件到本地缓存"""
+        """Download files to local cache."""
         from verl.utils.fs import copy_to_local
-        
+
         for i, file in enumerate(self.data_files):
             self.data_files[i] = copy_to_local(src=file, cache_dir=self.cache_dir)
 
     def _load_dataset(self):
-        """加载和处理数据集"""
+        """Load and process dataset."""
         self._download_files()
-        
-        # 加载parquet文件
+
+        # Load parquet files
         dataframes = []
         for file in self.data_files:
             df = datasets.load_dataset("parquet", data_files=file)["train"]
             dataframes.append(df)
-        
+
         self.dataframe = datasets.concatenate_datasets(dataframes)
         total = len(self.dataframe)
-        print(f"数据集长度: {total}")
-        
-        # 处理 max_samples 参数
+        print(f"Dataset length: {total}")
+
+        # Handle max_samples parameter
         if self.max_samples > 0 and self.max_samples < total:
             import numpy as np
+
             indices = np.arange(self.max_samples)
             self.dataframe = self.dataframe.select(indices.tolist())
-            print(f"选择了 {self.max_samples} 个样本（共 {total} 个）")
-        
-        # 过滤过长的提示
+            print(f"Selected {self.max_samples} samples (total: {total})")
+
+        # Filter overlong prompts
         if self.filter_overlong_prompts:
             self._filter_long_prompts()
 
     def _filter_long_prompts(self):
-        """过滤掉过长的提示"""
-        # 提取 tokenizer 和参数到局部变量，避免 pickle 序列化问题
+        """Filter out overlong prompts."""
+        # Extract tokenizer and params to local variables to avoid pickle serialization issues
         tokenizer = self.tokenizer
         max_length = self.max_prompt_length
         prompt_key = self.prompt_key
-        
+
         def is_prompt_valid(doc):
             try:
-                # 内联提取 prompt 逻辑，避免调用 self 方法
+                # Inline prompt extraction logic to avoid calling self methods
                 prompt = ""
                 if "input" in doc and doc["input"]:
                     for msg in doc["input"]:
                         if isinstance(msg, dict) and msg.get("role") == "user" and msg.get("content"):
                             prompt = msg["content"]
                             break
-                
+
                 if not prompt:
-                    # 回退到其他字段
+                    # Fallback to other fields
                     prompt = doc.get(prompt_key, "")
                     if isinstance(prompt, list) and prompt:
                         prompt = prompt[0].get("content", "") if isinstance(prompt[0], dict) else str(prompt[0])
-                
+
                 if not prompt:
-                    return True  # 如果无法提取 prompt，保留该样本
-                
+                    return True  # Keep sample if prompt cannot be extracted
+
                 return len(tokenizer.encode(prompt)) <= max_length
             except Exception as e:
-                print(f"过滤时出错: {e}")
-                return True  # 出错时保留该样本
-        
+                print(f"Error during filtering: {e}")
+                return True  # Keep sample on error
+
         original_len = len(self.dataframe)
         self.dataframe = self.dataframe.filter(
             is_prompt_valid,
-            num_proc=1,  # 使用单进程避免序列化问题
-            desc=f"过滤长度超过 {max_length} tokens的提示",
+            num_proc=1,  # Use single process to avoid serialization issues
+            desc=f"Filtering prompts exceeding {max_length} tokens",
         )
-        print(f"过滤后数据集长度: {len(self.dataframe)} (原始: {original_len})")
+        print(f"Dataset length after filtering: {len(self.dataframe)} (original: {original_len})")
 
     def _extract_prompt(self, example):
-        """从样本中提取提示"""
-        # 首先尝试新的数据结构
+        """Extract prompt from example."""
+        # First try new data structure
         if "input" in example and example["input"]:
             for msg in example["input"]:
                 if msg.get("role") == "user" and msg.get("content"):
                     return msg["content"]
-        
-        # 回退到旧的数据结构
+
+        # Fallback to old data structure
         prompt = example.get(self.prompt_key)
         if prompt is None:
             prompt = example.get("x", [])
             if prompt:
                 return prompt[-1].get("content", "")
-        
+
         if isinstance(prompt, str):
-            return prompt[:self.max_prompt_length]
+            return prompt[: self.max_prompt_length]
         elif isinstance(prompt, list) and prompt:
             return prompt[0].get("content", "") if isinstance(prompt[0], dict) else str(prompt[0])
-        
+
         return ""
 
     def _build_messages(self, example: dict) -> List[dict]:
-        """从样本构建聊天消息 - 子类需要重写"""
+        """Build chat messages from example - subclasses must override."""
         raise NotImplementedError("Subclasses must implement _build_messages")
 
     def _format_template(self, messages: List[dict], example: dict) -> str:
-        """格式化模板 - 子类需要重写"""
+        """Format template - subclasses must override."""
         raise NotImplementedError("Subclasses must implement _format_template")
 
     def _extract_ground_truth(self, row_dict):
-        """提取真实标签 - 子类需要重写"""
+        """Extract ground truth label - subclasses must override."""
         raise NotImplementedError("Subclasses must implement _extract_ground_truth")
 
     def __getitem__(self, item):
-        """获取数据集中的一个项目"""
+        """Get an item from the dataset."""
         row_dict = dict(self.dataframe[item])
         messages = self._build_messages(row_dict)
-        
-        # 格式化提示
+
+        # Format prompt
         raw_prompt_messages = self._format_template(messages, row_dict)
 
-        # 尝试使用 enable_thinking 参数，如果不支持则回退
+        # Try using enable_thinking parameter, fallback if not supported
         try:
             raw_prompt = self.tokenizer.apply_chat_template(
-                raw_prompt_messages, 
-                add_generation_prompt=True, 
-                tokenize=False, 
-                enable_thinking=True
+                raw_prompt_messages, add_generation_prompt=True, tokenize=False, enable_thinking=True
             )
         except TypeError:
-            # 如果 tokenizer 不支持 enable_thinking 参数，则不使用
+            # If tokenizer doesn't support enable_thinking parameter, skip it
             raw_prompt = self.tokenizer.apply_chat_template(
-                raw_prompt_messages, 
-                add_generation_prompt=True, 
-                tokenize=False
+                raw_prompt_messages, add_generation_prompt=True, tokenize=False
             )
-        
-        # 分词
+
+        # Tokenize
         model_inputs = self.tokenizer(raw_prompt, return_tensors="pt", add_special_tokens=False)
         input_ids = model_inputs["input_ids"]
         attention_mask = model_inputs["attention_mask"]
-        
-        # 后处理
+
+        # Post-process
         input_ids, attention_mask = verl_F.postprocess_data(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -214,21 +208,21 @@ def __getitem__(self, item):
             left_pad=True,
             truncation=self.truncation,
         )
-        
-        # 计算位置ID
+
+        # Compute position IDs
         position_ids = compute_position_id_with_mask(attention_mask)
-        
-        # 准备原始提示ID
+
+        # Prepare raw prompt IDs
         raw_prompt_ids = self.tokenizer.encode(raw_prompt, add_special_tokens=False)
         if len(raw_prompt_ids) > self.max_prompt_length:
             if self.truncation == "left":
-                raw_prompt_ids = raw_prompt_ids[-self.max_prompt_length:]
+                raw_prompt_ids = raw_prompt_ids[-self.max_prompt_length :]
             elif self.truncation == "right":
-                raw_prompt_ids = raw_prompt_ids[:self.max_prompt_length]
+                raw_prompt_ids = raw_prompt_ids[: self.max_prompt_length]
             elif self.truncation == "error":
-                raise RuntimeError(f"提示长度 {len(raw_prompt_ids)} 超过 {self.max_prompt_length}")
-        
-        # 构建结果
+                raise RuntimeError(f"Prompt length {len(raw_prompt_ids)} exceeds {self.max_prompt_length}")
+
+        # Build result
         result = {
             "input_ids": input_ids[0],
             "attention_mask": attention_mask[0],
@@ -239,26 +233,26 @@ def __getitem__(self, item):
             "reward_model": {"ground_truth": self._extract_ground_truth(row_dict)},
             "data_source": row_dict.get("source", "helpsteer2"),
         }
-        
+
         if self.return_raw_chat:
             result["raw_prompt"] = messages
-            
+
         return result
 
     def __len__(self):
         return len(self.dataframe)
 
     def resume_dataset_state(self):
-        """恢复数据集状态用于检查点"""
+        """Resume dataset state for checkpointing."""
         self.serialize_dataset = not hasattr(self, "original_data_files")
         if not self.serialize_dataset:
             self.data_files = copy.deepcopy(self.original_data_files)
             self._load_dataset()
         else:
-            print("使用旧的数据加载器检查点文件，建议从头开始训练")
+            print("Using old dataloader checkpoint file, recommend training from scratch")
 
     def __getstate__(self):
-        """获取用于序列化的状态"""
+        """Get state for serialization."""
         if not self.serialize_dataset:
             state = self.__dict__.copy()
             if "dataframe" in state:
@@ -268,25 +262,25 @@ def __getstate__(self):
 
 
 class PairwiseChatRLDataset(BaseChatRLDataset):
-    """Pairwise聊天强化学习数据集"""
-    
+    """Pairwise chat reinforcement learning dataset."""
+
     def __init__(self, data_files, tokenizer, config, processor=None, max_samples: int = -1):
         super().__init__(data_files, tokenizer, config, processor, max_samples)
-        # Pairwise相关配置
-        self.pairwise_response_index = self.config.get("pairwise_response_index", 0)  # 选择哪个response进行训练
-        print(f"使用 Pairwise 模式，选择 response index: {self.pairwise_response_index}")
+        # Pairwise related configuration
+        self.pairwise_response_index = self.config.get("pairwise_response_index", 0)  # Which response to train on
+        print(f"Using Pairwise mode, selected response index: {self.pairwise_response_index}")
 
     def _build_messages(self, example: dict) -> List[dict]:
-        """从样本构建聊天消息 - Pairwise模式"""
+        """Build chat messages from example - Pairwise mode."""
         messages = []
-        
-        # 从input字段提取用户消息
+
+        # Extract user message from input field
         if "input" in example and example["input"]:
             for msg in example["input"]:
                 if msg.get("role") == "user" and msg.get("content"):
                     messages.append({"role": "user", "content": msg["content"]})
-        
-        # Pairwise模式：选择指定的response
+
+        # Pairwise mode: select the specified response
         if "output" in example and example["output"]:
             if self.pairwise_response_index < len(example["output"]):
                 output_item = example["output"][self.pairwise_response_index]
@@ -295,17 +289,17 @@ def _build_messages(self, example: dict) -> List[dict]:
                     content = answer.get("content", "")
                     if content:
                         messages.append({"role": "assistant", "content": content})
-        
-        # 回退到原始结构
+
+        # Fallback to original structure
         if len(messages) <= 1:
             prompt = self._extract_prompt(example)
             if prompt:
                 messages = [{"role": "user", "content": prompt}]
-        
+
         return messages
 
     def _format_template(self, messages: List[dict], example: dict) -> str:
-        """格式化pairwise模板"""
+        """Format pairwise template."""
         task_desc = """You are a professional expert in response comparison.
 You will be provided with a query and two different responses (A and B) to that query.
 Your task is to determine which response is better by comparing their quality across multiple dimensions.
@@ -316,23 +310,23 @@ def _format_template(self, messages: List[dict], example: dict) -> str:
             "Accuracy: Factual correctness and reliability of information",
             "Safety: Avoiding harmful or inappropriate content",
         ]
-        
-        # 提取问题
-        query = next((msg['content'] for msg in messages if msg['role'] == 'user'), '')
-        
-        # 获取两个回答
+
+        # Extract query
+        query = next((msg["content"] for msg in messages if msg["role"] == "user"), "")
+
+        # Get two responses
         response_a = ""
         response_b = ""
-        
+
         if "output" in example and len(example["output"]) >= 2:
             response_a = example["output"][0].get("answer", {}).get("content", "")
             response_b = example["output"][1].get("answer", {}).get("content", "")
-        
-        # 直接使用字符串格式化，避免使用 PairwiseTrainTemplate 类（防止 pickle 序列化问题）
+
+        # Use string formatting directly to avoid PairwiseTrainTemplate class (prevent pickle serialization issues)
         principles_str = ""
         for i, principle in enumerate(principles):
             principles_str += f"{i + 1}. {principle}\n"
-        
+
         prompt = f"""# Task Description
 {task_desc}
 # Principles
@@ -351,50 +345,50 @@ def _format_template(self, messages: List[dict], example: dict) -> str:
         return [{"role": "user", "content": prompt}]
 
     def _extract_ground_truth(self, row_dict):
-        """提取pairwise真实标签"""
+        """Extract pairwise ground truth label."""
         try:
             output_data = row_dict.get("output", [])
             if output_data and len(output_data) >= 2:
-                # 获取选中response的标签
+                # Get label from selected response
                 selected_answer = output_data[self.pairwise_response_index].get("answer", {})
                 if isinstance(selected_answer, dict):
                     label_data = selected_answer.get("label", {})
                     if isinstance(label_data, dict):
-                        # 对于pairwise，返回偏好信息
+                        # For pairwise, return preference information
                         preference = label_data.get("preference", "")
                         strength = label_data.get("preference_strength", 0)
                         response_id = label_data.get("response_id", "")
-                        
+
                         return {
                             "preference": preference,
                             "preference_strength": strength,
                             "response_id": response_id,
-                            "task_type": "pairwise"
+                            "task_type": "pairwise",
                         }
-            
+
             return ""
         except:
             return ""
 
 
 class PointwiseChatRLDataset(BaseChatRLDataset):
-    """Pointwise聊天强化学习数据集 - 用于单个回答的质量评分"""
-    
+    """Pointwise chat reinforcement learning dataset - for single response quality scoring."""
+
     def __init__(self, data_files, tokenizer, config, processor=None, max_samples: int = -1):
         super().__init__(data_files, tokenizer, config, processor, max_samples)
-        print(f"使用 Pointwise 模式")
+        print("Using Pointwise mode")
 
     def _build_messages(self, example: dict) -> List[dict]:
-        """从样本构建聊天消息 - Pointwise模式"""
+        """Build chat messages from example - Pointwise mode."""
         messages = []
-        
-        # 从input字段提取用户消息
+
+        # Extract user message from input field
         if "input" in example and example["input"]:
             for msg in example["input"]:
                 if isinstance(msg, dict) and msg.get("role") == "user" and msg.get("content"):
                     messages.append({"role": "user", "content": msg["content"]})
-        
-        # Pointwise模式：获取第一个response
+
+        # Pointwise mode: get first response
         if "output" in example and example["output"]:
             output_item = example["output"][0] if isinstance(example["output"], list) else example["output"]
             answer = output_item.get("answer", {}) if isinstance(output_item, dict) else {}
@@ -402,17 +396,17 @@ def _build_messages(self, example: dict) -> List[dict]:
                 content = answer.get("content", "")
                 if content:
                     messages.append({"role": "assistant", "content": content})
-        
-        # 回退到原始结构
+
+        # Fallback to original structure
         if len(messages) <= 1:
             prompt = self._extract_prompt(example)
             if prompt:
                 messages = [{"role": "user", "content": prompt}]
-        
+
         return messages
 
     def _format_template(self, messages: List[dict], example: dict) -> str:
-        """格式化pointwise模板"""
+        """Format pointwise template."""
         task_desc = """You are a professional expert in response quality evaluation.
 You will be provided with a query and a response to that query.
 Your task is to evaluate the quality of the response and assign a helpfulness score from 0 to 4.
@@ -426,22 +420,22 @@ def _format_template(self, messages: List[dict], example: dict) -> str:
             "Relevance: How directly related the response is to the question",
             "Safety: Avoiding harmful or inappropriate content",
         ]
-        
-        # 提取问题
-        query = next((msg['content'] for msg in messages if msg['role'] == 'user'), '')
-        
-        # 获取回答
+
+        # Extract query
+        query = next((msg["content"] for msg in messages if msg["role"] == "user"), "")
+
+        # Get response
         response = ""
         if "output" in example and example["output"]:
             output_item = example["output"][0] if isinstance(example["output"], list) else example["output"]
             if isinstance(output_item, dict):
                 response = output_item.get("answer", {}).get("content", "")
-        
-        # 直接使用字符串格式化
+
+        # Use string formatting directly
         principles_str = ""
         for i, principle in enumerate(principles):
             principles_str += f"{i + 1}. {principle}\n"
-        
+
         prompt = f"""# Task Description
 {task_desc}
 # Principles
@@ -456,7 +450,7 @@ def _format_template(self, messages: List[dict], example: dict) -> str:
         return [{"role": "user", "content": prompt}]
 
     def _extract_ground_truth(self, row_dict):
-        """提取pointwise真实标签"""
+        """Extract pointwise ground truth label."""
         try:
             output_data = row_dict.get("output", [])
             if output_data:
@@ -466,16 +460,13 @@ def _extract_ground_truth(self, row_dict):
                     if isinstance(answer, dict):
                         label_data = answer.get("label", {})
                         if isinstance(label_data, dict):
-                            # 对于pointwise，返回评分信息
+                            # For pointwise, return scoring information
                             helpfulness = label_data.get("helpfulness", 0)
-                            return {
-                                "helpfulness": helpfulness,
-                                "task_type": "pointwise"
-                            }
-            
+                            return {"helpfulness": helpfulness, "task_type": "pointwise"}
+
             return {"helpfulness": 0, "task_type": "pointwise"}
         except:
             return {"helpfulness": 0, "task_type": "pointwise"}
 
 
-# 向后兼容的别名
+# Backward compatible aliases
diff --git a/cookbooks/training_judge_model/grpo/pairwise/reward_fn.py b/cookbooks/training_judge_model/grpo/pairwise/reward_fn.py
index a98be20e..1ac348bc 100644
--- a/cookbooks/training_judge_model/grpo/pairwise/reward_fn.py
+++ b/cookbooks/training_judge_model/grpo/pairwise/reward_fn.py
@@ -1,100 +1,93 @@
-import torch
-import json
 import re
-from datetime import datetime
-import os
-from collections import defaultdict
 
 
 def filter_thinking_parts(text):
     """
-    过滤文本中的思考部分（用于Qwen3等支持thinking模式的模型）
-    
-    支持的思考标记格式：
+    Filter thinking parts from text (for models like Qwen3 that support thinking mode).
+
+    Supported thinking tag formats:
     - <think>...</think>
     """
     if not isinstance(text, str):
         return text
-    
-    # 定义思考部分的正则表达式模式
-    thinking_patterns = [
-        r'<think>.*?</think>'
-    ]
-    
-    # 依次应用所有模式进行过滤
+
+    # Define regex patterns for thinking parts
+    thinking_patterns = [r"<think>.*?</think>"]
+
+    # Apply all patterns sequentially for filtering
     filtered_text = text
     for pattern in thinking_patterns:
-        filtered_text = re.sub(pattern, '', filtered_text, flags=re.DOTALL | re.IGNORECASE)
-    
-    # 清理多余的空白字符
-    filtered_text = re.sub(r'\n\s*\n', '\n\n', filtered_text)  # 合并多个换行
+        filtered_text = re.sub(pattern, "", filtered_text, flags=re.DOTALL | re.IGNORECASE)
+
+    # Clean up extra whitespace
+    filtered_text = re.sub(r"\n\s*\n", "\n\n", filtered_text)  # Merge multiple newlines
     filtered_text = filtered_text.strip()
-    
+
     return filtered_text
 
 
 def extract_preference_response(response_text):
     """
-    从模型回复中提取preference偏好
-    从<better>标签中提取偏好选择
+    Extract preference from model response.
+    Extract preference choice from <better> tag.
     """
     # Handle case where response_text might not be a string
     if not isinstance(response_text, str):
         response_text = str(response_text)
-    
-    # 从<better>标签中提取偏好
-    preference_pattern = r'<better>(.*?)</better>'
+
+    # Extract preference from <better> tag
+    preference_pattern = r"<better>(.*?)</better>"
     match = re.search(preference_pattern, response_text, re.DOTALL)
-    
+
     if match:
         preference_content = match.group(1).strip().upper()
-        
-        # 首先检查是否直接是A或B
-        if preference_content == 'A':
-            return 'A'
-        elif preference_content == 'B':
-            return 'B'
-        elif preference_content == 'TIE':
-            return 'tie'
-        
-        # 然后检查是否包含特定词汇但不是两者都有
-        if 'A' in preference_content and 'B' not in preference_content:
-            return 'A'
-        elif 'B' in preference_content and 'A' not in preference_content:
-            return 'B'
-        elif 'TIE' in preference_content or ('A' in preference_content and 'B' in preference_content):
-            return 'tie'
-    
-    # 如果没有找到标签，尝试从文本最后部分提取
-    lines = response_text.strip().split('\n')
-    for line in reversed(lines[-5:]):  # 检查最后5行
+
+        # First check if it's directly A or B
+        if preference_content == "A":
+            return "A"
+        elif preference_content == "B":
+            return "B"
+        elif preference_content == "TIE":
+            return "tie"
+
+        # Then check if it contains specific words but not both
+        if "A" in preference_content and "B" not in preference_content:
+            return "A"
+        elif "B" in preference_content and "A" not in preference_content:
+            return "B"
+        elif "TIE" in preference_content or ("A" in preference_content and "B" in preference_content):
+            return "tie"
+
+    # If no tag found, try to extract from the last part of text
+    lines = response_text.strip().split("\n")
+    for line in reversed(lines[-5:]):  # Check last 5 lines
         line = line.strip().upper()
-        if line == 'A' or 'RESPONSE A' in line or 'ANSWER A' in line:
-            return 'A'
-        elif line == 'B' or 'RESPONSE B' in line or 'ANSWER B' in line:
-            return 'B'
-        elif 'TIE' in line or 'EQUAL' in line:
-            return 'tie'
-    
-    return 'unknown'  # 如果无法提取，返回unknown
+        if line == "A" or "RESPONSE A" in line or "ANSWER A" in line:
+            return "A"
+        elif line == "B" or "RESPONSE B" in line or "ANSWER B" in line:
+            return "B"
+        elif "TIE" in line or "EQUAL" in line:
+            return "tie"
+
+    return "unknown"  # Return unknown if extraction fails
 
 
 def calculate_pairwise_reward(predicted_preference, true_preference, response_id):
     """
-    基于preference预测与真实偏好的匹配程度计算奖励
-    
+    Calculate reward based on how well the predicted preference matches the true preference.
+
     Args:
-        predicted_preference: 模型预测的偏好 ('A', 'B', 'tie', 'unknown')
-        true_preference: 真实偏好 ('A', 'B', 'tie')
-        response_id: 当前response的ID ('A' 或 'B')
-    
+        predicted_preference: Model's predicted preference ('A', 'B', 'tie', 'unknown')
+        true_preference: Ground truth preference ('A', 'B', 'tie')
+        response_id: Current response ID ('A' or 'B')
+
     Returns:
-        float: 奖励分数 (1.0 如果预测正确，0.0 如果预测错误)
+        float: Reward score (1.0 if prediction is correct, 0.0 if incorrect)
     """
-    if true_preference is None or predicted_preference == 'unknown':
+    if true_preference is None or predicted_preference == "unknown":
         return 0.0
-    
-    # 简化奖励逻辑：预测正确给1分，预测错误给0分
+
+    # Simplified reward logic: 1 point for correct prediction, 0 for incorrect
     if predicted_preference == true_preference:
         return 1.0
     else:
@@ -103,79 +96,75 @@ def calculate_pairwise_reward(predicted_preference, true_preference, response_id
 
 def compute_score(data_source, solution_str, ground_truth, extra_info=None, **kwargs):
     """
-    与naive.py兼容的compute_score函数，处理pairwise比较任务
-    
-    参数：
-    - data_source: 数据源类型
-    - solution_str: 模型生成的回复
-    - ground_truth: 真实标签（包含偏好信息）
-    - extra_info: 额外信息
+    compute_score function compatible with naive.py, handles pairwise comparison tasks.
+
+    Args:
+        data_source: Data source type
+        solution_str: Model generated response
+        ground_truth: Ground truth label (contains preference information)
+        extra_info: Additional information
     """
     try:
-        # 先过滤掉思考部分（支持Qwen3等模型的thinking模式）
+        # First filter out thinking parts (support thinking mode for models like Qwen3)
         filtered_solution = filter_thinking_parts(solution_str)
-        
-        # 从过滤后的solution_str中提取preference
+
+        # Extract preference from filtered solution_str
         predicted_preference = extract_preference_response(filtered_solution)
-        
-        # 处理ground_truth - 应该包含偏好信息
+
+        # Handle ground_truth - should contain preference information
         if isinstance(ground_truth, dict):
-            true_preference = ground_truth.get('preference', 'tie')
-            response_id = ground_truth.get('response_id', 'A')
-            preference_strength = ground_truth.get('preference_strength', 0)
-            task_type = ground_truth.get('task_type', 'pairwise')
+            true_preference = ground_truth.get("preference", "tie")
+            response_id = ground_truth.get("response_id", "A")
+            preference_strength = ground_truth.get("preference_strength", 0)
+            task_type = ground_truth.get("task_type", "pairwise")
         else:
-            # 回退处理
+            # Fallback handling
             if extra_info and isinstance(extra_info, dict):
-                # 尝试从extra_info中获取偏好信息
-                data_mode = extra_info.get('data_mode', 'pointwise')
-                if data_mode == 'pairwise':
-                    # 分析原始数据
-                    output_data = extra_info.get('output', [])
+                # Try to get preference info from extra_info
+                data_mode = extra_info.get("data_mode", "pointwise")
+                if data_mode == "pairwise":
+                    # Analyze original data
+                    output_data = extra_info.get("output", [])
                     if output_data and len(output_data) >= 2:
-                        # 从原始标签中推断偏好
-                        label_a = output_data[0].get('answer', {}).get('label', {})
-                        label_b = output_data[1].get('answer', {}).get('label', {})
-                        
-                        pref_a = label_a.get('overall_preference', 0)
-                        pref_b = label_b.get('overall_preference', 0)
-                        
+                        # Infer preference from original labels
+                        label_a = output_data[0].get("answer", {}).get("label", {})
+                        label_b = output_data[1].get("answer", {}).get("label", {})
+
+                        pref_a = label_a.get("overall_preference", 0)
+                        pref_b = label_b.get("overall_preference", 0)
+
                         if pref_a > pref_b:
-                            true_preference = 'A'
+                            true_preference = "A"
                         elif pref_b > pref_a:
-                            true_preference = 'B'
+                            true_preference = "B"
                         else:
-                            true_preference = 'tie'
-                        
-                        # 假设我们在评估第一个response (A)
-                        response_id = 'A'
+                            true_preference = "tie"
+
+                        # Assume we're evaluating the first response (A)
+                        response_id = "A"
                         preference_strength = abs(pref_a - pref_b)
-                        task_type = 'pairwise'
+                        task_type = "pairwise"
                     else:
-                        true_preference = 'tie'
-                        response_id = 'A'
+                        true_preference = "tie"
+                        response_id = "A"
                         preference_strength = 0
-                        task_type = 'pairwise'
+                        task_type = "pairwise"
                 else:
-                    # 不是pairwise任务，返回默认值
-                    return {
-                        "score": 0.0,
-                        "error": "Not a pairwise task",
-                        "data_source": data_source
-                    }
+                    # Not a pairwise task, return default values
+                    return {"score": 0.0, "error": "Not a pairwise task", "data_source": data_source}
             else:
-                true_preference = 'tie'
-                response_id = 'A'
+                true_preference = "tie"
+                response_id = "A"
                 preference_strength = 0
-                task_type = 'pairwise'
-        
-        # 计算奖励
+                task_type = "pairwise"
+
+        # Calculate reward
         reward = calculate_pairwise_reward(predicted_preference, true_preference, response_id)
-        
-        # 计算准确率
-        accuracy = 1.0 if (predicted_preference == true_preference and predicted_preference != 'unknown') else 0.0
 
-        # 返回详细信息
+        # Calculate accuracy
+        accuracy = 1.0 if (predicted_preference == true_preference and predicted_preference != "unknown") else 0.0
+
+        # Return detailed information
         return {
             "score": reward,
             "predicted_preference": predicted_preference,
@@ -184,23 +173,18 @@ def compute_score(data_source, solution_str, ground_truth, extra_info=None, **kw
             "response_id": response_id,
             "preference_strength": preference_strength,
             "task_type": task_type,
-            "data_source": data_source
+            "data_source": data_source,
         }
-        
+
     except Exception as e:
         print(f"Error in compute_score: {e}")
-        # 返回默认值
-        return {
-            "score": 0.0,
-             "accuracy": 0.0,
-            "error": str(e),
-            "data_source": data_source
-        }
+        # Return default values
+        return {"score": 0.0, "accuracy": 0.0, "error": str(e), "data_source": data_source}
 
 
 if __name__ == "__main__":
-    # 测试用例 - 模拟模型的实际输出
-    model_response = '''<think>Let me analyze both responses based on the given principles:
+    # Test cases - simulate model's actual output
+    model_response = """<think>Let me analyze both responses based on the given principles:
 
 1. Helpfulness: Response A provides detailed step-by-step instructions including washing, peeling, cutting, soaking, and drying. Response B only mentions cutting and frying, missing crucial preparation steps.
 
@@ -214,39 +198,29 @@ def compute_score(data_source, solution_str, ground_truth, extra_info=None, **kw
 
 Response A is significantly better as it provides complete, accurate, and helpful instructions for preparing potatoes for frying.
 </think>
-<better>A</better>'''
-    
-    # 测试better标签提取
+<better>A</better>"""
+
+    # Test better tag extraction
     extracted_pref = extract_preference_response(model_response)
-    print(f"提取的偏好: {extracted_pref}")
-    
-    # 模拟ground_truth数据
-    ground_truth = {
-        "preference": "A",
-        "preference_strength": 2,
-        "response_id": "A",
-        "task_type": "pairwise"
-    }
-    
-    # 测试reward计算
+    print(f"Extracted preference: {extracted_pref}")
+
+    # Simulate ground_truth data
+    ground_truth = {"preference": "A", "preference_strength": 2, "response_id": "A", "task_type": "pairwise"}
+
+    # Test reward calculation
     result = compute_score("helpsteer3", model_response, ground_truth)
     print(f"Reward result: {result}")
-    
-    # 测试不同的预测结果
+
+    # Test different prediction results
     test_cases = [
-        ("A", "A", "A"),  # 正确预测A更好，当前是A
-        ("A", "A", "B"),  # 正确预测A更好，当前是B
-        ("B", "A", "A"),  # 错误预测B更好，当前是A
-        ("tie", "A", "A"), # 预测tie，真实A更好，当前是A
+        ("A", "A", "A"),  # Correct prediction A is better, current is A
+        ("A", "A", "B"),  # Correct prediction A is better, current is B
+        ("B", "A", "A"),  # Wrong prediction B is better, current is A
+        ("tie", "A", "A"),  # Predict tie, true is A better, current is A
     ]
-    
-    print("\n=== 测试不同预测结果 ===")
+
+    print("\n=== Testing different prediction results ===")
     for pred, true, resp_id in test_cases:
-        test_gt = {
-            "preference": true,
-            "preference_strength": 1,
-            "response_id": resp_id,
-            "task_type": "pairwise"
-        }
+        test_gt = {"preference": true, "preference_strength": 1, "response_id": resp_id, "task_type": "pairwise"}
         reward = calculate_pairwise_reward(pred, true, resp_id)
-        print(f"预测: {pred}, 真实: {true}, Response ID: {resp_id} -> 奖励: {reward:.1f}") 
\ No newline at end of file
+        print(f"Predicted: {pred}, True: {true}, Response ID: {resp_id} -> Reward: {reward:.1f}")
diff --git a/cookbooks/training_judge_model/grpo/pointwise/reward_fn.py b/cookbooks/training_judge_model/grpo/pointwise/reward_fn.py
index 98571486..c0f76b81 100644
--- a/cookbooks/training_judge_model/grpo/pointwise/reward_fn.py
+++ b/cookbooks/training_judge_model/grpo/pointwise/reward_fn.py
@@ -1,168 +1,156 @@
-import torch
-import json
-from datetime import datetime
-import os
 import re
-from collections import defaultdict
 
 
 def filter_thinking_parts(text):
     """
-    过滤文本中的思考部分（用于Qwen3等支持thinking模式的模型）
-    
-    支持的思考标记格式：
+    Filter thinking parts from text (for models like Qwen3 that support thinking mode).
+
+    Supported thinking tag formats:
     - <think>...</think>
     """
     if not isinstance(text, str):
         return text
-    
-    # 定义思考部分的正则表达式模式
-    thinking_patterns = [
-        r'<think>.*?</think>'
-    ]
-    
-    # 依次应用所有模式进行过滤
+
+    # Define regex patterns for thinking parts
+    thinking_patterns = [r"<think>.*?</think>"]
+
+    # Apply all patterns sequentially for filtering
     filtered_text = text
     for pattern in thinking_patterns:
-        filtered_text = re.sub(pattern, '', filtered_text, flags=re.DOTALL | re.IGNORECASE)
-    
-    # 清理多余的空白字符
-    filtered_text = re.sub(r'\n\s*\n', '\n\n', filtered_text)  # 合并多个换行
+        filtered_text = re.sub(pattern, "", filtered_text, flags=re.DOTALL | re.IGNORECASE)
+
+    # Clean up extra whitespace
+    filtered_text = re.sub(r"\n\s*\n", "\n\n", filtered_text)  # Merge multiple newlines
     filtered_text = filtered_text.strip()
-    
+
     return filtered_text
 
 
 def extract_helpfulness_score(response_text):
     """
-    从模型回复中提取helpfulness评分
-    从<score>标签中提取分数
+    Extract helpfulness score from model response.
+    Extract score from <score> tag.
     """
     # Handle case where response_text might not be a string
     if not isinstance(response_text, str):
         response_text = str(response_text)
-    
-    # 从<score>标签中提取分数
-    score_pattern = r'<score>(.*?)</score>'
+
+    # Extract score from <score> tag
+    score_pattern = r"<score>(.*?)</score>"
     match = re.search(score_pattern, response_text, re.DOTALL)
-    
+
     if match:
         score_content = match.group(1).strip()
-        # 提取其中的数字
-        numbers = re.findall(r'\d+', score_content)
+        # Extract numbers from content
+        numbers = re.findall(r"\d+", score_content)
         if numbers:
             try:
-                score = int(numbers[0])  # 取第一个数字作为分数
-                if 0 <= score <= 4:  # 假设分数范围是0-4
+                score = int(numbers[0])  # Take the first number as score
+                if 0 <= score <= 4:  # Assume score range is 0-4
                     return score
             except:
                 pass
-    
-    return 0  # 如果无法提取，默认为0
+
+    return 0  # Default to 0 if extraction fails
+
 
 def calculate_helpfulness_reward(predicted_score, true_score):
     """
-    基于helpfulness预测分数与真实分数的差异计算奖励
-    差异越小，奖励越高
-    
-    对于二分类场景 (true_score为0或1):
-    - 预测正确（完全匹配）→ 奖励1.0
-    - 预测错误 → 奖励0.0
+    Calculate reward based on the difference between predicted and true helpfulness scores.
+    Smaller difference results in higher reward.
+
+    For binary classification scenarios (true_score is 0 or 1):
+    - Correct prediction (exact match) -> Reward 1.0
+    - Wrong prediction -> Reward 0.0
     """
     if true_score is None:
         return 0.0
-    
-    # 计算差异
+
+    # Calculate difference
     diff = abs(predicted_score - true_score)
-    
-    # 对于二分类场景（0或1），采用精确匹配
+
+    # For binary classification (0 or 1), use exact match
     if true_score in [0, 1]:
         return 1.0 if diff == 0 else 0.0
-    
-    # 对于多分类场景（0-4），采用差异计算
-    # 将差异转换为奖励分数 (差异越小奖励越高)
+
+    # For multi-class scenarios (0-4), use difference calculation
+    # Convert difference to reward score (smaller difference = higher reward)
     max_possible_diff = 4
     normalized_diff = min(diff / max_possible_diff, 1.0)
-    
-    # 奖励 = 1 - 标准化差异
+
+    # Reward = 1 - normalized difference
     reward = 1.0 - normalized_diff
-    
+
     return reward
 
+
 def compute_score(data_source, solution_str, ground_truth, extra_info=None, **kwargs):
     """
-    与naive.py兼容的compute_score函数
-    参数：
-    - data_source: 数据源类型
-    - solution_str: 模型生成的回复
-    - ground_truth: 真实标签（从reward_model字段获取）
-    - extra_info: 额外信息
+    compute_score function compatible with naive.py.
+
+    Args:
+        data_source: Data source type
+        solution_str: Model generated response
+        ground_truth: Ground truth label (obtained from reward_model field)
+        extra_info: Additional information
     """
     try:
-        # 先过滤掉思考部分（支持Qwen3等模型的thinking模式）
+        # First filter out thinking parts (support thinking mode for models like Qwen3)
         filtered_solution = filter_thinking_parts(solution_str)
-        
-        # 从过滤后的solution_str中提取helpfulness分数
+
+        # Extract helpfulness score from filtered solution_str
         predicted_helpfulness = extract_helpfulness_score(filtered_solution)
-        
-        # 处理ground_truth - 可能是数字或者字典
+
+        # Handle ground_truth - could be a number or dict
         if isinstance(ground_truth, dict):
-            true_helpfulness = ground_truth.get('helpfulness', 0)
+            true_helpfulness = ground_truth.get("helpfulness", 0)
         elif isinstance(ground_truth, (int, float)):
             true_helpfulness = int(ground_truth)
         elif isinstance(ground_truth, str) and ground_truth.isdigit():
             true_helpfulness = int(ground_truth)
         else:
-            # 如果ground_truth不可用，尝试从extra_info中获取
+            # If ground_truth is unavailable, try to get from extra_info
             if extra_info and isinstance(extra_info, dict):
-                output_data = extra_info.get('output', [])
+                output_data = extra_info.get("output", [])
                 if output_data and len(output_data) > 0:
-                    label_data = output_data[0].get('label', {})
-                    true_helpfulness = label_data.get('helpfulness', 0)
+                    label_data = output_data[0].get("label", {})
+                    true_helpfulness = label_data.get("helpfulness", 0)
                 else:
                     true_helpfulness = 0
             else:
                 true_helpfulness = 0
-        
-        # 计算奖励
+
+        # Calculate reward
         reward = calculate_helpfulness_reward(predicted_helpfulness, true_helpfulness)
-        
-        # 返回详细信息
+
+        # Return detailed information
         return {
             "score": reward,
             "predicted_helpfulness": predicted_helpfulness,
             "true_helpfulness": true_helpfulness,
-            "data_source": data_source
+            "data_source": data_source,
         }
-        
+
     except Exception as e:
         print(f"Error in compute_score: {e}")
-        # 返回默认值
-        return {
-            "score": 0.0,
-            "error": str(e),
-            "data_source": data_source
-        }
+        # Return default values
+        return {"score": 0.0, "error": str(e), "data_source": data_source}
+
 
 if __name__ == "__main__":
-    # 测试用例
-    test_response = '''<think>Let me analyze this answer step by step:
+    # Test cases
+    test_response = """<think>Let me analyze this answer step by step:
 1. First, I'll check if the answer is well-structured...
 4. Finally, I'll look at the overall helpfulness...
 </think>
-<score>2</score>'''
-    
+<score>2</score>"""
+
     ground_truth = {"helpfulness": 3, "task_type": "pointwise"}
-    
-    # 测试 compute_score 函数
-    result = compute_score(
-        data_source="test",
-        solution_str=test_response,
-        ground_truth=ground_truth
-    )
-    
-    print(f"Test Result:")
+
+    # Test compute_score function
+    result = compute_score(data_source="test", solution_str=test_response, ground_truth=ground_truth)
+
+    print("Test Result:")
     print(f"  Predicted Score: {result.get('predicted_helpfulness')}")
     print(f"  True Score: {result.get('true_helpfulness')}")
     print(f"  Reward: {result.get('score')}")
-
diff --git a/cookbooks/zero_shot_evaluation/report_generator.py b/cookbooks/zero_shot_evaluation/report_generator.py
new file mode 100644
index 00000000..d6f0c059
--- /dev/null
+++ b/cookbooks/zero_shot_evaluation/report_generator.py
@@ -0,0 +1,277 @@
+# -*- coding: utf-8 -*-
+"""Report generator for zero-shot evaluation results."""
+
+import asyncio
+from typing import List
+
+from cookbooks.zero_shot_evaluation.schema import (
+    ComparisonDetail,
+    OpenAIEndpoint,
+    TaskConfig,
+)
+from cookbooks.zero_shot_evaluation.zero_shot_pipeline import EvaluationResult
+from openjudge.models.openai_chat_model import OpenAIChatModel
+
+# Constants for report generation
+_NUM_WINNING_EXAMPLES_FOR_RANKING = 2
+_NUM_LOSING_EXAMPLES_FOR_RANKING = 1
+_NUM_SAMPLE_REASONS_PER_MODEL = 3
+
+
+class ReportGenerator:
+    """Generate evaluation report with parallel LLM calls."""
+
+    def __init__(
+        self,
+        judge_endpoint: OpenAIEndpoint,
+        language: str = "zh",
+        include_examples: int = 3,
+    ):
+        self.language = language
+        self.include_examples = include_examples
+        extra_params = judge_endpoint.extra_params or {}
+        self.model = OpenAIChatModel(
+            model=judge_endpoint.model,
+            api_key=judge_endpoint.api_key,
+            base_url=judge_endpoint.base_url,
+            temperature=extra_params.get("temperature", 0.3),
+        )
+
+    async def generate(
+        self,
+        task_config: TaskConfig,
+        rubrics: List[str],
+        result: EvaluationResult,
+        details: List[ComparisonDetail],
+    ) -> str:
+        """Generate complete report with parallel section generation."""
+        # Prepare context
+        ctx = self._prepare_context(task_config, rubrics, result, details)
+
+        # Generate sections in parallel
+        sections = await asyncio.gather(
+            self._gen_summary(ctx),
+            self._gen_ranking_explanation(ctx),
+            self._gen_model_analysis(ctx),
+            self._gen_examples(ctx),
+        )
+
+        # Assemble report
+        lang_title = "评估报告" if self.language == "zh" else "Evaluation Report"
+        header = f"# {lang_title}\n\n"
+        return header + "\n\n---\n\n".join(s for s in sections if s)
+
+    def _prepare_context(
+        self,
+        task_config: TaskConfig,
+        rubrics: List[str],
+        result: EvaluationResult,
+        details: List[ComparisonDetail],
+    ) -> dict:
+        """Prepare shared context for all sections."""
+        # Filter to only original order (remove swapped duplicates)
+        original_details = [d for d in details if d.order == "original"]
+
+        # Format rankings
+        rankings_text = "\n".join(f"{i+1}. {name}: {rate:.1%}" for i, (name, rate) in enumerate(result.rankings))
+
+        # Format rubrics
+        rubrics_text = "\n".join(f"- {r}" for r in rubrics)
+
+        # Group details by model pair for examples
+        model_examples = {}
+        for d in original_details:
+            key = tuple(sorted([d.model_a, d.model_b]))
+            if key not in model_examples:
+                model_examples[key] = []
+            model_examples[key].append(d)
+
+        # Select representative examples (prefer those with detailed reasons)
+        selected_examples = []
+        for pair_details in model_examples.values():
+            sorted_details = sorted(pair_details, key=lambda x: len(x.reason), reverse=True)
+            selected_examples.extend(sorted_details[: self.include_examples])
+
+        return {
+            "task_description": task_config.description,
+            "scenario": task_config.scenario or "",
+            "rubrics": rubrics_text,
+            "rankings": rankings_text,
+            "win_matrix": result.win_matrix,
+            "total_queries": result.total_queries,
+            "total_comparisons": result.total_comparisons,
+            "best_model": result.best_pipeline,
+            "model_names": [name for name, _ in result.rankings],
+            "examples": selected_examples[: self.include_examples * 3],
+            "all_details": original_details,  # Use deduplicated details
+        }
+
+    async def _call_llm(self, prompt: str) -> str:
+        """Call LLM with given prompt."""
+        lang_instruction = "Output in Chinese (中文)." if self.language == "zh" else "Output in English."
+        messages = [
+            {"role": "system", "content": f"You are an expert AI evaluation analyst. {lang_instruction}"},
+            {"role": "user", "content": prompt},
+        ]
+        response = await self.model.achat(messages=messages)
+        return response.content or ""
+
+    async def _gen_summary(self, ctx: dict) -> str:
+        """Generate executive summary."""
+        prompt = f"""Generate a concise executive summary for an AI model evaluation.
+
+Task: {ctx['task_description']}
+Scenario: {ctx['scenario']}
+
+Evaluation Statistics:
+- Total test queries: {ctx['total_queries']}
+- Total pairwise comparisons: {ctx['total_comparisons']}
+
+Final Rankings:
+{ctx['rankings']}
+
+Best performing model: {ctx['best_model']}
+
+Requirements:
+- Write 150-200 words
+- Include: evaluation purpose, methodology summary, key findings, winner
+- Use professional tone"""
+
+        content = await self._call_llm(prompt)
+        title = "## 执行摘要" if self.language == "zh" else "## Executive Summary"
+        return f"{title}\n\n{content}"
+
+    async def _gen_ranking_explanation(self, ctx: dict) -> str:
+        """Generate ranking explanation with evidence."""
+        # Find key examples showing why top model won/lost
+        best = ctx["best_model"]
+
+        # Best model wins: either (model_a=best and winner=model_a) or (model_b=best and winner=model_b)
+        winning_examples = [
+            d
+            for d in ctx["all_details"]
+            if (d.model_a == best and d.winner == "model_a") or (d.model_b == best and d.winner == "model_b")
+        ][:_NUM_WINNING_EXAMPLES_FOR_RANKING]
+
+        # Best model loses: either (model_a=best and winner=model_b) or (model_b=best and winner=model_a)
+        losing_examples = [
+            d
+            for d in ctx["all_details"]
+            if (d.model_a == best and d.winner == "model_b") or (d.model_b == best and d.winner == "model_a")
+        ][:_NUM_LOSING_EXAMPLES_FOR_RANKING]
+
+        examples_text = ""
+        for i, ex in enumerate(winning_examples + losing_examples, 1):
+            actual_winner = ex.model_a if ex.winner == "model_a" else ex.model_b
+            examples_text += f"""
+Example {i}:
+- Query: {ex.query[:200]}...
+- Winner: {actual_winner}
+- Reason: {ex.reason}
+"""
+
+        prompt = f"""Explain why the models are ranked this way based on the evaluation.
+
+Rankings:
+{ctx['rankings']}
+
+Evaluation Criteria:
+{ctx['rubrics']}
+
+Win Matrix (row beats column with this rate):
+{self._format_win_matrix(ctx['win_matrix'])}
+
+Key Examples:
+{examples_text}
+
+Requirements:
+- Explain why {ctx['best_model']} ranks first
+- Highlight key differences between top models
+- Reference specific evidence from examples
+- Be objective and balanced"""
+
+        content = await self._call_llm(prompt)
+        title = "## 排名解释" if self.language == "zh" else "## Ranking Explanation"
+        return f"{title}\n\n{content}"
+
+    async def _gen_model_analysis(self, ctx: dict) -> str:
+        """Generate per-model analysis."""
+        # Collect stats for each model
+        model_stats = {name: {"wins": 0, "losses": 0, "reasons": []} for name in ctx["model_names"]}
+
+        for d in ctx["all_details"]:
+            winner = d.model_a if d.winner == "model_a" else d.model_b
+            loser = d.model_b if d.winner == "model_a" else d.model_a
+            model_stats[winner]["wins"] += 1
+            model_stats[loser]["losses"] += 1
+            if d.reason:
+                model_stats[winner]["reasons"].append(f"[Win] {d.reason[:150]}")
+                model_stats[loser]["reasons"].append(f"[Loss] {d.reason[:150]}")
+
+        stats_text = ""
+        for name in ctx["model_names"]:
+            stats = model_stats[name]
+            sample_reasons = stats["reasons"][:_NUM_SAMPLE_REASONS_PER_MODEL]
+            reasons_text = "\n".join("  * " + r for r in sample_reasons)
+            stats_text += f"""
+Model: {name}
+- Wins: {stats['wins']}, Losses: {stats['losses']}
+- Sample evaluation reasons:
+{reasons_text}
+"""
+
+        prompt = f"""Analyze each model's performance in this evaluation.
+
+Task: {ctx['task_description']}
+
+Evaluation Criteria:
+{ctx['rubrics']}
+
+Model Statistics:
+{stats_text}
+
+Requirements:
+For each model, provide:
+1. Overall assessment (2-3 sentences)
+2. Key strengths (with evidence)
+3. Key weaknesses (with evidence)
+4. Improvement suggestions"""
+
+        content = await self._call_llm(prompt)
+        title = "## 模型分析" if self.language == "zh" else "## Model Analysis"
+        return f"{title}\n\n{content}"
+
+    async def _gen_examples(self, ctx: dict) -> str:
+        """Generate showcase examples."""
+        examples = ctx["examples"][: self.include_examples]
+        if not examples:
+            return ""
+
+        examples_text = ""
+        for i, ex in enumerate(examples, 1):
+            examples_text += f"""
+### Case {i}
+
+**Query:** {ex.query}
+
+**{ex.model_a}:**
+{ex.response_a[:500]}{'...' if len(ex.response_a) > 500 else ''}
+
+**{ex.model_b}:**
+{ex.response_b[:500]}{'...' if len(ex.response_b) > 500 else ''}
+
+**Winner:** {ex.model_a if ex.winner == 'model_a' else ex.model_b}
+
+**Evaluation Reason:** {ex.reason}
+"""
+
+        title = "## 典型案例" if self.language == "zh" else "## Representative Cases"
+        return f"{title}\n{examples_text}"
+
+    def _format_win_matrix(self, win_matrix: dict) -> str:
+        """Format win matrix for display."""
+        lines = []
+        for model_a, opponents in win_matrix.items():
+            for model_b, rate in opponents.items():
+                lines.append(f"  {model_a} vs {model_b}: {rate:.1%}")
+        return "\n".join(lines)