(dcy)polish flake8 add multimodal_rewardmodel and test

Chengyi1224 · Chengyi1224 · commit d8e9868fd439 · 2025-04-02T09:11:11.000Z
diff --git a/ding/reward_model/__init__.py b/ding/reward_model/__init__.py
@@ -13,6 +13,7 @@
 from .guided_cost_reward_model import GuidedCostRewardModel
 from .ngu_reward_model import RndNGURewardModel, EpisodicNGURewardModel
 from .icm_reward_model import ICMRewardModel
-# LLM/VLM reward model and verifier
+# LLM/VLM reward models and verifiers
 from .math_reward_model import MathRewardModel
 from .math_rule_reward_model import MathRuleRewardModel
+from .multi_modal_reward_model import MultiModalRewardModel
diff --git a/ding/reward_model/math_reward_model.py b/ding/reward_model/math_reward_model.py
@@ -49,21 +49,21 @@ def estimate(self, data: List[Dict]) -> List[Dict]:
             Estimate rewards for mathematical reasoning steps using Qwen2.5-Math-PRM-7B model.
         Arguments:
             - data (:obj:`List[Dict]`): List of dictionaries containing:
-                - system (:obj:`str`): System prompt for the model
-                - query (:obj:`str`): The mathematical query to be evaluated
-                - response (:obj:`List[str]`): List of reasoning steps
+                - system (:obj:`str`): System prompt for the model.
+                - query (:obj:`str`): The mathematical query to be evaluated.
+                - response (:obj:`List[str]`): List of reasoning steps.
         Returns:
             - reward (:obj:`List[Dict]`): List of dictionaries containing:
-                - reward (:obj:`float`): Final reward (last step reward)
+                - reward (:obj:`float`): Final reward (last step reward).
                 - metadata (:obj:`Dict`): Additional information including:
-                    - query (:obj:`str`): Original query
-                    - step_rewards (:obj:`List[float]`): Rewards for each reasoning step
-                    - num_steps (:obj:`int`): Number of reasoning steps
+                    - query (:obj:`str`): Original query.
+                    - step_rewards (:obj:`List[float]`): Rewards for each reasoning step.
+                    - num_steps (:obj:`int`): Number of reasoning steps.
         Shapes:
-            - input_ids (:obj:`torch.LongTensor`): :math:`(B, L)`, where B is batch size and L is sequence length
-            - outputs (:obj:`torch.FloatTensor`): :math:`(B, L, H)`, where H is hidden size
-            - token_masks (:obj:`torch.BoolTensor`): :math:`(B, L)`
-            - step_rewards (:obj:`List[List[float]]`): List of length B, each containing S rewards where S is num steps
+            - input_ids (:obj:`torch.LongTensor`): :math:`(B, L)`, where B is batch size and L is sequence length.
+            - outputs (:obj:`torch.FloatTensor`): :math:`(B, L, H)`, where H is hidden size.
+            - token_masks (:obj:`torch.BoolTensor`): :math:`(B, L)`.
+            - step_rewards (:obj:`List[List[float]]`): List of length B, each containing S rewards where S is num steps.
         Examples:
             >>> data = [{
             >>>     "system": "Please reason step by step...",
@@ -74,7 +74,6 @@ def estimate(self, data: List[Dict]) -> List[Dict]:
             >>> print(results[0]["reward"])  # 1.0
             >>> print(results[0]["metadata"]["step_rewards"])  # [0.8, 0.9, 1.0]
         """
-        # 批量处理所有样本
         all_messages = []
         for item in data:
             messages = [
@@ -93,7 +92,6 @@ def estimate(self, data: List[Dict]) -> List[Dict]:
             ]
             all_messages.append(messages)
 
-        # 批量转换为模型输入格式
         conversation_strs = [
             self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
             for messages in all_messages
@@ -104,24 +102,21 @@ def estimate(self, data: List[Dict]) -> List[Dict]:
             conversation_strs, return_tensors="pt", padding=True, truncation=True
         )["input_ids"].to(self.model.device)
 
-        # 批量获取模型输出
         with torch.no_grad():
             outputs = self.model(input_ids=input_ids)
 
-        # 计算每个样本的步骤奖励
         step_sep_id = self.tokenizer.encode("<extra_0>")[0]
         token_masks = (input_ids == step_sep_id)
         batch_rewards = self.make_step_rewards(outputs[0], token_masks)
 
-        # 构建详细的结果字典
         results = []
         for item, step_rewards in zip(data, batch_rewards):
             results.append(
                 {
-                    "reward": step_rewards[-1] if step_rewards else 0.0,  # 最后一步的奖励作为总体奖励
+                    "reward": step_rewards[-1] if step_rewards else 0.0,
                     "metadata": {
                         "query": item['query'],
-                        "step_rewards": step_rewards,  # 每个步骤的奖励
+                        "step_rewards": step_rewards,
                         "num_steps": len(item['response']),
                     }
                 }
diff --git a/ding/reward_model/math_rule_reward_model.py b/ding/reward_model/math_rule_reward_model.py
@@ -135,11 +135,11 @@ def _extract_final_answer(self, text: str) -> Optional[str]:
         """
         Extract the final answer from text.
         Supports various formats:
-        1. "The answer is X"
-        2. "Therefore, X is the answer"
-        3. "X" (if only one number)
-        4. "\\boxed{X}"
-        5. "= X" (expression after equals sign)
+        1. "The answer is X".
+        2. "Therefore, X is the answer".
+        3. "X" (if only one number).
+        4. "\\boxed{X}".
+        5. "= X" (expression after equals sign).
         6. Last LaTeX expression like \\frac{a}{b}, \\sqrt{x}, etc.
         """
         # Try to extract boxed content
diff --git a/ding/reward_model/multi_modal_reward_model.py b/ding/reward_model/multi_modal_reward_model.py
@@ -0,0 +1,163 @@
+from typing import List, Dict
+from easydict import EasyDict
+from torch.utils.tensorboard import SummaryWriter
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+from ding.utils import REWARD_MODEL_REGISTRY
+from .base_reward_model import BaseRewardModel
+
+
+@REWARD_MODEL_REGISTRY.register('multi_modal')
+class MultiModalRewardModel(BaseRewardModel):
+    config = dict(
+        type='multi_modal',
+        model_name='internlm/internlm-xcomposer2d5-7b-reward',
+        hd_num=9,  # Number of high-definition patches for image processing
+    )
+
+    def __init__(self, config: EasyDict, device: str, logger, tb_logger: 'SummaryWriter') -> None:
+        self.cfg = config
+        self.device = device
+        self.logger = logger
+        self.tb_logger = tb_logger
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.cfg.model_name, trust_remote_code=True, local_files_only=True
+        )
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.cfg.model_name, torch_dtype=torch.float16, trust_remote_code=True
+        )
+
+        self.model.tokenizer = self.tokenizer
+        self.model.cuda().eval()
+
+    def estimate(self, data: List[Dict], image: List[str], output_mode: str = 'score') -> List[Dict]:
+        """
+        Estimate rewards for multi-modal inputs using internlm-xcomposer model.
+
+        Arguments:
+            data (List[Dict]): List of chat dictionaries, each containing:
+                - chat (List[Dict]): List of messages, each message is a dict with:
+                    - role (str): Either "user" or "assistant"
+                    - content (str): The message content
+            image (List[str]): List of image paths. If fewer images than chats, last image will be reused
+            output_mode (str, optional): Evaluation mode. Defaults to 'score'.
+                - 'score': Return reward scores for each chat
+                - 'rank': Return ranking indices (0 is best) for all chats
+                - 'compare': Compare first two chats (returns 1.0 for better, 0.0 for worse)
+
+        Returns:
+            List[Dict]: Results depending on output_mode:
+            - For 'score' mode:
+                [{
+                    'reward': float,  # Reward score
+                    'metadata': {
+                        'mode': 'score',
+                        'chat_idx': int,  # Index of the chat
+                        'image_path': str  # Path of the image used
+                    }
+                }, ...]
+            - For 'rank' mode:
+                [{
+                    'rank': int,  # Ranking position (0 is best)
+                    'metadata': {
+                        'mode': 'rank',
+                        'chat_idx': int,
+                        'image_path': str
+                    }
+                }, ...]
+            - For 'compare' mode:
+                [{
+                    'reward': float,  # 1.0 for better, 0.0 for worse
+                    'metadata': {
+                        'mode': 'compare',
+                        'chat_idx': int,
+                        'image_path': str,
+                        'compared_with': int  # Index of the compared chat
+                    }
+                }, ...]
+        """
+        # Get chat data
+        chats = [item['chat'] for item in data]
+
+        with torch.autocast(device_type='cuda', dtype=torch.float16):
+            if output_mode == 'score':
+                # Ensure each chat has a corresponding image, use the last image if not enough
+                if len(image) < len(chats):
+                    image = image + [image[-1]] * (len(chats) - len(image))
+
+                # Get scores for each chat
+                scores = []
+                for chat, img in zip(chats, image):
+                    score = self.model.get_score(chat, [img], hd_num=self.cfg.hd_num)
+                    scores.append(score)
+
+                return [
+                    {
+                        'reward': float(score),
+                        'metadata': {
+                            'mode': 'score',
+                            'chat_idx': idx,
+                            'image_path': img
+                        }
+                    } for idx, (score, img) in enumerate(zip(scores, image))
+                ]
+
+            elif output_mode == 'rank':
+                # Use the first image for ranking
+                img = image[0]
+                ranks = self.model.rank(chats, [[img]] * len(chats), hd_num=self.cfg.hd_num)
+
+                return [
+                    {
+                        'rank': int(rank),
+                        'metadata': {
+                            'mode': 'rank',
+                            'chat_idx': idx,
+                            'image_path': img
+                        }
+                    } for idx, rank in enumerate(ranks)
+                ]
+
+            elif output_mode == 'compare':
+                if len(data) < 2:
+                    raise ValueError("Compare mode requires at least 2 samples")
+
+                # Use the first image for comparison
+                img = image[0]
+                is_better = self.model.compare(chats[0], [img], chats[1], [img], hd_num=self.cfg.hd_num)
+
+                return [
+                    {
+                        'reward': 1.0 if is_better else 0.0,
+                        'metadata': {
+                            'mode': 'compare',
+                            'chat_idx': 0,
+                            'image_path': img,
+                            'compared_with': 1
+                        }
+                    }, {
+                        'reward': 0.0 if is_better else 1.0,
+                        'metadata': {
+                            'mode': 'compare',
+                            'chat_idx': 1,
+                            'image_path': img,
+                            'compared_with': 0
+                        }
+                    }
+                ]
+            else:
+                raise ValueError(f"Invalid output mode: {output_mode}")
+
+    def train(self):
+        """Training is not implemented for this reward model"""
+        self.logger.warning("Training is not implemented for this reward model")
+        pass
+
+    def collect_data(self, data: list) -> None:
+        """Data collection is not needed for this reward model"""
+        pass
+
+    def clear_data(self) -> None:
+        """Data clearing is not needed for this reward model"""
+        pass
diff --git a/ding/reward_model/tests/test_math_reward_model.py b/ding/reward_model/tests/test_math_reward_model.py
@@ -21,7 +21,7 @@ def test_math_reward_model():
     # Initialize reward model
     model = MathRewardModel(cfg, "cuda" if torch.cuda.is_available() else "cpu", logger, tb_logger)
 
-    # Test case 1: Simple math problem
+    # Simple math problem
     data_simple = [
         {
             "system": "Please reason step by step...",
@@ -30,7 +30,7 @@ def test_math_reward_model():
         }
     ]
 
-    # Test case 2: Complex word problem
+    # Complex word problem
     data_complex = [
         {
             "system": "Please reason step by step, and put your final answer within \\boxed{}.",
diff --git a/ding/reward_model/tests/test_math_rule_reward_model.py b/ding/reward_model/tests/test_math_rule_reward_model.py
diff --git a/ding/reward_model/tests/test_multi_modal_reward_model.py b/ding/reward_model/tests/test_multi_modal_reward_model.py

Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@ def test_math_reward_model():`
`21`	`21`	`# Initialize reward model`
`22`	`22`	`model = MathRewardModel(cfg, "cuda" if torch.cuda.is_available() else "cpu", logger, tb_logger)`
`23`	`23`
`24`		`- # Test case 1: Simple math problem`
	`24`	`+ # Simple math problem`
`25`	`25`	`data_simple = [`
`26`	`26`	`{`
`27`	`27`	`"system": "Please reason step by step...",`
`@@ -30,7 +30,7 @@ def test_math_reward_model():`
`30`	`30`	`}`
`31`	`31`	`]`
`32`	`32`
`33`		`- # Test case 2: Complex word problem`
	`33`	`+ # Complex word problem`
`34`	`34`	`data_complex = [`
`35`	`35`	`{`
`36`	`36`	`"system": "Please reason step by step, and put your final answer within \\boxed{}.",`