(dcy) add multimodal_rewardmodel

root · root · commit ea83f44acd8d · 2025-04-02T07:58:50.000Z
diff --git a/ding/reward_model/__init__.py b/ding/reward_model/__init__.py
@@ -16,3 +16,4 @@
 # LLM/VLM reward model and verifier
 from .math_reward_model import MathRewardModel
 from .math_rule_reward_model import MathRuleRewardModel
+from .multi_modal_reward_model import MultiModalRewardModel
diff --git a/ding/reward_model/math_reward_model.py b/ding/reward_model/math_reward_model.py
@@ -11,9 +11,9 @@
 @REWARD_MODEL_REGISTRY.register('math')
 class MathRewardModel(BaseRewardModel):
     config = dict(
-        # (str) The type of the reward model.
+        # The type of the reward model.
         type='math',
-        # (str) The name of the tokenizer and model
+        # The name of the tokenizer and model
         model_name='Qwen/Qwen2.5-Math-PRM-7B',
     )
 
@@ -23,7 +23,6 @@ def __init__(self, config: EasyDict, device: str, logger, tb_logger: 'SummaryWri
         self.logger = logger
         self.tb_logger = tb_logger
 
-        # 初始化tokenizer和model
         self.tokenizer = AutoTokenizer.from_pretrained(self.cfg.model_name, trust_remote_code=True)
         self.model = AutoModel.from_pretrained(
             self.cfg.model_name, device_map=self.device, torch_dtype=torch.bfloat16, trust_remote_code=True
@@ -99,16 +98,13 @@ def estimate(self, data: List[Dict]) -> List[Dict]:
             for messages in all_messages
         ]
 
-        # 批量编码输入
         input_ids = self.tokenizer(
             conversation_strs, return_tensors="pt", padding=True, truncation=True
         )["input_ids"].to(self.model.device)
 
-        # 批量获取模型输出
         with torch.no_grad():
             outputs = self.model(input_ids=input_ids)
 
-        # 计算每个样本的步骤奖励
         step_sep_id = self.tokenizer.encode("<extra_0>")[0]
         token_masks = (input_ids == step_sep_id)
         batch_rewards = self.make_step_rewards(outputs[0], token_masks)
diff --git a/ding/reward_model/math_rule_reward_model.py b/ding/reward_model/math_rule_reward_model.py
@@ -112,7 +112,6 @@ def _process_response_answer(self, response: str) -> Tuple[Optional[float], Opti
                 if self.logger:
                     self.logger.debug(f"Error processing expression '{expr}': {e}")
 
-        # If all attempts fail, return None
         return None, None
 
     def _check_answer_match(self, pred: Optional[float], target: Optional[float]) -> bool:
diff --git a/ding/reward_model/multi_modal_reward_model.py b/ding/reward_model/multi_modal_reward_model.py
@@ -0,0 +1,163 @@
+from typing import List, Dict
+from easydict import EasyDict
+from torch.utils.tensorboard import SummaryWriter
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+from ding.utils import REWARD_MODEL_REGISTRY
+from .base_reward_model import BaseRewardModel
+
+
+@REWARD_MODEL_REGISTRY.register('multi_modal')
+class MultiModalRewardModel(BaseRewardModel):
+    config = dict(
+        type='multi_modal',
+        model_name='internlm/internlm-xcomposer2d5-7b-reward',
+        hd_num=9,  # Number of high-definition patches for image processing
+    )
+
+    def __init__(self, config: EasyDict, device: str, logger, tb_logger: 'SummaryWriter') -> None:
+        self.cfg = config
+        self.device = device
+        self.logger = logger
+        self.tb_logger = tb_logger
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.cfg.model_name, trust_remote_code=True, local_files_only=True
+        )
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.cfg.model_name, torch_dtype=torch.float16, trust_remote_code=True
+        )
+
+        self.model.tokenizer = self.tokenizer
+        self.model.cuda().eval()
+
+    def estimate(self, data: List[Dict], image: List[str], output_mode: str = 'score') -> List[Dict]:
+        """
+        Estimate rewards for multi-modal inputs using internlm-xcomposer model.
+        
+        Arguments:
+            data (List[Dict]): List of chat dictionaries, each containing:
+                - chat (List[Dict]): List of messages, each message is a dict with:
+                    - role (str): Either "user" or "assistant"
+                    - content (str): The message content
+            image (List[str]): List of image paths. If fewer images than chats, last image will be reused
+            output_mode (str, optional): Evaluation mode. Defaults to 'score'.
+                - 'score': Return reward scores for each chat
+                - 'rank': Return ranking indices (0 is best) for all chats
+                - 'compare': Compare first two chats (returns 1.0 for better, 0.0 for worse)
+
+        Returns:
+            List[Dict]: Results depending on output_mode:
+            - For 'score' mode:
+                [{
+                    'reward': float,  # Reward score
+                    'metadata': {
+                        'mode': 'score',
+                        'chat_idx': int,  # Index of the chat
+                        'image_path': str  # Path of the image used
+                    }
+                }, ...]
+            - For 'rank' mode:
+                [{
+                    'rank': int,  # Ranking position (0 is best)
+                    'metadata': {
+                        'mode': 'rank',
+                        'chat_idx': int,
+                        'image_path': str
+                    }
+                }, ...]
+            - For 'compare' mode:
+                [{
+                    'reward': float,  # 1.0 for better, 0.0 for worse
+                    'metadata': {
+                        'mode': 'compare',
+                        'chat_idx': int,
+                        'image_path': str,
+                        'compared_with': int  # Index of the compared chat
+                    }
+                }, ...]
+        """
+        # Get chat data
+        chats = [item['chat'] for item in data]
+
+        with torch.autocast(device_type='cuda', dtype=torch.float16):
+            if output_mode == 'score':
+                # Ensure each chat has a corresponding image, use the last image if not enough
+                if len(image) < len(chats):
+                    image = image + [image[-1]] * (len(chats) - len(image))
+
+                # Get scores for each chat
+                scores = []
+                for chat, img in zip(chats, image):
+                    score = self.model.get_score(chat, [img], hd_num=self.cfg.hd_num)
+                    scores.append(score)
+
+                return [
+                    {
+                        'reward': float(score),
+                        'metadata': {
+                            'mode': 'score',
+                            'chat_idx': idx,
+                            'image_path': img
+                        }
+                    } for idx, (score, img) in enumerate(zip(scores, image))
+                ]
+
+            elif output_mode == 'rank':
+                # Use the first image for ranking
+                img = image[0]
+                ranks = self.model.rank(chats, [[img]] * len(chats), hd_num=self.cfg.hd_num)
+
+                return [
+                    {
+                        'rank': int(rank),
+                        'metadata': {
+                            'mode': 'rank',
+                            'chat_idx': idx,
+                            'image_path': img
+                        }
+                    } for idx, rank in enumerate(ranks)
+                ]
+
+            elif output_mode == 'compare':
+                if len(data) < 2:
+                    raise ValueError("Compare mode requires at least 2 samples")
+
+                # Use the first image for comparison
+                img = image[0]
+                is_better = self.model.compare(chats[0], [img], chats[1], [img], hd_num=self.cfg.hd_num)
+
+                return [
+                    {
+                        'reward': 1.0 if is_better else 0.0,
+                        'metadata': {
+                            'mode': 'compare',
+                            'chat_idx': 0,
+                            'image_path': img,
+                            'compared_with': 1
+                        }
+                    }, {
+                        'reward': 0.0 if is_better else 1.0,
+                        'metadata': {
+                            'mode': 'compare',
+                            'chat_idx': 1,
+                            'image_path': img,
+                            'compared_with': 0
+                        }
+                    }
+                ]
+            else:
+                raise ValueError(f"Invalid output mode: {output_mode}")
+
+    def train(self):
+        """Training is not implemented for this reward model"""
+        self.logger.warning("Training is not implemented for this reward model")
+        pass
+
+    def collect_data(self, data: list) -> None:
+        """Data collection is not needed for this reward model"""
+        pass
+
+    def clear_data(self) -> None:
+        """Data clearing is not needed for this reward model"""
+        pass
diff --git a/ding/reward_model/tests/test_math_reward_model.py b/ding/reward_model/tests/test_math_reward_model.py
@@ -21,7 +21,7 @@ def test_math_reward_model():
     # Initialize reward model
     model = MathRewardModel(cfg, "cuda" if torch.cuda.is_available() else "cpu", logger, tb_logger)
 
-    # Test case 1: Simple math problem
+    # Simple math problem
     data_simple = [
         {
             "system": "Please reason step by step...",
@@ -30,7 +30,7 @@ def test_math_reward_model():
         }
     ]
 
-    # Test case 2: Complex word problem
+    # Complex word problem
     data_complex = [
         {
             "system": "Please reason step by step, and put your final answer within \\boxed{}.",
diff --git a/ding/reward_model/tests/test_math_rule_reward_model.py b/ding/reward_model/tests/test_math_rule_reward_model.py
@@ -4,6 +4,7 @@
 from easydict import EasyDict
 from ding.reward_model.math_rule_reward_model import MathRuleRewardModel
 
+
 @pytest.fixture
 def reward_model():
     return MathRuleRewardModel(
@@ -19,24 +20,26 @@ def reward_model():
 
 @pytest.mark.envtest
 def test_math_rule_reward_model_correct_answer(reward_model):
-    data_correct = [{
-    "system": "Please answer this math problem...",
-    "query": (
-        "The school now introduces a new color, silver, for the flag design. "
-        "Crestview's school colors are now purple, gold, and silver. "
-        "The students are designing a flag using three solid-colored horizontal stripes. "
-        "Using one, two, or all three of the school colors, how many different flags "
-        "are possible if adjacent stripes may be the same color?"
-    ),
-    "response": (
-        "Crestview's school colors—purple, gold, and silver—can be used to design "
-        "a flag with three horizontal stripes, where each stripe can be any of the "
-        "three colors and adjacent stripes may be the same. Since each of the three "
-        "stripes has three independent color choices, the total number of possible "
-        "flag designs is 27"
-    ),
-    "answer": r"27"
-}]
+    data_correct = [
+        {
+            "system": "Please answer this math problem...",
+            "query": (
+                "The school now introduces a new color, silver, for the flag design. "
+                "Crestview's school colors are now purple, gold, and silver. "
+                "The students are designing a flag using three solid-colored horizontal stripes. "
+                "Using one, two, or all three of the school colors, how many different flags "
+                "are possible if adjacent stripes may be the same color?"
+            ),
+            "response": (
+                "Crestview's school colors—purple, gold, and silver—can be used to design "
+                "a flag with three horizontal stripes, where each stripe can be any of the "
+                "three colors and adjacent stripes may be the same. Since each of the three "
+                "stripes has three independent color choices, the total number of possible "
+                "flag designs is 27"
+            ),
+            "answer": r"27"
+        }
+    ]
 
     # Test the case with correct answer
     rewards = reward_model.estimate(data_correct)
@@ -48,26 +51,28 @@ def test_math_rule_reward_model_correct_answer(reward_model):
 
 @pytest.mark.envtest
 def test_math_rule_reward_model_wrong_answer(reward_model):
-    data_wrong = [{
-    "system": "Please answer this math problem...",
-    "query": (
-        "The school now introduces a new color, silver, for the flag design. "
-        "Crestview's school colors are now purple, gold, and silver. "
-        "The students are designing a flag using three solid-colored horizontal stripes. "
-        "Using one, two, or all three of the school colors, how many different flags "
-        "are possible if adjacent stripes may be the same color?"
-    ),
-    "response": (
-        r"The given point \(\left(\frac{\sqrt{3}}{2}, -\frac{1}{2}\right)\) lies on "
-        r"the unit circle, meaning its coordinates correspond to \((\cos \alpha, "
-        r"\sin \alpha)\). Since \(\cos \alpha = \frac{\sqrt{3}}{2}\) and "
-        r"\(\sin \alpha = -\frac{1}{2}\), the angle \(\alpha\) is in the "
-        r"**fourth quadrant**, where the reference angle is \(\frac{\pi}{6}\). "
-        r"Therefore, the smallest positive value of \(\alpha\) is "
-        r"\(2\pi - \frac{\pi}{6} = \frac{17\pi}{6}\)."
-    ),
-    "answer": r"\frac{11\pi}{6}"
-}]
+    data_wrong = [
+        {
+            "system": "Please answer this math problem...",
+            "query": (
+                "The school now introduces a new color, silver, for the flag design. "
+                "Crestview's school colors are now purple, gold, and silver. "
+                "The students are designing a flag using three solid-colored horizontal stripes. "
+                "Using one, two, or all three of the school colors, how many different flags "
+                "are possible if adjacent stripes may be the same color?"
+            ),
+            "response": (
+                r"The given point \(\left(\frac{\sqrt{3}}{2}, -\frac{1}{2}\right)\) lies on "
+                r"the unit circle, meaning its coordinates correspond to \((\cos \alpha, "
+                r"\sin \alpha)\). Since \(\cos \alpha = \frac{\sqrt{3}}{2}\) and "
+                r"\(\sin \alpha = -\frac{1}{2}\), the angle \(\alpha\) is in the "
+                r"**fourth quadrant**, where the reference angle is \(\frac{\pi}{6}\). "
+                r"Therefore, the smallest positive value of \(\alpha\) is "
+                r"\(2\pi - \frac{\pi}{6} = \frac{17\pi}{6}\)."
+            ),
+            "answer": r"\frac{11\pi}{6}"
+        }
+    ]
 
     # Test the case with wrong answer
     rewards = reward_model.estimate(data_wrong)
@@ -79,12 +84,14 @@ def test_math_rule_reward_model_wrong_answer(reward_model):
 
 @pytest.mark.envtest
 def test_math_rule_reward_model_format_error(reward_model):
-    data_format_error = [{
-        "system": "Please answer this math problem...",
-        "query": "What is 2+2?",
-        "response": "The answer is four.",
-        "answer": r"4"
-    }]
+    data_format_error = [
+        {
+            "system": "Please answer this math problem...",
+            "query": "What is 2+2?",
+            "response": "The answer is four.",
+            "answer": r"4"
+        }
+    ]
     rewards_format = reward_model.estimate(data_format_error)
     assert len(rewards_format) == len(data_format_error)
     # This should be a format error because "four" cannot be processed as a numerical value
@@ -99,13 +106,11 @@ def test_math_rule_reward_model_special_expressions(reward_model):
             "query": "What is 1/2?",
             "response": r"The answer is \frac{1}{2}.",
             "answer": r"0.5"
-        },
-        {
+        }, {
             "query": "What is 50%?",
             "response": "The answer is 50%.",
             "answer": r"0.5"
-        },
-        {
+        }, {
             "query": "What is sqrt(4)?",
             "response": r"The answer is \sqrt{4} = 2.",
             "answer": r"2"
diff --git a/ding/reward_model/tests/test_multi_modal_reward_model.py b/ding/reward_model/tests/test_multi_modal_reward_model.py

Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@ def test_math_reward_model():`
`21`	`21`	`# Initialize reward model`
`22`	`22`	`model = MathRewardModel(cfg, "cuda" if torch.cuda.is_available() else "cpu", logger, tb_logger)`
`23`	`23`
`24`		`- # Test case 1: Simple math problem`
	`24`	`+ # Simple math problem`
`25`	`25`	`data_simple = [`
`26`	`26`	`{`
`27`	`27`	`"system": "Please reason step by step...",`
`@@ -30,7 +30,7 @@ def test_math_reward_model():`
`30`	`30`	`}`
`31`	`31`	`]`
`32`	`32`
`33`		`- # Test case 2: Complex word problem`
	`33`	`+ # Complex word problem`
`34`	`34`	`data_complex = [`
`35`	`35`	`{`
`36`	`36`	`"system": "Please reason step by step, and put your final answer within \\boxed{}.",`