[RM] support margin & update doc (#4817)

hjh0119 · web-flow · commit 92c4f5780da6 · 2025-07-03T20:55:15.000+08:00
* reward modeling document

* support margin

* add margin to standard keys

* pop margin

* margin wip

* rm_encode

* data collator

* convert margin to float

* fix template inputs from dict

* convert to tensor in data_collator

* revert rm mode

* fix judge

* doc

* padding_free&amp;liger check
diff --git a/docs/source/Customization/自定义数据集.md b/docs/source/Customization/自定义数据集.md
@@ -69,6 +69,8 @@ alpaca格式:
 {"messages": [{"role": "system", "content": "你是个有用无害的数学计算器"}, {"role": "user", "content": "1+1等于几"}, {"role": "assistant", "content": "等于2"}, {"role": "user", "content": "再加1呢"}, {"role": "assistant", "content": "等于3"}], "rejected_response": "我不知道"}
 ```
 
+> 注: RM 额外支持 margin 列，参考[RM文档](../Instruction/人类对齐.md#rm)
+
 #### KTO
 
 ```jsonl
diff --git a/docs/source/Instruction/人类对齐.md b/docs/source/Instruction/人类对齐.md
@@ -39,6 +39,18 @@ RLHF中的Reward Modeling阶段
 
 增加的value head权重会保存在`value_head.safetensors` 或 `value_head.bin`文件中
 
+RM损失函数如下
+
+$
+\text{loss} = -\log \sigma \left( r^{(c)} - r^{(r)} - m \right) + \lambda \left( r^{(c)} + r^{(r)} \right)^2
+$
+
+- $r^{(c)}$: 模型对 chosen response 的打分
+- $r^{(r)}$: 模型对 rejected response 的打分
+- $\lambda$: L2正则项系数，鼓励模型输出接近0，使用参数`center_rewards_coefficient`进行设置，来自[论文](https://arxiv.org/pdf/2307.09288), 默认为0
+- $m$: margin项，鼓励模型根据不同难度的样本进行区分，需要数据集中提供`margin`列，默认为0，来自[论文](https://arxiv.org/pdf/2307.09288)
+
+
 训练脚本参考[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/rlhf/rm.sh).
 
 ## PPO
diff --git a/docs/source_en/Customization/Custom-dataset.md b/docs/source_en/Customization/Custom-dataset.md
@@ -70,6 +70,8 @@ The following outlines the standard dataset format for ms-swift, where the "syst
 {"messages": [{"role": "system", "content": "You are a useful and harmless math calculator"}, {"role": "user", "content": "What is 1 + 1?"}, {"role": "assistant", "content": "It equals 2"}, {"role": "user", "content": "What about adding 1?"}, {"role": "assistant", "content": "It equals 3"}], "rejected_response": "I don't know"}
 ```
 
+> Note: RM additionally supports the margin column. For details, refer to the [RM documentation](../Instruction/RLHF.md#rm).
+
 #### KTO
 
 ```jsonl
diff --git a/docs/source_en/Instruction/RLHF.md b/docs/source_en/Instruction/RLHF.md
@@ -38,6 +38,18 @@ Use the base model or instruct model trained with SFT as the foundation model. A
 
 The weights of the added value head will be saved in `value_head.safetensors` or `value_head.bin`.
 
+The loss function for reward modeling is as follows:
+
+$
+\text{loss} = -\log \sigma \left( r^{(c)} - r^{(r)} - m \right) + \lambda \left( r^{(c)} + r^{(r)} \right)^2
+$
+
+- $r^{(c)}$: The score assigned by the model to the chosen response.
+- $r^{(r)}$: The score assigned by the model to the rejected response.
+- $\lambda$: L2 regularization coefficient that encourages the model outputs to be close to zero. It is set by the parameter `center_rewards_coefficient`, as described in [the paper](https://arxiv.org/pdf/2307.09288), and defaults to 0.
+- $m$: Margin term that encourages the model to distinguish between samples of different difficulty levels. The dataset needs to provide a `margin` column for this; by default, it is 0. This term is also introduced in [the paper](https://arxiv.org/pdf/2307.09288).
+
+
 Reference the training script [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/rlhf/rm.sh).
 
 ## PPO
diff --git a/swift/llm/argument/rlhf_args.py b/swift/llm/argument/rlhf_args.py
@@ -126,6 +126,7 @@ def __post_init__(self):
         self._set_default()
         self._init_external_vllm()
         super().__post_init__()
+        self._check_padding_free()
         self._check_grpo()
         self._external_vllm_warning()
 
@@ -261,10 +262,12 @@ def _check_grpo(self):
                 raise ValueError('Liger loss does not support two-sided GRPO loss yet.')
             if self.sequence_parallel_size > 1:
                 raise ValueError('Liger loss does not support sequence parallel yet.')
+            if self.padding_free:
+                raise ValueError('Liger loss does not support padding free yet.')
+
             from trl.import_utils import is_liger_kernel_available
             assert is_liger_kernel_available(), (
                 'Please install/update liger-kernel by running: pip install -U liger-kernel')
-
         if self.vllm_mode == 'server':
             assert not self.use_vllm or self.vllm_server_host is not None
 
@@ -333,3 +336,10 @@ def _deprecated_warning(self):
         if self.gc_collect_after_offload:
             logger.warning(
                 "The parameter 'gc_collect_after_offload' has been deprecated and will be removed in version 3.7. ")
+
+    def _check_padding_free(self):
+        if self.padding_free:
+            supported_types = ['grpo', 'dpo', 'gkd']
+            if self.rlhf_type not in supported_types:
+                raise NotImplementedError(f"The current rlhf_type '{self.rlhf_type}' does not support padding_free. "
+                                          'Please set --padding_free to false.')
diff --git a/swift/llm/dataset/preprocessor/core.py b/swift/llm/dataset/preprocessor/core.py
@@ -21,7 +21,7 @@
 
 class RowPreprocessor:
     standard_keys = [
-        'messages', 'rejected_response', 'label', 'images', 'videos', 'audios', 'tools', 'objects', 'channel'
+        'messages', 'rejected_response', 'label', 'images', 'videos', 'audios', 'tools', 'objects', 'channel', 'margin'
     ]
 
     def __init__(self,
diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
@@ -320,6 +320,7 @@ def get_base_model(model):
             return model
 
     def _rlhf_encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
+        margin = inputs.margin
         chosen_inputs, rejected_inputs = inputs, deepcopy(inputs)
         assert chosen_inputs.rejected_response is not None, f'inputs: {inputs}'
         rejected_inputs.messages[-1]['content'] = chosen_inputs.rejected_response
@@ -331,6 +332,8 @@ def _rlhf_encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
             data = locals()[f'{prefix}_encoded']
             for k, v in data.items():
                 encoded[f'{prefix}_{k}'] = v
+        if margin:
+            encoded['margin'] = float(margin)
         return encoded
 
     def _kto_encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
@@ -1391,7 +1394,14 @@ def _rlhf_data_collator(self,
         new_batch = []
         for prefix in [chosen_prefix, rejected_prefix]:
             new_batch += self._fetch_inputs_startswith(batch, prefix)
-        return self._data_collator(new_batch, padding_to=padding_to)
+        res = self._data_collator(new_batch, padding_to=padding_to)
+
+        # reward modeling
+        margin = [b['margin'] for b in batch if b.get('margin') is not None]
+        if margin:
+            res['margin'] = torch.tensor(margin, dtype=torch.float)
+
+        return res
 
     def _kto_data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
         new_batch = self._fetch_inputs_startswith(batch, 'chosen_')
@@ -1532,12 +1542,14 @@ def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[in
             inputs_embeds = [b['inputs_embeds'] for b in batch if b.get('inputs_embeds') is not None]
             input_ids = [b['input_ids'] for b in batch if b.get('input_ids') is not None]
             channel = [b['channel'] for b in batch if b.get('channel') is not None]
+
             if inputs_embeds:
                 res['inputs_embeds'] = inputs_embeds
             if input_ids:
                 res['input_ids'] = input_ids
             if channel:
                 res['channel'] = channel
+
             for key in ['labels', 'loss_scale', 'position_ids', 'token_type_ids']:
                 val = [b[key] for b in batch if b.get(key) is not None]
                 if val:
diff --git a/swift/llm/template/template_inputs.py b/swift/llm/template/template_inputs.py
@@ -110,6 +110,8 @@ class StdTemplateInputs:
     videos: List[str] = field(default_factory=list)
     objects: Dict[str, List[Any]] = field(default_factory=dict)
 
+    margin: Optional[float] = None  # for reward modeling
+
     def __post_init__(self):
         self.image_idx = 0
         self.audio_idx = 0
@@ -135,7 +137,7 @@ def is_multimodal(self):
     @classmethod
     def from_dict(cls, inputs: Dict[str, Any]) -> Tuple['StdTemplateInputs', Dict[str, Any]]:
         kwargs = {}
-        for key in ['rejected_response', 'label', 'channel']:
+        for key in ['rejected_response', 'label', 'channel', 'margin']:
             if key in inputs:
                 kwargs[key] = inputs[key]
         messages = inputs['messages']
diff --git a/swift/trainers/rlhf_trainer/reward_trainer.py b/swift/trainers/rlhf_trainer/reward_trainer.py
@@ -24,12 +24,13 @@ def compute_loss(self,
                      return_outputs=False,
                      num_items_in_batch=None) -> Union[torch.Tensor, Tuple[torch.Tensor, Dict[str, torch.Tensor]]]:
         inputs.pop('labels', None)  # not use
+        margin = inputs.pop('margin', None)
         attention_mask = inputs['attention_mask']
         batch_size = attention_mask.shape[0] // 2
         rewards = model(**inputs).logits
         rewards_chosen, rewards_rejected = torch.split(rewards, batch_size, dim=0)
-        if 'margin' in inputs:
-            loss = -nn.functional.logsigmoid(rewards_chosen - rewards_rejected - inputs['margin']).mean()
+        if margin is not None:
+            loss = -nn.functional.logsigmoid(rewards_chosen - rewards_rejected - margin).mean()
         else:
             loss = -nn.functional.logsigmoid(rewards_chosen - rewards_rejected).mean()
         if self.args.center_rewards_coefficient is not None:

Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@`
`21`	`21`
`22`	`22`	`class RowPreprocessor:`
`23`	`23`	`standard_keys = [`
`24`		`- 'messages', 'rejected_response', 'label', 'images', 'videos', 'audios', 'tools', 'objects', 'channel'`
	`24`	`+ 'messages', 'rejected_response', 'label', 'images', 'videos', 'audios', 'tools', 'objects', 'channel', 'margin'`
`25`	`25`	`]`
`26`	`26`
`27`	`27`	`def __init__(self,`