[train] fix channel_loss (qwen2.5-vl & packing) (#4941)

Jintao-Huang · Jintao-Huang · commit 1347cdda5654 · 2025-07-14T17:16:09.000+08:00
diff --git a/examples/train/plugins/channel_loss.sh b/examples/train/plugins/channel_loss.sh
@@ -8,6 +8,8 @@
 #     {"role": "user", "content": "What color do you like?"},
 #     {"role": "assistant", "content": "I like blue."}
 #   ]}
+
+# channel_loss is compatible with padding-free and packing.
 CUDA_VISIBLE_DEVICES=0 \
 swift sft \
     --model Qwen/Qwen2.5-0.5B-Instruct \
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
@@ -237,6 +237,7 @@ def prediction_step(
         **gen_kwargs,
     ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
         if not self.args.predict_with_generate or prediction_loss_only:
+            inputs['_position_ids'] = inputs.get('position_ids')
             with self.template.forward_context(self.model, inputs):
                 return super().prediction_step(
                     model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys)
@@ -277,15 +278,19 @@ def _prepare_inputs(self, inputs):
                 compute_loss_func = get_loss_func('loss_scale')
 
         sample_channels = inputs.pop('channel', None)
-        if sample_channels is not None and self.args.channels is not None:
+        position_ids = inputs.pop('_position_ids', None)
+        if self.args.channels is not None:
+            assert sample_channels is not None, f'sample_channels: {sample_channels}'
             state = self.state
             setattr(state, 'local_step', getattr(state, 'local_step', 0))
             setattr(state, 'ch_loss_steps', getattr(state, 'ch_loss_steps', {}))
 
             loss_kwargs['sample_channels'] = sample_channels
             loss_kwargs['trainer'] = self
-            if inputs.get('position_ids') is not None:
-                loss_kwargs['position_ids'] = inputs['position_ids']
+            if position_ids is None:
+                position_ids = inputs.get('position_ids')
+            if position_ids is not None:
+                loss_kwargs['position_ids'] = position_ids
 
         use_logits_to_keep = self.get_use_logits_to_keep('labels' in inputs and self.label_smoother is None
                                                          and compute_loss_func is None)
@@ -352,5 +357,6 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N
         return (loss, outputs) if return_outputs else loss
 
     def training_step(self, model, inputs, *args, **kwargs):
+        inputs['_position_ids'] = inputs.get('position_ids')
         with self.template.forward_context(self.model, inputs):
             return super().training_step(model, inputs, *args, **kwargs)
diff --git a/tests/train/test_channel.py b/tests/train/test_channel.py
@@ -10,8 +10,12 @@ def test_channel():
             model='Qwen/Qwen2.5-VL-7B-Instruct',
             dataset=['channel.jsonl#1000'],
             split_dataset_ratio=0.01,
+            packing=True,
+            max_length=128,
             channels=['aaa', 'abc'],
-            loss_type='channel_loss'))
+            attn_impl='flash_attn',
+            loss_type='channel_loss',
+            eval_steps=10))
 
 
 if __name__ == '__main__':