fix multimodal padding_free prediction_step (#4839)

Jintao-Huang · web-flow · commit 43dab1a25361 · 2025-07-06T22:22:54.000+08:00
diff --git a/examples/deploy/agent/client.py b/examples/deploy/agent/client.py
@@ -55,6 +55,8 @@ def infer_stream(client, model: str, messages, tools):
     response = ''
     print(f'query: {query}\nresponse: ', end='')
     for chunk in gen:
+        if chunk is None:
+            continue
         delta = chunk.choices[0].delta.content
         response += delta
         print(delta, end='', flush=True)
@@ -68,6 +70,8 @@ def infer_stream(client, model: str, messages, tools):
         model=model, messages=messages, tools=tools, max_tokens=512, temperature=0, stream=True)
     print(f'query: {query}\nresponse2: ', end='')
     for chunk in gen:
+        if chunk is None:
+            continue
         print(chunk.choices[0].delta.content, end='', flush=True)
     print()
 
diff --git a/examples/deploy/client/llm/chat/openai_client.py b/examples/deploy/client/llm/chat/openai_client.py
@@ -20,6 +20,8 @@ def infer_stream(client, model: str, messages):
     gen = client.chat.completions.create(model=model, messages=messages, stream=True, temperature=0)
     print(f'messages: {messages}\nresponse: ', end='')
     for chunk in gen:
+        if chunk is None:
+            continue
         print(chunk.choices[0].delta.content, end='', flush=True)
     print()
 
diff --git a/examples/deploy/client/mllm/openai_client.py b/examples/deploy/client/mllm/openai_client.py
@@ -21,6 +21,8 @@ def infer_stream(client, model: str, messages):
     gen = client.chat.completions.create(model=model, messages=messages, stream=True, temperature=0)
     print(f'messages: {messages}\nresponse: ', end='')
     for chunk in gen:
+        if chunk is None:
+            continue
         print(chunk.choices[0].delta.content, end='', flush=True)
     print()
 
diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
@@ -309,7 +309,7 @@ def _extend_tokens(input_ids: List[int], labels: Optional[List[int]], replace_id
             added_tokens_len += token_len - 1
         return input_ids, labels
 
-    def training_step_context(self, model, inputs):
+    def forward_context(self, model, inputs):
         return nullcontext()
 
     @staticmethod
diff --git a/swift/llm/template/template/internvl.py b/swift/llm/template/template/internvl.py
@@ -56,14 +56,14 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
         encoded['pixel_values'] = pixel_values
         return encoded
 
-    def training_step_context(self, model, inputs):
+    def forward_context(self, model, inputs):
         model_name = model.language_model.__class__.__name__.lower()
         if self._packing and 'internlm2' in model_name:
             position_ids = inputs['position_ids']
             modeling_module = model.language_model.model.layers[0].attention.__class__
             return self._patch_flash_attention_forward(modeling_module, position_ids, use_new_func=True)
         else:
-            return super().training_step_context(model, inputs)
+            return super().forward_context(model, inputs)
 
     def _post_encode(self, model: nn.Module, inputs: Dict[str, Any]) -> Dict[str, Any]:
         embedding = model.get_input_embeddings()
diff --git a/swift/llm/template/template/qwen.py b/swift/llm/template/template/qwen.py
@@ -297,9 +297,9 @@ def _get_new_tokens(i):
         encoded['labels'] = labels
         return encoded
 
-    def training_step_context(self, model, inputs):
+    def forward_context(self, model, inputs):
         if 'real_position_ids' not in inputs:
-            return super().training_step_context(model, inputs)
+            return super().forward_context(model, inputs)
         if self.version == 'v2':
             from transformers.models.qwen2_vl import modeling_qwen2_vl as modeling_module
         elif self.version == 'v2_5':
diff --git a/swift/trainers/rlhf_trainer/dpo_trainer.py b/swift/trainers/rlhf_trainer/dpo_trainer.py
@@ -125,5 +125,10 @@ def get_per_token_logps(
 
     def training_step(self, model, inputs, *args, **kwargs):
         inputs['_position_ids'] = inputs.get('position_ids')
-        with self.template.training_step_context(self.model, inputs):
+        with self.template.forward_context(self.model, inputs):
             return super().training_step(model, inputs, *args, **kwargs)
+
+    def prediction_step(self, model, inputs, *args, **kwargs):
+        inputs['_position_ids'] = inputs.get('position_ids')
+        with self.template.forward_context(self.model, inputs):
+            return super().prediction_step(model, inputs, *args, **kwargs)
diff --git a/swift/trainers/rlhf_trainer/gkd_trainer.py b/swift/trainers/rlhf_trainer/gkd_trainer.py
@@ -148,6 +148,10 @@ def training_step(self,
             inputs['attention_mask'] = new_attention_mask
             inputs['labels'] = new_labels
 
-        with self.template.training_step_context(self.model, inputs):
+        with self.template.forward_context(self.model, inputs):
             loss = HFSFTTrainer.training_step(self, model, inputs, num_items_in_batch)
         return loss
+
+    def prediction_step(self, model, inputs, *args, **kwargs):
+        with self.template.forward_context(self.model, inputs):
+            return super().prediction_step(model, inputs, *args, **kwargs)
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
@@ -237,8 +237,9 @@ def prediction_step(
         **gen_kwargs,
     ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
         if not self.args.predict_with_generate or prediction_loss_only:
-            return super().prediction_step(
-                model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys)
+            with self.template.forward_context(self.model, inputs):
+                return super().prediction_step(
+                    model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys)
         from swift.llm import RequestConfig, InferRequest
         data_list = inputs['_data']
         labels_list = [InferRequest.remove_response(data['messages']) for data in data_list]
@@ -340,5 +341,5 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N
         return (loss, outputs) if return_outputs else loss
 
     def training_step(self, model, inputs, *args, **kwargs):
-        with self.template.training_step_context(self.model, inputs):
+        with self.template.forward_context(self.model, inputs):
             return super().training_step(model, inputs, *args, **kwargs)