[template] fix vlm padding_free/logits_to_keep (#4444)

Jintao-Huang · web-flow · commit 23df7f3344df · 2025-06-02T22:36:39.000+08:00
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -81,7 +81,7 @@
 - 🔥padding_free: 将一个batch中的数据进行展平而避免数据padding，从而降低显存占用并加快训练。默认为False。当前支持`swift pt/sft`。
   - 注意：使用padding_free请结合`--attn_impl flash_attn`使用且"transformers>=4.44"，具体查看[该PR](https://github.com/huggingface/transformers/pull/31629)。（同packing）
   - 支持的多模态模型与多模态packing支持情况相同。相较于packing，padding_free不额外消耗时间和空间。
-  - Megatron-SWIFT默认使用padding_free，即`qkv_format='thd'`。
+  - Megatron-SWIFT默认使用padding_free，即`qkv_format='thd'`，不需要额外设置。
 - padding_side: 当训练`batch_size>=2`时的padding_side，可选值为'left'、'right'，默认为'right'。（推理时的batch_size>=2时，只进行左padding）。
 - loss_scale: 训练tokens的loss权重设置。默认为`'default'`，代表所有response（含history）以1计算交叉熵损失，忽略对应agent_template的`tool_response`的损失。可选值为'default'、'last_round'、'all'、'ignore_empty_think'，以及agent需要的loss_scale: 'react'、'hermes'、'qwen'、'agentflan'、'alpha_umi'。agent部分可以查看[插件化](../Customization/插件化.md)和[Agent文档](./Agent支持.md)。
   - 'last_round': 只计算最后一轮response的损失。
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -82,7 +82,7 @@ Hints:
 - 🔥padding_free: Flattens the data in a batch to avoid padding, thereby reducing memory usage and accelerating training. Default is False. Currently supports `swift pt/sft`.
   - Note: When using `padding_free`, it should be combined with `--attn_impl flash_attn` and "transformers>=4.44". For details, see [this PR](https://github.com/huggingface/transformers/pull/31629). (Same as packing)
   - The supported multimodal models are the same as those supported for multimodal packing. Compared to packing, padding_free does not consume additional time or space.
-  - Megatron-SWIFT uses `padding_free` by default, i.e., `qkv_format='thd'`.
+  - Megatron-SWIFT uses `padding_free` by default, i.e., `qkv_format='thd'`, and no additional configuration is required.
 - padding_side: Padding side when `batch_size>=2` during training. Options are 'left' and 'right', with 'right' as the default. (For inference with batch_size>=2, only left padding is applied.)
 - loss_scale: Weight setting for the loss of training tokens. Default is `'default'`, which means that all responses (including history) are used with a weight of 1 in cross-entropy loss, and the loss from the corresponding `tool_response` in the agent_template is ignored. Possible values include: 'default', 'last_round', 'all', 'ignore_empty_think', and agent-specific options: 'react', 'hermes', 'qwen', 'agentflan', 'alpha_umi'. For more details about the agent part, please refer to [Pluginization](../Customization/Pluginization.md) and [Agent Training](./Agent-support.md).
   - 'last_round': Only calculate the loss for the last round of response.
diff --git a/swift/llm/argument/train_args.py b/swift/llm/argument/train_args.py
@@ -132,14 +132,15 @@ def _init_lazy_tokenize(self):
             logger.info(f'Setting args.lazy_tokenize: {self.lazy_tokenize}')
 
     def __post_init__(self) -> None:
-        if (self.padding_free or self.packing) and self.attn_impl != 'flash_attn':
+        if self.padding_free or self.packing:
             if self.packing:
                 feature = 'packing'
                 self.padding_free = False
             else:
                 feature = 'padding_free'
-            raise ValueError(f'The "{feature}" feature needs to be used in conjunction with "flash_attn". '
-                             'Please specify `--attn_impl flash_attn`.')
+            if self.attn_impl != 'flash_attn':
+                raise ValueError(f'The "{feature}" feature needs to be used in conjunction with "flash_attn". '
+                                 'Please specify `--attn_impl flash_attn`.')
         if self.resume_from_checkpoint:
             self.resume_from_checkpoint = to_abspath(self.resume_from_checkpoint, True)
             if self.resume_only_model:
diff --git a/swift/llm/template/base.py b/swift/llm/template/base.py
@@ -117,7 +117,7 @@ def __init__(
         self.mode: Literal['pt', 'vllm', 'lmdeploy',  # infer
                            'train', 'rlhf', 'kto',  # train
                            'seq_cls', 'embedding', 'prm'] = 'pt'
-        self._packing = False
+        self._packing = self.padding_free
         self.use_megatron = False
         self._handles = []
         self._deepspeed_initialize = None
@@ -1172,7 +1172,7 @@ def pre_forward_hook(self, model: nn.Module, args, kwargs):
         old_kwargs = to_device(kwargs, model.device)
         kwargs = to_device(self._post_encode(model, old_kwargs), model.device)
         for k, v in old_kwargs.items():
-            if k in {'input_ids', 'attention_mask', 'labels', 'position_ids', 'output_hidden_states'
+            if k in {'input_ids', 'attention_mask', 'labels', 'position_ids', 'output_hidden_states', 'logits_to_keep'
                      } and k not in kwargs:
                 kwargs[k] = v
         if 'inputs_embeds' in kwargs:
@@ -1359,9 +1359,11 @@ def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[in
         assert self.tokenizer.pad_token_id is not None
         padding_side = self.padding_side if self.is_training else 'left'
         padding_right = padding_side == 'right'
-        packing_mode = self.use_megatron or self.padding_free or self._packing and 'position_ids' in batch[0]
+        packing_mode = self.use_megatron or self._packing
         if self.padding_free:
-            batch = self._data_flatten(batch)
+            batch[:] = self._data_flatten(batch)
+        if self._packing:
+            assert 'position_ids' in batch[0], f'batch[0]: {batch[0]}'
         res = {}
         if packing_mode:
             # only support llm