[sft] support DFT (#5355)

hjh0119 · web-flow · commit ce426e1f85e1 · 2025-08-13T10:06:10.000+08:00
* lint

* update readme &amp; optimize target_probs compute

* fix scripts

* position argument

* compatible with sp
diff --git a/README.md b/README.md
@@ -75,6 +75,7 @@ You can contact us and communicate with us by adding our group:
 
 
 ## 🎉 News
+- 🎁 2025.08.12: Support [Dynamic Fine-Tuning](https://arxiv.org/abs/2508.05629)(DFT) in SFT training, use parameter `--enable_dft_loss true`. Training scripts can be found [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/full/dft.sh).
 - 🎁 2025.07.12: Deployment(pt/vLLM/SGLang) of Embedding models is supported, check [here](examples/deploy/embedding/client.py).
 - 🎁 2025.07.09: Megatron-SWIFT supports LoRA training. Compared to ms-swift, it achieves significant speedup on MoE models. Training scripts can be found [here](https://github.com/modelscope/ms-swift/blob/main/examples/train/megatron/lora).
 - 🎁 2025.06.23: Fine-tuning of reranker models is supported. Training scripts can be found here: [Reranker](https://github.com/modelscope/ms-swift/blob/main/examples/train/reranker/train_reranker.sh).
diff --git a/README_CN.md b/README_CN.md
@@ -71,6 +71,7 @@
 - **模型量化**：支持AWQ、GPTQ、FP8和BNB的量化导出，导出的模型支持使用vLLM/SGLang/LmDeploy推理加速，并支持继续训练。
 
 ## 🎉 新闻
+- 🎁 2025.08.12: 支持在SFT训练中使用[Dynamic Fine-Tuning](https://arxiv.org/abs/2508.05629)(DFT)，使用参数 `--enable_dft_loss true`。训练脚本参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/full/dft.sh)
 - 🎁 2025.07.12: 支持部署Embedding模型的部署(pt/vLLM/SGLang), 查看[这里](examples/deploy/embedding/client.py).
 - 🎁 2025.07.09: Megatron-SWIFT支持LoRA训练。相比ms-swift，在MoE模型提速显著。训练脚本参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/megatron/lora)。
 - 🎁 2025.06.23: 支持Reranker模型训练，训练脚本参考[这里](https://github.com/modelscope/ms-swift/blob/main/examples/train/reranker/train_reranker.sh)。
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -165,6 +165,7 @@
 - logging_steps: 日志打印间隔，默认为5。
 - router_aux_loss_coef: 用于moe模型训练时，设置 aux_loss 的权重，默认为`0.`。
   - 注意：在"ms-swift==3.7.0"，其默认为None，从config.json中读取，该行为在"ms-swift>=3.7.1"被修改。
+- enable_dft_loss: 是否在SFT训练中使用[DFT](https://arxiv.org/abs/2508.05629) (Dynamic Fine-Tuning) loss，默认为False。
 - logging_dir: tensorboard日志路径。默认为None，即设置为`f'{self.output_dir}/runs'`。
 - predict_with_generate: 验证时使用生成式的方式，默认为False。
 - metric_for_best_model: 默认为None，即当`predict_with_generate`设置为False时，设置为'loss'，否则设置为'rouge-l'（在PPO训练时，不进行默认值设置；GRPO训练设置为'reward'）。
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -168,6 +168,7 @@ This parameter list inherits from transformers `Seq2SeqTrainingArguments`, with
 - logging_steps: Interval for logging, defaults to 5.
 - router_aux_loss_coef: Sets the weight of the aux_loss when training MoE models; default is `0.`
   - Note: In ms-swift == 3.7.0, the default is None and the value is read from config.json; this behavior was changed starting with ms-swift >= 3.7.1.
+- enable_dft_loss: Whether to use [DFT](https://arxiv.org/abs/2508.05629) (Dynamic Fine-Tuning) loss in SFT training, default is False.
 - logging_dir: The path for TensorBoard logs. Defaults to None, which means it is set to `f'{self.output_dir}/runs'`.
 - predict_with_generate: Whether to use generative method during validation, default is False.
 - metric_for_best_model: Default is None, which means that when predict_with_generate is set to False, it is set to 'loss'; otherwise, it is set to 'rouge-l' (during PPO training, the default value is not set; in GRPO training, it is set to 'reward').
diff --git a/examples/train/full/dft.sh b/examples/train/full/dft.sh
@@ -0,0 +1,22 @@
+# 4*80G
+# exp: https://github.com/modelscope/ms-swift/pull/5355
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+NPROC_PER_NODE=4 \
+swift sft \
+    --model Qwen/Qwen2.5-Math-1.5B \
+    --train_type full \
+    --dataset AI-MO/NuminaMath-CoT#100000 \
+    --torch_dtype bfloat16 \
+    --enable_dft_loss true \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 8 \
+    --learning_rate 5e-5 \
+    --gradient_accumulation_steps 32 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --system 'You are a helpful assistant.' \
+    --warmup_ratio 0.1 \
+    --deepspeed zero2 \
+    --dataloader_num_workers 4
diff --git a/swift/plugin/loss.py b/swift/plugin/loss.py
@@ -12,7 +12,7 @@
 from transformers.utils import strtobool
 
 
-def per_token_loss_func(outputs, labels, **kwargs):
+def per_token_loss_func(outputs, labels, enable_dft_loss, **kwargs):
     logits = outputs.logits
     # Upcast to float if we need to compute the loss to avoid potential precision issues
     logits = logits.float()
@@ -23,6 +23,10 @@ def per_token_loss_func(outputs, labels, **kwargs):
     # Enable model parallelism
     labels = labels.to(logits.device)
     loss = F.cross_entropy(logits, labels, ignore_index=-100, reduction='none')
+    if enable_dft_loss:
+        with torch.no_grad():
+            target_probs = torch.exp(-loss)
+        loss *= target_probs
     return loss
 
 
diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py
@@ -35,6 +35,7 @@ class TrainArgumentsMixin:
     logging_first_step: bool = True
     logging_steps: int = 5
     router_aux_loss_coef: float = 0.
+    enable_dft_loss: bool = False  # https://arxiv.org/abs/2508.05629
 
     weight_decay: float = 0.1
     adam_beta2: float = 0.95
diff --git a/swift/trainers/sequence_parallel/ring_attention.py b/swift/trainers/sequence_parallel/ring_attention.py
@@ -216,9 +216,10 @@ def prepare_trainer(self, trainer):
         trainer.ring_attention = self
 
         if trainer.__class__.__name__ == 'Seq2SeqTrainer':
+            enable_dft_loss = trainer.args.enable_dft_loss
             trainer._origin_prepare_inputs = trainer._prepare_inputs
             trainer._prepare_inputs = MethodType(partial(_prepare_inputs, sp_instance=self), trainer)
-            trainer.compute_loss_func = partial(loss_scale_sp_func, sp_instance=self)
+            trainer.compute_loss_func = partial(loss_scale_sp_func, sp_instance=self, enable_dft_loss=enable_dft_loss)
 
         elif trainer.__class__.__name__ == 'DPOTrainer':
             trainer._origin_prepare_inputs = trainer._prepare_inputs
diff --git a/swift/trainers/sequence_parallel/ulysses.py b/swift/trainers/sequence_parallel/ulysses.py
@@ -326,9 +326,10 @@ def prepare_trainer(self, trainer):
 
         trainer.ulysses = self
         if trainer.__class__.__name__ == 'Seq2SeqTrainer':
+            enable_dft_loss = trainer.args.enable_dft_loss
             trainer._origin_prepare_inputs = trainer._prepare_inputs
             trainer._prepare_inputs = MethodType(partial(_prepare_inputs, sp_instance=self), trainer)
-            trainer.compute_loss_func = partial(loss_scale_sp_func, sp_instance=self)
+            trainer.compute_loss_func = partial(loss_scale_sp_func, sp_instance=self, enable_dft_loss=enable_dft_loss)
 
         elif trainer.__class__.__name__ == 'DPOTrainer':
             trainer._origin_prepare_inputs = trainer._prepare_inputs
diff --git a/swift/trainers/sequence_parallel/utils.py b/swift/trainers/sequence_parallel/utils.py
@@ -125,6 +125,7 @@ def loss_scale_sp_func(outputs,
                        loss_scale=None,
                        num_items_in_batch=None,
                        sp_instance=None,
+                       enable_dft_loss=False,
                        **kwargs) -> torch.Tensor:
     """Common loss function for sequence parallel training"""
     if hasattr(outputs, 'logits'):
@@ -146,6 +147,10 @@ def loss_scale_sp_func(outputs,
     else:
         loss_fct = CrossEntropyLoss(reduction='none')
         loss = loss_fct(logits, labels)
+    if enable_dft_loss:
+        with torch.no_grad():
+            target_probs = torch.exp(-loss)
+        loss *= target_probs
     if loss_scale is not None:
         loss_scale = loss_scale.flatten().to(device)
         loss = (loss_scale * loss)
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
@@ -333,8 +333,8 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N
         loss_scale = inputs.pop('loss_scale', None)
         loss_kwargs = inputs.pop('loss_kwargs', {})
 
-        if (self.label_smoother is not None or compute_loss_func is not None
-                or loss_scale is not None) and 'labels' in inputs:
+        if (self.label_smoother is not None or compute_loss_func is not None or loss_scale is not None
+                or self.args.enable_dft_loss) and 'labels' in inputs:
             labels = inputs.pop('labels')
         outputs = model(**inputs)
         if getattr(outputs, 'aux_loss', None) is not None:
@@ -360,10 +360,14 @@ def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=N
             loss = outputs['loss'] if isinstance(outputs, dict) else outputs[0]
         else:
             outputs.loss = None
-            if loss_scale is not None:
-                loss_scale = torch.roll(loss_scale, shifts=-1, dims=-1).view(-1)
-                outputs.loss = get_loss_func('per_token_cross_entropy')(outputs, labels)
-                outputs.loss = outputs.loss * loss_scale
+            if self.args.enable_dft_loss or loss_scale is not None:
+                outputs.loss = get_loss_func('per_token_cross_entropy')(
+                    outputs, labels, enable_dft_loss=self.args.enable_dft_loss)
+
+                if loss_scale is not None:
+                    loss_scale = torch.roll(loss_scale, shifts=-1, dims=-1).view(-1)
+                    outputs.loss = outputs.loss * loss_scale
+
             unwrapped_model = self.accelerator.unwrap_model(model)
             if is_peft_available() and isinstance(unwrapped_model, PeftModel):
                 model_name = unwrapped_model.model._get_name()