[bugfix] fix megatron loss_scale (#5406)

Jintao-Huang · Jintao-Huang · commit 98a93ab7c190 · 2025-08-16T00:04:35.000+08:00
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -754,5 +754,5 @@ qwen2_5_omni除了包含qwen2_5_vl和qwen2_audio的模型特定参数外，还
 - NNODES: torchrun中`--nnodes`的参数透传。
 - NODE_RANK: torchrun中`--node_rank`的参数透传。
 - LOG_LEVEL: 日志的level，默认为'INFO'，你可以设置为'WARNING', 'ERROR'等。
-- SWIFT_DEBUG: 在`engine.infer(...)`时，若设置为'1'，则会打印input_ids和generate_ids的内容。
+- SWIFT_DEBUG: 在`engine.infer(...)`时，若设置为'1'，PtEngine将会打印input_ids和generate_ids的内容。
 - VLLM_USE_V1: 用于切换vLLM使用V0/V1版本。
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -771,5 +771,5 @@ For the meaning of the arguments, please refer to [here](https://modelscope.cn/m
 - NNODES: Pass-through for the `--nnodes` parameter in torchrun.
 - NODE_RANK: Pass-through for the `--node_rank` parameter in torchrun.
 - LOG_LEVEL: The log level, default is 'INFO'. You can set it to 'WARNING', 'ERROR', etc.
-- SWIFT_DEBUG: During `engine.infer(...)`, if set to '1', the content of input_ids and generate_ids will be printed.
+- SWIFT_DEBUG: When set to '1', the PtEngine will print the contents of input_ids and generate_ids during `engine.infer(...)`.
 - VLLM_USE_V1: Used to switch between V0 and V1 versions of vLLM.
diff --git a/swift/megatron/init.py b/swift/megatron/init.py
@@ -439,7 +439,6 @@ def forward(
         output, bias = self.linear_proj(core_attn_out)
 
         return output, bias
-        pass
 
     MultiLatentAttention.forward = forward
 
@@ -555,12 +554,7 @@ def qkv_up_proj_and_rope_apply(q_compressed, kv_compressed, k_pos_emb, rotary_po
                 sequence_start = inference_context.sequence_len_offset
                 sequence_end = sequence_start + q_len
                 rotary_pos_emb = rotary_pos_emb[sequence_start:sequence_end]
-            else:
-                # Shorten rotary_pos_emb to the sequence length when inference_params
-                # is not provided. This makes sure we can run forward directly with
-                # any sequence length. During training, the sequence length is always
-                # the full rotary_pos_emb length.
-                rotary_pos_emb = rotary_pos_emb[0:q_len]
+            # Remove the else branch to fix cp.
 
             # [num_tokens, qk_pos_emb_head_dim] -> [num_tokens, 1, qk_pos_emb_head_dim]
             k_pos_emb = torch.unsqueeze(k_pos_emb, -2)
diff --git a/swift/megatron/trainers/trainer.py b/swift/megatron/trainers/trainer.py
@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from functools import partial
+from typing import Optional
 
 import megatron.core
 import torch
@@ -21,25 +22,18 @@
 class MegatronTrainer(BaseMegatronTrainer):
 
     # Code borrowed from NVIDIA/Megatron-LM
-    def loss_func(self, output_tensor: torch.Tensor, *, loss_mask: torch.Tensor):
-        """Loss function.
-
-        Args:
-            output_tensor (torch.Tensor): The tensor with the losses
-            loss_mask (torch.Tensor): Used to mask out some portions of the loss
-
-        Returns:
-            the loss scalar for this micro-batch
-            the number of non-padded tokens in this microbatch
-            a dict containing reporting metrics on the loss and number of tokens across
-                the data parallel ranks
-        """
+    def loss_func(self,
+                  output_tensor: torch.Tensor,
+                  *,
+                  labels: torch.Tensor,
+                  loss_scale: Optional[torch.Tensor] = None):
         args = get_args()
 
         losses = output_tensor.float()
-        loss_mask = loss_mask.view(-1).float()
-        total_tokens = loss_mask.sum()
-        loss = torch.cat([torch.sum(losses.view(-1) * loss_mask).view(1), total_tokens.view(1)])
+        if loss_scale is not None:
+            losses = losses * loss_scale
+        loss_mask = labels != -100
+        loss = torch.cat([torch.sum(losses * loss_mask).view(1), loss_mask.sum().view(1)])
 
         megatron_core_013 = version.parse(megatron.core.__version__) >= version.parse('0.13.0rc0')
         if args.context_parallel_size > 1 and not megatron_core_013:
@@ -109,9 +103,4 @@ def forward_step(self, data_iterator, model):
         with self.stimer:
             output_tensor = model(**data)
         labels = data.get('labels')
-        if loss_scale is None:
-            loss_mask = None if labels is None else (labels != -100).float()
-        else:
-            loss_scale[labels == -100] = 0
-            loss_mask = loss_scale
-        return output_tensor, partial(self.loss_func, loss_mask=loss_mask)
+        return output_tensor, partial(self.loss_func, labels=labels, loss_scale=loss_scale)