update router_aux_loss_coef (#5318)

Jintao-Huang · web-flow · commit fc2bc3dda472 · 2025-08-08T21:23:29.000+08:00
diff --git a/docs/source/Instruction/Megatron-SWIFT训练.md b/docs/source/Instruction/Megatron-SWIFT训练.md
@@ -404,7 +404,8 @@ swift export \
 - moe_enable_deepep: 实验性功能，启用DeepSeek/DeepEP以实现 MoE 模型中的高效令牌分发与组合。仅在设置`--moe_token_dispatcher_type flex`使用灵活令牌分发器时生效。
 - 🔥moe_grouped_gemm: 当每个rank包含多个专家时，通过在多个流中启动多个本地 GEMM 内核，利用 TransformerEngine中的GroupedLinear提高利用率和性能。默认为False。
 - 🔥moe_permute_fusion: 在令牌分发过程中融合令牌重排操作。默认为False。
-- 🔥moe_aux_loss_coeff: 辅助损失的缩放系数：建议的初始值为 1e-2。默认为None。自动从config.json读取。
+- 🔥moe_aux_loss_coeff: 默认为0，不使用aux_loss。
+  - 注意：在"ms-swift<3.7.1"，其默认为None，自动从config.json读取。
 - moe_z_loss_coeff: z-loss 的缩放系数。默认为None。
 - moe_expert_capacity_factor: 每个专家的容量因子，None表示不会丢弃任何token。默认为None。自动从config.json读取。
 - 🔥moe_shared_expert_overlap: 启用共享专家计算与调度器通信之间的重叠。如果不启用此选项，共享专家将在路由专家之后执行。仅在设置了`moe_shared_expert_intermediate_size`时有效。默认为False。
diff --git a/docs/source/Instruction/命令行参数.md b/docs/source/Instruction/命令行参数.md
@@ -163,7 +163,8 @@
 - 🔥report_to: 默认值为`tensorboard`。你也可以指定`--report_to tensorboard wandb swanlab`、`--report_to all`。
 - logging_first_step: 是否记录第一个step的日志，默认为True。
 - logging_steps: 日志打印间隔，默认为5。
-- router_aux_loss_coef: 用于moe模型训练时，设置 aux_loss 的权重。默认为None，使用config中值。若设置为0，则不计算 aux_loss。
+- router_aux_loss_coef: 用于moe模型训练时，设置 aux_loss 的权重，默认为`0.`。
+  - 注意：在"ms-swift==3.7.0"，其默认为None，从config.json中读取，该行为在"ms-swift>=3.7.1"被修改。
 - logging_dir: tensorboard日志路径。默认为None，即设置为`f'{self.output_dir}/runs'`。
 - predict_with_generate: 验证时使用生成式的方式，默认为False。
 - metric_for_best_model: 默认为None，即当`predict_with_generate`设置为False时，设置为'loss'，否则设置为'rouge-l'（在PPO训练时，不进行默认值设置；GRPO训练设置为'reward'）。
diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md
@@ -166,7 +166,8 @@ This parameter list inherits from transformers `Seq2SeqTrainingArguments`, with
 - 🔥report_to: Default value is `tensorboard`. You can also specify `--report_to tensorboard wandb swanlab` or `--report_to all`.
 - logging_first_step: Whether to log the first step, defaults to True.
 - logging_steps: Interval for logging, defaults to 5.
-- router_aux_loss_coef: Weight for aux_loss when training MoE models. Defaults to None, meaning the value from the config is used. If set to 0, aux_loss is not computed.
+- router_aux_loss_coef: Sets the weight of the aux_loss when training MoE models; default is `0.`
+  - Note: In ms-swift == 3.7.0, the default is None and the value is read from config.json; this behavior was changed starting with ms-swift >= 3.7.1.
 - logging_dir: The path for TensorBoard logs. Defaults to None, which means it is set to `f'{self.output_dir}/runs'`.
 - predict_with_generate: Whether to use generative method during validation, default is False.
 - metric_for_best_model: Default is None, which means that when predict_with_generate is set to False, it is set to 'loss'; otherwise, it is set to 'rouge-l' (during PPO training, the default value is not set; in GRPO training, it is set to 'reward').
diff --git a/docs/source_en/Instruction/Megatron-SWIFT-Training.md b/docs/source_en/Instruction/Megatron-SWIFT-Training.md
@@ -421,7 +421,8 @@ seq_length: Defaults to None, meaning it is set to `max_length`. To restrict the
 - moe_enable_deepep: Experimental feature, Enables DeepSeek/DeepEP for efficient token dispatching and combination in MoE models. Only works when using the flexible token dispatcher by setting `--moe_token_dispatcher_type flex`.
 - 🔥moe_grouped_gemm: When each rank contains multiple experts, multiple local GEMM kernels can be launched in parallel streams to improve utilization and performance by using GroupedLinear from TransformerEngine. Default is False.
 - 🔥moe_permute_fusion: Fuses token permutation operations during token dispatch. Default is False.
-- 🔥moe_aux_loss_coeff: Scaling coefficient for the auxiliary loss; a recommended initial value is 1e-2. Default is None and is automatically read from config.json.
+- 🔥moe_aux_loss_coeff: Default is 0, which disables aux_loss.
+  - Note: In ms-swift versions earlier than 3.7.1, the default is None and the value is automatically loaded from config.json.
 - moe_z_loss_coeff: Scaling coefficient for z-loss. Default is None.
 - moe_expert_capacity_factor: Capacity factor for each expert. None means no token will be dropped. Default is None and will be automatically read from config.json.
 - 🔥moe_shared_expert_overlap: Enables overlap between shared expert computation and the dispatcher. If not enabled, shared expert computation will be performed after routing experts. Only effective when `moe_shared_expert_intermediate_size` is set. Default is False.
diff --git a/swift/megatron/argument/megatron_args.py b/swift/megatron/argument/megatron_args.py
@@ -217,7 +217,7 @@ class MegatronArguments(ExtraMegatronArguments):
     moe_enable_deepep: bool = False
     moe_grouped_gemm: bool = False
     moe_permute_fusion: bool = False
-    moe_aux_loss_coeff: Optional[float] = None
+    moe_aux_loss_coeff: float = 0.
     moe_z_loss_coeff: Optional[float] = None
     moe_expert_capacity_factor: Optional[float] = None
     moe_shared_expert_overlap: bool = False
@@ -315,8 +315,6 @@ def _set_default(self):
             self.moe_router_topk = 2
         if self.moe_router_pre_softmax is None:
             self.moe_router_pre_softmax = False
-        if self.moe_aux_loss_coeff is None:
-            self.moe_aux_loss_coeff = 0.
         if self.moe_router_load_balancing_type is None:
             self.moe_router_load_balancing_type = 'aux_loss'
         if self.moe_router_enable_expert_bias is None:
diff --git a/swift/megatron/model/config.py b/swift/megatron/model/config.py
@@ -27,7 +27,6 @@
     'moe_router_topk': ['num_experts_per_tok', 'n_group', 'moe_topk', 'moe_k'],
     'num_experts': ['num_experts', 'n_routed_experts', 'moe_num_experts'],
     'moe_router_pre_softmax': ['norm_topk_prob'],
-    'moe_aux_loss_coeff': ['router_aux_loss_coef'],
     # deepseek
     'q_lora_rank': ['q_lora_rank'],
     'kv_lora_rank': ['kv_lora_rank'],
diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py
@@ -30,7 +30,7 @@ class TrainArgumentsMixin:
     gradient_checkpointing_kwargs: Optional[Union[dict, str]] = None
     logging_first_step: bool = True
     logging_steps: int = 5
-    router_aux_loss_coef: Optional[float] = None
+    router_aux_loss_coef: float = 0.
 
     weight_decay: float = 0.1
     adam_beta2: float = 0.95
diff --git a/swift/trainers/trainers.py b/swift/trainers/trainers.py
@@ -303,6 +303,8 @@ def prediction_step(
         return None, response_list, labels_list
 
     def _prepare_inputs(self, inputs):
+        from swift.llm import HfConfigFactory
+        args = self.args
         inputs = super()._prepare_inputs(inputs)
         from swift.plugin.loss import get_loss_func
         loss_kwargs = {}
@@ -315,7 +317,7 @@ def _prepare_inputs(self, inputs):
 
         sample_channels = inputs.pop('channel', None)
         position_ids = inputs.pop('_position_ids', None)
-        if self.args.channels is not None:
+        if args.channels is not None:
             assert sample_channels is not None, f'sample_channels: {sample_channels}'
             state = self.state
             setattr(state, 'local_step', getattr(state, 'local_step', 0))
@@ -334,22 +336,17 @@ def _prepare_inputs(self, inputs):
             inputs['labels'], logits_to_keep = self.get_logits_to_keep(inputs['labels'])
             if logits_to_keep is not None:
                 inputs['logits_to_keep'] = logits_to_keep
-                if self.args.tuner_backend == 'unsloth' and isinstance(logits_to_keep, torch.Tensor):
+                if args.tuner_backend == 'unsloth' and isinstance(logits_to_keep, torch.Tensor):
                     inputs['logits_to_keep'] = int(logits_to_keep.sum())
 
-        if self.model.model_info.is_moe_model:
-            base_model = self.template.get_base_model(self.model)
-            router_aux_loss_coef = self.args.router_aux_loss_coef
-            if router_aux_loss_coef is None:
-                router_aux_loss_coef = getattr(base_model.config, 'router_aux_loss_coef', None)
-            if router_aux_loss_coef is not None:
-                from swift.llm import HfConfigFactory
-                HfConfigFactory.set_config_attr(base_model.config, 'router_aux_loss_coef', router_aux_loss_coef)
-                base_model.router_aux_loss_coef = router_aux_loss_coef
-                logger.info_once(f'router_aux_loss_coef: {router_aux_loss_coef}')
-                if router_aux_loss_coef > 0 and 'output_router_logits' in inspect.signature(
-                        base_model.forward).parameters:
-                    inputs['output_router_logits'] = True
+        base_model = self.template.get_base_model(self.model)
+        if self.model.model_info.is_moe_model and 'output_router_logits' in inspect.signature(
+                base_model.forward).parameters:
+            HfConfigFactory.set_config_attr(base_model.config, 'router_aux_loss_coef', args.router_aux_loss_coef)
+            base_model.router_aux_loss_coef = args.router_aux_loss_coef
+            logger.info_once(f'router_aux_loss_coef: {args.router_aux_loss_coef}')
+            if args.router_aux_loss_coef > 0:
+                inputs['output_router_logits'] = True
         inputs['compute_loss_func'] = compute_loss_func
         inputs['loss_kwargs'] = loss_kwargs
         return inputs