diff --git a/paddlenlp/trainer/trainer.py b/paddlenlp/trainer/trainer.py index 663932720a51..28ca0c80eb87 100644 --- a/paddlenlp/trainer/trainer.py +++ b/paddlenlp/trainer/trainer.py @@ -1220,6 +1220,17 @@ def fused_allreduce_gradients_no_sync(paramlist, hcg): elif p.grad is not None: p.grad.scale_(1.0 / self.args.gradient_accumulation_steps) + for p in model._layers.parameters(): + if hasattr(p, "is_expert_weight") and p.is_expert_weight: + print(f"param {p.name} is expert weight") + with paddle.no_grad(): + if hasattr(p, "main_grad") and p.main_grad is not None: + print("main grad scale 1/ep") + p.main_grad.scale_(1.0 / self.args.expert_parallel_degree) + elif p.grad is not None: + print("grad scale 1/ep") + p.grad.scale_(1.0 / self.args.expert_parallel_degree) + # Optimizer step self.callback_handler.on_optimizer_begin( args, self.state, self.control, scaler=self.scaler if self.do_grad_scaling else None diff --git a/paddlenlp/transformers/deepseek_v2/modeling.py b/paddlenlp/transformers/deepseek_v2/modeling.py index ba32fad3238c..1bdacdc92980 100644 --- a/paddlenlp/transformers/deepseek_v2/modeling.py +++ b/paddlenlp/transformers/deepseek_v2/modeling.py @@ -1045,6 +1045,11 @@ def __init__(self, config: DeepseekV2Config, norm_weight=None, norm_eps=None): using_post_norm_recompute=self.using_post_norm_recompute, ) + for expert in self.experts: + if expert is not None: + setattr(expert.w1, "is_expert_weight", True) + setattr(expert.w2, "is_expert_weight", True) + if config.offline_quant_expert_weight and config.clear_origin_weight_when_offline_quant: moe_grad_group = fleet.get_hybrid_communicate_group().expert_grad_comm_group expert_w1_list = [expert.w1 for expert in self.experts if expert is not None]