PaddlePaddle
diff --git a/‎paddlenlp/transformers/deepseek_v2/configuration.py
Lines changed: 2 additions & 0 deletions b/‎paddlenlp/transformers/deepseek_v2/configuration.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎paddlenlp/transformers/deepseek_v2/modeling.py
Lines changed: 139 additions & 29 deletions b/‎paddlenlp/transformers/deepseek_v2/modeling.py
Lines changed: 139 additions & 29 deletions
@@ -181,6 +181,7 @@ def __init__(
         using_flex_token=False,
         use_dualpipev=False,
         send_mtp_embed=False,
+        using_post_norm_recompute=False,
         recompute_fwd_gate_up=False,
         is_split_group_gemm=False,
         **kwargs,
@@ -233,6 +234,7 @@ def __init__(
         self.using_flex_token = using_flex_token
         self.use_dualpipev = use_dualpipev
         self.send_mtp_embed = send_mtp_embed
+        self.using_post_norm_recompute = using_post_norm_recompute
         self.recompute_fwd_gate_up = recompute_fwd_gate_up
         self.is_split_group_gemm = is_split_group_gemm
 
 
@@ -74,7 +74,7 @@
 from ..model_utils import PretrainedModel, register_base_model
 from ..moe_gate import PretrainedMoEGate
 from ..moe_layer import MoELayer
-from ..utils import device_guard
+from ..utils import cast_if_needed, device_guard
 from . import fp8_linear as linear_utils
 from .configuration import DeepseekV2Config
 
@@ -737,6 +737,41 @@ def forward(self, x):
         return out
 
 
+class FusedNormGateFunc(paddle.autograd.PyLayer):
+    """recompute of postnorm and gate"""
+
+    @staticmethod
+    def forward(ctx, x, rms_norm_weight, moe_gate_weight, eps):
+        ctx.dtype = paddle.float32
+        norm_output, invar = fused_ln.fused_rms_norm(x, rms_norm_weight, eps)
+        with paddle.amp.auto_cast(False):
+            gate_logits = F.linear(cast_if_needed(norm_output, ctx.dtype), cast_if_needed(moe_gate_weight, ctx.dtype))
+
+        ctx.save_for_backward(x, rms_norm_weight, moe_gate_weight, eps)
+        return gate_logits, norm_output
+
+    @staticmethod
+    def backward(ctx, d_gate_logits, d_norm_output):
+        x, rms_norm_weight, moe_gate_weight, eps = ctx.saved_tensor()
+        # recompute rmsnorm
+        norm_output, invar = fused_ln.fused_rms_norm(x, rms_norm_weight, eps)
+        d_norm_output_linear, d_moe_gate_weight = paddle._C_ops.matmul_grad(
+            cast_if_needed(norm_output, ctx.dtype),
+            cast_if_needed(moe_gate_weight, ctx.dtype),
+            d_gate_logits,
+            False,
+            False,
+        )
+        d_norm_output_linear, d_moe_gate_weight = cast_if_needed(
+            d_norm_output_linear, norm_output.dtype
+        ), cast_if_needed(d_moe_gate_weight, moe_gate_weight.dtype)
+
+        d_norm_output = d_norm_output + d_norm_output_linear
+        dx, d_rms_norm_weight = fused_ln.fused_rms_norm_grad_func(x, rms_norm_weight, invar, d_norm_output, eps)
+
+        return dx, d_rms_norm_weight, d_moe_gate_weight
+
+
 class FakeGate(paddle.autograd.PyLayer):
     @staticmethod
     def forward(ctx, hidden_states, weight):
@@ -756,7 +791,16 @@ def backward(ctx, grad_output):
 
 
 class MoEGate(PretrainedMoEGate):
-    def __init__(self, config, num_experts, expert_hidden_size, **kwargs):
+    def __init__(
+        self,
+        config,
+        num_experts,
+        expert_hidden_size,
+        using_post_norm_recompute=False,
+        norm_weight=None,
+        norm_eps=None,
+        **kwargs
+    ):
         super().__init__(config, num_experts, expert_hidden_size, **kwargs)
         # [hidden_size, n_expert]
 
@@ -771,6 +815,8 @@ def __init__(self, config, num_experts, expert_hidden_size, **kwargs):
         )
 
         self.config = config
+        self.using_post_norm_recompute = using_post_norm_recompute
+
         if config.topk_method == "noaux_tc":
             self.e_score_correction_bias = paddle.create_parameter(
                 shape=[num_experts],
@@ -779,6 +825,10 @@ def __init__(self, config, num_experts, expert_hidden_size, **kwargs):
             )
             self.e_score_correction_bias.is_distributed = True
 
+        if self.using_post_norm_recompute:
+            assert norm_weight is not None and norm_eps is not None
+            self.norm_weight = norm_weight
+            self.norm_eps = norm_eps
         self.using_flex_token = False
 
     def forward(self, hidden_states):
@@ -789,23 +839,33 @@ def forward(self, hidden_states):
         _, _, h_dim = hidden_states.shape
 
         # compute gating score
-        with paddle.amp.auto_cast(False):
-            hidden_states = hidden_states.cast(self.weight.dtype)
-
-            if hasattr(self.config, "using_fake_gate") and self.config.using_fake_gate:
-                logits = FakeGate.apply(hidden_states, self.weight)
-            else:
-                logits = F.linear(hidden_states, self.weight, None)
+        if self.using_post_norm_recompute:
+            logits, norm_out = FusedNormGateFunc.apply(hidden_states, self.norm_weight, self.weight, self.norm_eps)
+        else:
+            with paddle.amp.auto_cast(False):
+                hidden_states = hidden_states.cast(self.weight.dtype)
+                if hasattr(self.config, "using_fake_gate") and self.config.using_fake_gate:
+                    logits = FakeGate.apply(hidden_states, self.weight)
+                else:
+                    logits = F.linear(hidden_states, self.weight, None)
 
-            scores = self.gate_score_func(logits=logits)
-            scores = scores.cast(paddle.float32)
+        scores = self.gate_score_func(logits=logits)
+        scores = scores.cast(paddle.float32)
 
+        # Compute all possible return values
         if self.using_flex_token:
-            scores, routing_map, exp_counts, l_aux, l_zloss = self.topkgating_nodrop(scores)
-            return scores, routing_map, l_aux, l_zloss
+            scores, routing_map, exp_counts, l_aux, l_zloss = self.topkgating_nodrop(
+                scores
+            )  # (scores, routing_map, exp_counts, l_aux, l_zloss)
+            ret = (scores, routing_map, l_aux, l_zloss)
+        else:
+            ret = self.topkgating(scores)  # (capacity, combine_weights, dispatch_mask, exp_counts, l_aux, l_zloss)
 
-        capacity, combine_weights, dispatch_mask, exp_counts, l_aux, l_zloss = self.topkgating(scores)
-        return capacity, combine_weights, dispatch_mask, exp_counts, l_aux, l_zloss
+        # Append norm_out if needed
+        if self.using_post_norm_recompute:
+            ret = (*ret, norm_out)
+
+        return ret
 
 
 class AddAuxiliaryLoss(paddle.autograd.PyLayer):
@@ -833,9 +893,13 @@ class DeepseekV2MoE(MoELayer):
     A mixed expert module containing shared experts.
     """
 
-    def __init__(self, config: DeepseekV2Config):
+    def __init__(self, config: DeepseekV2Config, norm_weight=None, norm_eps=None):
         assert config.tensor_parallel_degree <= 1, "tensor_parallel_degree should be 1"
 
+        self.using_post_norm_recompute = config.using_post_norm_recompute
+        if self.using_post_norm_recompute:
+            assert norm_weight is not None and norm_eps is not None
+
         gate = MoEGate(
             config=config,
             num_experts=config.n_routed_experts,
@@ -847,6 +911,9 @@ def __init__(self, config: DeepseekV2Config):
             norm_topk_prob=config.norm_topk_prob,
             routed_scaling_factor=config.routed_scaling_factor,
             drop_tokens=False,
+            using_post_norm_recompute=self.using_post_norm_recompute,
+            norm_weight=norm_weight,
+            norm_eps=norm_eps,
         )
         DeepseekV2MLPClass = FP8Mlp if DSV3_USE_FP8_GEMM else DeepseekV2MLP
 
@@ -862,6 +929,7 @@ def __init__(self, config: DeepseekV2Config):
             gate=gate,
             capacity=2.0,
             moe_group="expert",
+            using_post_norm_recompute=self.using_post_norm_recompute,
         )
 
         moe_grad_group = fleet.get_hybrid_communicate_group().expert_grad_comm_group
@@ -871,11 +939,44 @@ def __init__(self, config: DeepseekV2Config):
         self.alpha = config.aux_loss_alpha
         if config.n_shared_experts is not None:
             intermediate_size = config.moe_intermediate_size * config.n_shared_experts
-            self.shared_experts = DeepseekV2MLPClass(config=config, intermediate_size=intermediate_size, is_moe=False)
+            if self.using_post_norm_recompute:
+                assert DeepseekV2MLPClass is FP8Mlp
+                self.shared_experts = DeepseekV2MLPClass(
+                    config=config,
+                    intermediate_size=intermediate_size,
+                    is_moe=False,
+                    using_post_norm_recompute=self.using_post_norm_recompute,
+                    norm_weight=norm_weight,
+                    norm_eps=norm_eps,
+                )
+            else:
+                self.shared_experts = DeepseekV2MLPClass(
+                    config=config, intermediate_size=intermediate_size, is_moe=False
+                )
 
     def forward(self, hidden_states):
-        final_hidden_states, l_aux, l_zloss = super().forward(hidden_states)
-        final_hidden_states = self.post_process(hidden_states, final_hidden_states, l_aux)
+        if self.using_post_norm_recompute:
+            super().update_flex_token()
+            if self.using_flex_token:
+                probs, routing_map, l_aux, l_zloss, norm_out = self.router(hidden_states)
+                final_hidden_states, l_aux, l_zloss = super().forward(
+                    norm_out, probs=probs, routing_map=routing_map, l_aux=l_aux, l_zloss=l_zloss
+                )
+            else:
+                capacity, topk_weight, topk_ids, token_priority, l_aux, l_zloss, norm_out = self.gate(hidden_states)
+                final_hidden_states, l_aux, l_zloss = super().forward(
+                    norm_out,
+                    capacity=capacity,
+                    topk_weight=topk_weight,
+                    topk_ids=topk_ids,
+                    token_priority=token_priority,
+                    l_aux=l_aux,
+                    l_zloss=l_zloss,
+                )
+            final_hidden_states = self.post_process(hidden_states, final_hidden_states, l_aux)
+        else:
+            final_hidden_states, l_aux, l_zloss = super().forward(hidden_states)
+            final_hidden_states = self.post_process(hidden_states, final_hidden_states, l_aux)
         return final_hidden_states
 
     def post_process(self, hidden_states, final_hidden_states, l_aux):
@@ -1774,25 +1875,32 @@ def __init__(self, config: DeepseekV2Config, layer_idx: int, layerwise_recompute
         self.enable_recompute = False
         self.layerwise_recompute = layerwise_recompute
         self.recompute_granularity = config.recompute_granularity
+        self.using_post_norm_recompute = config.using_post_norm_recompute
 
         self.hidden_size = config.hidden_size
 
         self.self_attn = DeepseekV2Attention(config=config, layerwise_recompute=layerwise_recompute)
 
         DeepseekV2MLPClass = FP8Mlp if DSV3_USE_FP8_GEMM else DeepseekV2MLP
 
-        self.mlp = (
-            DeepseekV2MoE(config)
-            if (
-                config.n_routed_experts is not None
-                and layer_idx >= config.first_k_dense_replace
-                and layer_idx % config.moe_layer_freq == 0
-            )
-            else DeepseekV2MLPClass(config)
-        )
         self.input_layernorm = DeepseekV2RMSNorm(config)
         self.post_attention_layernorm = DeepseekV2RMSNorm(config)
 
+        if (
+            config.n_routed_experts is not None
+            and layer_idx >= config.first_k_dense_replace
+            and layer_idx % config.moe_layer_freq == 0
+        ):
+            self.mlp = (
+                DeepseekV2MoE(
+                    config, self.post_attention_layernorm.weight, self.post_attention_layernorm.variance_epsilon
+                )
+                if config.using_post_norm_recompute
+                else DeepseekV2MoE(config)
+            )
+        else:
+            self.mlp = DeepseekV2MLPClass(config)
+
     def forward(
         self,
         hidden_states: paddle.Tensor,
@@ -1871,7 +1979,9 @@ def forward(
         # Fully Connected
         residual = hidden_states
 
-        hidden_states = self.post_attention_layernorm(hidden_states)
+        if not (self.using_post_norm_recompute and isinstance(self.mlp, DeepseekV2MoE)):
+            hidden_states = self.post_attention_layernorm(hidden_states)
+
         hidden_states = self.mlp(hidden_states)
         hidden_states = residual + hidden_states