PaddlePaddle
diff --git a/‎paddlenlp/transformers/deepseek_v2/modeling.py
Lines changed: 28 additions & 1 deletion b/‎paddlenlp/transformers/deepseek_v2/modeling.py
Lines changed: 28 additions & 1 deletion
@@ -747,6 +747,18 @@ def forward(self, x):
 
 class FusedNormGateFunc(paddle.autograd.PyLayer):
     """recompute of postnorm and gate"""
+    _current_norm_output = None
+    _current_invar = None
+
+    @classmethod
+    def set_temporary_vars(cls, norm_output, invar):
+        cls._current_norm_output = norm_output
+        cls._current_invar = invar
+
+    @classmethod
+    def clear_temporary_vars(cls):
+        cls._current_norm_output = None
+        cls._current_invar = None
 
     @staticmethod
     def forward(ctx, x, rms_norm_weight, moe_gate_weight, eps):
@@ -762,7 +774,12 @@ def forward(ctx, x, rms_norm_weight, moe_gate_weight, eps):
     def backward(ctx, d_gate_logits, d_norm_output):
         x, rms_norm_weight, moe_gate_weight, eps = ctx.saved_tensor()
         # recompute rmsnorm
-        norm_output, invar = fused_ln.fused_rms_norm(x, rms_norm_weight, eps)
+        norm_output = FusedNormGateFunc._current_norm_output
+        invar = FusedNormGateFunc._current_invar
+        if norm_output is None or invar is None:
+            raise RuntimeError("norm_output and invar must be set before backward!")
+
+        # norm_output, invar = fused_ln.fused_rms_norm(x, rms_norm_weight, eps)
         d_norm_output_linear, d_moe_gate_weight = paddle._C_ops.matmul_grad(
             cast_if_needed(norm_output, ctx.dtype),
             cast_if_needed(moe_gate_weight, ctx.dtype),
@@ -779,6 +796,16 @@ def backward(ctx, d_gate_logits, d_norm_output):
 
         return dx, d_rms_norm_weight, d_moe_gate_weight
 
+class TemporaryVarContext:
+    def __init__(self, norm_output, invar):
+        self.norm_output = norm_output
+        self.invar = invar
+
+    def __enter__(self):
+        FusedNormGateFunc.set_temporary_vars(self.norm_output, self.invar)
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        FusedNormGateFunc.clear_temporary_vars()
 
 def balance_expert_assignment(n, m, k):
     assert k * n % m == 0