fix bug (#11006)

zhangbo9674 · web-flow · commit fe9e9f8246b5 · 2025-08-26T15:07:17.000+08:00
diff --git a/paddlenlp/transformers/deepseek_v2/modeling.py b/paddlenlp/transformers/deepseek_v2/modeling.py
@@ -2572,7 +2572,7 @@ def forward(
         hidden_states = self.hnorm(hidden_states)
         nextn_hidden_state = self.enorm(nextn_hidden_state)
 
-        concat_h = paddle.concat([hidden_states, nextn_hidden_state], axis=-1)
+        concat_h = paddle.concat([nextn_hidden_state, hidden_states], axis=-1)
         hidden_states = LMHeadFunction.apply(concat_h, self.eh_proj.weight, False)
 
         layer_outputs = super(DeepseekV2MTPLayer, self).forward(
diff --git a/paddlenlp/transformers/deepseek_v2/modeling_pp.py b/paddlenlp/transformers/deepseek_v2/modeling_pp.py
@@ -1814,7 +1814,7 @@ def attn_compute_for_fusion(self, args):
         hidden_states = self.hnorm(hidden_states)
         nextn_hidden_state = self.enorm(nextn_hidden_state)
 
-        hidden_states = self.eh_proj(paddle.concat([hidden_states, nextn_hidden_state], axis=-1))
+        hidden_states = self.eh_proj(paddle.concat([nextn_hidden_state, hidden_states], axis=-1))
 
         # attention compute
         hidden_states, residual = self.self_attn_compute(hidden_states)