fix bug (#10856)

zhangbo9674 · web-flow · commit 3fee1505c96b · 2025-07-17T15:54:15.000+08:00
diff --git a/paddlenlp/transformers/deepseek_v2/modeling_pp.py b/paddlenlp/transformers/deepseek_v2/modeling_pp.py
@@ -629,7 +629,7 @@ def combine_backward(self, output_grad, async_finish=False, allocate_on_comm_str
             output_combine_grad,
             async_finish=async_finish,
             previous_event=quant_event,
-            allocate_on_comm_stream=allocate_on_comm_stream,
+            allocate_on_comm_stream=allocate_on_comm_stream and quant_event is not None,
         )
 
         ret = (hidden_states_grad, residual_grad, l_aux_grad, hidden_states_out_grad)
diff --git a/paddlenlp/transformers/moe_utils.py b/paddlenlp/transformers/moe_utils.py
@@ -160,6 +160,7 @@ def forward(
                 num_experts=num_experts,
                 tokens_per_expert=tokens_per_expert,
                 padding_multiplex=128,
+                fill_output=True,
             )
         else:
             with paddle.amp.auto_cast(False):
@@ -236,6 +237,7 @@ def backward(
                 num_experts,
                 tokens_per_expert,
                 padding_multiplex=128,
+                fill_output=True,
             )
             return (unzipped_grad, unzipped_scale_grad)
         else:

Original file line number	Diff line number	Diff line change
`@@ -629,7 +629,7 @@ def combine_backward(self, output_grad, async_finish=False, allocate_on_comm_str`
`629`	`629`	`output_combine_grad,`
`630`	`630`	`async_finish=async_finish,`
`631`	`631`	`previous_event=quant_event,`
`632`		`- allocate_on_comm_stream=allocate_on_comm_stream,`
	`632`	`+ allocate_on_comm_stream=allocate_on_comm_stream and quant_event is not None,`
`633`	`633`	`)`
`634`	`634`
`635`	`635`	`ret = (hidden_states_grad, residual_grad, l_aux_grad, hidden_states_out_grad)`