Skip to content

Commit 3fee150

Browse files
authored
fix bug (#10856)
1 parent 831d818 commit 3fee150

File tree

2 files changed

+3
-1
lines changed

2 files changed

+3
-1
lines changed

paddlenlp/transformers/deepseek_v2/modeling_pp.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -629,7 +629,7 @@ def combine_backward(self, output_grad, async_finish=False, allocate_on_comm_str
629629
output_combine_grad,
630630
async_finish=async_finish,
631631
previous_event=quant_event,
632-
allocate_on_comm_stream=allocate_on_comm_stream,
632+
allocate_on_comm_stream=allocate_on_comm_stream and quant_event is not None,
633633
)
634634

635635
ret = (hidden_states_grad, residual_grad, l_aux_grad, hidden_states_out_grad)

paddlenlp/transformers/moe_utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ def forward(
160160
num_experts=num_experts,
161161
tokens_per_expert=tokens_per_expert,
162162
padding_multiplex=128,
163+
fill_output=True,
163164
)
164165
else:
165166
with paddle.amp.auto_cast(False):
@@ -236,6 +237,7 @@ def backward(
236237
num_experts,
237238
tokens_per_expert,
238239
padding_multiplex=128,
240+
fill_output=True,
239241
)
240242
return (unzipped_grad, unzipped_scale_grad)
241243
else:

0 commit comments

Comments
 (0)