Skip to content

Commit a48c978

Browse files
committed
fix rebuild_padding and step_paddle
1 parent 978be5b commit a48c978

File tree

3 files changed

+33
-6
lines changed

3 files changed

+33
-6
lines changed

fastdeploy/model_executor/ops/npu/rebuild_padding.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@ def rebuild_padding(
1212
padding_offset,
1313
max_model_len
1414
):
15-
model_output=paddle.cast(model_output, paddle.float16)
15+
# Cast to float16 for NPU kernel as required, then cast back to original dtype
16+
original_dtype = model_output.dtype
17+
model_output = paddle.cast(model_output, paddle.float16)
1618

1719
out = core.eager._run_custom_op(
1820
"rebuild_padding_v2",
@@ -23,5 +25,8 @@ def rebuild_padding(
2325
max_model_len
2426
)[0]
2527

28+
# Cast back to original dtype to maintain consistency
29+
out = paddle.cast(out, original_dtype)
30+
2631

2732
return out

fastdeploy/model_executor/pre_and_post_process.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,12 +307,14 @@ def post_process_normal(
307307
# In the future, we will abandon this approach.
308308
if not skip_save_output:
309309
if sampler_output.logprobs_tensors is None:
310+
print("<><><><><>before save_output")
310311
save_output(
311312
sampler_output.sampled_token_ids,
312313
model_output.not_need_stop,
313314
model_output.mp_rank,
314315
save_each_rank, # save_each_rank
315316
)
317+
print("<><><><><>after save_output")
316318
else:
317319
save_output_topk(
318320
sampler_output.sampled_token_ids,
@@ -322,6 +324,7 @@ def post_process_normal(
322324
model_output.not_need_stop,
323325
model_output.mp_rank,
324326
)
327+
print("<><><><><>end of this")
325328

326329

327330
def post_process_specualate(model_output, save_each_rank: bool = False, skip_save_output: bool = False):
@@ -378,7 +381,9 @@ def post_process(
378381
if speculative_decoding:
379382
post_process_specualate(model_output, save_each_rank, skip_save_output)
380383
else:
384+
print("<><><><><>before post_process_normal")
381385
post_process_normal(sampler_output, model_output, share_inputs, block_size, save_each_rank, skip_save_output)
386+
print("<><><><><>after post_process_normal")
382387

383388

384389
def step_cuda(

fastdeploy/worker/npu_model_runner.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -814,22 +814,39 @@ class at the server level, which is too granular for ModelRunner.
814814
accept_num=None,
815815
)
816816
# Create proper SamplerOutput object from the tensor
817-
print("<><><><><>before_sampler_output")
818817
sampler_output = SamplerOutput(
819818
sampled_token_ids=next_tokens,
820819
logprobs_tensors=None,
821820
)
822821

823-
print("<><><><><>before_post_process")
824822
post_process(sampler_output=sampler_output, model_output=model_output_data, share_inputs=self.share_inputs)
825-
print("<><><><><>after_post_process")
826823

827824
# 7. Updata 'infer_seed' and step_paddle()
828825
self.share_inputs["infer_seed"].add_(self.infer_seed_increment)
829826
self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED
830-
print("<><><><><>before_step_paddle")
831827
step_paddle(
832-
self.share_inputs,
828+
self.share_inputs["stop_flags"],
829+
self.share_inputs["seq_lens_this_time"],
830+
self.share_inputs["ori_seq_lens_encoder"],
831+
self.share_inputs["seq_lens_encoder"],
832+
self.share_inputs["seq_lens_decoder"],
833+
self.share_inputs["block_tables"],
834+
self.share_inputs["encoder_block_lens"],
835+
self.share_inputs["is_block_step"],
836+
self.share_inputs["step_block_list"],
837+
self.share_inputs["step_lens"],
838+
self.share_inputs["recover_block_list"],
839+
self.share_inputs["recover_lens"],
840+
self.share_inputs["need_block_list"],
841+
self.share_inputs["need_block_len"],
842+
self.share_inputs["used_list_len"],
843+
self.share_inputs["free_list"],
844+
self.share_inputs["free_list_len"],
845+
self.share_inputs["input_ids"],
846+
self.share_inputs["pre_ids"],
847+
self.share_inputs["step_idx"],
848+
self.share_inputs["next_tokens"],
849+
self.share_inputs["first_token_ids"],
833850
self.cache_config.block_size,
834851
self.cache_config.enc_dec_block_num,
835852
)

0 commit comments

Comments
 (0)