@@ -36,7 +36,6 @@ def __init__(self) -> None:
3636 self .prefill = self .prefill_overlap_mtp
3737 else :
3838 self .prefill = self .prefill_mtp
39-
4039 if self .enable_decode_microbatch_overlap :
4140 self .decode = self .decode_overlap_mtp
4241 else :
@@ -543,7 +542,6 @@ def prefill_overlap_mtp(self, event_pack: OverlapEventPack, prefill_reqs: List[I
543542 run_reqs1 ,
544543 padded_req_num1 ,
545544 ) = padded_overlap_prepare_prefill_inputs (prefill_reqs , is_multimodal = self .is_multimodal )
546- print (micro_input0 , micro_input1 )
547545 with torch .cuda .stream (g_infer_context .get_overlap_stream ()):
548546 micro_output0 , micro_output1 = self .model .microbatch_overlap_prefill (micro_input0 , micro_input1 )
549547 logits0 = micro_output0 .logits
@@ -622,7 +620,6 @@ def prefill_overlap_mtp(self, event_pack: OverlapEventPack, prefill_reqs: List[I
622620
623621 event_pack .notify_forward_and_wait_post_handle ()
624622 sync_event .synchronize ()
625- print (next_token_ids_cpu )
626623
627624 self ._post_handle (
628625 run_reqs = run_reqs ,
@@ -767,6 +764,7 @@ def decode_overlap_mtp(self, event_pack: OverlapEventPack, decode_reqs: List[Inf
767764 g_infer_state_lock .acquire ()
768765 g_infer_context .req_manager .mem_manager .free (need_free_mem_indexes )
769766 g_infer_state_lock .release ()
767+ event_pack .notify_pre_post_handle ()
770768 else :
771769 event_pack .notify_post_handle_and_wait_pre_post_handle ()
772770 event_pack .notify_forward_and_wait_post_handle ()
0 commit comments