Skip to content

Commit 7d6632b

Browse files
committed
Enable autotuner warmup for drafting loops
Signed-off-by: ziyixiong-nv <[email protected]>
1 parent 6970e13 commit 7d6632b

File tree

1 file changed

+6
-4
lines changed

1 file changed

+6
-4
lines changed

tensorrt_llm/_torch/pyexecutor/model_engine.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -895,8 +895,6 @@ def _create_warmup_request(
895895
return None
896896

897897
num_extra_decoding_steps = self._get_num_extra_decoding_steps()
898-
if num_extra_decoding_steps > 0:
899-
return None # Disable autotuning for fused drafting loops for now.
900898

901899
if num_gen_requests > self.batch_size:
902900
return None
@@ -909,7 +907,10 @@ def _create_warmup_request(
909907
ctx_requests = []
910908
gen_requests = []
911909

912-
max_seq_len = self.max_seq_len - 1
910+
# For drafting loops, reduce max_seq_len to leave room for extra decoding steps
911+
max_seq_len = self.max_seq_len - 1 - num_extra_decoding_steps
912+
if max_seq_len < 1:
913+
return None # Not enough sequence length for drafting loop
913914
num_full_seqs = 0
914915
num_left_over_tokens = 0
915916

@@ -954,7 +955,8 @@ def _create_warmup_request(
954955
token_nums=ctx_token_nums,
955956
is_gen=False,
956957
max_num_draft_tokens=self.runtime_draft_len,
957-
use_mrope=self.use_mrope)
958+
use_mrope=self.use_mrope,
959+
num_extra_decoding_steps=num_extra_decoding_steps)
958960

959961
if spec_resource_manager is not None:
960962
spec_resource_manager.add_dummy_requests(

0 commit comments

Comments
 (0)