@@ -64,6 +64,9 @@ def __init__(
64
64
65
65
# Buffers saved before sleep
66
66
self ._sleep_saved_buffers : dict [str , torch .Tensor ] = {}
67
+
68
+ # executed cuda graph
69
+ self ._token_compiled_cudagraphs : set [int ] = set ()
67
70
68
71
# Torch profiler. Enabled and configured through env vars:
69
72
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
@@ -310,8 +313,8 @@ def compile_or_warm_up_model(self) -> None:
310
313
for size in sorted (warmup_sizes , reverse = True ):
311
314
logger .info ("Compile and warming up model for size %d" , size )
312
315
self .model_runner ._dummy_run (size , skip_eplb = True )
313
- if not self .model_config .enforce_eager :
314
- self .model_runner .capture_model ()
316
+ # if not self.model_config.enforce_eager:
317
+ # self.model_runner.capture_model()
315
318
316
319
# Warm up sampler and preallocate memory buffer for logits and other
317
320
# sampling related tensors of max possible shape to avoid memory
@@ -355,6 +358,12 @@ def execute_model(
355
358
get_pp_group ().recv_tensor_dict (
356
359
all_gather_group = get_tp_group ()))
357
360
361
+ # Adding capture model in execution time
362
+ if scheduler_output .total_num_scheduled_tokens not in self ._token_compiled_cudagraphs :
363
+ logger .info ("DIEGO: CUDAgraph in execution time for %d input tokens" , scheduler_output .total_num_scheduled_tokens )
364
+ self ._token_compiled_cudagraphs .add (scheduler_output .total_num_scheduled_tokens )
365
+ self .model_runner .capture_model (scheduler_output .total_num_scheduled_tokens )
366
+
358
367
output = self .model_runner .execute_model (scheduler_output ,
359
368
intermediate_tensors )
360
369
0 commit comments