|
4 | 4 | import copy
|
5 | 5 | import gc
|
6 | 6 | import os
|
7 |
| -import time |
8 | 7 | from contextlib import AbstractContextManager, nullcontext
|
9 | 8 | from typing import TYPE_CHECKING, Any, Optional
|
10 | 9 |
|
@@ -67,7 +66,7 @@ def __init__(
|
67 | 66 | self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
|
68 | 67 |
|
69 | 68 | # executed cuda graph
|
70 |
| - self._token_compiled_cudagraphs: set[int] = set() |
| 69 | + self._token_compiled_cudagraphs: set[int] = set(self.compilation_config.cudagraph_capture_sizes) |
71 | 70 |
|
72 | 71 | # Torch profiler. Enabled and configured through env vars:
|
73 | 72 | # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
|
@@ -359,58 +358,31 @@ def execute_model(
|
359 | 358 | get_pp_group().recv_tensor_dict(
|
360 | 359 | all_gather_group=get_tp_group()))
|
361 | 360 |
|
362 |
| - # logger.info("DIEGO: Executing the model") |
363 |
| - # Adding capture model in execution time |
364 |
| - # if scheduler_output.total_num_scheduled_tokens not in self._token_compiled_cudagraphs: |
365 |
| - # logger.info("DIEGO: CUDAgraph in execution time for %d input tokens", scheduler_output.total_num_scheduled_tokens) |
366 |
| - # self._token_compiled_cudagraphs.add(scheduler_output.total_num_scheduled_tokens) |
367 |
| - # self.model_runner.capture_model(scheduler_output.total_num_scheduled_tokens) |
368 |
| - |
369 |
| - def compile_cuda_graph(input_size: int): |
370 |
| - gc.freeze() |
371 |
| - start_time = time.perf_counter() |
372 |
| - self.model_runner._dummy_run(input_size, |
373 |
| - capture_attn_cudagraph=False, |
374 |
| - skip_eplb=True) |
375 |
| - end_time = time.perf_counter() |
376 |
| - gc.unfreeze() |
377 |
| - elapsed_time = end_time - start_time |
378 |
| - logger.info("Graph capturing finished in %.3f secs", elapsed_time) |
379 |
| - |
380 |
| - # # ATTENTION: This code is duplicated in compile_or_warm_up_model method |
381 |
| - # # so we should clean this part before creating the vllm PR |
382 |
| - # # warm up sizes that are not in cudagraph capture sizes, |
383 |
| - # # but users still want to compile for better performance, |
384 |
| - # # e.g. for the max-num-batched token size in chunked prefill. |
385 |
| - # warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy() |
386 |
| - # logger.info("Warm up sizes %s", str(warmup_sizes)) |
387 |
| - # if not self.model_config.enforce_eager: |
388 |
| - # warmup_sizes = [ |
389 |
| - # x for x in warmup_sizes if x not in |
390 |
| - # self.vllm_config.compilation_config.cudagraph_capture_sizes |
391 |
| - # ] |
392 |
| - |
393 |
| - # warmup_sizes_set = set(warmup_sizes) |
394 |
| - |
395 |
| - self.cudagraph_batch_sizes_set = set( |
396 |
| - reversed(self.compilation_config.cudagraph_capture_sizes)) |
397 |
| - # Just compilation with dummy run |
398 |
| - if scheduler_output.total_num_scheduled_tokens not in self._token_compiled_cudagraphs and scheduler_output.total_num_scheduled_tokens in self.cudagraph_batch_sizes_set and scheduler_output.total_num_scheduled_tokens != 0: |
| 361 | + |
| 362 | + # Initialize next_comp variable to None |
| 363 | + next_capture = None |
| 364 | + |
| 365 | + # Check if the scheduled token count is in our compiled CUDAgraphs list |
| 366 | + if scheduler_output.total_num_scheduled_tokens in self._token_compiled_cudagraphs: |
| 367 | + # If it is, update next_comp and remove the entry from _token_compiled_cudagraphs |
| 368 | + next_capture = scheduler_output.total_num_scheduled_tokens |
| 369 | + self._token_compiled_cudagraphs.discard( |
| 370 | + scheduler_output.total_num_scheduled_tokens) |
399 | 371 | logger.info(
|
400 | 372 | "LAZY DIEGO: CUDAgraph in execution time for %d input tokens",
|
401 |
| - scheduler_output.total_num_scheduled_tokens) |
402 |
| - self._token_compiled_cudagraphs.add( |
403 |
| - scheduler_output.total_num_scheduled_tokens) |
404 |
| - compile_cuda_graph(scheduler_output.total_num_scheduled_tokens) |
405 |
| - else: |
406 |
| - next_comp_set = self.cudagraph_batch_sizes_set.difference(self._token_compiled_cudagraphs) |
407 |
| - if len(next_comp_set) != 0: |
408 |
| - next_comp = list(next_comp_set) |
409 |
| - self._token_compiled_cudagraphs.add(next_comp[0]) |
410 |
| - logger.info( |
411 |
| - "DELAYED DIEGO: CUDAgraph in execution time for %d input tokens", |
412 |
| - next_comp[0]) |
413 |
| - compile_cuda_graph(next_comp[0]) |
| 373 | + next_capture) |
| 374 | + |
| 375 | + # Check if there are any entries left in _token_compiled_cudagraphs |
| 376 | + elif len(self._token_compiled_cudagraphs) > 0: |
| 377 | + # If so, update next_comp to the first item and remove it from the list |
| 378 | + next_capture = self._token_compiled_cudagraphs.pop() |
| 379 | + logger.info( |
| 380 | + "DELAYED DIEGO: CUDAgraph in execution time for %d input tokens", |
| 381 | + next_capture) |
| 382 | + |
| 383 | + # If we have a value for next_comp, call the model_runner to capture the model |
| 384 | + if next_capture: |
| 385 | + self.model_runner.capture_model(next_capture) |
414 | 386 |
|
415 | 387 | output = self.model_runner.execute_model(scheduler_output,
|
416 | 388 | intermediate_tensors)
|
|
0 commit comments