|
1 | 1 | # SPDX-License-Identifier: Apache-2.0
|
2 | 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
3 | 3 | """A GPU worker class."""
|
4 |
| -import copy, time |
| 4 | +import copy |
5 | 5 | import gc
|
6 | 6 | import os
|
| 7 | +import time |
7 | 8 | from contextlib import AbstractContextManager, nullcontext
|
8 | 9 | from typing import TYPE_CHECKING, Any, Optional
|
9 | 10 |
|
@@ -64,9 +65,9 @@ def __init__(
|
64 | 65 |
|
65 | 66 | # Buffers saved before sleep
|
66 | 67 | self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
|
67 |
| - |
| 68 | + |
68 | 69 | # executed cuda graph
|
69 |
| - self._token_compiled_cudagraphs: set[int] = set() |
| 70 | + self._token_compiled_cudagraphs: set[int] = set() |
70 | 71 |
|
71 | 72 | # Torch profiler. Enabled and configured through env vars:
|
72 | 73 | # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
|
@@ -364,20 +365,37 @@ def execute_model(
|
364 | 365 | # logger.info("DIEGO: CUDAgraph in execution time for %d input tokens", scheduler_output.total_num_scheduled_tokens)
|
365 | 366 | # self._token_compiled_cudagraphs.add(scheduler_output.total_num_scheduled_tokens)
|
366 | 367 | # self.model_runner.capture_model(scheduler_output.total_num_scheduled_tokens)
|
367 |
| - |
| 368 | + |
| 369 | + # ATTENTION: This code is duplicated in compile_or_warm_up_model method |
| 370 | + # so we should clean this part before creating the vllm PR |
| 371 | + # warm up sizes that are not in cudagraph capture sizes, |
| 372 | + # but users still want to compile for better performance, |
| 373 | + # e.g. for the max-num-batched token size in chunked prefill. |
| 374 | + warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy() |
| 375 | + if not self.model_config.enforce_eager: |
| 376 | + warmup_sizes = [ |
| 377 | + x for x in warmup_sizes if x not in |
| 378 | + self.vllm_config.compilation_config.cudagraph_capture_sizes |
| 379 | + ] |
| 380 | + |
368 | 381 | # Just compilation with dummy run
|
369 |
| - if scheduler_output.total_num_scheduled_tokens not in self._token_compiled_cudagraphs and scheduler_output.total_num_scheduled_tokens != 0: |
370 |
| - logger.info("DIEGO: CUDAgraph in execution time for %d input tokens", scheduler_output.total_num_scheduled_tokens) |
371 |
| - self._token_compiled_cudagraphs.add(scheduler_output.total_num_scheduled_tokens) |
| 382 | + if scheduler_output.total_num_scheduled_tokens not in self._token_compiled_cudagraphs and scheduler_output.total_num_scheduled_tokens in warmup_sizes and scheduler_output.total_num_scheduled_tokens != 0: |
| 383 | + logger.info( |
| 384 | + "DIEGO: CUDAgraph in execution time for %d input tokens", |
| 385 | + scheduler_output.total_num_scheduled_tokens) |
| 386 | + self._token_compiled_cudagraphs.add( |
| 387 | + scheduler_output.total_num_scheduled_tokens) |
372 | 388 | gc.freeze()
|
373 | 389 | start_time = time.perf_counter()
|
374 |
| - self.model_runner._dummy_run(scheduler_output.total_num_scheduled_tokens, capture_attn_cudagraph=False, skip_eplb=True) |
| 390 | + self.model_runner._dummy_run( |
| 391 | + scheduler_output.total_num_scheduled_tokens, |
| 392 | + capture_attn_cudagraph=False, |
| 393 | + skip_eplb=True) |
375 | 394 | end_time = time.perf_counter()
|
376 | 395 | gc.unfreeze()
|
377 | 396 | elapsed_time = end_time - start_time
|
378 | 397 | logger.info("Graph capturing finished in %.3f secs", elapsed_time)
|
379 |
| - |
380 |
| - |
| 398 | + |
381 | 399 | output = self.model_runner.execute_model(scheduler_output,
|
382 | 400 | intermediate_tensors)
|
383 | 401 |
|
|
0 commit comments