Skip to content

Commit 65c9031

Browse files
isort changes
Signed-off-by: Diego-Castan <[email protected]>
1 parent 4d8bec5 commit 65c9031

File tree

1 file changed

+28
-10
lines changed

1 file changed

+28
-10
lines changed

vllm/v1/worker/gpu_worker.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33
"""A GPU worker class."""
4-
import copy, time
4+
import copy
55
import gc
66
import os
7+
import time
78
from contextlib import AbstractContextManager, nullcontext
89
from typing import TYPE_CHECKING, Any, Optional
910

@@ -64,9 +65,9 @@ def __init__(
6465

6566
# Buffers saved before sleep
6667
self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
67-
68+
6869
# executed cuda graph
69-
self._token_compiled_cudagraphs: set[int] = set()
70+
self._token_compiled_cudagraphs: set[int] = set()
7071

7172
# Torch profiler. Enabled and configured through env vars:
7273
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
@@ -364,20 +365,37 @@ def execute_model(
364365
# logger.info("DIEGO: CUDAgraph in execution time for %d input tokens", scheduler_output.total_num_scheduled_tokens)
365366
# self._token_compiled_cudagraphs.add(scheduler_output.total_num_scheduled_tokens)
366367
# self.model_runner.capture_model(scheduler_output.total_num_scheduled_tokens)
367-
368+
369+
# ATTENTION: This code is duplicated in compile_or_warm_up_model method
370+
# so we should clean this part before creating the vllm PR
371+
# warm up sizes that are not in cudagraph capture sizes,
372+
# but users still want to compile for better performance,
373+
# e.g. for the max-num-batched token size in chunked prefill.
374+
warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy()
375+
if not self.model_config.enforce_eager:
376+
warmup_sizes = [
377+
x for x in warmup_sizes if x not in
378+
self.vllm_config.compilation_config.cudagraph_capture_sizes
379+
]
380+
368381
# Just compilation with dummy run
369-
if scheduler_output.total_num_scheduled_tokens not in self._token_compiled_cudagraphs and scheduler_output.total_num_scheduled_tokens != 0:
370-
logger.info("DIEGO: CUDAgraph in execution time for %d input tokens", scheduler_output.total_num_scheduled_tokens)
371-
self._token_compiled_cudagraphs.add(scheduler_output.total_num_scheduled_tokens)
382+
if scheduler_output.total_num_scheduled_tokens not in self._token_compiled_cudagraphs and scheduler_output.total_num_scheduled_tokens in warmup_sizes and scheduler_output.total_num_scheduled_tokens != 0:
383+
logger.info(
384+
"DIEGO: CUDAgraph in execution time for %d input tokens",
385+
scheduler_output.total_num_scheduled_tokens)
386+
self._token_compiled_cudagraphs.add(
387+
scheduler_output.total_num_scheduled_tokens)
372388
gc.freeze()
373389
start_time = time.perf_counter()
374-
self.model_runner._dummy_run(scheduler_output.total_num_scheduled_tokens, capture_attn_cudagraph=False, skip_eplb=True)
390+
self.model_runner._dummy_run(
391+
scheduler_output.total_num_scheduled_tokens,
392+
capture_attn_cudagraph=False,
393+
skip_eplb=True)
375394
end_time = time.perf_counter()
376395
gc.unfreeze()
377396
elapsed_time = end_time - start_time
378397
logger.info("Graph capturing finished in %.3f secs", elapsed_time)
379-
380-
398+
381399
output = self.model_runner.execute_model(scheduler_output,
382400
intermediate_tensors)
383401

0 commit comments

Comments
 (0)