Skip to content

Commit f61029c

Browse files
Better loggic but WIP
Signed-off-by: Diego-Castan <[email protected]>
1 parent b3fe509 commit f61029c

File tree

2 files changed

+25
-63
lines changed

2 files changed

+25
-63
lines changed

vllm/v1/worker/gpu_model_runner.py

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

4-
import dataclasses, inspect
4+
import dataclasses
55
import gc
66
import time
77
from contextlib import contextmanager
@@ -328,9 +328,6 @@ def __init__(
328328
if self.cache_config.kv_sharing_fast_prefill:
329329
self.kv_sharing_fast_prefill_logits_indices = torch.zeros(
330330
self.max_num_tokens, dtype=torch.int32, device=self.device)
331-
332-
# DIEGO: to avoid printing all the stack of calls every single time
333-
self.print_stack_calls = True
334331

335332
def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
336333
"""
@@ -2578,13 +2575,6 @@ def freeze_gc():
25782575
# This usually takes 5~20 seconds.
25792576
logger.info("Graph capturing finished in %.3f secs, took %.2f GiB",
25802577
elapsed_time, cuda_graph_size / (1 << 30))
2581-
2582-
# DIEGO: print all the stack of calls just the first time
2583-
# if self.print_stack_calls == True:
2584-
# self.print_stack_calls = False
2585-
# for frame_info in inspect.stack():
2586-
# logger.info(f"DIEGO: File: {frame_info.filename}, Line: {frame_info.lineno}, Function: {frame_info.function}")
2587-
25882578

25892579
def _initialize_single_attn_backend(
25902580
self, kv_cache_spec: KVCacheSpec, layer_names: list[str]

vllm/v1/worker/gpu_worker.py

Lines changed: 24 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import copy
55
import gc
66
import os
7-
import time
87
from contextlib import AbstractContextManager, nullcontext
98
from typing import TYPE_CHECKING, Any, Optional
109

@@ -67,7 +66,7 @@ def __init__(
6766
self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
6867

6968
# executed cuda graph
70-
self._token_compiled_cudagraphs: set[int] = set()
69+
self._token_compiled_cudagraphs: set[int] = set(self.compilation_config.cudagraph_capture_sizes)
7170

7271
# Torch profiler. Enabled and configured through env vars:
7372
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
@@ -359,58 +358,31 @@ def execute_model(
359358
get_pp_group().recv_tensor_dict(
360359
all_gather_group=get_tp_group()))
361360

362-
# logger.info("DIEGO: Executing the model")
363-
# Adding capture model in execution time
364-
# if scheduler_output.total_num_scheduled_tokens not in self._token_compiled_cudagraphs:
365-
# logger.info("DIEGO: CUDAgraph in execution time for %d input tokens", scheduler_output.total_num_scheduled_tokens)
366-
# self._token_compiled_cudagraphs.add(scheduler_output.total_num_scheduled_tokens)
367-
# self.model_runner.capture_model(scheduler_output.total_num_scheduled_tokens)
368-
369-
def compile_cuda_graph(input_size: int):
370-
gc.freeze()
371-
start_time = time.perf_counter()
372-
self.model_runner._dummy_run(input_size,
373-
capture_attn_cudagraph=False,
374-
skip_eplb=True)
375-
end_time = time.perf_counter()
376-
gc.unfreeze()
377-
elapsed_time = end_time - start_time
378-
logger.info("Graph capturing finished in %.3f secs", elapsed_time)
379-
380-
# # ATTENTION: This code is duplicated in compile_or_warm_up_model method
381-
# # so we should clean this part before creating the vllm PR
382-
# # warm up sizes that are not in cudagraph capture sizes,
383-
# # but users still want to compile for better performance,
384-
# # e.g. for the max-num-batched token size in chunked prefill.
385-
# warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy()
386-
# logger.info("Warm up sizes %s", str(warmup_sizes))
387-
# if not self.model_config.enforce_eager:
388-
# warmup_sizes = [
389-
# x for x in warmup_sizes if x not in
390-
# self.vllm_config.compilation_config.cudagraph_capture_sizes
391-
# ]
392-
393-
# warmup_sizes_set = set(warmup_sizes)
394-
395-
self.cudagraph_batch_sizes_set = set(
396-
reversed(self.compilation_config.cudagraph_capture_sizes))
397-
# Just compilation with dummy run
398-
if scheduler_output.total_num_scheduled_tokens not in self._token_compiled_cudagraphs and scheduler_output.total_num_scheduled_tokens in self.cudagraph_batch_sizes_set and scheduler_output.total_num_scheduled_tokens != 0:
361+
362+
# Initialize next_comp variable to None
363+
next_capture = None
364+
365+
# Check if the scheduled token count is in our compiled CUDAgraphs list
366+
if scheduler_output.total_num_scheduled_tokens in self._token_compiled_cudagraphs:
367+
# If it is, update next_comp and remove the entry from _token_compiled_cudagraphs
368+
next_capture = scheduler_output.total_num_scheduled_tokens
369+
self._token_compiled_cudagraphs.discard(
370+
scheduler_output.total_num_scheduled_tokens)
399371
logger.info(
400372
"LAZY DIEGO: CUDAgraph in execution time for %d input tokens",
401-
scheduler_output.total_num_scheduled_tokens)
402-
self._token_compiled_cudagraphs.add(
403-
scheduler_output.total_num_scheduled_tokens)
404-
compile_cuda_graph(scheduler_output.total_num_scheduled_tokens)
405-
else:
406-
next_comp_set = self.cudagraph_batch_sizes_set.difference(self._token_compiled_cudagraphs)
407-
if len(next_comp_set) != 0:
408-
next_comp = list(next_comp_set)
409-
self._token_compiled_cudagraphs.add(next_comp[0])
410-
logger.info(
411-
"DELAYED DIEGO: CUDAgraph in execution time for %d input tokens",
412-
next_comp[0])
413-
compile_cuda_graph(next_comp[0])
373+
next_capture)
374+
375+
# Check if there are any entries left in _token_compiled_cudagraphs
376+
elif len(self._token_compiled_cudagraphs) > 0:
377+
# If so, update next_comp to the first item and remove it from the list
378+
next_capture = self._token_compiled_cudagraphs.pop()
379+
logger.info(
380+
"DELAYED DIEGO: CUDAgraph in execution time for %d input tokens",
381+
next_capture)
382+
383+
# If we have a value for next_comp, call the model_runner to capture the model
384+
if next_capture:
385+
self.model_runner.capture_model(next_capture)
414386

415387
output = self.model_runner.execute_model(scheduler_output,
416388
intermediate_tensors)

0 commit comments

Comments
 (0)