1
1
# SPDX-License-Identifier: Apache-2.0
2
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
3
4
- import dataclasses
4
+ import dataclasses , inspect
5
5
import gc
6
6
import time
7
7
from contextlib import contextmanager
@@ -328,6 +328,9 @@ def __init__(
328
328
if self .cache_config .kv_sharing_fast_prefill :
329
329
self .kv_sharing_fast_prefill_logits_indices = torch .zeros (
330
330
self .max_num_tokens , dtype = torch .int32 , device = self .device )
331
+
332
+ # DIEGO: to avoid printing all the stack of calls every single time
333
+ self .print_stack_calls = True
331
334
332
335
def _may_reorder_batch (self , scheduler_output : "SchedulerOutput" ) -> None :
333
336
"""
@@ -2516,7 +2519,7 @@ def profile_run(self) -> None:
2516
2519
self .encoder_cache .clear ()
2517
2520
gc .collect ()
2518
2521
2519
- def capture_model (self , specific_token_num : Optional [int ]) -> None :
2522
+ def capture_model (self , specific_token_num : Optional [int ] = None ) -> None :
2520
2523
if not self .use_cuda_graph :
2521
2524
logger .warning (
2522
2525
"Skipping CUDA graph capture. To turn on CUDA graph capture, "
@@ -2534,7 +2537,7 @@ def freeze_gc():
2534
2537
# Optimize garbage collection during CUDA graph capture.
2535
2538
# Clean up, then freeze all remaining objects from being included
2536
2539
# in future collections.
2537
- gc .collect ()
2540
+ # gc.collect()
2538
2541
should_freeze = not envs .VLLM_ENABLE_CUDAGRAPH_GC
2539
2542
if should_freeze :
2540
2543
gc .freeze ()
@@ -2573,8 +2576,15 @@ def freeze_gc():
2573
2576
elapsed_time = end_time - start_time
2574
2577
cuda_graph_size = start_free_gpu_memory - end_free_gpu_memory
2575
2578
# This usually takes 5~20 seconds.
2576
- logger .info ("Graph capturing finished in %.0f secs, took %.2f GiB" ,
2579
+ logger .info ("Graph capturing finished in %.3f secs, took %.2f GiB" ,
2577
2580
elapsed_time , cuda_graph_size / (1 << 30 ))
2581
+
2582
+ # DIEGO: print all the stack of calls just the first time
2583
+ # if self.print_stack_calls == True:
2584
+ # self.print_stack_calls = False
2585
+ # for frame_info in inspect.stack():
2586
+ # logger.info(f"DIEGO: File: {frame_info.filename}, Line: {frame_info.lineno}, Function: {frame_info.function}")
2587
+
2578
2588
2579
2589
def _initialize_single_attn_backend (
2580
2590
self , kv_cache_spec : KVCacheSpec , layer_names : list [str ]
0 commit comments