File tree Expand file tree Collapse file tree 2 files changed +3
-9
lines changed Expand file tree Collapse file tree 2 files changed +3
-9
lines changed Original file line number Diff line number Diff line change @@ -40,11 +40,6 @@ def get_max_shared_memory_bytes(gpu: int = 0) -> int:
40
40
return int (max_shared_mem )
41
41
42
42
43
- def get_gpu_memory (gpu : int = 0 ) -> int :
44
- """Returns the total memory of the GPU in bytes."""
45
- return torch .cuda .get_device_properties (gpu ).total_memory
46
-
47
-
48
43
def get_cpu_memory () -> int :
49
44
"""Returns the total CPU memory of the node in bytes."""
50
45
return psutil .virtual_memory ().total
Original file line number Diff line number Diff line change 13
13
from vllm .sequence import SamplerOutput , SequenceGroupMetadata
14
14
from vllm .worker .cache_engine import CacheEngine
15
15
from vllm .worker .model_runner import ModelRunner
16
- from vllm .utils import get_gpu_memory
17
16
18
17
19
18
class Worker :
@@ -81,7 +80,6 @@ def profile_num_available_blocks(
81
80
# Profile the memory usage of the model and get the maximum number of
82
81
# cache blocks that can be allocated with the remaining free memory.
83
82
torch .cuda .empty_cache ()
84
- torch .cuda .reset_peak_memory_stats ()
85
83
86
84
# Execute a forward pass with dummy inputs to profile the memory usage
87
85
# of the model.
@@ -90,8 +88,9 @@ def profile_num_available_blocks(
90
88
# Calculate the number of blocks that can be allocated with the
91
89
# profiled peak memory.
92
90
torch .cuda .synchronize ()
93
- peak_memory = torch .cuda .max_memory_allocated ()
94
- total_gpu_memory = get_gpu_memory ()
91
+ free_gpu_memory , total_gpu_memory = torch .cuda .mem_get_info ()
92
+ peak_memory = total_gpu_memory - free_gpu_memory
93
+
95
94
cache_block_size = CacheEngine .get_cache_block_size (
96
95
block_size , self .model_config , self .parallel_config )
97
96
num_gpu_blocks = int (
You can’t perform that action at this time.
0 commit comments