Skip to content

Commit fdb09c7

Browse files
authored
[sleep mode] save memory for on-the-fly quantization (vllm-project#24731)
Signed-off-by: youkaichao <[email protected]>
1 parent 7a1c402 commit fdb09c7

File tree

1 file changed

+31
-6
lines changed

1 file changed

+31
-6
lines changed

vllm/device_allocator/cumem.py

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,11 @@
1616

1717
import torch
1818

19+
from vllm.logger import init_logger
1920
from vllm.utils import is_pin_memory_available
2021

22+
logger = init_logger(__name__)
23+
2124

2225
def find_loaded_library(lib_name) -> Optional[str]:
2326
"""
@@ -165,6 +168,9 @@ def _python_malloc_callback(self, allocation_handle: HandleType) -> None:
165168
py_d_mem = allocation_handle[2]
166169
self.pointer_to_data[py_d_mem] = AllocationData(
167170
allocation_handle, self.current_tag)
171+
logger.debug(
172+
"Allocated %s bytes for %s with address %s from cumem allocator",
173+
allocation_handle[1], self.current_tag, py_d_mem)
168174
return
169175

170176
def _python_free_callback(self, ptr: int) -> HandleType:
@@ -174,6 +180,9 @@ def _python_free_callback(self, ptr: int) -> HandleType:
174180
data = self.pointer_to_data.pop(ptr)
175181
if data.cpu_backup_tensor is not None:
176182
data.cpu_backup_tensor = None
183+
logger.debug(
184+
"Freed %s bytes for %s with address %s from cumem allocator",
185+
data.handle[1], data.tag, ptr)
177186
return data.handle
178187

179188
def sleep(
@@ -197,9 +206,14 @@ def sleep(
197206

198207
assert isinstance(offload_tags, tuple)
199208

209+
total_bytes = 0
210+
backup_bytes = 0
211+
200212
for ptr, data in self.pointer_to_data.items():
201213
handle = data.handle
214+
total_bytes += handle[1]
202215
if data.tag in offload_tags:
216+
backup_bytes += handle[1]
203217
size_in_bytes = handle[1]
204218
cpu_backup_tensor = torch.empty(
205219
size_in_bytes,
@@ -211,6 +225,12 @@ def sleep(
211225
data.cpu_backup_tensor = cpu_backup_tensor
212226
unmap_and_release(handle)
213227

228+
logger.info(
229+
"CuMemAllocator: sleep freed %.2f GiB memory in total, of which "
230+
"%.2f GiB is backed up in CPU and the rest %.2f GiB is discarded "
231+
"directly.", total_bytes / 1024**3, backup_bytes / 1024**3,
232+
(total_bytes - backup_bytes) / 1024**3)
233+
214234
gc.collect()
215235
torch.cuda.empty_cache()
216236

@@ -267,12 +287,17 @@ def use_memory_pool(self, tag: Optional[str] = None):
267287
# when using pluggable allocator, see
268288
# https://github.com/pytorch/pytorch/issues/145168 .
269289
# if we have some memory allocated and then freed,
270-
# the memory will not be released.
271-
# right now it is fine, because we only use this allocator
272-
# during weight loading and kv cache creation, where we only
273-
# allocate memory.
274-
# TODO: we need to find a way to release the memory,
275-
# i.e. calling torch.cuda.empty_cache()
290+
# the memory will not be released, e.g. in online quantization,
291+
# where the model is created in higher precision, and then
292+
# quantized in lower precision.
293+
# Find all unused allocations and manually release them.
294+
# TODO: we should expose `empty_cache` method in the memory pool.
295+
# TODO: ask for help from PyTorch team to expose this method.
296+
allocations = data[0].snapshot()
297+
for allocation in allocations:
298+
if allocation["allocated_size"] == 0:
299+
handle = self._python_free_callback(allocation["address"])
300+
unmap_and_release(handle)
276301
self.current_tag = old_tag
277302

278303
def get_current_usage(self) -> int:

0 commit comments

Comments
 (0)