Skip to content

Commit 551603f

Browse files
authored
[core] overhaul memory profiling and fix backward compatibility (#10511)
Signed-off-by: youkaichao <[email protected]>
1 parent efbce85 commit 551603f

File tree

8 files changed

+236
-60
lines changed

8 files changed

+236
-60
lines changed
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from vllm import LLM, SamplingParams
2+
3+
4+
def test_gpu_memory_utilization():
5+
prompts = [
6+
"Hello, my name is",
7+
"The president of the United States is",
8+
"The capital of France is",
9+
"The future of AI is",
10+
]
11+
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
12+
13+
# makes sure gpu_memory_utilization is per-instance limit,
14+
# not a global limit
15+
llms = [
16+
LLM(model="facebook/opt-125m",
17+
gpu_memory_utilization=0.3,
18+
enforce_eager=True) for i in range(3)
19+
]
20+
for llm in llms:
21+
outputs = llm.generate(prompts, sampling_params)
22+
for output in outputs:
23+
prompt = output.prompt
24+
generated_text = output.outputs[0].text
25+
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

tests/entrypoints/llm/test_lazy_outlines.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def run_lmfe(sample_regex):
3636
llm = LLM(model="facebook/opt-125m",
3737
enforce_eager=True,
3838
guided_decoding_backend="lm-format-enforcer",
39-
gpu_memory_utilization=0.6)
39+
gpu_memory_utilization=0.3)
4040
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
4141
outputs = llm.generate(
4242
prompts=[

tests/test_utils.py

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,13 @@
55
from typing import AsyncIterator, Tuple
66

77
import pytest
8+
import torch
89

910
from vllm.utils import (FlexibleArgumentParser, StoreBoolean, deprecate_kwargs,
10-
get_open_port, merge_async_iterators, supports_kw)
11+
get_open_port, memory_profiling, merge_async_iterators,
12+
supports_kw)
1113

12-
from .utils import error_on_warning
14+
from .utils import error_on_warning, fork_new_process_for_each_test
1315

1416

1517
@pytest.mark.asyncio
@@ -270,3 +272,41 @@ def test_supports_kw(callable,kw_name,requires_kw_only,
270272
requires_kw_only=requires_kw_only,
271273
allow_var_kwargs=allow_var_kwargs
272274
) == is_supported
275+
276+
277+
@fork_new_process_for_each_test
278+
def test_memory_profiling():
279+
# Fake out some model loading + inference memory usage to test profiling
280+
# Memory used by other processes will show up as cuda usage outside of torch
281+
from vllm.distributed.device_communicators.cuda_wrapper import (
282+
CudaRTLibrary)
283+
lib = CudaRTLibrary()
284+
# 512 MiB allocation outside of this instance
285+
handle1 = lib.cudaMalloc(512 * 1024 * 1024)
286+
287+
baseline_memory_in_bytes = \
288+
torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]
289+
290+
# load weights
291+
292+
weights = torch.randn(128, 1024, 1024, device='cuda', dtype=torch.float32)
293+
294+
weights_memory_in_bytes = 128 * 1024 * 1024 * 4 # 512 MiB
295+
296+
with memory_profiling(baseline_memory_in_bytes=baseline_memory_in_bytes,
297+
weights_memory_in_bytes=weights_memory_in_bytes) as result:
298+
# make a memory spike, 1 GiB
299+
spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32)
300+
del spike
301+
302+
# Add some extra non-torch memory 256 MiB (simulate NCCL)
303+
handle2 = lib.cudaMalloc(256 * 1024 * 1024)
304+
305+
# Check that the memory usage is within 5% of the expected values
306+
non_torch_ratio = result.non_torch_increase_in_bytes / (256 * 1024 * 1024) # noqa
307+
torch_peak_ratio = result.torch_peak_increase_in_bytes / (1024 * 1024 * 1024) # noqa
308+
assert abs(non_torch_ratio - 1) <= 0.05
309+
assert abs(torch_peak_ratio - 1) <= 0.05
310+
del weights
311+
lib.cudaFree(handle1)
312+
lib.cudaFree(handle2)

tests/worker/test_profile.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,6 @@ def test_gpu_memory_profiling():
3131
is_driver_worker=True,
3232
)
3333

34-
# Load the model so we can profile it
35-
worker.init_device()
36-
worker.load_model()
37-
3834
# Set 10GiB as the total gpu ram to be device-agnostic
3935
def mock_mem_info():
4036
current_usage = torch.cuda.memory_stats(
@@ -46,20 +42,24 @@ def mock_mem_info():
4642

4743
from unittest.mock import patch
4844
with patch("torch.cuda.mem_get_info", side_effect=mock_mem_info):
45+
# Load the model so we can profile it
46+
worker.init_device()
47+
worker.load_model()
4948
gpu_blocks, _ = worker.determine_num_available_blocks()
5049

51-
# Peak vram usage by torch should be 0.7077 GiB
50+
# Peak vram usage by torch should be 0.47 GiB
51+
# Model weights take 0.25 GiB
5252
# No memory should be allocated outside of torch
5353
# 9.0 GiB should be the utilization target
54-
# 8.2923 GiB should be available for the KV cache
54+
# 8.28 GiB should be available for the KV cache
5555
block_size = CacheEngine.get_cache_block_size(
5656
engine_config.cache_config, engine_config.model_config,
5757
engine_config.parallel_config)
5858

59-
expected_blocks = (8.2923 * 1024**3) // block_size
59+
expected_blocks = (8.28 * 1024**3) // block_size
6060

6161
# Check within a small tolerance for portability
6262
# Hardware, kernel, or dependency changes could all affect memory
6363
# utilization.
64-
# A 10 block tolerance here should be about 6MB of wiggle room.
65-
assert abs(gpu_blocks - expected_blocks) < 10
64+
# A 100 block tolerance here should be about 60MB of wiggle room.
65+
assert abs(gpu_blocks - expected_blocks) < 100

vllm/engine/arg_utils.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -487,11 +487,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
487487
help='The fraction of GPU memory to be used for the model '
488488
'executor, which can range from 0 to 1. For example, a value of '
489489
'0.5 would imply 50%% GPU memory utilization. If unspecified, '
490-
'will use the default value of 0.9. This is a global gpu memory '
491-
'utilization limit, for example if 50%% of the gpu memory is '
492-
'already used before vLLM starts and --gpu-memory-utilization is '
493-
'set to 0.9, then only 40%% of the gpu memory will be allocated '
494-
'to the model executor.')
490+
'will use the default value of 0.9. This is a per-instance '
491+
'limit, and only applies to the current vLLM instance.'
492+
'It does not matter if you have another vLLM instance running '
493+
'on the same GPU. For example, if you have two vLLM instances '
494+
'running on the same GPU, you can set the GPU memory utilization '
495+
'to 0.5 for each instance.')
495496
parser.add_argument(
496497
'--num-gpu-blocks-override',
497498
type=int,

vllm/utils.py

Lines changed: 123 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,12 @@
2323
from asyncio import FIRST_COMPLETED, AbstractEventLoop, Future, Task
2424
from collections import UserDict, defaultdict
2525
from collections.abc import Iterable, Mapping
26+
from dataclasses import dataclass, field
2627
from functools import lru_cache, partial, wraps
2728
from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
28-
Dict, Generic, Hashable, List, Literal, Optional,
29-
OrderedDict, Set, Tuple, Type, TypeVar, Union, overload)
29+
Dict, Generator, Generic, Hashable, List, Literal,
30+
Optional, OrderedDict, Set, Tuple, Type, TypeVar, Union,
31+
overload)
3032
from uuid import uuid4
3133

3234
import numpy as np
@@ -1664,3 +1666,122 @@ def kill_process_tree(pid: int):
16641666
# Finally kill the parent
16651667
with contextlib.suppress(ProcessLookupError):
16661668
os.kill(pid, signal.SIGKILL)
1669+
1670+
1671+
@dataclass
1672+
class MemorySnapshot:
1673+
"""Memory snapshot."""
1674+
torch_peak_in_bytes: int = 0
1675+
torch_memory_in_bytes: int = 0
1676+
timestamp: float = 0.0
1677+
1678+
def measure(self):
1679+
self.torch_peak_in_bytes = torch.cuda.memory_stats(
1680+
)["allocated_bytes.all.peak"]
1681+
self.torch_memory_in_bytes = torch.cuda.memory_stats(
1682+
)["allocated_bytes.all.current"]
1683+
self.timestamp = time.time()
1684+
1685+
def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot":
1686+
"""support a - b"""
1687+
return MemorySnapshot(
1688+
torch_peak_in_bytes=self.torch_peak_in_bytes -
1689+
other.torch_peak_in_bytes,
1690+
torch_memory_in_bytes=self.torch_memory_in_bytes -
1691+
other.torch_memory_in_bytes,
1692+
timestamp=self.timestamp - other.timestamp)
1693+
1694+
1695+
@dataclass
1696+
class MemoryProfilingResult:
1697+
"""Memory profiling result.
1698+
""" # noqa
1699+
baseline_memory_in_bytes: int = 0
1700+
non_kv_cache_memory_in_bytes: int = 0
1701+
torch_peak_increase_in_bytes: int = 0
1702+
non_torch_increase_in_bytes: int = 0
1703+
weights_memory_in_bytes: float = 0
1704+
before_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
1705+
after_profile: MemorySnapshot = field(default_factory=MemorySnapshot)
1706+
profile_time: float = 0.0
1707+
1708+
1709+
@contextlib.contextmanager
1710+
def memory_profiling(
1711+
baseline_memory_in_bytes: int, weights_memory_in_bytes: int
1712+
) -> Generator[MemoryProfilingResult, None, None]:
1713+
"""Memory profiling context manager.
1714+
baseline_memory_in_bytes: memory used by all the components other than
1715+
the current vLLM instance. It contains: memory used by other processes, memory
1716+
used by another vLLM instance in the same process, etc. It is usually measured
1717+
before the current vLLM instance initialize the device. And we assume it is
1718+
constant during the profiling of the current vLLM instance.
1719+
weights_memory_in_bytes: memory used by PyTorch when loading the model weights.
1720+
Note that, before loading the model weights, we also initialize the device
1721+
and distributed environment, which may consume some memory. This part is not
1722+
included in the weights_memory_in_bytes because PyTorch does not control it.
1723+
1724+
The memory in one GPU can be classified into 3 categories:
1725+
1. memory used by anything other than the current vLLM instance.
1726+
2. memory used by torch in the current vLLM instance.
1727+
3. memory used in the current vLLM instance, but not by torch.
1728+
1729+
A quantitive example:
1730+
1731+
Before creating the current vLLM instance:
1732+
category 1: 1 GiB
1733+
category 2: 0 GiB
1734+
category 3: 0 GiB
1735+
1736+
After creating the current vLLM instance and loading the model,
1737+
(i.e. before profiling):
1738+
category 1: 1 GiB
1739+
category 2: 2 GiB (model weights take 2 GiB)
1740+
category 3: 0.5 GiB (memory used by NCCL)
1741+
1742+
During profiling (peak):
1743+
category 1: 1 GiB
1744+
category 2: 4 GiB (peak activation tensors take 2 GiB)
1745+
category 3: 1 GiB (memory used by NCCL + buffers for some attention backends)
1746+
1747+
After profiling:
1748+
category 1: 1 GiB
1749+
category 2: 3 GiB (after garbage-collecting activation tensors)
1750+
category 3: 1 GiB (memory used by NCCL + buffers for some attention backends)
1751+
1752+
In this case, non-kv cache takes 5 GiB in total, including:
1753+
a. 2 GiB used by the model weights (category 2)
1754+
b. 2 GiB reserved for the peak activation tensors (category 2)
1755+
c. 1 GiB used by non-torch components (category 3)
1756+
1757+
The memory used for loading weights (a.) is directly given from the argument `weights_memory_in_bytes`.
1758+
1759+
The increase of ``torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.).
1760+
1761+
(c.) is tricky. We measure the total memory used in this GPU (`torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]`),
1762+
subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_stats()["allocated_bytes.all.current"]`.
1763+
""" # noqa
1764+
torch.cuda.reset_peak_memory_stats()
1765+
1766+
result = MemoryProfilingResult()
1767+
1768+
result.baseline_memory_in_bytes = baseline_memory_in_bytes
1769+
# the part of memory used for holding the model weights
1770+
result.weights_memory_in_bytes = weights_memory_in_bytes
1771+
1772+
result.before_profile.measure()
1773+
1774+
yield result
1775+
1776+
gc.collect()
1777+
torch.cuda.empty_cache()
1778+
1779+
result.after_profile.measure()
1780+
1781+
diff = result.after_profile - result.before_profile
1782+
result.torch_peak_increase_in_bytes = diff.torch_peak_in_bytes
1783+
current_cuda_memory_bytes = torch.cuda.mem_get_info(
1784+
)[1] - torch.cuda.mem_get_info()[0]
1785+
result.non_torch_increase_in_bytes = current_cuda_memory_bytes - baseline_memory_in_bytes - weights_memory_in_bytes - diff.torch_memory_in_bytes # noqa
1786+
result.profile_time = diff.timestamp
1787+
result.non_kv_cache_memory_in_bytes = result.non_torch_increase_in_bytes + result.torch_peak_increase_in_bytes + result.weights_memory_in_bytes # noqa

vllm/worker/multi_step_model_runner.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -645,7 +645,8 @@ def _advance_step(self, model_input: StatefulModelInput,
645645
return model_input
646646

647647
def load_model(self) -> None:
648-
return self._base_model_runner.load_model()
648+
self._base_model_runner.load_model()
649+
self.model_memory_usage = self._base_model_runner.model_memory_usage
649650

650651
def save_sharded_state(
651652
self,

0 commit comments

Comments
 (0)