Skip to content

Commit 251d173

Browse files
committed
merge ace w/o merged_prefill_for_v1 changes
1 parent 5103668 commit 251d173

31 files changed

+2502
-26
lines changed

examples/lmcache/README.md

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# LMCache Examples
2+
3+
This folder demonstrates how to use LMCache for disaggregated prefilling, CPU offloading and KV cache sharing.
4+
5+
## 1. Disaggregated Prefill in vLLM v1
6+
7+
This example demonstrates how to run LMCache with disaggregated prefill using NIXL on a single node.
8+
9+
### Prerequisites
10+
11+
- Install [LMCache](https://github.com/LMCache/LMCache). You can simply run `pip install lmcache`.
12+
- Install [NIXL](https://github.com/ai-dynamo/nixl).
13+
- At least 2 GPUs
14+
- Valid Hugging Face token (HF_TOKEN) for Llama 3.1 8B Instruct.
15+
16+
### Usage
17+
18+
Run
19+
`cd disagg_prefill_lmcache_v1`
20+
to get into `disagg_prefill_lmcache_v1` folder, and then run
21+
22+
```bash
23+
bash disagg_example_nixl.sh
24+
```
25+
26+
to run disaggregated prefill and benchmark the performance.
27+
28+
### Components
29+
30+
#### Server Scripts
31+
- `disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh` - Launches individual vLLM servers for prefill/decode, and also launches the proxy server.
32+
- `disagg_prefill_lmcache_v1/disagg_proxy_server.py` - FastAPI proxy server that coordinates between prefiller and decoder
33+
- `disagg_prefill_lmcache_v1/disagg_example_nixl.sh` - Main script to run the example
34+
35+
#### Configuration
36+
- `disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml` - Configuration for prefiller server
37+
- `disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml` - Configuration for decoder server
38+
39+
#### Log Files
40+
The main script generates several log files:
41+
- `prefiller.log` - Logs from the prefill server
42+
- `decoder.log` - Logs from the decode server
43+
- `proxy.log` - Logs from the proxy server
44+
45+
## 2. CPU Offload Examples
46+
47+
- `python cpu_offload_lmcache.py -v v0` - CPU offloading implementation for vLLM v0
48+
- `python cpu_offload_lmcache.py -v v1` - CPU offloading implementation for vLLM v1
49+
50+
## 3. KV Cache Sharing
51+
52+
The `kv_cache_sharing_lmcache_v1.py` example demonstrates how to share KV caches between vLLM v1 instances.
53+
54+
## 4. Disaggregated Prefill in vLLM v0
55+
56+
The `disaggregated_prefill_lmcache_v0.py` provides an example of how to run disaggregated prefill in vLLM v0.

examples/lmcache/a

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
17a18
2+
> from lmcache.experimental.cache_engine import LMCacheEngineBuilder
3+
19d19
4+
< from lmcache.v1.cache_engine import LMCacheEngineBuilder
5+
26a27,28
6+
> # Use experimental features in LMCache
7+
> os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
8+
39,42d40
9+
< MODEL = "/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/"
10+
< #prompts = [
11+
< # "Hello, how are you?" * 1000,
12+
< #]
13+
44c42
14+
< "San Francisco is a",
15+
---
16+
> "Hello, how are you?" * 1000,
17+
49a48,49
18+
> os.environ["CUDA_VISIBLE_DEVICES"] = "0"
19+
>
20+
53c53
21+
< '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_producer"}')
22+
---
23+
> '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}')
24+
56c56
25+
< llm = LLM(model=MODEL,
26+
---
27+
> llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
28+
60,61c60
29+
< tensor_parallel_size=2,
30+
< enforce_eager=False)
31+
---
32+
> enforce_eager=True)
33+
66c65
34+
< print(f"Producer Generated text: {generated_text!r}")
35+
---
36+
> print(f"Generated text: {generated_text!r}")
37+
76,77c75,78
38+
< sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=20)
39+
< # sampling_params = SamplingParams(temperature=0, max_tokens=100)
40+
---
41+
> os.environ["CUDA_VISIBLE_DEVICES"] = "1"
42+
>
43+
> sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
44+
>
45+
79c80
46+
< '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_consumer"}')
47+
---
48+
> '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}')
49+
82c83
50+
< llm = LLM(model=MODEL,
51+
---
52+
> llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
53+
86,87c87
54+
< tensor_parallel_size=2,
55+
< enforce_eager=False)
56+
---
57+
> enforce_eager=True)
58+
96c96
59+
< print(f"Consumer Generated text: {generated_text!r}")
60+
---
61+
> print(f"Generated text: {generated_text!r}")
62+
103,105c103,106
63+
< server_proc = subprocess.Popen(
64+
< ["python", "-m", "lmcache.v1.server", "localhost",
65+
< str(port)])
66+
---
67+
> server_proc = subprocess.Popen([
68+
> "python", "-m", "lmcache.experimental.server", "localhost",
69+
> str(port)
70+
> ])
71+
113d113
72+
<
73+
115c115
74+
< print("libin kvshare store start")
75+
---
76+
>
77+
119d118
78+
< print("libin kvshare retrieve start")
79+
122,124c121
80+
< print("libin kvshare retrieve done")
81+
< store_process.join()
82+
< retrieve_process.join()
83+
---
84+
>
85+
125a123
86+
> store_process.join()
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
"""
3+
This file demonstrates the example usage of cpu offloading
4+
with LMCache in vLLM v1 or v0.
5+
6+
Usage:
7+
8+
Specify vLLM version
9+
10+
-v v0 : Use LMCacheConnector
11+
model = mistralai/Mistral-7B-Instruct-v0.2
12+
(Includes enable_chunked_prefill = True)
13+
14+
-v v1 : Use LMCacheConnectorV1 (default)
15+
model = meta-llama/Meta-Llama-3.1-8B-Instruct
16+
(Without enable_chunked_prefill)
17+
18+
Note that `lmcache` is needed to run this example.
19+
Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
20+
Learn more about LMCache environment setup, please refer to:
21+
https://docs.lmcache.ai/getting_started/installation.html
22+
"""
23+
import argparse
24+
import contextlib
25+
import os
26+
import time
27+
from dataclasses import asdict
28+
29+
from lmcache.experimental.cache_engine import LMCacheEngineBuilder
30+
from lmcache.integration.vllm.utils import ENGINE_NAME
31+
32+
from vllm import LLM, SamplingParams
33+
from vllm.config import KVTransferConfig
34+
from vllm.engine.arg_utils import EngineArgs
35+
36+
37+
def setup_environment_variables():
38+
# LMCache-related environment variables
39+
# Use experimental features in LMCache
40+
os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
41+
# LMCache is set to use 256 tokens per chunk
42+
os.environ["LMCACHE_CHUNK_SIZE"] = "256"
43+
# Enable local CPU backend in LMCache
44+
os.environ["LMCACHE_LOCAL_CPU"] = "True"
45+
# Set local CPU memory limit to 5.0 GB
46+
os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
47+
48+
49+
@contextlib.contextmanager
50+
def build_llm_with_lmcache(lmcache_connector: str, model: str,
51+
vllm_version: str):
52+
ktc = KVTransferConfig(
53+
kv_connector=lmcache_connector,
54+
kv_role="kv_both",
55+
)
56+
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
57+
# memory. Reduce the value if your GPU has less memory.
58+
# Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
59+
if vllm_version == "v0":
60+
llm_args = EngineArgs(
61+
model=model,
62+
kv_transfer_config=ktc,
63+
max_model_len=8000,
64+
gpu_memory_utilization=0.8,
65+
enable_chunked_prefill=True, # Only in v0
66+
)
67+
else:
68+
llm_args = EngineArgs(
69+
model=model,
70+
kv_transfer_config=ktc,
71+
max_model_len=8000,
72+
gpu_memory_utilization=0.8,
73+
)
74+
75+
llm = LLM(**asdict(llm_args))
76+
try:
77+
yield llm
78+
finally:
79+
# Clean up lmcache backend
80+
LMCacheEngineBuilder.destroy(ENGINE_NAME)
81+
82+
83+
def print_output(
84+
llm: LLM,
85+
prompt: list[str],
86+
sampling_params: SamplingParams,
87+
req_str: str,
88+
):
89+
# Should be able to see logs like the following:
90+
# `LMCache INFO: Storing KV cache for 6006 out of 6006 tokens for request 0`
91+
# This indicates that the KV cache has been stored in LMCache.
92+
start = time.time()
93+
outputs = llm.generate(prompt, sampling_params)
94+
print("-" * 50)
95+
for output in outputs:
96+
generated_text = output.outputs[0].text
97+
print(f"Generated text: {generated_text!r}")
98+
print(f"Generation took {time.time() - start:.2f} seconds, "
99+
f"{req_str} request done.")
100+
print("-" * 50)
101+
102+
103+
def parse_args():
104+
parser = argparse.ArgumentParser()
105+
parser.add_argument("-v",
106+
"--version",
107+
choices=["v0", "v1"],
108+
default="v1",
109+
help="Specify vLLM version (default: v1)")
110+
return parser.parse_args()
111+
112+
113+
def main():
114+
args = parse_args()
115+
116+
if args.version == "v0":
117+
lmcache_connector = "LMCacheConnector"
118+
model = "mistralai/Mistral-7B-Instruct-v0.2"
119+
else:
120+
lmcache_connector = "LMCacheConnectorV1"
121+
model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
122+
123+
setup_environment_variables()
124+
125+
with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm:
126+
127+
# This example script runs two requests with a shared prefix.
128+
# Define the shared prompt and specific prompts
129+
shared_prompt = "Hello, how are you?" * 1000
130+
first_prompt = [
131+
shared_prompt + "Hello, my name is",
132+
]
133+
second_prompt = [
134+
shared_prompt + "Tell me a very long story",
135+
]
136+
137+
sampling_params = SamplingParams(temperature=0,
138+
top_p=0.95,
139+
max_tokens=10)
140+
141+
# Print the first output
142+
print_output(llm, first_prompt, sampling_params, "first")
143+
144+
time.sleep(1)
145+
146+
# print the second output
147+
print_output(llm, second_prompt, sampling_params, "second")
148+
149+
150+
if __name__ == "__main__":
151+
main()
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
"""
3+
This file demonstrates the example usage of cpu offloading
4+
with LMCache.
5+
6+
Note that `lmcache` is needed to run this example.
7+
Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
8+
Learn more about LMCache environment setup, please refer to:
9+
https://docs.lmcache.ai/getting_started/installation.html
10+
"""
11+
import contextlib
12+
import os
13+
import time
14+
15+
from lmcache.experimental.cache_engine import LMCacheEngineBuilder
16+
from lmcache.integration.vllm.utils import ENGINE_NAME
17+
18+
from vllm import LLM, SamplingParams
19+
from vllm.config import KVTransferConfig
20+
21+
22+
def setup_environment_variables():
23+
# LMCache-related environment variables
24+
# Use experimental features in LMCache
25+
os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
26+
# LMCache is set to use 256 tokens per chunk
27+
os.environ["LMCACHE_CHUNK_SIZE"] = "256"
28+
# Enable local CPU backend in LMCache
29+
os.environ["LMCACHE_LOCAL_CPU"] = "True"
30+
# Set local CPU memory limit to 5.0 GB
31+
os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
32+
33+
34+
@contextlib.contextmanager
35+
def build_llm_with_lmcache():
36+
ktc = KVTransferConfig.from_cli(
37+
'{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')
38+
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
39+
# memory. Reduce the value if your GPU has less memory.
40+
# Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
41+
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
42+
kv_transfer_config=ktc,
43+
max_model_len=8000,
44+
enable_chunked_prefill=True,
45+
gpu_memory_utilization=0.8)
46+
47+
try:
48+
yield llm
49+
finally:
50+
# Clean up lmcache backend
51+
LMCacheEngineBuilder.destroy(ENGINE_NAME)
52+
53+
54+
def print_output(
55+
llm: LLM,
56+
prompt: list[str],
57+
sampling_params: SamplingParams,
58+
req_str: str,
59+
):
60+
start = time.time()
61+
outputs = llm.generate(prompt, sampling_params)
62+
print("-" * 50)
63+
for output in outputs:
64+
generated_text = output.outputs[0].text
65+
print(f"Generated text: {generated_text!r}")
66+
print(f"Generation took {time.time() - start:.2f} seconds, "
67+
f"{req_str} request done.")
68+
print("-" * 50)
69+
70+
71+
def main():
72+
setup_environment_variables()
73+
74+
with build_llm_with_lmcache() as llm:
75+
76+
# This example script runs two requests with a shared prefix.
77+
# Define the shared prompt and specific prompts
78+
shared_prompt = "Hello, how are you?" * 1000
79+
first_prompt = [
80+
shared_prompt + "Hello, my name is",
81+
]
82+
second_prompt = [
83+
shared_prompt + "Tell me a very long story",
84+
]
85+
86+
sampling_params = SamplingParams(temperature=0,
87+
top_p=0.95,
88+
max_tokens=10)
89+
90+
# Print the first output
91+
print_output(llm, first_prompt, sampling_params, "first")
92+
93+
time.sleep(1)
94+
95+
# print the second output
96+
print_output(llm, second_prompt, sampling_params, "second")
97+
98+
99+
if __name__ == "__main__":
100+
main()

0 commit comments

Comments
 (0)