Skip to content

Commit c814ddb

Browse files
ksuma2109Suma Kasa
andauthored
Add tests for LMCache - functionality and integration tests (#2942)
Co-authored-by: Suma Kasa <sumakasa@amazon.com>
1 parent 444b0ef commit c814ddb

File tree

7 files changed

+230
-3
lines changed

7 files changed

+230
-3
lines changed

.github/workflows/integration.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,12 @@ jobs:
203203
- test: TestStatefulModel_g6
204204
instance: g6
205205
failure-prefix: lmi
206+
- test: TestVllmLmcache_g6
207+
instance: g6
208+
failure-prefix: lmi
209+
- test: TestVllmLmcachePerformance_g6
210+
instance: g6
211+
failure-prefix: lmi
206212
# P4D instance tests
207213
- test: TestVllm_p4d
208214
instance: p4d

serving/docker/lmi-container-requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,5 @@ peft
3434
llmcompressor
3535
https://vllm-wheels.s3.us-west-2.amazonaws.com/d3ab240f39219df0175ec662416f630d7bf273d8/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
3636
xgrammar
37-
flashinfer-python==0.4.1
37+
flashinfer-python==0.4.1
38+
lmcache

tests/integration/llm/client.py

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import math
99
import json
1010
import shutil
11+
import time
1112
from random import randrange
1213
import numpy as np
1314
from datetime import datetime
@@ -239,6 +240,21 @@ def get_model_name():
239240
"seq_length": [256],
240241
"tokenizer": "MiniMaxAI/MiniMax-M2",
241242
},
243+
"llama3-8b-lmcache-cpu": {
244+
"batch_size": [1, 4],
245+
"seq_length": [256],
246+
"tokenizer": "TheBloke/Llama-3-8B-fp16"
247+
},
248+
"llama3-8b-lmcache-local-storage": {
249+
"batch_size": [1, 4],
250+
"seq_length": [256],
251+
"tokenizer": "TheBloke/Llama-3-8B-fp16"
252+
},
253+
"llama3-8b-no-lmcache": {
254+
"batch_size": [1, 4],
255+
"seq_length": [256],
256+
"tokenizer": "TheBloke/Llama-3-8B-fp16"
257+
},
242258
}
243259

244260
vllm_neo_model_spec = {
@@ -1786,6 +1802,91 @@ def test_text_embedding_model(model, model_spec):
17861802
awscurl_run(req, spec.get("tokenizer"), batch_size)
17871803

17881804

1805+
def test_handler_lmcache(model, model_spec, is_baseline):
1806+
modelspec_checker(model, model_spec)
1807+
spec = model_spec[args.model]
1808+
if "worker" in spec:
1809+
check_worker_number(spec["worker"])
1810+
1811+
paragraph = """In the realm of artificial intelligence and machine learning, large language models have revolutionized
1812+
the way we process and understand natural language. These sophisticated neural networks, trained on vast amounts of text data,
1813+
can generate coherent responses, answer questions, and even engage in creative tasks. The architecture underlying these models
1814+
typically involves transformer networks with attention mechanisms that allow the model to focus on relevant parts of the input.
1815+
As these models scale to billions of parameters, they demonstrate emergent capabilities that weren't explicitly programmed.
1816+
However, this scaling comes with significant computational costs, particularly in terms of memory and processing power required
1817+
for inference. Key-value caching mechanisms have become essential for optimizing the performance of these models during generation,
1818+
storing intermediate computations to avoid redundant calculations. This is especially important in scenarios where multiple requests
1819+
share common prefixes or context, as the cached states can be reused across different queries."""
1820+
shared_prefix = " ".join([paragraph] * 40)
1821+
params = {"max_new_tokens": 50, "temperature": 0, "seed": 42}
1822+
1823+
warmup_req = {"inputs": shared_prefix + " Warmup?", "parameters": params}
1824+
LOGGER.info("Warmup: Populating cache with shared prefix")
1825+
start = time.time()
1826+
send_json(warmup_req)
1827+
warmup_time = time.time() - start
1828+
time.sleep(1)
1829+
1830+
test_req = {"inputs": shared_prefix + " Test query?", "parameters": params}
1831+
LOGGER.info("Test: Sending request with shared prefix")
1832+
start = time.time()
1833+
send_json(test_req)
1834+
test_time = time.time() - start
1835+
1836+
speedup = warmup_time / test_time if test_time > 0 else 0
1837+
LOGGER.info(
1838+
f"Warmup time: {warmup_time:.2f}s, Test time: {test_time:.2f}s, Speedup: {speedup:.2f}x"
1839+
)
1840+
1841+
1842+
def test_handler_lmcache_performance(model, model_spec):
1843+
modelspec_checker(model, model_spec)
1844+
spec = model_spec[args.model]
1845+
if "worker" in spec:
1846+
check_worker_number(spec["worker"])
1847+
1848+
paragraph = """In the realm of artificial intelligence and machine learning, large language models have revolutionized
1849+
the way we process and understand natural language. These sophisticated neural networks, trained on vast amounts of text data,
1850+
can generate coherent responses, answer questions, and even engage in creative tasks. The architecture underlying these models
1851+
typically involves transformer networks with attention mechanisms that allow the model to focus on relevant parts of the input.
1852+
As these models scale to billions of parameters, they demonstrate emergent capabilities that weren't explicitly programmed.
1853+
However, this scaling comes with significant computational costs, particularly in terms of memory and processing power required
1854+
for inference. Key-value caching mechanisms have become essential for optimizing the performance of these models during generation,
1855+
storing intermediate computations to avoid redundant calculations. This is especially important in scenarios where multiple requests
1856+
share common prefixes or context, as the cached states can be reused across different queries."""
1857+
shared_prefix = " ".join([paragraph] * 40)
1858+
params = {"max_new_tokens": 50, "temperature": 0, "seed": 42}
1859+
concurrency = 100
1860+
warmup_req = {"inputs": shared_prefix + " Warmup?", "parameters": params}
1861+
LOGGER.info("Warmup: Populating LMCache with shared prefix")
1862+
send_json(warmup_req)
1863+
time.sleep(2)
1864+
1865+
shared_reqs = [{
1866+
"inputs": shared_prefix + f" Query {i}?",
1867+
"parameters": params
1868+
} for i in range(concurrency)]
1869+
1870+
LOGGER.info(
1871+
f"Performance test: {concurrency} concurrent requests with shared prefix"
1872+
)
1873+
awscurl_run(shared_reqs,
1874+
spec.get("tokenizer"),
1875+
concurrency=concurrency,
1876+
num_run=1,
1877+
json_results=True,
1878+
dataset=True)
1879+
1880+
with open("benchmark.json") as f:
1881+
metrics = json.load(f)
1882+
p50_ttft = metrics["p50TimeToFirstByte"]
1883+
p90_ttft = metrics["p90TimeToFirstByte"]
1884+
tps = metrics["tps"]
1885+
LOGGER.info(
1886+
f"Results: P50 TTFT={p50_ttft:.2f}ms, P90 TTFT={p90_ttft:.2f}ms, TPS={tps:.2f}"
1887+
)
1888+
1889+
17891890
def test_handler_stateful(model, model_spec):
17901891
if model not in model_spec:
17911892
raise ValueError(
@@ -1899,6 +2000,10 @@ def run(raw_args):
18992000
test_handler_rolling_batch_chat(args.model, vllm_chat_model_spec)
19002001
elif args.handler == "vllm_tool":
19012002
test_handler_rolling_batch_tool(args.model, vllm_tool_model_spec)
2003+
elif args.handler == "vllm_lmcache":
2004+
test_handler_lmcache(args.model, vllm_model_spec, False)
2005+
elif args.handler == "vllm_lmcache_performance":
2006+
test_handler_lmcache_performance(args.model, vllm_model_spec)
19022007
elif args.handler == "vllm_neo":
19032008
test_handler_rolling_batch(args.model, vllm_neo_model_spec)
19042009
elif args.handler == "handler_performance":

tests/integration/llm/prepare.py

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -468,6 +468,35 @@
468468
"option.max_model_len": 16384,
469469
"option.gpu_memory_utilization": "0.9",
470470
},
471+
"llama3-8b-lmcache-cpu": {
472+
"option.model_id":
473+
"s3://djl-llm/llama-3-8b-instruct-hf/",
474+
"option.tensor_parallel_degree":
475+
4,
476+
"lmcache_config_file":
477+
"lmcache_cpu.yaml",
478+
"option.kv_transfer_config":
479+
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
480+
},
481+
"llama3-8b-lmcache-local-storage": {
482+
"option.model_id":
483+
"s3://djl-llm/llama-3-8b-instruct-hf/",
484+
"option.tensor_parallel_degree":
485+
4,
486+
"lmcache_config_file":
487+
"lmcache_local_storage.yaml",
488+
"option.kv_transfer_config":
489+
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
490+
},
491+
"llama3-8b-lmcache-missing-role": {
492+
"option.model_id": "s3://djl-llm/llama-3-8b-instruct-hf/",
493+
"option.tensor_parallel_degree": 4,
494+
"option.kv_transfer_config": '{"kv_connector":"LMCacheConnectorV1"}',
495+
},
496+
"llama3-8b-no-lmcache": {
497+
"option.model_id": "s3://djl-llm/llama-3-8b-instruct-hf/",
498+
"option.tensor_parallel_degree": 4,
499+
},
471500
}
472501

473502
vllm_neo_model_list = {
@@ -799,11 +828,19 @@
799828
def write_model_artifacts(properties,
800829
requirements=None,
801830
adapter_ids=[],
802-
adapter_names=[]):
831+
adapter_names=[],
832+
lmcache_config_file=None):
803833
model_path = "models/test"
804834
if os.path.exists(model_path):
805835
shutil.rmtree(model_path)
806836
os.makedirs(model_path, exist_ok=True)
837+
838+
if lmcache_config_file:
839+
source_config = os.path.join("lmcache_configs", lmcache_config_file)
840+
dest_config = os.path.join(model_path, lmcache_config_file)
841+
if os.path.exists(source_config):
842+
shutil.copy2(source_config, dest_config)
843+
807844
with open(os.path.join(model_path, "serving.properties"), "w") as f:
808845
for key, value in properties.items():
809846
f.write(f"{key}={value}\n")
@@ -935,9 +972,14 @@ def build_vllm_async_model(model):
935972
adapter_ids = options.pop("adapter_ids", [])
936973
adapter_names = options.pop("adapter_names", [])
937974

975+
lmcache_config_file = options.pop("lmcache_config_file", None)
976+
if lmcache_config_file:
977+
options["option.lmcache_config_file"] = lmcache_config_file
978+
938979
write_model_artifacts(options,
939980
adapter_ids=adapter_ids,
940-
adapter_names=adapter_names)
981+
adapter_names=adapter_names,
982+
lmcache_config_file=lmcache_config_file)
941983

942984

943985
def build_vllm_async_model_with_custom_handler(model, handler_type="success"):
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# 256 Tokens per KV Chunk
2+
chunk_size: 256
3+
# Enable CPU memory backend
4+
local_cpu: true # default
5+
# 5GB of Pinned CPU memory
6+
max_local_cpu_size: 5.0 # default
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# 256 Tokens per KV Chunk
2+
chunk_size: 256
3+
# Enable Disk backend
4+
local_disk: "file:///tmp/lmcache/"
5+
# 5GB of Disk memory
6+
max_local_disk_size: 5.0
7+
8+
# Disable page cache
9+
# This should be turned on for better performance if most local CPU memory is used
10+
extra_config: {'use_odirect': True}

tests/integration/tests.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -637,6 +637,63 @@ def test_custom_formatter_load_error(self):
637637
r.launch()
638638

639639

640+
@pytest.mark.vllm
641+
@pytest.mark.gpu_4
642+
class TestVllmLmcache_g6:
643+
644+
def test_lmcache_cpu(self):
645+
with Runner('lmi', 'llama3-8b-lmcache-cpu') as r:
646+
prepare.build_vllm_async_model("llama3-8b-lmcache-cpu")
647+
r.launch(env_vars=[
648+
"LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_cpu.yaml"
649+
])
650+
client.run("vllm_lmcache llama3-8b-lmcache-cpu".split())
651+
652+
def test_lmcache_local_storage(self):
653+
with Runner('lmi', 'llama3-8b-lmcache-local-storage') as r:
654+
prepare.build_vllm_async_model("llama3-8b-lmcache-local-storage")
655+
r.launch(env_vars=[
656+
"LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_local_storage.yaml"
657+
])
658+
client.run("vllm_lmcache llama3-8b-lmcache-local-storage".split())
659+
660+
def test_lmcache_missing_role(self):
661+
with Runner('lmi', 'llama3-8b-lmcache-missing-role') as r:
662+
prepare.build_vllm_async_model("llama3-8b-lmcache-missing-role")
663+
with pytest.raises(Exception):
664+
r.launch()
665+
666+
667+
@pytest.mark.vllm
668+
@pytest.mark.gpu_4
669+
class TestVllmLmcachePerformance_g6:
670+
671+
def test_lmcache_performance_baseline(self):
672+
with Runner('lmi', 'llama3-8b-no-lmcache') as r:
673+
prepare.build_vllm_async_model("llama3-8b-no-lmcache")
674+
r.launch()
675+
client.run("vllm_lmcache_performance llama3-8b-no-lmcache".split())
676+
677+
def test_lmcache_performance_cpu(self):
678+
with Runner('lmi', 'llama3-8b-lmcache-cpu') as r:
679+
prepare.build_vllm_async_model("llama3-8b-lmcache-cpu")
680+
r.launch(env_vars=[
681+
"LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_cpu.yaml"
682+
])
683+
client.run(
684+
"vllm_lmcache_performance llama3-8b-lmcache-cpu".split())
685+
686+
def test_lmcache_performance_local_storage(self):
687+
with Runner('lmi', 'llama3-8b-lmcache-local-storage') as r:
688+
prepare.build_vllm_async_model("llama3-8b-lmcache-local-storage")
689+
r.launch(env_vars=[
690+
"LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_local_storage.yaml"
691+
])
692+
client.run(
693+
"vllm_lmcache_performance llama3-8b-lmcache-local-storage".
694+
split())
695+
696+
640697
@pytest.mark.gpu_4
641698
class TestTextEmbedding_g6:
642699

0 commit comments

Comments
 (0)