Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,12 @@ jobs:
- test: TestStatefulModel_g6
instance: g6
failure-prefix: lmi
- test: TestVllmLmcache_g6
instance: g6
failure-prefix: lmi
- test: TestVllmLmcachePerformance_g6
instance: g6
failure-prefix: lmi
# P4D instance tests
- test: TestVllm_p4d
instance: p4d
Expand Down
3 changes: 2 additions & 1 deletion serving/docker/lmi-container-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,5 @@ peft
llmcompressor
https://vllm-wheels.s3.us-west-2.amazonaws.com/d3ab240f39219df0175ec662416f630d7bf273d8/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
xgrammar
flashinfer-python==0.4.1
flashinfer-python==0.4.1
lmcache
105 changes: 105 additions & 0 deletions tests/integration/llm/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import math
import json
import shutil
import time
from random import randrange
import numpy as np
from datetime import datetime
Expand Down Expand Up @@ -239,6 +240,21 @@ def get_model_name():
"seq_length": [256],
"tokenizer": "MiniMaxAI/MiniMax-M2",
},
"llama3-8b-lmcache-cpu": {
"batch_size": [1, 4],
"seq_length": [256],
"tokenizer": "TheBloke/Llama-3-8B-fp16"
},
"llama3-8b-lmcache-local-storage": {
"batch_size": [1, 4],
"seq_length": [256],
"tokenizer": "TheBloke/Llama-3-8B-fp16"
},
"llama3-8b-no-lmcache": {
"batch_size": [1, 4],
"seq_length": [256],
"tokenizer": "TheBloke/Llama-3-8B-fp16"
},
}

vllm_neo_model_spec = {
Expand Down Expand Up @@ -1786,6 +1802,91 @@ def test_text_embedding_model(model, model_spec):
awscurl_run(req, spec.get("tokenizer"), batch_size)


def test_handler_lmcache(model, model_spec, is_baseline):
modelspec_checker(model, model_spec)
spec = model_spec[args.model]
if "worker" in spec:
check_worker_number(spec["worker"])

paragraph = """In the realm of artificial intelligence and machine learning, large language models have revolutionized
the way we process and understand natural language. These sophisticated neural networks, trained on vast amounts of text data,
can generate coherent responses, answer questions, and even engage in creative tasks. The architecture underlying these models
typically involves transformer networks with attention mechanisms that allow the model to focus on relevant parts of the input.
As these models scale to billions of parameters, they demonstrate emergent capabilities that weren't explicitly programmed.
However, this scaling comes with significant computational costs, particularly in terms of memory and processing power required
for inference. Key-value caching mechanisms have become essential for optimizing the performance of these models during generation,
storing intermediate computations to avoid redundant calculations. This is especially important in scenarios where multiple requests
share common prefixes or context, as the cached states can be reused across different queries."""
shared_prefix = " ".join([paragraph] * 40)
params = {"max_new_tokens": 50, "temperature": 0, "seed": 42}

warmup_req = {"inputs": shared_prefix + " Warmup?", "parameters": params}
LOGGER.info("Warmup: Populating cache with shared prefix")
start = time.time()
send_json(warmup_req)
warmup_time = time.time() - start
time.sleep(1)

test_req = {"inputs": shared_prefix + " Test query?", "parameters": params}
LOGGER.info("Test: Sending request with shared prefix")
start = time.time()
send_json(test_req)
test_time = time.time() - start

speedup = warmup_time / test_time if test_time > 0 else 0
LOGGER.info(
f"Warmup time: {warmup_time:.2f}s, Test time: {test_time:.2f}s, Speedup: {speedup:.2f}x"
)


def test_handler_lmcache_performance(model, model_spec):
modelspec_checker(model, model_spec)
spec = model_spec[args.model]
if "worker" in spec:
check_worker_number(spec["worker"])

paragraph = """In the realm of artificial intelligence and machine learning, large language models have revolutionized
the way we process and understand natural language. These sophisticated neural networks, trained on vast amounts of text data,
can generate coherent responses, answer questions, and even engage in creative tasks. The architecture underlying these models
typically involves transformer networks with attention mechanisms that allow the model to focus on relevant parts of the input.
As these models scale to billions of parameters, they demonstrate emergent capabilities that weren't explicitly programmed.
However, this scaling comes with significant computational costs, particularly in terms of memory and processing power required
for inference. Key-value caching mechanisms have become essential for optimizing the performance of these models during generation,
storing intermediate computations to avoid redundant calculations. This is especially important in scenarios where multiple requests
share common prefixes or context, as the cached states can be reused across different queries."""
shared_prefix = " ".join([paragraph] * 40)
params = {"max_new_tokens": 50, "temperature": 0, "seed": 42}
concurrency = 100
warmup_req = {"inputs": shared_prefix + " Warmup?", "parameters": params}
LOGGER.info("Warmup: Populating LMCache with shared prefix")
send_json(warmup_req)
time.sleep(2)

shared_reqs = [{
"inputs": shared_prefix + f" Query {i}?",
"parameters": params
} for i in range(concurrency)]

LOGGER.info(
f"Performance test: {concurrency} concurrent requests with shared prefix"
)
awscurl_run(shared_reqs,
spec.get("tokenizer"),
concurrency=concurrency,
num_run=1,
json_results=True,
dataset=True)

with open("benchmark.json") as f:
metrics = json.load(f)
p50_ttft = metrics["p50TimeToFirstByte"]
p90_ttft = metrics["p90TimeToFirstByte"]
tps = metrics["tps"]
LOGGER.info(
f"Results: P50 TTFT={p50_ttft:.2f}ms, P90 TTFT={p90_ttft:.2f}ms, TPS={tps:.2f}"
)


def test_handler_stateful(model, model_spec):
if model not in model_spec:
raise ValueError(
Expand Down Expand Up @@ -1899,6 +2000,10 @@ def run(raw_args):
test_handler_rolling_batch_chat(args.model, vllm_chat_model_spec)
elif args.handler == "vllm_tool":
test_handler_rolling_batch_tool(args.model, vllm_tool_model_spec)
elif args.handler == "vllm_lmcache":
test_handler_lmcache(args.model, vllm_model_spec, False)
elif args.handler == "vllm_lmcache_performance":
test_handler_lmcache_performance(args.model, vllm_model_spec)
elif args.handler == "vllm_neo":
test_handler_rolling_batch(args.model, vllm_neo_model_spec)
elif args.handler == "handler_performance":
Expand Down
46 changes: 44 additions & 2 deletions tests/integration/llm/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,35 @@
"option.max_model_len": 16384,
"option.gpu_memory_utilization": "0.9",
},
"llama3-8b-lmcache-cpu": {
"option.model_id":
"s3://djl-llm/llama-3-8b-instruct-hf/",
"option.tensor_parallel_degree":
4,
"lmcache_config_file":
"lmcache_cpu.yaml",
"option.kv_transfer_config":
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
},
"llama3-8b-lmcache-local-storage": {
"option.model_id":
"s3://djl-llm/llama-3-8b-instruct-hf/",
"option.tensor_parallel_degree":
4,
"lmcache_config_file":
"lmcache_local_storage.yaml",
"option.kv_transfer_config":
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
},
"llama3-8b-lmcache-missing-role": {
"option.model_id": "s3://djl-llm/llama-3-8b-instruct-hf/",
"option.tensor_parallel_degree": 4,
"option.kv_transfer_config": '{"kv_connector":"LMCacheConnectorV1"}',
},
"llama3-8b-no-lmcache": {
"option.model_id": "s3://djl-llm/llama-3-8b-instruct-hf/",
"option.tensor_parallel_degree": 4,
},
}

vllm_neo_model_list = {
Expand Down Expand Up @@ -799,11 +828,19 @@
def write_model_artifacts(properties,
requirements=None,
adapter_ids=[],
adapter_names=[]):
adapter_names=[],
lmcache_config_file=None):
model_path = "models/test"
if os.path.exists(model_path):
shutil.rmtree(model_path)
os.makedirs(model_path, exist_ok=True)

if lmcache_config_file:
source_config = os.path.join("lmcache_configs", lmcache_config_file)
dest_config = os.path.join(model_path, lmcache_config_file)
if os.path.exists(source_config):
shutil.copy2(source_config, dest_config)

with open(os.path.join(model_path, "serving.properties"), "w") as f:
for key, value in properties.items():
f.write(f"{key}={value}\n")
Expand Down Expand Up @@ -935,9 +972,14 @@ def build_vllm_async_model(model):
adapter_ids = options.pop("adapter_ids", [])
adapter_names = options.pop("adapter_names", [])

lmcache_config_file = options.pop("lmcache_config_file", None)
if lmcache_config_file:
options["option.lmcache_config_file"] = lmcache_config_file

write_model_artifacts(options,
adapter_ids=adapter_ids,
adapter_names=adapter_names)
adapter_names=adapter_names,
lmcache_config_file=lmcache_config_file)


def build_vllm_async_model_with_custom_handler(model, handler_type="success"):
Expand Down
6 changes: 6 additions & 0 deletions tests/integration/lmcache_configs/lmcache_cpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# 256 Tokens per KV Chunk
chunk_size: 256
# Enable CPU memory backend
local_cpu: true # default
# 5GB of Pinned CPU memory
max_local_cpu_size: 5.0 # default
10 changes: 10 additions & 0 deletions tests/integration/lmcache_configs/lmcache_local_storage.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# 256 Tokens per KV Chunk
chunk_size: 256
# Enable Disk backend
local_disk: "file:///tmp/lmcache/"
# 5GB of Disk memory
max_local_disk_size: 5.0

# Disable page cache
# This should be turned on for better performance if most local CPU memory is used
extra_config: {'use_odirect': True}
57 changes: 57 additions & 0 deletions tests/integration/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,6 +637,63 @@ def test_custom_formatter_load_error(self):
r.launch()


@pytest.mark.vllm
@pytest.mark.gpu_4
class TestVllmLmcache_g6:

def test_lmcache_cpu(self):
with Runner('lmi', 'llama3-8b-lmcache-cpu') as r:
prepare.build_vllm_async_model("llama3-8b-lmcache-cpu")
r.launch(env_vars=[
"LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_cpu.yaml"
])
client.run("vllm_lmcache llama3-8b-lmcache-cpu".split())

def test_lmcache_local_storage(self):
with Runner('lmi', 'llama3-8b-lmcache-local-storage') as r:
prepare.build_vllm_async_model("llama3-8b-lmcache-local-storage")
r.launch(env_vars=[
"LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_local_storage.yaml"
])
client.run("vllm_lmcache llama3-8b-lmcache-local-storage".split())

def test_lmcache_missing_role(self):
with Runner('lmi', 'llama3-8b-lmcache-missing-role') as r:
prepare.build_vllm_async_model("llama3-8b-lmcache-missing-role")
with pytest.raises(Exception):
r.launch()


@pytest.mark.vllm
@pytest.mark.gpu_4
class TestVllmLmcachePerformance_g6:

def test_lmcache_performance_baseline(self):
with Runner('lmi', 'llama3-8b-no-lmcache') as r:
prepare.build_vllm_async_model("llama3-8b-no-lmcache")
r.launch()
client.run("vllm_lmcache_performance llama3-8b-no-lmcache".split())

def test_lmcache_performance_cpu(self):
with Runner('lmi', 'llama3-8b-lmcache-cpu') as r:
prepare.build_vllm_async_model("llama3-8b-lmcache-cpu")
r.launch(env_vars=[
"LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_cpu.yaml"
])
client.run(
"vllm_lmcache_performance llama3-8b-lmcache-cpu".split())

def test_lmcache_performance_local_storage(self):
with Runner('lmi', 'llama3-8b-lmcache-local-storage') as r:
prepare.build_vllm_async_model("llama3-8b-lmcache-local-storage")
r.launch(env_vars=[
"LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_local_storage.yaml"
])
client.run(
"vllm_lmcache_performance llama3-8b-lmcache-local-storage".
split())


@pytest.mark.gpu_4
class TestTextEmbedding_g6:

Expand Down
Loading