diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index 3c7f3c1bb..fd597c84f 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -203,6 +203,12 @@ jobs: - test: TestStatefulModel_g6 instance: g6 failure-prefix: lmi + - test: TestVllmLmcache_g6 + instance: g6 + failure-prefix: lmi + - test: TestVllmLmcachePerformance_g6 + instance: g6 + failure-prefix: lmi # P4D instance tests - test: TestVllm_p4d instance: p4d diff --git a/serving/docker/lmi-container-requirements.txt b/serving/docker/lmi-container-requirements.txt index a2e55eae4..8f3affe2c 100644 --- a/serving/docker/lmi-container-requirements.txt +++ b/serving/docker/lmi-container-requirements.txt @@ -34,4 +34,5 @@ peft llmcompressor https://vllm-wheels.s3.us-west-2.amazonaws.com/d3ab240f39219df0175ec662416f630d7bf273d8/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl xgrammar -flashinfer-python==0.4.1 \ No newline at end of file +flashinfer-python==0.4.1 +lmcache \ No newline at end of file diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py index 2030ae5ec..6c0bbe03e 100644 --- a/tests/integration/llm/client.py +++ b/tests/integration/llm/client.py @@ -8,6 +8,7 @@ import math import json import shutil +import time from random import randrange import numpy as np from datetime import datetime @@ -239,6 +240,21 @@ def get_model_name(): "seq_length": [256], "tokenizer": "MiniMaxAI/MiniMax-M2", }, + "llama3-8b-lmcache-cpu": { + "batch_size": [1, 4], + "seq_length": [256], + "tokenizer": "TheBloke/Llama-3-8B-fp16" + }, + "llama3-8b-lmcache-local-storage": { + "batch_size": [1, 4], + "seq_length": [256], + "tokenizer": "TheBloke/Llama-3-8B-fp16" + }, + "llama3-8b-no-lmcache": { + "batch_size": [1, 4], + "seq_length": [256], + "tokenizer": "TheBloke/Llama-3-8B-fp16" + }, } vllm_neo_model_spec = { @@ -1786,6 +1802,91 @@ def test_text_embedding_model(model, model_spec): awscurl_run(req, spec.get("tokenizer"), batch_size) +def test_handler_lmcache(model, model_spec, is_baseline): + modelspec_checker(model, model_spec) + spec = model_spec[args.model] + if "worker" in spec: + check_worker_number(spec["worker"]) + + paragraph = """In the realm of artificial intelligence and machine learning, large language models have revolutionized + the way we process and understand natural language. These sophisticated neural networks, trained on vast amounts of text data, + can generate coherent responses, answer questions, and even engage in creative tasks. The architecture underlying these models + typically involves transformer networks with attention mechanisms that allow the model to focus on relevant parts of the input. + As these models scale to billions of parameters, they demonstrate emergent capabilities that weren't explicitly programmed. + However, this scaling comes with significant computational costs, particularly in terms of memory and processing power required + for inference. Key-value caching mechanisms have become essential for optimizing the performance of these models during generation, + storing intermediate computations to avoid redundant calculations. This is especially important in scenarios where multiple requests + share common prefixes or context, as the cached states can be reused across different queries.""" + shared_prefix = " ".join([paragraph] * 40) + params = {"max_new_tokens": 50, "temperature": 0, "seed": 42} + + warmup_req = {"inputs": shared_prefix + " Warmup?", "parameters": params} + LOGGER.info("Warmup: Populating cache with shared prefix") + start = time.time() + send_json(warmup_req) + warmup_time = time.time() - start + time.sleep(1) + + test_req = {"inputs": shared_prefix + " Test query?", "parameters": params} + LOGGER.info("Test: Sending request with shared prefix") + start = time.time() + send_json(test_req) + test_time = time.time() - start + + speedup = warmup_time / test_time if test_time > 0 else 0 + LOGGER.info( + f"Warmup time: {warmup_time:.2f}s, Test time: {test_time:.2f}s, Speedup: {speedup:.2f}x" + ) + + +def test_handler_lmcache_performance(model, model_spec): + modelspec_checker(model, model_spec) + spec = model_spec[args.model] + if "worker" in spec: + check_worker_number(spec["worker"]) + + paragraph = """In the realm of artificial intelligence and machine learning, large language models have revolutionized + the way we process and understand natural language. These sophisticated neural networks, trained on vast amounts of text data, + can generate coherent responses, answer questions, and even engage in creative tasks. The architecture underlying these models + typically involves transformer networks with attention mechanisms that allow the model to focus on relevant parts of the input. + As these models scale to billions of parameters, they demonstrate emergent capabilities that weren't explicitly programmed. + However, this scaling comes with significant computational costs, particularly in terms of memory and processing power required + for inference. Key-value caching mechanisms have become essential for optimizing the performance of these models during generation, + storing intermediate computations to avoid redundant calculations. This is especially important in scenarios where multiple requests + share common prefixes or context, as the cached states can be reused across different queries.""" + shared_prefix = " ".join([paragraph] * 40) + params = {"max_new_tokens": 50, "temperature": 0, "seed": 42} + concurrency = 100 + warmup_req = {"inputs": shared_prefix + " Warmup?", "parameters": params} + LOGGER.info("Warmup: Populating LMCache with shared prefix") + send_json(warmup_req) + time.sleep(2) + + shared_reqs = [{ + "inputs": shared_prefix + f" Query {i}?", + "parameters": params + } for i in range(concurrency)] + + LOGGER.info( + f"Performance test: {concurrency} concurrent requests with shared prefix" + ) + awscurl_run(shared_reqs, + spec.get("tokenizer"), + concurrency=concurrency, + num_run=1, + json_results=True, + dataset=True) + + with open("benchmark.json") as f: + metrics = json.load(f) + p50_ttft = metrics["p50TimeToFirstByte"] + p90_ttft = metrics["p90TimeToFirstByte"] + tps = metrics["tps"] + LOGGER.info( + f"Results: P50 TTFT={p50_ttft:.2f}ms, P90 TTFT={p90_ttft:.2f}ms, TPS={tps:.2f}" + ) + + def test_handler_stateful(model, model_spec): if model not in model_spec: raise ValueError( @@ -1899,6 +2000,10 @@ def run(raw_args): test_handler_rolling_batch_chat(args.model, vllm_chat_model_spec) elif args.handler == "vllm_tool": test_handler_rolling_batch_tool(args.model, vllm_tool_model_spec) + elif args.handler == "vllm_lmcache": + test_handler_lmcache(args.model, vllm_model_spec, False) + elif args.handler == "vllm_lmcache_performance": + test_handler_lmcache_performance(args.model, vllm_model_spec) elif args.handler == "vllm_neo": test_handler_rolling_batch(args.model, vllm_neo_model_spec) elif args.handler == "handler_performance": diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py index f003a655e..5dc0c7c9d 100644 --- a/tests/integration/llm/prepare.py +++ b/tests/integration/llm/prepare.py @@ -468,6 +468,35 @@ "option.max_model_len": 16384, "option.gpu_memory_utilization": "0.9", }, + "llama3-8b-lmcache-cpu": { + "option.model_id": + "s3://djl-llm/llama-3-8b-instruct-hf/", + "option.tensor_parallel_degree": + 4, + "lmcache_config_file": + "lmcache_cpu.yaml", + "option.kv_transfer_config": + '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}', + }, + "llama3-8b-lmcache-local-storage": { + "option.model_id": + "s3://djl-llm/llama-3-8b-instruct-hf/", + "option.tensor_parallel_degree": + 4, + "lmcache_config_file": + "lmcache_local_storage.yaml", + "option.kv_transfer_config": + '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}', + }, + "llama3-8b-lmcache-missing-role": { + "option.model_id": "s3://djl-llm/llama-3-8b-instruct-hf/", + "option.tensor_parallel_degree": 4, + "option.kv_transfer_config": '{"kv_connector":"LMCacheConnectorV1"}', + }, + "llama3-8b-no-lmcache": { + "option.model_id": "s3://djl-llm/llama-3-8b-instruct-hf/", + "option.tensor_parallel_degree": 4, + }, } vllm_neo_model_list = { @@ -799,11 +828,19 @@ def write_model_artifacts(properties, requirements=None, adapter_ids=[], - adapter_names=[]): + adapter_names=[], + lmcache_config_file=None): model_path = "models/test" if os.path.exists(model_path): shutil.rmtree(model_path) os.makedirs(model_path, exist_ok=True) + + if lmcache_config_file: + source_config = os.path.join("lmcache_configs", lmcache_config_file) + dest_config = os.path.join(model_path, lmcache_config_file) + if os.path.exists(source_config): + shutil.copy2(source_config, dest_config) + with open(os.path.join(model_path, "serving.properties"), "w") as f: for key, value in properties.items(): f.write(f"{key}={value}\n") @@ -935,9 +972,14 @@ def build_vllm_async_model(model): adapter_ids = options.pop("adapter_ids", []) adapter_names = options.pop("adapter_names", []) + lmcache_config_file = options.pop("lmcache_config_file", None) + if lmcache_config_file: + options["option.lmcache_config_file"] = lmcache_config_file + write_model_artifacts(options, adapter_ids=adapter_ids, - adapter_names=adapter_names) + adapter_names=adapter_names, + lmcache_config_file=lmcache_config_file) def build_vllm_async_model_with_custom_handler(model, handler_type="success"): diff --git a/tests/integration/lmcache_configs/lmcache_cpu.yaml b/tests/integration/lmcache_configs/lmcache_cpu.yaml new file mode 100644 index 000000000..969a15c31 --- /dev/null +++ b/tests/integration/lmcache_configs/lmcache_cpu.yaml @@ -0,0 +1,6 @@ +# 256 Tokens per KV Chunk +chunk_size: 256 +# Enable CPU memory backend +local_cpu: true # default +# 5GB of Pinned CPU memory +max_local_cpu_size: 5.0 # default \ No newline at end of file diff --git a/tests/integration/lmcache_configs/lmcache_local_storage.yaml b/tests/integration/lmcache_configs/lmcache_local_storage.yaml new file mode 100644 index 000000000..aa301565e --- /dev/null +++ b/tests/integration/lmcache_configs/lmcache_local_storage.yaml @@ -0,0 +1,10 @@ +# 256 Tokens per KV Chunk +chunk_size: 256 +# Enable Disk backend +local_disk: "file:///tmp/lmcache/" +# 5GB of Disk memory +max_local_disk_size: 5.0 + +# Disable page cache +# This should be turned on for better performance if most local CPU memory is used +extra_config: {'use_odirect': True} \ No newline at end of file diff --git a/tests/integration/tests.py b/tests/integration/tests.py index 1835f92ae..f2e0266dc 100644 --- a/tests/integration/tests.py +++ b/tests/integration/tests.py @@ -637,6 +637,63 @@ def test_custom_formatter_load_error(self): r.launch() +@pytest.mark.vllm +@pytest.mark.gpu_4 +class TestVllmLmcache_g6: + + def test_lmcache_cpu(self): + with Runner('lmi', 'llama3-8b-lmcache-cpu') as r: + prepare.build_vllm_async_model("llama3-8b-lmcache-cpu") + r.launch(env_vars=[ + "LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_cpu.yaml" + ]) + client.run("vllm_lmcache llama3-8b-lmcache-cpu".split()) + + def test_lmcache_local_storage(self): + with Runner('lmi', 'llama3-8b-lmcache-local-storage') as r: + prepare.build_vllm_async_model("llama3-8b-lmcache-local-storage") + r.launch(env_vars=[ + "LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_local_storage.yaml" + ]) + client.run("vllm_lmcache llama3-8b-lmcache-local-storage".split()) + + def test_lmcache_missing_role(self): + with Runner('lmi', 'llama3-8b-lmcache-missing-role') as r: + prepare.build_vllm_async_model("llama3-8b-lmcache-missing-role") + with pytest.raises(Exception): + r.launch() + + +@pytest.mark.vllm +@pytest.mark.gpu_4 +class TestVllmLmcachePerformance_g6: + + def test_lmcache_performance_baseline(self): + with Runner('lmi', 'llama3-8b-no-lmcache') as r: + prepare.build_vllm_async_model("llama3-8b-no-lmcache") + r.launch() + client.run("vllm_lmcache_performance llama3-8b-no-lmcache".split()) + + def test_lmcache_performance_cpu(self): + with Runner('lmi', 'llama3-8b-lmcache-cpu') as r: + prepare.build_vllm_async_model("llama3-8b-lmcache-cpu") + r.launch(env_vars=[ + "LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_cpu.yaml" + ]) + client.run( + "vllm_lmcache_performance llama3-8b-lmcache-cpu".split()) + + def test_lmcache_performance_local_storage(self): + with Runner('lmi', 'llama3-8b-lmcache-local-storage') as r: + prepare.build_vllm_async_model("llama3-8b-lmcache-local-storage") + r.launch(env_vars=[ + "LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_local_storage.yaml" + ]) + client.run( + "vllm_lmcache_performance llama3-8b-lmcache-local-storage". + split()) + + @pytest.mark.gpu_4 class TestTextEmbedding_g6: