deepjavalibrary · ksuma2109 · Nov 10, 2025 · Nov 8, 2025 · Nov 8, 2025 · Nov 8, 2025
@@ -203,6 +203,12 @@ jobs:
           - test: TestStatefulModel_g6
             instance: g6
             failure-prefix: lmi
+          - test: TestVllmLmcache_g6
+            instance: g6
+            failure-prefix: lmi
+          - test: TestVllmLmcachePerformance_g6
+            instance: g6
+            failure-prefix: lmi
           # P4D instance tests
           - test: TestVllm_p4d
             instance: p4d

@@ -34,4 +34,5 @@ peft
 llmcompressor
 https://vllm-wheels.s3.us-west-2.amazonaws.com/d3ab240f39219df0175ec662416f630d7bf273d8/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 xgrammar
-flashinfer-python==0.4.1
+flashinfer-python==0.4.1
+lmcache
@@ -8,6 +8,7 @@
 import math
 import json
 import shutil
+import time
 from random import randrange
 import numpy as np
 from datetime import datetime
@@ -239,6 +240,21 @@ def get_model_name():
         "seq_length": [256],
         "tokenizer": "MiniMaxAI/MiniMax-M2",
     },
+    "llama3-8b-lmcache-cpu": {
+        "batch_size": [1, 4],
+        "seq_length": [256],
+        "tokenizer": "TheBloke/Llama-3-8B-fp16"
+    },
+    "llama3-8b-lmcache-local-storage": {
+        "batch_size": [1, 4],
+        "seq_length": [256],
+        "tokenizer": "TheBloke/Llama-3-8B-fp16"
+    },
+    "llama3-8b-no-lmcache": {
+        "batch_size": [1, 4],
+        "seq_length": [256],
+        "tokenizer": "TheBloke/Llama-3-8B-fp16"
+    },
 }
 
 vllm_neo_model_spec = {
@@ -1786,6 +1802,91 @@ def test_text_embedding_model(model, model_spec):
         awscurl_run(req, spec.get("tokenizer"), batch_size)
 
 
+def test_handler_lmcache(model, model_spec, is_baseline):
+    modelspec_checker(model, model_spec)
+    spec = model_spec[args.model]
+    if "worker" in spec:
+        check_worker_number(spec["worker"])
+
+    paragraph = """In the realm of artificial intelligence and machine learning, large language models have revolutionized 
+    the way we process and understand natural language. These sophisticated neural networks, trained on vast amounts of text data, 
+    can generate coherent responses, answer questions, and even engage in creative tasks. The architecture underlying these models 
+    typically involves transformer networks with attention mechanisms that allow the model to focus on relevant parts of the input. 
+    As these models scale to billions of parameters, they demonstrate emergent capabilities that weren't explicitly programmed. 
+    However, this scaling comes with significant computational costs, particularly in terms of memory and processing power required 
+    for inference. Key-value caching mechanisms have become essential for optimizing the performance of these models during generation, 
+    storing intermediate computations to avoid redundant calculations. This is especially important in scenarios where multiple requests 
+    share common prefixes or context, as the cached states can be reused across different queries."""
+    shared_prefix = " ".join([paragraph] * 40)
+    params = {"max_new_tokens": 50, "temperature": 0, "seed": 42}
+
+    warmup_req = {"inputs": shared_prefix + " Warmup?", "parameters": params}
+    LOGGER.info("Warmup: Populating cache with shared prefix")
+    start = time.time()
+    send_json(warmup_req)
+    warmup_time = time.time() - start
+    time.sleep(1)
+
+    test_req = {"inputs": shared_prefix + " Test query?", "parameters": params}
+    LOGGER.info("Test: Sending request with shared prefix")
+    start = time.time()
+    send_json(test_req)
+    test_time = time.time() - start
+
+    speedup = warmup_time / test_time if test_time > 0 else 0
+    LOGGER.info(
+        f"Warmup time: {warmup_time:.2f}s, Test time: {test_time:.2f}s, Speedup: {speedup:.2f}x"
+    )
+
+
+def test_handler_lmcache_performance(model, model_spec):
+    modelspec_checker(model, model_spec)
+    spec = model_spec[args.model]
+    if "worker" in spec:
+        check_worker_number(spec["worker"])
+
+    paragraph = """In the realm of artificial intelligence and machine learning, large language models have revolutionized 
+    the way we process and understand natural language. These sophisticated neural networks, trained on vast amounts of text data, 
+    can generate coherent responses, answer questions, and even engage in creative tasks. The architecture underlying these models 
+    typically involves transformer networks with attention mechanisms that allow the model to focus on relevant parts of the input. 
+    As these models scale to billions of parameters, they demonstrate emergent capabilities that weren't explicitly programmed. 
+    However, this scaling comes with significant computational costs, particularly in terms of memory and processing power required 
+    for inference. Key-value caching mechanisms have become essential for optimizing the performance of these models during generation, 
+    storing intermediate computations to avoid redundant calculations. This is especially important in scenarios where multiple requests 
+    share common prefixes or context, as the cached states can be reused across different queries."""
+    shared_prefix = " ".join([paragraph] * 40)
+    params = {"max_new_tokens": 50, "temperature": 0, "seed": 42}
+    concurrency = 100
+    warmup_req = {"inputs": shared_prefix + " Warmup?", "parameters": params}
+    LOGGER.info("Warmup: Populating LMCache with shared prefix")
+    send_json(warmup_req)
+    time.sleep(2)
+
+    shared_reqs = [{
+        "inputs": shared_prefix + f" Query {i}?",
+        "parameters": params
+    } for i in range(concurrency)]
+
+    LOGGER.info(
+        f"Performance test: {concurrency} concurrent requests with shared prefix"
+    )
+    awscurl_run(shared_reqs,
+                spec.get("tokenizer"),
+                concurrency=concurrency,
+                num_run=1,
+                json_results=True,
+                dataset=True)
+
+    with open("benchmark.json") as f:
+        metrics = json.load(f)
+        p50_ttft = metrics["p50TimeToFirstByte"]
+        p90_ttft = metrics["p90TimeToFirstByte"]
+        tps = metrics["tps"]
+        LOGGER.info(
+            f"Results: P50 TTFT={p50_ttft:.2f}ms, P90 TTFT={p90_ttft:.2f}ms, TPS={tps:.2f}"
+        )
+
+
 def test_handler_stateful(model, model_spec):
     if model not in model_spec:
         raise ValueError(
@@ -1899,6 +2000,10 @@ def run(raw_args):
         test_handler_rolling_batch_chat(args.model, vllm_chat_model_spec)
     elif args.handler == "vllm_tool":
         test_handler_rolling_batch_tool(args.model, vllm_tool_model_spec)
+    elif args.handler == "vllm_lmcache":
+        test_handler_lmcache(args.model, vllm_model_spec, False)
+    elif args.handler == "vllm_lmcache_performance":
+        test_handler_lmcache_performance(args.model, vllm_model_spec)
     elif args.handler == "vllm_neo":
         test_handler_rolling_batch(args.model, vllm_neo_model_spec)
     elif args.handler == "handler_performance":

@@ -468,6 +468,35 @@
         "option.max_model_len": 16384,
         "option.gpu_memory_utilization": "0.9",
     },
+    "llama3-8b-lmcache-cpu": {
+        "option.model_id":
+        "s3://djl-llm/llama-3-8b-instruct-hf/",
+        "option.tensor_parallel_degree":
+        4,
+        "lmcache_config_file":
+        "lmcache_cpu.yaml",
+        "option.kv_transfer_config":
+        '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
+    },
+    "llama3-8b-lmcache-local-storage": {
+        "option.model_id":
+        "s3://djl-llm/llama-3-8b-instruct-hf/",
+        "option.tensor_parallel_degree":
+        4,
+        "lmcache_config_file":
+        "lmcache_local_storage.yaml",
+        "option.kv_transfer_config":
+        '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
+    },
+    "llama3-8b-lmcache-missing-role": {
+        "option.model_id": "s3://djl-llm/llama-3-8b-instruct-hf/",
+        "option.tensor_parallel_degree": 4,
+        "option.kv_transfer_config": '{"kv_connector":"LMCacheConnectorV1"}',
+    },
+    "llama3-8b-no-lmcache": {
+        "option.model_id": "s3://djl-llm/llama-3-8b-instruct-hf/",
+        "option.tensor_parallel_degree": 4,
+    },
 }
 
 vllm_neo_model_list = {
@@ -799,11 +828,19 @@
 def write_model_artifacts(properties,
                           requirements=None,
                           adapter_ids=[],
-                          adapter_names=[]):
+                          adapter_names=[],
+                          lmcache_config_file=None):
     model_path = "models/test"
     if os.path.exists(model_path):
         shutil.rmtree(model_path)
     os.makedirs(model_path, exist_ok=True)
+
+    if lmcache_config_file:
+        source_config = os.path.join("lmcache_configs", lmcache_config_file)
+        dest_config = os.path.join(model_path, lmcache_config_file)
+        if os.path.exists(source_config):
+            shutil.copy2(source_config, dest_config)
+
     with open(os.path.join(model_path, "serving.properties"), "w") as f:
         for key, value in properties.items():
             f.write(f"{key}={value}\n")
@@ -935,9 +972,14 @@ def build_vllm_async_model(model):
     adapter_ids = options.pop("adapter_ids", [])
     adapter_names = options.pop("adapter_names", [])
 
+    lmcache_config_file = options.pop("lmcache_config_file", None)
+    if lmcache_config_file:
+        options["option.lmcache_config_file"] = lmcache_config_file
+
     write_model_artifacts(options,
                           adapter_ids=adapter_ids,
-                          adapter_names=adapter_names)
+                          adapter_names=adapter_names,
+                          lmcache_config_file=lmcache_config_file)
 
 
 def build_vllm_async_model_with_custom_handler(model, handler_type="success"):

@@ -0,0 +1,6 @@
+# 256 Tokens per KV Chunk
+chunk_size: 256
+# Enable CPU memory backend
+local_cpu: true # default
+# 5GB of Pinned CPU memory
+max_local_cpu_size: 5.0 # default
@@ -0,0 +1,10 @@
+# 256 Tokens per KV Chunk
+chunk_size: 256
+# Enable Disk backend
+local_disk: "file:///tmp/lmcache/"
+# 5GB of Disk memory
+max_local_disk_size: 5.0
+
+# Disable page cache
+# This should be turned on for better performance if most local CPU memory is used
+extra_config: {'use_odirect': True}
@@ -637,6 +637,63 @@ def test_custom_formatter_load_error(self):
                 r.launch()
 
 
+@pytest.mark.vllm
+@pytest.mark.gpu_4
+class TestVllmLmcache_g6:
+
+    def test_lmcache_cpu(self):
+        with Runner('lmi', 'llama3-8b-lmcache-cpu') as r:
+            prepare.build_vllm_async_model("llama3-8b-lmcache-cpu")
+            r.launch(env_vars=[
+                "LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_cpu.yaml"
+            ])
+            client.run("vllm_lmcache llama3-8b-lmcache-cpu".split())
+
+    def test_lmcache_local_storage(self):
+        with Runner('lmi', 'llama3-8b-lmcache-local-storage') as r:
+            prepare.build_vllm_async_model("llama3-8b-lmcache-local-storage")
+            r.launch(env_vars=[
+                "LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_local_storage.yaml"
+            ])
+            client.run("vllm_lmcache llama3-8b-lmcache-local-storage".split())
+
+    def test_lmcache_missing_role(self):
+        with Runner('lmi', 'llama3-8b-lmcache-missing-role') as r:
+            prepare.build_vllm_async_model("llama3-8b-lmcache-missing-role")
+            with pytest.raises(Exception):
+                r.launch()
+
+
+@pytest.mark.vllm
+@pytest.mark.gpu_4
+class TestVllmLmcachePerformance_g6:
+
+    def test_lmcache_performance_baseline(self):
+        with Runner('lmi', 'llama3-8b-no-lmcache') as r:
+            prepare.build_vllm_async_model("llama3-8b-no-lmcache")
+            r.launch()
+            client.run("vllm_lmcache_performance llama3-8b-no-lmcache".split())
+
+    def test_lmcache_performance_cpu(self):
+        with Runner('lmi', 'llama3-8b-lmcache-cpu') as r:
+            prepare.build_vllm_async_model("llama3-8b-lmcache-cpu")
+            r.launch(env_vars=[
+                "LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_cpu.yaml"
+            ])
+            client.run(
+                "vllm_lmcache_performance llama3-8b-lmcache-cpu".split())
+
+    def test_lmcache_performance_local_storage(self):
+        with Runner('lmi', 'llama3-8b-lmcache-local-storage') as r:
+            prepare.build_vllm_async_model("llama3-8b-lmcache-local-storage")
+            r.launch(env_vars=[
+                "LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_local_storage.yaml"
+            ])
+            client.run(
+                "vllm_lmcache_performance llama3-8b-lmcache-local-storage".
+                split())
+
+
 @pytest.mark.gpu_4
 class TestTextEmbedding_g6: