[Feat] Adding a tutorial for using vLLM v1 in production stack (#390)

YuhanLiu11 · web-flow · commit 24049186d023 · 2025-04-29T14:56:28.000-05:00
* Adding vLLM v1 tutorial

Signed-off-by: YuhanLiu11 &lt;yliu738@wisc.edu&gt;

* Bump helm chart version

Signed-off-by: YuhanLiu11 &lt;yliu738@wisc.edu&gt;

* Fixing yaml file format

Signed-off-by: YuhanLiu11 &lt;yliu738@wisc.edu&gt;

* Fixing yaml file format

Signed-off-by: YuhanLiu11 &lt;yliu738@wisc.edu&gt;

* fix yaml formate

Signed-off-by: YuhanLiu11 &lt;yliu738@wisc.edu&gt;

---------

Signed-off-by: YuhanLiu11 &lt;yliu738@wisc.edu&gt;
diff --git a/helm/Chart.yaml b/helm/Chart.yaml
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.1.1
+version: 0.1.2
 
 maintainers:
   - name: apostac
diff --git a/helm/templates/deployment-vllm-multi.yaml b/helm/templates/deployment-vllm-multi.yaml
@@ -118,8 +118,18 @@ spec:
           {{- end }}
           {{- if $modelSpec.lmcacheConfig }}
           {{-   if $modelSpec.lmcacheConfig.enabled }}
+          {{-     if hasKey $modelSpec.vllmConfig "v1" }}
+          {{-       if eq (toString $modelSpec.vllmConfig.v1) "1" }}
+          - "--kv-transfer-config"
+          - '{"kv_connector":"LMCacheConnectorV1","kv_role":"kv_both"}'
+          {{-       else }}
           - "--kv-transfer-config"
           - '{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}'
+          {{-       end }}
+          {{-     else }}
+          - "--kv-transfer-config"
+          - '{"kv_connector":"LMCacheConnector","kv_role":"kv_both"}'
+          {{-     end }}
           {{-   end }}
           {{- end }}
           {{- if $modelSpec.chatTemplate }}
@@ -139,6 +149,8 @@ spec:
             value: /tmp
             {{- end }}
           {{- with $modelSpec.vllmConfig}}
+          - name: LMCACHE_LOG_LEVEL
+            value: "DEBUG"
           {{- if hasKey . "v1" }}
           - name: VLLM_USE_V1
             value: {{ default 0 $modelSpec.vllmConfig.v1 | quote }}
diff --git a/tutorials/14-vllm-v1.md b/tutorials/14-vllm-v1.md
@@ -0,0 +1,104 @@
+# Tutorial: Running vLLM with v1 Configuration
+
+## Introduction
+
+This tutorial demonstrates how to deploy vLLM with v1 configuration enabled. The v1 configuration uses the LMCacheConnectorV1 for KV cache management, which provides improved performance and stability for certain workloads.
+
+## Prerequisites
+
+- A Kubernetes cluster with GPU support
+- Helm installed on your local machine
+- Completion of the following tutorials:
+  - [00-install-kubernetes-env.md](00-install-kubernetes-env.md)
+  - [01-minimal-helm-installation.md](01-minimal-helm-installation.md)
+
+## Step 1: Understanding the Configuration
+
+The configuration file `values-14-vllm-v1.yaml` includes several important settings:
+
+1. Model Configuration:
+   - Using Llama-3.1-8B-Instruct model
+   - Single replica deployment
+   - Resource requirements: 6 CPU, 16Gi memory, 1 GPU
+   - 50Gi persistent storage
+
+2. vLLM Configuration:
+   - v1 mode enabled (v1: 1)
+   - bfloat16 precision
+   - Maximum sequence length of 4096 tokens
+   - GPU memory utilization set to 80%
+
+3. LMCache Configuration:
+   - KV cache offloading enabled
+   - 20GB CPU offloading buffer size
+
+4. Cache Server Configuration:
+   - Single replica cache server
+   - Naive serialization/deserialization
+   - Resource limits: 2 CPU, 10Gi memory
+
+Feel freet to change the above parameters for your own scenario.
+
+## Step 2: Deploying the Stack
+
+1. First, ensure you're in the correct directory:
+
+   ```bash
+   cd production-stack
+   ```
+
+2. Deploy the stack using Helm:
+
+   ```bash
+   helm install vllm helm/ -f tutorials/assets/values-14-vllm-v1.yaml
+   ```
+
+3. Verify the deployment:
+
+   ```bash
+   kubectl get pods
+   ```
+
+   You should see:
+   - A vLLM pod for the Llama model
+   - A cache server pod
+
+## Step 3: Verifying the Configuration
+
+1. Check the vLLM pod logs to verify v1 configuration:
+
+   ```bash
+   kubectl logs -f <vllm-pod-name>
+   ```
+
+   Look for the following log message:
+
+   ```log
+   INFO 04-29 12:12:25 [factory.py:64] Creating v1 connector with name: LMCacheConnectorV1
+   ```
+
+2. Forward the router service port:
+
+   ```bash
+   kubectl port-forward svc/vllm-router-service 30080:80
+   ```
+
+## Step 4: Testing the Deployment
+
+Send a test request to verify the deployment:
+
+```bash
+curl -X POST http://localhost:30080/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "meta-llama/Llama-3.1-8B-Instruct",
+    "prompt": "Explain the benefits of using v1 configuration in vLLM.",
+    "max_tokens": 100
+  }'
+```
+
+Note that you need to send a prompt greater than 256 tokens in order to reuse the KV cache (the chunk size set in LMCache)
+
+## Conclusion
+
+This tutorial demonstrated how to deploy vLLM with v1 configuration enabled. The v1 configuration provides improved KV cache management through LMCacheConnectorV1, which can lead to better performance for certain workloads. You can adjust the configuration parameters in the values file to optimize for your specific use case.
diff --git a/tutorials/assets/values-06-shared-storage.yaml b/tutorials/assets/values-06-shared-storage.yaml
@@ -55,3 +55,14 @@ cacheserverSpec:
   labels:
     environment: "cacheserver"
     release: "cacheserver"
+
+routerSpec:
+  resources:
+    requests:
+      cpu: "1"
+      memory: "2G"
+    limits:
+      cpu: "1"
+      memory: "2G"
+  routingLogic: "session"
+  sessionKey: "x-user-id"
diff --git a/tutorials/assets/values-14-vllm-v1.yaml b/tutorials/assets/values-14-vllm-v1.yaml
@@ -0,0 +1,61 @@
+servingEngineSpec:
+  runtimeClassName: ""
+  modelSpec:
+  - name: "llama3"
+    repository: "lmcache/vllm-openai"
+    tag: "2025-04-18"
+    modelURL: "meta-llama/Llama-3.1-8B-Instruct"
+    replicaCount: 1
+
+    requestCPU: 6
+    requestMemory: "16Gi"
+    requestGPU: 1
+
+    pvcStorage: "50Gi"
+    pvcAccessMode:
+      - ReadWriteOnce
+
+    vllmConfig:
+      enableChunkedPrefill: false
+      enablePrefixCaching: false
+      maxModelLen: 4096
+      dtype: "bfloat16"
+      v1: 1
+      extraArgs: ["--disable-log-requests", "--gpu-memory-utilization", "0.8"]
+
+    lmcacheConfig:
+      enabled: true
+      cpuOffloadingBufferSize: "20"
+    hf_token: <your-hf-token>
+
+cacheserverSpec:
+  # -- Number of replicas
+  replicaCount: 1
+
+  # -- Container port
+  containerPort: 8080
+
+  # -- Service port
+  servicePort: 81
+
+  # -- Serializer/Deserializer type
+  serde: "naive"
+
+  # -- Cache server image (reusing the vllm image)
+  repository: "lmcache/vllm-openai"
+  tag: "2025-04-18"
+
+  # TODO (Jiayi): please adjust this once we have evictor
+  # -- router resource requests and limits
+  resources:
+    requests:
+      cpu: "2"
+      memory: "8G"
+    limits:
+      cpu: "2"
+      memory: "10G"
+
+  # -- Customized labels for the cache server deployment
+  labels:
+    environment: "cacheserver"
+    release: "cacheserver"