Bugfix/482 helm rayspec fix (#483)

ahinsutime · YuhanLiu11 · web-flow · commit 6e3c06f4aa39 · 2025-06-05T14:10:42.000-07:00
* [bugfix] Bugfixed raySpec preventing multiple deployments specified with modelSpecs.

Signed-off-by: ahinsutime &lt;ahinsutime@gmail.com&gt;

* [Doc] Updated tutorial document and values for raySpec.

Signed-off-by: ahinsutime &lt;ahinsutime@gmail.com&gt;

* [Bugfix] Updated helm chart version to sync with changes due to bugfix.

Signed-off-by: ahinsutime &lt;ahinsutime@gmail.com&gt;

* [Doc] Added guideline to deploy both ray cluster and deployments. Fixed typos. Added more example values.

Signed-off-by: ahinsutime &lt;ahinsutime@gmail.com&gt;

* [Bugfix] Fixed configmap conflicts by distinguishing configmap names.

Signed-off-by: ahinsutime &lt;ahinsutime@gmail.com&gt;

---------

Signed-off-by: ahinsutime &lt;ahinsutime@gmail.com&gt;
Co-authored-by: Yuhan Liu &lt;32589867+YuhanLiu11@users.noreply.github.com&gt;
diff --git a/helm/Chart.yaml b/helm/Chart.yaml
@@ -15,7 +15,7 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.1.3
+version: 0.1.4
 
 maintainers:
   - name: apostac
diff --git a/helm/templates/deployment-vllm-multi.yaml b/helm/templates/deployment-vllm-multi.yaml
@@ -1,5 +1,6 @@
-{{- if and .Values.servingEngineSpec.enableEngine (not (hasKey .Values.servingEngineSpec "raySpec")) -}}
+{{- if .Values.servingEngineSpec.enableEngine -}}
 {{- range $modelSpec := .Values.servingEngineSpec.modelSpec }}
+{{- if not (hasKey $modelSpec "raySpec") }}
 {{- $kv_role := "kv_both" }}
 {{- $kv_rank := 0 }}
 {{- $kv_parallel_size := 1 }}
@@ -411,3 +412,4 @@ data:
 ---
 {{- end }}
 {{- end }}
+{{- end }}
diff --git a/helm/templates/ray-cluster.yaml b/helm/templates/ray-cluster.yaml
@@ -1,5 +1,6 @@
-{{- if and .Values.servingEngineSpec.enableEngine (hasKey .Values.servingEngineSpec "raySpec")}}
+{{- if .Values.servingEngineSpec.enableEngine }}
 {{- range $modelSpec := .Values.servingEngineSpec.modelSpec }}
+{{- if (hasKey $modelSpec "raySpec") }}
 {{- with $ -}}
 apiVersion: ray.io/v1
 kind: RayCluster
@@ -154,10 +155,10 @@ spec:
                 command: ["/bin/bash", "-c", "echo TBD"]
             resources:
               limits:
-                cpu: {{ default "2" .Values.servingEngineSpec.raySpec.headNode.requestCPU }}
-                memory: {{ default "8Gi" .Values.servingEngineSpec.raySpec.headNode.requestMemory }}
-                {{- if hasKey .Values.servingEngineSpec.raySpec.headNode "requestGPU" }}
-                nvidia.com/gpu: {{ .Values.servingEngineSpec.raySpec.headNode.requestGPU }}
+                cpu: {{ default "2" $modelSpec.raySpec.headNode.requestCPU }}
+                memory: {{ default "8Gi" $modelSpec.raySpec.headNode.requestMemory }}
+                {{- if hasKey $modelSpec.raySpec.headNode "requestGPU" }}
+                nvidia.com/gpu: {{ $modelSpec.raySpec.headNode.requestGPU }}
                 {{- end }}
             startupProbe:
               exec:
@@ -192,10 +193,10 @@ spec:
         volumes:
           - name: wait-script
             configMap:
-              name: wait-for-ray-script
+              name: "{{$modelSpec.name}}-wait-for-ray-script"
           - name: vllm-script
             configMap:
-              name: vllm-start-script
+              name: "{{$modelSpec.name}}-vllm-start-script"
           {{- if or (hasKey $modelSpec "pvcStorage") (and $modelSpec.vllmConfig (hasKey $modelSpec.vllmConfig "tensorParallelSize")) (hasKey $modelSpec "chatTemplate") (hasKey $modelSpec "extraVolumes") }}
           {{- if hasKey $modelSpec "pvcStorage" }}
           - name: {{ .Release.Name }}-storage
@@ -400,10 +401,10 @@ spec:
           volumes:
             - name: wait-script
               configMap:
-                name: wait-for-ray-script
+                name: "{{$modelSpec.name}}-wait-for-ray-script"
             - name: vllm-script
               configMap:
-                name: vllm-start-script
+                name: "{{$modelSpec.name}}-vllm-start-script"
             {{- if or (hasKey $modelSpec "pvcStorage") (and $modelSpec.vllmConfig (hasKey $modelSpec.vllmConfig "tensorParallelSize")) (hasKey $modelSpec "chatTemplate") (hasKey $modelSpec "extraVolumes") }}
             {{- if hasKey $modelSpec "pvcStorage" }}
             - name: {{ .Release.Name }}-storage
@@ -466,7 +467,7 @@ spec:
 apiVersion: v1
 kind: ConfigMap
 metadata:
-  name: wait-for-ray-script
+  name: "{{$modelSpec.name}}-wait-for-ray-script"
 data:
   wait_for_ray.py: |
     import ray
@@ -499,7 +500,7 @@ data:
 apiVersion: v1
 kind: ConfigMap
 metadata:
-  name: vllm-start-script
+  name: "{{$modelSpec.name}}-vllm-start-script"
 data:
   vllm-entrypoint.sh: |
     #!/bin/bash
@@ -618,3 +619,4 @@ data:
 ---
 {{- end }}
 {{- end }}
+{{- end }}
diff --git a/tutorials/00-a-install-multinode-kubernetes-env.md b/tutorials/00-a-install-multinode-kubernetes-env.md
@@ -107,12 +107,9 @@ Before you begin, ensure the following:
    ```
 
 4. **Explanation:**
-   - Downloads, installs and configures v1.32 version of cri-o container runtime for your Kubernetes cluster.
-
-5. **Explanation:**
    This script downloads v1.32 version of [`cri-0`](https://github.com/cri-o/packaging/blob/main/README.md#distributions-using-deb-packages), one of container runtimes for Kubernetes for managing pods on your cluster.
 
-6. Repeat steps 1 to 2 on your other bare-metal server, which will serve as a worker node.
+5. Repeat steps 1 to 2 on your other bare-metal server, which will serve as a worker node.
 
 ### Step 3: Setting up a control plane node
 
diff --git a/tutorials/15-basic-pipeline-parallel.md b/tutorials/15-basic-pipeline-parallel.md
@@ -32,13 +32,13 @@ This tutorial provides a step-by-step guide for configuring and deploying the vL
 
 ## Step 2: Preparing the Configuration File
 
-1. Locate the example configuration file [`tutorials/assets/values-15-minimal-pipeline-parallel-example.yaml`](assets/values-15-minimal-pipeline-parallel-example.yaml).
+1. Locate the example configuration file [`tutorials/assets/values-15-a-minimal-pipeline-parallel-example-raycluster.yaml`](assets/values-15-a-minimal-pipeline-parallel-example-raycluster.yaml).
 
 2. Open the file and update the following fields:
 
 - Write your actual huggingface token in `hf_token: <YOUR HF TOKEN>` in the yaml file.
 
-### Explanation of Key Items in `values-15-minimal-pipeline-parallel-example.yaml`
+### Explanation of Key Items in `values-15-a-minimal-pipeline-parallel-example-raycluster.yaml`
 
 - **`raySpec`**: Required when using KubeRay to enable pipeline parallelism.
 - **`headNode`**: Specifies the resource requirements for the Kuberay head node and must be defined accordingly:
@@ -74,11 +74,6 @@ In the following example, we configure a total of two Ray nodes each equipped wi
 ```yaml
 servingEngineSpec:
   runtimeClassName: ""
-  raySpec:
-    headNode:
-      requestCPU: 2
-      requestMemory: "20Gi"
-      requestGPU: 2
   modelSpec:
   - name: "distilgpt2"
     repository: "vllm/vllm-openai"
@@ -97,6 +92,12 @@ servingEngineSpec:
 
     shmSize: "20Gi"
 
+    raySpec:
+      headNode:
+        requestCPU: 2
+        requestMemory: "20Gi"
+        requestGPU: 2
+
     hf_token: <YOUR HF TOKEN>
 ```
 
@@ -307,3 +308,5 @@ TEST SUITE: None
 ## Conclusion
 
 In this tutorial, you configured and deployed the vLLM serving engine with support for pipeline parallelism across multiple GPUs within a multi-node Kubernetes environment using KubeRay. Additionally, you learned how to verify the deployment and monitor the associated pods to ensure proper operation. For further customization and configuration options, please consult the `values.yaml` file and the Helm chart documentation.
+
+To deploy both a Ray cluster and standard Kubernetes deployments using a single Helm release, please refer to the example configuration file available at [`tutorials/assets/values-15-b-minimal-pipeline-parallel-example-multiple-modelspec.yaml`](assets/values-15-b-minimal-pipeline-parallel-example-multiple-modelspec.yaml).
diff --git a/tutorials/assets/values-15-a-minimal-pipeline-parallel-example-raycluster.yaml b/tutorials/assets/values-15-a-minimal-pipeline-parallel-example-raycluster.yaml
@@ -1,24 +1,25 @@
 servingEngineSpec:
   runtimeClassName: ""
-  raySpec:
-    headNode:
-      requestCPU: 2
-      requestMemory: "20Gi"
-      requestGPU: 2
   modelSpec:
-  - name: "distilgpt2"
+  - name: "distilgpt2-raycluster"
     repository: "vllm/vllm-openai"
     tag: "latest"
     modelURL: "distilbert/distilgpt2"
 
     replicaCount: 1
 
-    requestCPU: 2
+    requestCPU: 1
     requestMemory: "20Gi"
-    requestGPU: 2
+    requestGPU: 1
 
     vllmConfig:
-      tensorParallelSize: 2
+      tensorParallelSize: 1
       pipelineParallelSize: 2
 
     shmSize: "20Gi"
+
+    raySpec:
+      headNode:
+        requestCPU: 1
+        requestMemory: "20Gi"
+        requestGPU: 1
diff --git a/tutorials/assets/values-15-b-minimal-pipeline-parallel-example-multiple-modelspec.yaml b/tutorials/assets/values-15-b-minimal-pipeline-parallel-example-multiple-modelspec.yaml
@@ -0,0 +1,40 @@
+servingEngineSpec:
+  runtimeClassName: ""
+  modelSpec:
+  - name: "distilgpt2-raycluster"
+    repository: "vllm/vllm-openai"
+    tag: "latest"
+    modelURL: "distilbert/distilgpt2"
+
+    replicaCount: 1
+
+    requestCPU: 1
+    requestMemory: "20Gi"
+    requestGPU: 1
+
+    vllmConfig:
+      tensorParallelSize: 1
+      pipelineParallelSize: 2
+
+    shmSize: "20Gi"
+
+    raySpec:
+      headNode:
+        requestCPU: 1
+        requestMemory: "20Gi"
+        requestGPU: 1
+  - name: "opt125m-deployment"
+    repository: "vllm/vllm-openai"
+    tag: "latest"
+    modelURL: "facebook/opt-125m"
+
+    replicaCount: 1
+
+    requestCPU: 1
+    requestMemory: "20Gi"
+    requestGPU: 1
+
+    vllmConfig:
+      tensorParallelSize: 1
+
+    shmSize: "20Gi"