Added Ray-Serve Config For LLMs (#3517) (#3767)

kevin85421 · Blaze-DSP · DPatel_7 · web-flow · commit 983927dd0341 · 2025-06-13T12:43:15.000+08:00
Co-authored-by: Blaze-DSP &lt;111348803+Blaze-DSP@users.noreply.github.com&gt;
Co-authored-by: DPatel_7 &lt;dpatel@gocommotion.com&gt;
Co-authored-by: Seiji Eicher &lt;seiji@anyscale.com&gt;
diff --git a/ray-operator/config/samples/ray-service.llm-serve.yaml b/ray-operator/config/samples/ray-service.llm-serve.yaml
@@ -0,0 +1,98 @@
+apiVersion: ray.io/v1
+kind: RayService
+metadata:
+  name: ray-serve-llm
+spec:
+  serveConfigV2: |
+    applications:
+    - name: llms
+      import_path: ray.serve.llm:build_openai_app
+      route_prefix: "/"
+      args:
+        llm_configs:
+        - model_loading_config:
+            model_id: qwen2.5-7b-instruct
+            model_source: Qwen/Qwen2.5-7B-Instruct
+          engine_kwargs:
+            dtype: bfloat16
+            max_model_len: 1024
+            device: auto
+            gpu_memory_utilization: 0.75
+          deployment_config:
+            autoscaling_config:
+              min_replicas: 1
+              max_replicas: 4
+              target_ongoing_requests: 64
+            max_ongoing_requests: 128
+  rayClusterConfig:
+    rayVersion: "2.46.0"
+    headGroupSpec:
+      rayStartParams:
+        num-cpus: "0"
+        num-gpus: "0"
+      template:
+        spec:
+          containers:
+          - name: ray-head
+            image: rayproject/ray-llm:2.46.0-py311-cu124
+            ports:
+            - containerPort: 8000
+              name: serve
+              protocol: TCP
+            - containerPort: 8080
+              name: metrics
+              protocol: TCP
+            - containerPort: 6379
+              name: gcs
+              protocol: TCP
+            - containerPort: 8265
+              name: dashboard
+              protocol: TCP
+            - containerPort: 10001
+              name: client
+              protocol: TCP
+            resources:
+              limits:
+                cpu: 2
+                memory: 4Gi
+              requests:
+                cpu: 2
+                memory: 4Gi
+    workerGroupSpecs:
+    - replicas: 1
+      minReplicas: 1
+      maxReplicas: 1
+      numOfHosts: 1
+      groupName: gpu-group
+      rayStartParams:
+        num-gpus: "4"
+      template:
+        spec:
+          containers:
+          - name: ray-worker
+            image: rayproject/ray-llm:2.46.0-py311-cu124
+            env:
+            - name: HUGGING_FACE_HUB_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token
+                  key: hf_token
+            resources:
+              limits:
+                cpu: 32
+                memory: 32Gi
+                nvidia.com/gpu: "4"
+              requests:
+                cpu: 32
+                memory: 32Gi
+                nvidia.com/gpu: "4"
+
+---
+
+apiVersion: v1
+kind: Secret
+metadata:
+  name: hf-token
+type: Opaque
+stringData:
+  hf_token: <your-hf-access-token-value>