vllm-project · zerofishnoodles · Jun 19, 2025 · Jun 26, 2025 · Jun 20, 2025 · Jun 21, 2025
diff --git a/operator/config/default/kustomization.yaml b/operator/config/default/kustomization.yaml
@@ -1,5 +1,5 @@
 # Adds namespace to all resources.
-namespace: production-stack-system
+namespace: default
 
 # Value of this field is prepended to the
 # names of all resources, e.g. a deployment named
@@ -18,6 +18,7 @@ resources:
   - ../crd
   - ../rbac
   - ../manager
+  - ../storage
   # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in
   # crd/kustomization.yaml
   #- ../webhook

diff --git a/operator/config/manager/deployment.yaml b/operator/config/manager/deployment.yaml
@@ -1,7 +1,7 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: production-stack-controller-manager
+  name: controller-manager
   namespace: production-stack-system
   labels:
     app.kubernetes.io/name: production-stack
@@ -86,7 +86,12 @@ spec:
             requests:
               cpu: 10m
               memory: 64Mi
-          volumeMounts: []
-      volumes: []
+          volumeMounts:
+            - name: shared-pvc-storage
+              mountPath: /data/shared-pvc-storage
+      volumes:
+        - name: shared-pvc-storage
+          persistentVolumeClaim:
+            claimName: production-stack-shared-pvc-storage-claim
       serviceAccountName: production-stack-controller-manager
       terminationGracePeriodSeconds: 10
diff --git a/operator/config/manager/kustomization.yaml b/operator/config/manager/kustomization.yaml
@@ -1,3 +1,8 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+images:
+- name: controller
+  newName: controller
+  newTag: latest
 resources:
-  - namespace.yaml
-  - deployment.yaml
+- deployment.yaml
diff --git a/operator/config/rbac/role.yaml b/operator/config/rbac/role.yaml
@@ -8,6 +8,8 @@ rules:
   - ""
   resources:
   - configmaps
+  - persistentvolumeclaims
+  - persistentvolumes
   - secrets
   - serviceaccounts
   - services
@@ -39,6 +41,18 @@ rules:
   - patch
   - update
   - watch
+- apiGroups:
+  - networking.k8s.io
+  resources:
+  - ingresses
+  verbs:
+  - create
+  - delete
+  - get
+  - list
+  - patch
+  - update
+  - watch
 - apiGroups:
   - production-stack.vllm.ai
   resources:

diff --git a/operator/config/samples/production-stack_v1alpha1_loraadapter.yaml b/operator/config/samples/production-stack_v1alpha1_loraadapter.yaml
@@ -6,20 +6,17 @@ metadata:
     app.kubernetes.io/managed-by: kustomize
   name: loraadapter-sample
 spec:
-  baseModel: "llama3-8b-instr" # Use the model name with your specified model name in engineSpec
-  # If you want to use vllm api key, uncomment the following section, you can either use secret or directly set the value
-  # Option 1: Secret reference
+  baseModel: "llama-3.1-8b-instruct" # Use the model name with your specified model label in vllmruntime
   # vllmApiKey:
   #   secretName: "vllm-api-key"
   #   secretKey: "VLLM_API_KEY"
-
-  # Option 2: Direct value
-  # vllmApiKey:
-  #   value: "abc123"
   adapterSource:
-    type: "local" # (local, huggingface, s3) for now we only support local
+    type: "huggingface" # (local, huggingface)
     adapterName: "llama-3.1-nemoguard-8b-topic-control" # This will be the adapter ID
-    adapterPath: "/data/lora-adapters/llama-3.1-nemoguard-8b-topic-control" # This will be the path to the adapter in the persistent volume
+    repository: "nvidia/llama-3.1-nemoguard-8b-topic-control"
+    credentialsSecretRef:
+      name: "huggingface-credentials"
+      key: "hf_token"
   loraAdapterDeploymentConfig:
     algorithm: "default" # for now we only support default algorithm
     replicas: 1 # if not specified, by default algorithm, the lora adapter will be applied to all llama3-8b models, if specified, the lora adapter will only be applied to the specified number of replicas
diff --git a/operator/config/samples/production-stack_v1alpha1_vllmrouter.yaml b/operator/config/samples/production-stack_v1alpha1_vllmrouter.yaml
@@ -16,7 +16,7 @@ spec:
   serviceDiscovery: k8s
 
   # Label selector for vLLM runtime pods
-  k8sLabelSelector: "app=vllmruntime-sample"
+  k8sLabelSelector: "model=llama-3.1-8b-instruct"
 
   # Routing strategy (roundrobin or session)
   routingLogic: roundrobin

diff --git a/operator/config/samples/production-stack_v1alpha1_vllmruntime.yaml b/operator/config/samples/production-stack_v1alpha1_vllmruntime.yaml
@@ -4,12 +4,13 @@ metadata:
   labels:
     app.kubernetes.io/name: production-stack
     app.kubernetes.io/managed-by: kustomize
+    model: "llama-3.1-8b-instruct"
   name: vllmruntime-sample
 spec:
   # Model configuration
   model:
-    modelURL: "meta-llama/Llama-3.1-8B"
-    enableLoRA: false
+    modelURL: "meta-llama/Llama-3.1-8B-Instruct"
+    enableLoRA: true
     enableTool: false
     toolCallParser: ""
     maxModelLen: 4096
@@ -60,7 +61,7 @@ spec:
       pullSecretName: ""
 
     # Number of replicas
-    replicas: 1
+    replicas: 2
 
     # Deployment strategy
     deploymentStrategy: "Recreate"
diff --git a/operator/config/storage/kustomization.yaml b/operator/config/storage/kustomization.yaml
@@ -0,0 +1,4 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+- pvc.yaml
diff --git a/operator/config/storage/pvc.yaml b/operator/config/storage/pvc.yaml
@@ -0,0 +1,33 @@
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: shared-pvc-storage
+  labels:
+    type: local
+    app: production-stack
+    component: shared-pvc-storage
+spec:
+  storageClassName: ""
+  capacity:
+    storage: 100Gi
+  accessModes:
+    - ReadWriteMany
+  hostPath:
+    path: /data/shared-pvc-storage
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: shared-pvc-storage-claim
+  namespace: default
+  labels:
+    app: production-stack
+    component: shared-pvc-storage
+spec:
+  storageClassName: ""
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 100Gi
diff --git a/operator/internal/controller/loraadapter_controller.go b/operator/internal/controller/loraadapter_controller.go
@@ -63,6 +63,8 @@ type LoraAdapterReconciler struct {
 // +kubebuilder:rbac:groups=production-stack.vllm.ai,resources=loraadapters/finalizers,verbs=update
 // +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch
 // +kubebuilder:rbac:groups=core,resources=services,verbs=get;list;watch
+// +kubebuilder:rbac:groups=core,resources=persistentvolumeclaims,verbs=get;list;watch;create;update;patch;delete
+// +kubebuilder:rbac:groups=networking.k8s.io,resources=ingresses,verbs=get;list;watch;create;update;patch;delete
 
 // Reconcile is part of the main kubernetes reconciliation loop which aims to
 // move the current state of the cluster closer to the desired state.

diff --git a/operator/internal/controller/vllmrouter_controller.go b/operator/internal/controller/vllmrouter_controller.go
@@ -194,8 +194,9 @@ func (r *VLLMRouterReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 
 // deploymentForVLLMRouter returns a VLLMRouter Deployment object
 func (r *VLLMRouterReconciler) deploymentForVLLMRouter(router *servingv1alpha1.VLLMRouter) *appsv1.Deployment {
-	labels := map[string]string{
-		"app": router.Name,
+	labels := map[string]string{"app": router.Name}
+	for k, v := range router.Labels {
+		labels[k] = v
 	}
 
 	// Add user-defined environment variables

diff --git a/operator/internal/controller/vllmruntime_controller.go b/operator/internal/controller/vllmruntime_controller.go
@@ -50,6 +50,8 @@ type VLLMRuntimeReconciler struct {
 // +kubebuilder:rbac:groups=core,resources=configmaps,verbs=get;list;watch;create;update;patch;delete
 // +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch;create;update;patch;delete
 // +kubebuilder:rbac:groups=core,resources=services,verbs=get;list;watch;create;update;patch;delete
+// +kubebuilder:rbac:groups=core,resources=persistentvolumeclaims,verbs=get;list;watch;create;update;patch;delete
+// +kubebuilder:rbac:groups=core,resources=persistentvolumes,verbs=get;list;watch;create;update;patch;delete
 
 // Reconcile is part of the main kubernetes reconciliation loop which aims to
 // move the current state of the cluster closer to the desired state.
@@ -150,8 +152,9 @@ func (r *VLLMRuntimeReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 
 // deploymentForVLLMRuntime returns a VLLMRuntime Deployment object
 func (r *VLLMRuntimeReconciler) deploymentForVLLMRuntime(vllmRuntime *productionstackv1alpha1.VLLMRuntime) *appsv1.Deployment {
-	labels := map[string]string{
-		"app": vllmRuntime.Name,
+	labels := map[string]string{"app": vllmRuntime.Name}
+	for k, v := range vllmRuntime.Labels {
+		labels[k] = v
 	}
 
 	// Define probes
@@ -178,11 +181,11 @@ func (r *VLLMRuntimeReconciler) deploymentForVLLMRuntime(vllmRuntime *production
 				Scheme: corev1.URISchemeHTTP,
 			},
 		},
-		InitialDelaySeconds: 240,
-		PeriodSeconds:       10,
+		InitialDelaySeconds: 300,
+		PeriodSeconds:       20,
 		TimeoutSeconds:      3,
 		SuccessThreshold:    1,
-		FailureThreshold:    3,
+		FailureThreshold:    10,
 	}
 
 	// Build command line arguments
@@ -260,6 +263,15 @@ func (r *VLLMRuntimeReconciler) deploymentForVLLMRuntime(vllmRuntime *production
 		})
 	}
 
+	if vllmRuntime.Spec.Model.EnableLoRA {
+		env = append(env,
+			corev1.EnvVar{
+				Name:  "VLLM_ALLOW_RUNTIME_LORA_UPDATING",
+				Value: "True",
+			},
+		)
+	}
+
 	// LM Cache configuration
 	if vllmRuntime.Spec.LMCacheConfig.Enabled {
 		env = append(env,
@@ -424,6 +436,22 @@ func (r *VLLMRuntimeReconciler) deploymentForVLLMRuntime(vllmRuntime *production
 							Resources:      resources,
 							ReadinessProbe: readinessProbe,
 							LivenessProbe:  livenessProbe,
+							VolumeMounts: []corev1.VolumeMount{
+								{
+									Name:      "shared-pvc-storage",
+									MountPath: "/data/shared-pvc-storage",
+								},
+							},
+						},
+					},
+					Volumes: []corev1.Volume{
+						{
+							Name: "shared-pvc-storage",
+							VolumeSource: corev1.VolumeSource{
+								PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
+									ClaimName: "production-stack-shared-pvc-storage-claim",
+								},
+							},
 						},
 					},
 				},