diff --git a/operator/config/default/kustomization.yaml b/operator/config/default/kustomization.yaml index b092f90d4..db5a1903b 100644 --- a/operator/config/default/kustomization.yaml +++ b/operator/config/default/kustomization.yaml @@ -1,5 +1,5 @@ # Adds namespace to all resources. -namespace: production-stack-system +namespace: default # Value of this field is prepended to the # names of all resources, e.g. a deployment named @@ -18,6 +18,7 @@ resources: - ../crd - ../rbac - ../manager + - ../storage # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in # crd/kustomization.yaml #- ../webhook diff --git a/operator/config/manager/deployment.yaml b/operator/config/manager/deployment.yaml index 092443619..e1507bf64 100644 --- a/operator/config/manager/deployment.yaml +++ b/operator/config/manager/deployment.yaml @@ -1,7 +1,7 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: production-stack-controller-manager + name: controller-manager namespace: production-stack-system labels: app.kubernetes.io/name: production-stack @@ -86,7 +86,12 @@ spec: requests: cpu: 10m memory: 64Mi - volumeMounts: [] - volumes: [] + volumeMounts: + - name: shared-pvc-storage + mountPath: /data/shared-pvc-storage + volumes: + - name: shared-pvc-storage + persistentVolumeClaim: + claimName: production-stack-shared-pvc-storage-claim serviceAccountName: production-stack-controller-manager terminationGracePeriodSeconds: 10 diff --git a/operator/config/manager/kustomization.yaml b/operator/config/manager/kustomization.yaml index ac10fc9f6..4821ee983 100644 --- a/operator/config/manager/kustomization.yaml +++ b/operator/config/manager/kustomization.yaml @@ -1,3 +1,8 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +images: +- name: controller + newName: controller + newTag: latest resources: - - namespace.yaml - - deployment.yaml +- deployment.yaml diff --git a/operator/config/rbac/role.yaml b/operator/config/rbac/role.yaml index a2f2d5f46..739675fef 100644 --- a/operator/config/rbac/role.yaml +++ b/operator/config/rbac/role.yaml @@ -8,6 +8,8 @@ rules: - "" resources: - configmaps + - persistentvolumeclaims + - persistentvolumes - secrets - serviceaccounts - services @@ -39,6 +41,18 @@ rules: - patch - update - watch +- apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - production-stack.vllm.ai resources: diff --git a/operator/config/samples/production-stack_v1alpha1_loraadapter.yaml b/operator/config/samples/production-stack_v1alpha1_loraadapter.yaml index 960017ab8..0224d588e 100644 --- a/operator/config/samples/production-stack_v1alpha1_loraadapter.yaml +++ b/operator/config/samples/production-stack_v1alpha1_loraadapter.yaml @@ -6,20 +6,17 @@ metadata: app.kubernetes.io/managed-by: kustomize name: loraadapter-sample spec: - baseModel: "llama3-8b-instr" # Use the model name with your specified model name in engineSpec - # If you want to use vllm api key, uncomment the following section, you can either use secret or directly set the value - # Option 1: Secret reference + baseModel: "llama-3.1-8b-instruct" # Use the model name with your specified model label in vllmruntime # vllmApiKey: # secretName: "vllm-api-key" # secretKey: "VLLM_API_KEY" - - # Option 2: Direct value - # vllmApiKey: - # value: "abc123" adapterSource: - type: "local" # (local, huggingface, s3) for now we only support local + type: "huggingface" # (local, huggingface) adapterName: "llama-3.1-nemoguard-8b-topic-control" # This will be the adapter ID - adapterPath: "/data/lora-adapters/llama-3.1-nemoguard-8b-topic-control" # This will be the path to the adapter in the persistent volume + repository: "nvidia/llama-3.1-nemoguard-8b-topic-control" + credentialsSecretRef: + name: "huggingface-credentials" + key: "hf_token" loraAdapterDeploymentConfig: algorithm: "default" # for now we only support default algorithm replicas: 1 # if not specified, by default algorithm, the lora adapter will be applied to all llama3-8b models, if specified, the lora adapter will only be applied to the specified number of replicas diff --git a/operator/config/samples/production-stack_v1alpha1_vllmrouter.yaml b/operator/config/samples/production-stack_v1alpha1_vllmrouter.yaml index 80995f0c9..807853f1d 100644 --- a/operator/config/samples/production-stack_v1alpha1_vllmrouter.yaml +++ b/operator/config/samples/production-stack_v1alpha1_vllmrouter.yaml @@ -16,7 +16,7 @@ spec: serviceDiscovery: k8s # Label selector for vLLM runtime pods - k8sLabelSelector: "app=vllmruntime-sample" + k8sLabelSelector: "model=llama-3.1-8b-instruct" # Routing strategy (roundrobin or session) routingLogic: roundrobin diff --git a/operator/config/samples/production-stack_v1alpha1_vllmruntime.yaml b/operator/config/samples/production-stack_v1alpha1_vllmruntime.yaml index b5be819bb..053f79fc8 100644 --- a/operator/config/samples/production-stack_v1alpha1_vllmruntime.yaml +++ b/operator/config/samples/production-stack_v1alpha1_vllmruntime.yaml @@ -4,12 +4,13 @@ metadata: labels: app.kubernetes.io/name: production-stack app.kubernetes.io/managed-by: kustomize + model: "llama-3.1-8b-instruct" name: vllmruntime-sample spec: # Model configuration model: - modelURL: "meta-llama/Llama-3.1-8B" - enableLoRA: false + modelURL: "meta-llama/Llama-3.1-8B-Instruct" + enableLoRA: true enableTool: false toolCallParser: "" maxModelLen: 4096 @@ -60,7 +61,7 @@ spec: pullSecretName: "" # Number of replicas - replicas: 1 + replicas: 2 # Deployment strategy deploymentStrategy: "Recreate" diff --git a/operator/config/storage/kustomization.yaml b/operator/config/storage/kustomization.yaml new file mode 100644 index 000000000..7bfd4518a --- /dev/null +++ b/operator/config/storage/kustomization.yaml @@ -0,0 +1,4 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: +- pvc.yaml diff --git a/operator/config/storage/pvc.yaml b/operator/config/storage/pvc.yaml new file mode 100644 index 000000000..386baee16 --- /dev/null +++ b/operator/config/storage/pvc.yaml @@ -0,0 +1,33 @@ +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: shared-pvc-storage + labels: + type: local + app: production-stack + component: shared-pvc-storage +spec: + storageClassName: "" + capacity: + storage: 100Gi + accessModes: + - ReadWriteMany + hostPath: + path: /data/shared-pvc-storage +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: shared-pvc-storage-claim + namespace: default + labels: + app: production-stack + component: shared-pvc-storage +spec: + storageClassName: "" + accessModes: + - ReadWriteMany + resources: + requests: + storage: 100Gi diff --git a/operator/internal/controller/loraadapter_controller.go b/operator/internal/controller/loraadapter_controller.go index 50a361378..5f6866c27 100644 --- a/operator/internal/controller/loraadapter_controller.go +++ b/operator/internal/controller/loraadapter_controller.go @@ -63,6 +63,8 @@ type LoraAdapterReconciler struct { // +kubebuilder:rbac:groups=production-stack.vllm.ai,resources=loraadapters/finalizers,verbs=update // +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch // +kubebuilder:rbac:groups=core,resources=services,verbs=get;list;watch +// +kubebuilder:rbac:groups=core,resources=persistentvolumeclaims,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=networking.k8s.io,resources=ingresses,verbs=get;list;watch;create;update;patch;delete // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. diff --git a/operator/internal/controller/vllmrouter_controller.go b/operator/internal/controller/vllmrouter_controller.go index d9352f5ce..13c2e4b43 100644 --- a/operator/internal/controller/vllmrouter_controller.go +++ b/operator/internal/controller/vllmrouter_controller.go @@ -194,8 +194,9 @@ func (r *VLLMRouterReconciler) Reconcile(ctx context.Context, req ctrl.Request) // deploymentForVLLMRouter returns a VLLMRouter Deployment object func (r *VLLMRouterReconciler) deploymentForVLLMRouter(router *servingv1alpha1.VLLMRouter) *appsv1.Deployment { - labels := map[string]string{ - "app": router.Name, + labels := map[string]string{"app": router.Name} + for k, v := range router.Labels { + labels[k] = v } // Add user-defined environment variables diff --git a/operator/internal/controller/vllmruntime_controller.go b/operator/internal/controller/vllmruntime_controller.go index 72ca8d672..dfc39f397 100644 --- a/operator/internal/controller/vllmruntime_controller.go +++ b/operator/internal/controller/vllmruntime_controller.go @@ -50,6 +50,8 @@ type VLLMRuntimeReconciler struct { // +kubebuilder:rbac:groups=core,resources=configmaps,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=core,resources=services,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=core,resources=persistentvolumeclaims,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=core,resources=persistentvolumes,verbs=get;list;watch;create;update;patch;delete // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. @@ -150,8 +152,9 @@ func (r *VLLMRuntimeReconciler) Reconcile(ctx context.Context, req ctrl.Request) // deploymentForVLLMRuntime returns a VLLMRuntime Deployment object func (r *VLLMRuntimeReconciler) deploymentForVLLMRuntime(vllmRuntime *productionstackv1alpha1.VLLMRuntime) *appsv1.Deployment { - labels := map[string]string{ - "app": vllmRuntime.Name, + labels := map[string]string{"app": vllmRuntime.Name} + for k, v := range vllmRuntime.Labels { + labels[k] = v } // Define probes @@ -178,11 +181,11 @@ func (r *VLLMRuntimeReconciler) deploymentForVLLMRuntime(vllmRuntime *production Scheme: corev1.URISchemeHTTP, }, }, - InitialDelaySeconds: 240, - PeriodSeconds: 10, + InitialDelaySeconds: 300, + PeriodSeconds: 20, TimeoutSeconds: 3, SuccessThreshold: 1, - FailureThreshold: 3, + FailureThreshold: 10, } // Build command line arguments @@ -260,6 +263,15 @@ func (r *VLLMRuntimeReconciler) deploymentForVLLMRuntime(vllmRuntime *production }) } + if vllmRuntime.Spec.Model.EnableLoRA { + env = append(env, + corev1.EnvVar{ + Name: "VLLM_ALLOW_RUNTIME_LORA_UPDATING", + Value: "True", + }, + ) + } + // LM Cache configuration if vllmRuntime.Spec.LMCacheConfig.Enabled { env = append(env, @@ -424,6 +436,22 @@ func (r *VLLMRuntimeReconciler) deploymentForVLLMRuntime(vllmRuntime *production Resources: resources, ReadinessProbe: readinessProbe, LivenessProbe: livenessProbe, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "shared-pvc-storage", + MountPath: "/data/shared-pvc-storage", + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "shared-pvc-storage", + VolumeSource: corev1.VolumeSource{ + PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{ + ClaimName: "production-stack-shared-pvc-storage-claim", + }, + }, }, }, },