Skip to content
Draft
3 changes: 2 additions & 1 deletion operator/config/default/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Adds namespace to all resources.
namespace: production-stack-system
namespace: default

# Value of this field is prepended to the
# names of all resources, e.g. a deployment named
Expand All @@ -18,6 +18,7 @@ resources:
- ../crd
- ../rbac
- ../manager
- ../storage
# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in
# crd/kustomization.yaml
#- ../webhook
Expand Down
11 changes: 8 additions & 3 deletions operator/config/manager/deployment.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: production-stack-controller-manager
name: controller-manager
namespace: production-stack-system
labels:
app.kubernetes.io/name: production-stack
Expand Down Expand Up @@ -86,7 +86,12 @@ spec:
requests:
cpu: 10m
memory: 64Mi
volumeMounts: []
volumes: []
volumeMounts:
- name: shared-pvc-storage
mountPath: /data/shared-pvc-storage
volumes:
- name: shared-pvc-storage
persistentVolumeClaim:
claimName: production-stack-shared-pvc-storage-claim
serviceAccountName: production-stack-controller-manager
terminationGracePeriodSeconds: 10
9 changes: 7 additions & 2 deletions operator/config/manager/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
images:
- name: controller
newName: controller
newTag: latest
resources:
- namespace.yaml
- deployment.yaml
- deployment.yaml
14 changes: 14 additions & 0 deletions operator/config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ rules:
- ""
resources:
- configmaps
- persistentvolumeclaims
- persistentvolumes
- secrets
- serviceaccounts
- services
Expand Down Expand Up @@ -39,6 +41,18 @@ rules:
- patch
- update
- watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- production-stack.vllm.ai
resources:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,17 @@ metadata:
app.kubernetes.io/managed-by: kustomize
name: loraadapter-sample
spec:
baseModel: "llama3-8b-instr" # Use the model name with your specified model name in engineSpec
# If you want to use vllm api key, uncomment the following section, you can either use secret or directly set the value
# Option 1: Secret reference
baseModel: "llama-3.1-8b-instruct" # Use the model name with your specified model label in vllmruntime
# vllmApiKey:
# secretName: "vllm-api-key"
# secretKey: "VLLM_API_KEY"

# Option 2: Direct value
# vllmApiKey:
# value: "abc123"
adapterSource:
type: "local" # (local, huggingface, s3) for now we only support local
type: "huggingface" # (local, huggingface)
adapterName: "llama-3.1-nemoguard-8b-topic-control" # This will be the adapter ID
adapterPath: "/data/lora-adapters/llama-3.1-nemoguard-8b-topic-control" # This will be the path to the adapter in the persistent volume
repository: "nvidia/llama-3.1-nemoguard-8b-topic-control"
credentialsSecretRef:
name: "huggingface-credentials"
key: "hf_token"
loraAdapterDeploymentConfig:
algorithm: "default" # for now we only support default algorithm
replicas: 1 # if not specified, by default algorithm, the lora adapter will be applied to all llama3-8b models, if specified, the lora adapter will only be applied to the specified number of replicas
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ spec:
serviceDiscovery: k8s

# Label selector for vLLM runtime pods
k8sLabelSelector: "app=vllmruntime-sample"
k8sLabelSelector: "model=llama-3.1-8b-instruct"

# Routing strategy (roundrobin or session)
routingLogic: roundrobin
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@ metadata:
labels:
app.kubernetes.io/name: production-stack
app.kubernetes.io/managed-by: kustomize
model: "llama-3.1-8b-instruct"
name: vllmruntime-sample
spec:
# Model configuration
model:
modelURL: "meta-llama/Llama-3.1-8B"
enableLoRA: false
modelURL: "meta-llama/Llama-3.1-8B-Instruct"
enableLoRA: true
enableTool: false
toolCallParser: ""
maxModelLen: 4096
Expand Down Expand Up @@ -60,7 +61,7 @@ spec:
pullSecretName: ""

# Number of replicas
replicas: 1
replicas: 2

# Deployment strategy
deploymentStrategy: "Recreate"
4 changes: 4 additions & 0 deletions operator/config/storage/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- pvc.yaml
33 changes: 33 additions & 0 deletions operator/config/storage/pvc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
---
apiVersion: v1
kind: PersistentVolume
metadata:
name: shared-pvc-storage
labels:
type: local
app: production-stack
component: shared-pvc-storage
spec:
storageClassName: ""
capacity:
storage: 100Gi
accessModes:
- ReadWriteMany
hostPath:
path: /data/shared-pvc-storage
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: shared-pvc-storage-claim
namespace: default
labels:
app: production-stack
component: shared-pvc-storage
spec:
storageClassName: ""
accessModes:
- ReadWriteMany
resources:
requests:
storage: 100Gi
2 changes: 2 additions & 0 deletions operator/internal/controller/loraadapter_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ type LoraAdapterReconciler struct {
// +kubebuilder:rbac:groups=production-stack.vllm.ai,resources=loraadapters/finalizers,verbs=update
// +kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch
// +kubebuilder:rbac:groups=core,resources=services,verbs=get;list;watch
// +kubebuilder:rbac:groups=core,resources=persistentvolumeclaims,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=networking.k8s.io,resources=ingresses,verbs=get;list;watch;create;update;patch;delete

// Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state.
Expand Down
5 changes: 3 additions & 2 deletions operator/internal/controller/vllmrouter_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,8 +194,9 @@ func (r *VLLMRouterReconciler) Reconcile(ctx context.Context, req ctrl.Request)

// deploymentForVLLMRouter returns a VLLMRouter Deployment object
func (r *VLLMRouterReconciler) deploymentForVLLMRouter(router *servingv1alpha1.VLLMRouter) *appsv1.Deployment {
labels := map[string]string{
"app": router.Name,
labels := map[string]string{"app": router.Name}
for k, v := range router.Labels {
labels[k] = v
}

// Add user-defined environment variables
Expand Down
38 changes: 33 additions & 5 deletions operator/internal/controller/vllmruntime_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ type VLLMRuntimeReconciler struct {
// +kubebuilder:rbac:groups=core,resources=configmaps,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=core,resources=services,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=core,resources=persistentvolumeclaims,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=core,resources=persistentvolumes,verbs=get;list;watch;create;update;patch;delete

// Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state.
Expand Down Expand Up @@ -150,8 +152,9 @@ func (r *VLLMRuntimeReconciler) Reconcile(ctx context.Context, req ctrl.Request)

// deploymentForVLLMRuntime returns a VLLMRuntime Deployment object
func (r *VLLMRuntimeReconciler) deploymentForVLLMRuntime(vllmRuntime *productionstackv1alpha1.VLLMRuntime) *appsv1.Deployment {
labels := map[string]string{
"app": vllmRuntime.Name,
labels := map[string]string{"app": vllmRuntime.Name}
for k, v := range vllmRuntime.Labels {
labels[k] = v
}

// Define probes
Expand All @@ -178,11 +181,11 @@ func (r *VLLMRuntimeReconciler) deploymentForVLLMRuntime(vllmRuntime *production
Scheme: corev1.URISchemeHTTP,
},
},
InitialDelaySeconds: 240,
PeriodSeconds: 10,
InitialDelaySeconds: 300,
PeriodSeconds: 20,
TimeoutSeconds: 3,
SuccessThreshold: 1,
FailureThreshold: 3,
FailureThreshold: 10,
}

// Build command line arguments
Expand Down Expand Up @@ -260,6 +263,15 @@ func (r *VLLMRuntimeReconciler) deploymentForVLLMRuntime(vllmRuntime *production
})
}

if vllmRuntime.Spec.Model.EnableLoRA {
env = append(env,
corev1.EnvVar{
Name: "VLLM_ALLOW_RUNTIME_LORA_UPDATING",
Value: "True",
},
)
}

// LM Cache configuration
if vllmRuntime.Spec.LMCacheConfig.Enabled {
env = append(env,
Expand Down Expand Up @@ -424,6 +436,22 @@ func (r *VLLMRuntimeReconciler) deploymentForVLLMRuntime(vllmRuntime *production
Resources: resources,
ReadinessProbe: readinessProbe,
LivenessProbe: livenessProbe,
VolumeMounts: []corev1.VolumeMount{
{
Name: "shared-pvc-storage",
MountPath: "/data/shared-pvc-storage",
},
},
},
},
Volumes: []corev1.Volume{
{
Name: "shared-pvc-storage",
VolumeSource: corev1.VolumeSource{
PersistentVolumeClaim: &corev1.PersistentVolumeClaimVolumeSource{
ClaimName: "production-stack-shared-pvc-storage-claim",
},
},
},
},
},
Expand Down
Loading