feat(deployment): add startupProbe for slow model loading (#809)

noalimoy · rootfs · web-flow · commit 970f34987170 · 2025-12-10T11:53:22.000-05:00
Add startupProbe configuration to handle slow container startup caused by ML model loading (embeddings, classifiers, LoRA). Total startup time can reach ~60 seconds. Without startupProbe, liveness/readiness probes start checking at 30s and may kill the pod before it becomes ready. Changes: - Add startupProbe to Kubernetes deployment manifest - Add startupProbe to Helm chart (values.yaml + template) - Increase memory limit to 7Gi for full model set The startupProbe allows up to 300 seconds for startup before liveness/readiness probes begin checking. Fixes #784 Signed-off-by: noalimoy <nlimoy@redhat.com> Co-authored-by: Huamin Chen <rootfs@users.noreply.github.com>
diff --git a/deploy/helm/semantic-router/templates/deployment.yaml b/deploy/helm/semantic-router/templates/deployment.yaml
@@ -110,6 +110,15 @@ spec:
         - name: models-volume
           mountPath: /app/models
         {{- end }}
+        {{- if .Values.startupProbe.enabled }}
+        # Startup probe for slow-starting containers (model loading takes ~60s with Gemma)
+        startupProbe:
+          tcpSocket:
+            port: {{ .Values.service.grpc.targetPort }}
+          periodSeconds: {{ .Values.startupProbe.periodSeconds }}
+          timeoutSeconds: {{ .Values.startupProbe.timeoutSeconds }}
+          failureThreshold: {{ .Values.startupProbe.failureThreshold }}
+        {{- end }}
         {{- if .Values.livenessProbe.enabled }}
         livenessProbe:
           tcpSocket:
diff --git a/deploy/helm/semantic-router/values.yaml b/deploy/helm/semantic-router/values.yaml
@@ -121,7 +121,7 @@ ingress:
 resources:
   # -- Resource limits
   limits:
-    memory: "6Gi"
+    memory: "7Gi"
     cpu: "2"
   # -- Resource requests
   requests:
@@ -202,6 +202,19 @@ tolerations: []
 # Affinity rules
 affinity: {}
 
+# Startup probe configuration (for slow-starting containers with model loading)
+# This probe runs FIRST and disables liveness/readiness probes until it succeeds
+# Required when using Gemma embedding model which takes ~35s to load
+startupProbe:
+  # -- Enable startup probe
+  enabled: true
+  # -- Period seconds
+  periodSeconds: 10
+  # -- Timeout seconds
+  timeoutSeconds: 5
+  # -- Failure threshold
+  failureThreshold: 30
+
 # Liveness probe configuration
 livenessProbe:
   # -- Enable liveness probe
diff --git a/deploy/kubernetes/ai-gateway/semantic-router/deployment.yaml b/deploy/kubernetes/ai-gateway/semantic-router/deployment.yaml
@@ -137,6 +137,14 @@ spec:
               readOnly: true
             - name: models-volume
               mountPath: /app/models
+          # Startup probe for slow-starting containers (model loading takes ~60s with Gemma)
+          # This probe runs FIRST and disables liveness/readiness probes until it succeeds
+          startupProbe:
+            tcpSocket:
+              port: 50051
+            periodSeconds: 10
+            timeoutSeconds: 5
+            failureThreshold: 30  # 10s * 30 = 300s max startup time
           livenessProbe:
             tcpSocket:
               port: 50051
@@ -152,14 +160,13 @@ spec:
             periodSeconds: 30
             timeoutSeconds: 10
             failureThreshold: 3
-          # Significantly reduced resource requirements for kind cluster
+          # Resource requirements - increased memory for Gemma model loading (~7Gi peak)
           resources:
             requests:
-              memory: "3Gi"  # Reduced from 8Gi
-              cpu: "1"  # Reduced from 2
+              memory: "3Gi"
+              cpu: "1"
             limits:
-              memory: "6Gi"  # Reduced from 12Gi
-              cpu: "2"  # Reduced from 4
+              memory: "7Gi"
       volumes:
         - name: config-volume
           configMap: