Skip to content

Commit 970f349

Browse files
noalimoyrootfs
andauthored
feat(deployment): add startupProbe for slow model loading (#809)
Add startupProbe configuration to handle slow container startup caused by ML model loading (embeddings, classifiers, LoRA). Total startup time can reach ~60 seconds. Without startupProbe, liveness/readiness probes start checking at 30s and may kill the pod before it becomes ready. Changes: - Add startupProbe to Kubernetes deployment manifest - Add startupProbe to Helm chart (values.yaml + template) - Increase memory limit to 7Gi for full model set The startupProbe allows up to 300 seconds for startup before liveness/readiness probes begin checking. Fixes #784 Signed-off-by: noalimoy <[email protected]> Co-authored-by: Huamin Chen <[email protected]>
1 parent 249812f commit 970f349

File tree

3 files changed

+35
-6
lines changed

3 files changed

+35
-6
lines changed

deploy/helm/semantic-router/templates/deployment.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,15 @@ spec:
110110
- name: models-volume
111111
mountPath: /app/models
112112
{{- end }}
113+
{{- if .Values.startupProbe.enabled }}
114+
# Startup probe for slow-starting containers (model loading takes ~60s with Gemma)
115+
startupProbe:
116+
tcpSocket:
117+
port: {{ .Values.service.grpc.targetPort }}
118+
periodSeconds: {{ .Values.startupProbe.periodSeconds }}
119+
timeoutSeconds: {{ .Values.startupProbe.timeoutSeconds }}
120+
failureThreshold: {{ .Values.startupProbe.failureThreshold }}
121+
{{- end }}
113122
{{- if .Values.livenessProbe.enabled }}
114123
livenessProbe:
115124
tcpSocket:

deploy/helm/semantic-router/values.yaml

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ ingress:
121121
resources:
122122
# -- Resource limits
123123
limits:
124-
memory: "6Gi"
124+
memory: "7Gi"
125125
cpu: "2"
126126
# -- Resource requests
127127
requests:
@@ -202,6 +202,19 @@ tolerations: []
202202
# Affinity rules
203203
affinity: {}
204204

205+
# Startup probe configuration (for slow-starting containers with model loading)
206+
# This probe runs FIRST and disables liveness/readiness probes until it succeeds
207+
# Required when using Gemma embedding model which takes ~35s to load
208+
startupProbe:
209+
# -- Enable startup probe
210+
enabled: true
211+
# -- Period seconds
212+
periodSeconds: 10
213+
# -- Timeout seconds
214+
timeoutSeconds: 5
215+
# -- Failure threshold
216+
failureThreshold: 30
217+
205218
# Liveness probe configuration
206219
livenessProbe:
207220
# -- Enable liveness probe

deploy/kubernetes/ai-gateway/semantic-router/deployment.yaml

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,14 @@ spec:
137137
readOnly: true
138138
- name: models-volume
139139
mountPath: /app/models
140+
# Startup probe for slow-starting containers (model loading takes ~60s with Gemma)
141+
# This probe runs FIRST and disables liveness/readiness probes until it succeeds
142+
startupProbe:
143+
tcpSocket:
144+
port: 50051
145+
periodSeconds: 10
146+
timeoutSeconds: 5
147+
failureThreshold: 30 # 10s * 30 = 300s max startup time
140148
livenessProbe:
141149
tcpSocket:
142150
port: 50051
@@ -152,14 +160,13 @@ spec:
152160
periodSeconds: 30
153161
timeoutSeconds: 10
154162
failureThreshold: 3
155-
# Significantly reduced resource requirements for kind cluster
163+
# Resource requirements - increased memory for Gemma model loading (~7Gi peak)
156164
resources:
157165
requests:
158-
memory: "3Gi" # Reduced from 8Gi
159-
cpu: "1" # Reduced from 2
166+
memory: "3Gi"
167+
cpu: "1"
160168
limits:
161-
memory: "6Gi" # Reduced from 12Gi
162-
cpu: "2" # Reduced from 4
169+
memory: "7Gi"
163170
volumes:
164171
- name: config-volume
165172
configMap:

0 commit comments

Comments
 (0)