Skip to content

Commit 6cbb5df

Browse files
authored
User longer exec probe timeouts for Head pods (#2353)
Signed-off-by: Andrew Sy Kim <[email protected]>
1 parent fb7a486 commit 6cbb5df

File tree

3 files changed

+39
-10
lines changed

3 files changed

+39
-10
lines changed

ray-operator/controllers/ray/common/pod.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -264,9 +264,14 @@ func initLivenessAndReadinessProbe(rayContainer *corev1.Container, rayNodeType r
264264
}
265265

266266
if rayContainer.LivenessProbe == nil {
267+
probeTimeout := utils.DefaultLivenessProbeTimeoutSeconds
268+
if rayNodeType == rayv1.HeadNode {
269+
probeTimeout = utils.DefaultHeadLivenessProbeTimeoutSeconds
270+
}
271+
267272
rayContainer.LivenessProbe = &corev1.Probe{
268273
InitialDelaySeconds: utils.DefaultLivenessProbeInitialDelaySeconds,
269-
TimeoutSeconds: utils.DefaultLivenessProbeTimeoutSeconds,
274+
TimeoutSeconds: int32(probeTimeout),
270275
PeriodSeconds: utils.DefaultLivenessProbePeriodSeconds,
271276
SuccessThreshold: utils.DefaultLivenessProbeSuccessThreshold,
272277
FailureThreshold: utils.DefaultLivenessProbeFailureThreshold,
@@ -275,9 +280,13 @@ func initLivenessAndReadinessProbe(rayContainer *corev1.Container, rayNodeType r
275280
}
276281

277282
if rayContainer.ReadinessProbe == nil {
283+
probeTimeout := utils.DefaultReadinessProbeTimeoutSeconds
284+
if rayNodeType == rayv1.HeadNode {
285+
probeTimeout = utils.DefaultHeadReadinessProbeTimeoutSeconds
286+
}
278287
rayContainer.ReadinessProbe = &corev1.Probe{
279288
InitialDelaySeconds: utils.DefaultReadinessProbeInitialDelaySeconds,
280-
TimeoutSeconds: utils.DefaultReadinessProbeTimeoutSeconds,
289+
TimeoutSeconds: int32(probeTimeout),
281290
PeriodSeconds: utils.DefaultReadinessProbePeriodSeconds,
282291
SuccessThreshold: utils.DefaultReadinessProbeSuccessThreshold,
283292
FailureThreshold: utils.DefaultReadinessProbeFailureThreshold,

ray-operator/controllers/ray/common/pod_test.go

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1128,7 +1128,7 @@ func TestInitLivenessAndReadinessProbe(t *testing.T) {
11281128
assert.Nil(t, rayContainer.LivenessProbe.Exec)
11291129
assert.Nil(t, rayContainer.ReadinessProbe.Exec)
11301130

1131-
// Test 2: User does not define a custom probe. KubeRay will inject Exec probe.
1131+
// Test 2: User does not define a custom probe. KubeRay will inject Exec probe for worker pod.
11321132
// Here we test the case where the Ray Pod originates from RayServiceCRD,
11331133
// implying that an additional serve health check will be added to the readiness probe.
11341134
rayContainer.LivenessProbe = nil
@@ -1138,4 +1138,20 @@ func TestInitLivenessAndReadinessProbe(t *testing.T) {
11381138
assert.NotNil(t, rayContainer.ReadinessProbe.Exec)
11391139
assert.False(t, strings.Contains(strings.Join(rayContainer.LivenessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath))
11401140
assert.True(t, strings.Contains(strings.Join(rayContainer.ReadinessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath))
1141+
assert.Equal(t, int32(2), rayContainer.LivenessProbe.TimeoutSeconds)
1142+
assert.Equal(t, int32(2), rayContainer.ReadinessProbe.TimeoutSeconds)
1143+
1144+
// Test 3: User does not define a custom probe. KubeRay will inject Exec probe for head pod.
1145+
// Here we test the case where the Ray Pod originates from RayServiceCRD,
1146+
// implying that an additional serve health check will be added to the readiness probe.
1147+
rayContainer.LivenessProbe = nil
1148+
rayContainer.ReadinessProbe = nil
1149+
initLivenessAndReadinessProbe(rayContainer, rayv1.HeadNode, utils.RayServiceCRD)
1150+
assert.NotNil(t, rayContainer.LivenessProbe.Exec)
1151+
assert.NotNil(t, rayContainer.ReadinessProbe.Exec)
1152+
// head pod should not have Ray Serve proxy health probes
1153+
assert.False(t, strings.Contains(strings.Join(rayContainer.LivenessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath))
1154+
assert.False(t, strings.Contains(strings.Join(rayContainer.ReadinessProbe.Exec.Command, " "), utils.RayServeProxyHealthPath))
1155+
assert.Equal(t, int32(5), rayContainer.LivenessProbe.TimeoutSeconds)
1156+
assert.Equal(t, int32(5), rayContainer.ReadinessProbe.TimeoutSeconds)
11411157
}

ray-operator/controllers/ray/utils/constant.go

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -150,17 +150,21 @@ const (
150150
// Ray FT default readiness probe values
151151
DefaultReadinessProbeInitialDelaySeconds = 10
152152
DefaultReadinessProbeTimeoutSeconds = 2
153-
DefaultReadinessProbePeriodSeconds = 5
154-
DefaultReadinessProbeSuccessThreshold = 1
155-
DefaultReadinessProbeFailureThreshold = 10
156-
ServeReadinessProbeFailureThreshold = 1
153+
// Probe timeout for Head pod needs to be longer as it queries two endpoints (api/local_raylet_healthz & api/gcs_healthz)
154+
DefaultHeadReadinessProbeTimeoutSeconds = 5
155+
DefaultReadinessProbePeriodSeconds = 5
156+
DefaultReadinessProbeSuccessThreshold = 1
157+
DefaultReadinessProbeFailureThreshold = 10
158+
ServeReadinessProbeFailureThreshold = 1
157159

158160
// Ray FT default liveness probe values
159161
DefaultLivenessProbeInitialDelaySeconds = 30
160162
DefaultLivenessProbeTimeoutSeconds = 2
161-
DefaultLivenessProbePeriodSeconds = 5
162-
DefaultLivenessProbeSuccessThreshold = 1
163-
DefaultLivenessProbeFailureThreshold = 120
163+
// Probe timeout for Head pod needs to be longer as it queries two endpoints (api/local_raylet_healthz & api/gcs_healthz)
164+
DefaultHeadLivenessProbeTimeoutSeconds = 5
165+
DefaultLivenessProbePeriodSeconds = 5
166+
DefaultLivenessProbeSuccessThreshold = 1
167+
DefaultLivenessProbeFailureThreshold = 120
164168

165169
// Ray health check related configurations
166170
// Note: Since the Raylet process and the dashboard agent process are fate-sharing,

0 commit comments

Comments
 (0)