Skip to content

Commit 2997db6

Browse files
committed
Implement conductor startup/liveness probes
This makes the following changes: - The readiness probe is remove because this is only used when the Pod is associated with a Service - A startup and liveness probe is added which does the following: - loads the ironic config - loads the DB API - looks for the configured host in the currently online hosts - This script achieves the following in terms of a check - The script can access the database with the configuration - The list of online conductors is filtered by updated_at, so if this conductor has stopped sending its heartbeat since [conductor]heartbeat_timeout then the probe will fail - The startup probe is run every 2 seconds since conductor will generally start quickly - The liveness probe is run every 30 seconds Jira: OSPRH-22659
1 parent d458faa commit 2997db6

File tree

3 files changed

+50
-31
lines changed

3 files changed

+50
-31
lines changed

internal/controller/ironicconductor_controller.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -946,6 +946,7 @@ func (r *IronicConductorReconciler) generateServiceConfigMaps(
946946
"runlogwatch.sh": "/common/bin/runlogwatch.sh",
947947
"pxe-init.sh": "/common/bin/pxe-init.sh",
948948
"init.sh": "/ironicconductor/bin/init.sh",
949+
"live_check": "/ironicconductor/bin/live_check",
949950
},
950951
Labels: cmLabels,
951952
},

internal/ironicconductor/statefulset.go

Lines changed: 21 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -53,14 +53,14 @@ func StatefulSet(
5353
runAsUser := int64(0)
5454

5555
livenessProbe := &corev1.Probe{
56-
TimeoutSeconds: 5,
57-
PeriodSeconds: 30,
58-
InitialDelaySeconds: 5,
56+
TimeoutSeconds: 5,
57+
// [conductor]heartbeat_timeout is set to 120 so make PeriodSeconds
58+
// more frequent to catch an offline conductor earlier
59+
PeriodSeconds: 30,
5960
}
60-
readinessProbe := &corev1.Probe{
61-
TimeoutSeconds: 5,
62-
PeriodSeconds: 30,
63-
InitialDelaySeconds: 5,
61+
startupProbe := &corev1.Probe{
62+
FailureThreshold: 30,
63+
PeriodSeconds: 2,
6464
}
6565
dnsmasqLivenessProbe := &corev1.Probe{
6666
TimeoutSeconds: 10,
@@ -89,24 +89,15 @@ func StatefulSet(
8989
// https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/
9090
//
9191

92-
if instance.Spec.RPCTransport == "json-rpc" {
93-
livenessProbe.TCPSocket = &corev1.TCPSocketAction{
94-
Port: intstr.IntOrString{Type: intstr.Int, IntVal: int32(8089)},
95-
}
96-
readinessProbe.TCPSocket = &corev1.TCPSocketAction{
97-
Port: intstr.IntOrString{Type: intstr.Int, IntVal: int32(8089)},
98-
}
99-
} else {
100-
livenessProbe.Exec = &corev1.ExecAction{
101-
Command: []string{
102-
"/bin/true",
103-
},
104-
}
105-
readinessProbe.Exec = &corev1.ExecAction{
106-
Command: []string{
107-
"/bin/true",
108-
},
109-
}
92+
livenessProbe.Exec = &corev1.ExecAction{
93+
Command: []string{
94+
"/usr/local/bin/container-scripts/live_check",
95+
},
96+
}
97+
startupProbe.Exec = &corev1.ExecAction{
98+
Command: []string{
99+
"/usr/local/bin/container-scripts/live_check",
100+
},
110101
}
111102

112103
httpbootLivenessProbe.TCPSocket = &corev1.TCPSocketAction{
@@ -188,12 +179,11 @@ func StatefulSet(
188179
SecurityContext: &corev1.SecurityContext{
189180
RunAsUser: &runAsUser,
190181
},
191-
Env: env.MergeEnvs([]corev1.EnvVar{}, envVars),
192-
VolumeMounts: conductorVolumeMounts,
193-
Resources: instance.Spec.Resources,
194-
ReadinessProbe: readinessProbe,
195-
LivenessProbe: livenessProbe,
196-
// StartupProbe: startupProbe,
182+
Env: env.MergeEnvs([]corev1.EnvVar{}, envVars),
183+
VolumeMounts: conductorVolumeMounts,
184+
Resources: instance.Spec.Resources,
185+
LivenessProbe: livenessProbe,
186+
StartupProbe: startupProbe,
197187
}
198188
httpbootContainer := corev1.Container{
199189
Name: "httpboot",
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#!/bin/env python3
2+
#
3+
# Copyright 2025 Red Hat Inc.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License"); you may
6+
# not use this file except in compliance with the License. You may obtain
7+
# a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13+
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14+
# License for the specific language governing permissions and limitations
15+
# under the License.
16+
17+
from ironic.common import config
18+
import ironic.conf
19+
CONF = ironic.conf.CONF
20+
21+
config.parse_args([])
22+
from ironic.db.sqlalchemy import api
23+
24+
dbapi = api.get_backend()
25+
if CONF.host in dbapi.get_online_conductors():
26+
print(f'{CONF.host} is online')
27+
else:
28+
raise Exception(f'{CONF.host} is offline')

0 commit comments

Comments
 (0)