Skip to content

Commit 89cf2c2

Browse files
authored
Merge pull request #1315 from nebius/release-1.21.10/fix-wait-controller
Remove controller service DNS check from worker wait logic to support failover scenarios
2 parents d9a82b9 + 2b2ca7c commit 89cf2c2

File tree

5 files changed

+8
-45
lines changed

5 files changed

+8
-45
lines changed

images/worker/wait-for-controller.sh

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,13 @@
11
#!/bin/bash
22

33
echo "Waiting for Slurm controller to be ready..."
4-
controller_service="${CONTROLLER_SERVICE}"
5-
controller_port="${CONTROLLER_PORT:-6817}" # Default to 6817 if not set
64
max_attempts=60
75
attempt=0
86

97
# Create symlink to slurm configs (same as worker entrypoint)
108
echo "Creating symlink to slurm configs..."
119
rm -rf /etc/slurm && ln -s /mnt/jail/etc/slurm /etc/slurm
1210

13-
# Wait for controller service to be resolvable via DNS
14-
echo "Checking controller service DNS resolution..."
15-
attempt=0
16-
while [ $attempt -lt $max_attempts ]; do
17-
if timeout 1 bash -c "</dev/tcp/$controller_service/$controller_port" >/dev/null 2>&1; then
18-
echo "Controller service is reachable on port $controller_port"
19-
break
20-
fi
21-
echo "Attempt $((attempt + 1))/$max_attempts: Waiting for controller service TCP port $controller_port..."
22-
attempt=$((attempt + 1))
23-
sleep 5
24-
done
25-
26-
if ! timeout 1 bash -c "</dev/tcp/$controller_service/$controller_port" >/dev/null 2>&1; then
27-
echo "ERROR: Controller service TCP port $controller_port not reachable after $max_attempts attempts"
28-
exit 1
29-
fi
30-
3111
# Now try to ping the controller using scontrol
3212
echo "Checking slurmctld readiness..."
3313
attempt=0
@@ -38,10 +18,10 @@ while [ $attempt -lt $max_attempts ]; do
3818
echo "Running: scontrol ping"
3919
if scontrol_output=$(scontrol ping 2>&1); then
4020
echo "Controller is ready!"
41-
echo "scontrol ping output: $scontrol_output"
21+
echo -e "scontrol ping output:\n$scontrol_output"
4222
exit 0
4323
else
44-
echo "scontrol ping failed with output: $scontrol_output"
24+
echo -e "scontrol ping failed with output:\n$scontrol_output"
4525
fi
4626

4727
attempt=$((attempt + 1))

internal/controller/clustercontroller/worker.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,6 @@ func (r SlurmClusterReconciler) ReconcileWorkers(
234234
&clusterValues.NodeWorker,
235235
clusterValues.SlurmTopologyConfigMapRefName,
236236
clusterValues.WorkerFeatures,
237-
clusterValues.NodeController.ContainerSlurmctld.Port,
238237
)
239238
if err != nil {
240239
stepLogger.Error(err, "Failed to render")

internal/render/worker/container.go

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -47,24 +47,15 @@ func renderContainerToolkitValidation(container *values.Container) corev1.Contai
4747
}
4848

4949
// RenderContainerWaitForController renders init [corev1.Container] that waits for controller readiness
50-
func RenderContainerWaitForController(container *values.Container, controllerPort int32) corev1.Container {
50+
func RenderContainerWaitForController(container *values.Container) corev1.Container {
5151
return corev1.Container{
5252
Name: consts.ContainerNameWaitForController,
5353
Image: container.Image,
5454
ImagePullPolicy: container.ImagePullPolicy,
5555
Command: []string{
5656
"/opt/bin/slurm/wait-for-controller.sh",
5757
},
58-
Env: []corev1.EnvVar{
59-
{
60-
Name: "CONTROLLER_SERVICE",
61-
Value: fmt.Sprintf("%s-%d", consts.ComponentTypeController, 0),
62-
},
63-
{
64-
Name: "CONTROLLER_PORT",
65-
Value: strconv.FormatInt(int64(controllerPort), 10),
66-
},
67-
},
58+
Env: []corev1.EnvVar{},
6859
VolumeMounts: []corev1.VolumeMount{
6960
common.RenderVolumeMountJail(),
7061
common.RenderVolumeMountMungeSocket(),

internal/render/worker/statefulset.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ func RenderStatefulSet(
3030
worker *values.SlurmWorker,
3131
slurmTopologyConfigMapRefName string,
3232
workerFeatures []slurmv1.WorkerFeature,
33-
controllerPort int32,
3433
) (kruisev1b1.StatefulSet, error) {
3534
labels := common.RenderLabels(consts.ComponentTypeWorker, clusterName)
3635
matchLabels := common.RenderMatchLabels(consts.ComponentTypeWorker, clusterName)
@@ -53,7 +52,7 @@ func RenderStatefulSet(
5352
common.RenderContainerMunge(&worker.ContainerMunge),
5453
}
5554
if worker.WaitForController != nil && *worker.WaitForController {
56-
initContainers = append(initContainers, RenderContainerWaitForController(&worker.ContainerSlurmd, controllerPort))
55+
initContainers = append(initContainers, RenderContainerWaitForController(&worker.ContainerSlurmd))
5756
}
5857
if clusterType == consts.ClusterTypeGPU {
5958
initContainers = append(initContainers, renderContainerToolkitValidation(&worker.ContainerToolkitValidation))

internal/render/worker/statefulset_test.go

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ func Test_RenderStatefulSet(t *testing.T) {
5252
},
5353
},
5454
}
55-
controllerPort := int32(6817)
5655

5756
createWorker := func() *values.SlurmWorker {
5857
return &values.SlurmWorker{
@@ -131,7 +130,7 @@ func Test_RenderStatefulSet(t *testing.T) {
131130
for _, tt := range tests {
132131
t.Run(tt.name, func(t *testing.T) {
133132
result, err := worker.RenderStatefulSet(
134-
testNamespace, testCluster, tt.clusterType, nodeFilter, tt.secrets, volumeSource, tt.worker, testTopologyConfig, nil, controllerPort,
133+
testNamespace, testCluster, tt.clusterType, nodeFilter, tt.secrets, volumeSource, tt.worker, testTopologyConfig, nil,
135134
)
136135
assert.NoError(t, err)
137136

@@ -169,25 +168,20 @@ func Test_RenderStatefulSet(t *testing.T) {
169168
}
170169

171170
func Test_RenderContainerWaitForController(t *testing.T) {
172-
controllerPort := int32(6817)
173171
container := &values.Container{
174172
NodeContainer: slurmv1.NodeContainer{
175173
Image: "test-image",
176174
ImagePullPolicy: corev1.PullIfNotPresent,
177175
},
178176
}
179177

180-
result := worker.RenderContainerWaitForController(container, controllerPort)
178+
result := worker.RenderContainerWaitForController(container)
181179

182180
assert.Equal(t, consts.ContainerNameWaitForController, result.Name)
183181
assert.Equal(t, container.Image, result.Image)
184182
assert.Equal(t, container.ImagePullPolicy, result.ImagePullPolicy)
185183
assert.Equal(t, []string{"/opt/bin/slurm/wait-for-controller.sh"}, result.Command)
186-
assert.Equal(t, 2, len(result.Env))
187-
assert.Equal(t, "CONTROLLER_SERVICE", result.Env[0].Name)
188-
assert.Equal(t, "controller-0", result.Env[0].Value)
189-
assert.Equal(t, "CONTROLLER_PORT", result.Env[1].Name)
190-
assert.Equal(t, "6817", result.Env[1].Value)
184+
assert.Equal(t, 0, len(result.Env))
191185
assert.Equal(t, 2, len(result.VolumeMounts))
192186

193187
// Verify exact volume mount values and no unexpected mounts

0 commit comments

Comments
 (0)