Skip to content

Commit 4ab8e4d

Browse files
authored
Merge pull request #548 from nebius/fix-autohealing
fix autohealing
2 parents 1e847c4 + 4995fc2 commit 4ab8e4d

File tree

2 files changed

+20
-7
lines changed

2 files changed

+20
-7
lines changed

internal/soperatorchecks/k8s_nodes_controller.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ func (c *K8SNodesController) Reconcile(ctx context.Context, req ctrl.Request) (c
111111
}
112112

113113
func (c *K8SNodesController) processDrainCondition(ctx context.Context, k8sNode *corev1.Node) error {
114-
logger := log.FromContext(ctx).WithName("processDrainCondition")
114+
logger := log.FromContext(ctx).WithName("K8SNodesController.processDrainCondition")
115115
logger.Info("processing drain condition")
116116

117117
var (
@@ -160,7 +160,7 @@ func (c *K8SNodesController) processDrainCondition(ctx context.Context, k8sNode
160160
}
161161

162162
func (c *K8SNodesController) processRebootCondition(ctx context.Context, k8sNode *corev1.Node) error {
163-
logger := log.FromContext(ctx).WithName("processRebootCondition")
163+
logger := log.FromContext(ctx).WithName("K8SNodesController.processRebootCondition")
164164
logger.Info("processing reboot condition")
165165

166166
var (

internal/soperatorchecks/slurm_nodes_controller.go

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"errors"
66
"fmt"
7+
"regexp"
78
"strings"
89
"time"
910

@@ -29,6 +30,8 @@ import (
2930

3031
var (
3132
SlurmNodesControllerName = "soperatorchecks.slurmnodes"
33+
34+
workerPodNameRegex = regexp.MustCompile(`^worker-\d+$`)
3235
)
3336

3437
type SlurmNodesController struct {
@@ -179,6 +182,7 @@ func (c *SlurmNodesController) processKillTaskFailed(
179182
slurmClusterName types.NamespacedName,
180183
slurmNode slurmapi.Node,
181184
) error {
185+
logger := log.FromContext(ctx).WithName("SlurmNodesController.processKillTaskFailed")
182186

183187
drainWithCondition := func() error {
184188
if err := c.drainSlurmNodesWithConditionUpdate(
@@ -208,17 +212,26 @@ func (c *SlurmNodesController) processKillTaskFailed(
208212

209213
if degradedCondition == (corev1.NodeCondition{}) {
210214
// No degraded condition found
215+
logger.V(1).Info("draining because no degraded condition found")
211216
return drainWithCondition()
212217
}
213218

214219
if degradedCondition.Status == corev1.ConditionTrue {
215220
// Node is still rebooting, skip
221+
logger.V(1).Info("skip, still rebooting")
216222
return nil
217223
}
224+
225+
logger = logger.WithValues(
226+
"reasonChangedAt", slurmNode.Reason.ChangedAt.String(),
227+
"conditionTransitionTime", degradedCondition.LastTransitionTime.Time.String(),
228+
)
218229
if slurmNode.Reason.ChangedAt.Before(degradedCondition.LastTransitionTime.Time) {
230+
logger.V(1).Info("undraining, slurm node drained before degraded condition changed")
219231
return c.undrainSlurmNode(ctx, slurmClusterName, slurmNode.Name)
220232
}
221233

234+
logger.V(1).Info("draining, slurm node drained after degraded condition changed")
222235
return drainWithCondition()
223236
}
224237

@@ -348,7 +361,7 @@ func (c *SlurmNodesController) drainSlurmNodes(
348361

349362
var errs []error
350363
for _, pod := range podList.Items {
351-
if _, err := fmt.Sscanf("worker-%d", pod.Name); err == nil {
364+
if workerPodNameRegex.MatchString(pod.Name) {
352365
slurmClusterName := types.NamespacedName{
353366
Namespace: pod.Namespace,
354367
Name: pod.Labels[consts.LabelInstanceKey],
@@ -369,7 +382,7 @@ func (c *SlurmNodesController) drainSlurmNode(
369382
slurmClusterName types.NamespacedName,
370383
slurmNodeName, reason string,
371384
) error {
372-
logger := log.FromContext(ctx).WithName("drainSlurmNode").
385+
logger := log.FromContext(ctx).WithName("SlurmNodesController.drainSlurmNode").
373386
WithValues(
374387
"slurmNodeName", slurmNodeName,
375388
"drainReason", reason,
@@ -403,7 +416,7 @@ func (c *SlurmNodesController) slurmNodesFullyDrained(
403416
ctx context.Context,
404417
k8sNodeName string,
405418
) (bool, error) {
406-
logger := log.FromContext(ctx).WithName("slurmNodesFullyDrained")
419+
logger := log.FromContext(ctx).WithName("SlurmNodesController.slurmNodesFullyDrained")
407420

408421
logger.Info("checking that slurm nodes are fully drained")
409422
podList := &corev1.PodList{}
@@ -412,7 +425,7 @@ func (c *SlurmNodesController) slurmNodesFullyDrained(
412425
}
413426

414427
for _, pod := range podList.Items {
415-
if _, err := fmt.Sscanf("worker-%d", pod.Name); err == nil {
428+
if workerPodNameRegex.MatchString(pod.Name) {
416429
logger = logger.WithValues("slurmNode", pod.Name, "instanceKey", pod.Labels[consts.LabelInstanceKey])
417430
logger.Info("found slurm node")
418431

@@ -443,7 +456,7 @@ func (c *SlurmNodesController) undrainSlurmNode(
443456
slurmClusterName types.NamespacedName,
444457
slurmNodeName string,
445458
) error {
446-
logger := log.FromContext(ctx).WithName("undrainSlurmNode").V(1).
459+
logger := log.FromContext(ctx).WithName("SlurmNodesController.undrainSlurmNode").V(1).
447460
WithValues(
448461
"slurmNodeName", slurmNodeName,
449462
"slurmCluster", slurmClusterName,

0 commit comments

Comments
 (0)