44 "context"
55 "errors"
66 "fmt"
7+ "regexp"
78 "strings"
89 "time"
910
@@ -29,6 +30,8 @@ import (
2930
3031var (
3132 SlurmNodesControllerName = "soperatorchecks.slurmnodes"
33+
34+ workerPodNameRegex = regexp .MustCompile (`^worker-\d+$` )
3235)
3336
3437type SlurmNodesController struct {
@@ -179,6 +182,7 @@ func (c *SlurmNodesController) processKillTaskFailed(
179182 slurmClusterName types.NamespacedName ,
180183 slurmNode slurmapi.Node ,
181184) error {
185+ logger := log .FromContext (ctx ).WithName ("SlurmNodesController.processKillTaskFailed" )
182186
183187 drainWithCondition := func () error {
184188 if err := c .drainSlurmNodesWithConditionUpdate (
@@ -208,17 +212,26 @@ func (c *SlurmNodesController) processKillTaskFailed(
208212
209213 if degradedCondition == (corev1.NodeCondition {}) {
210214 // No degraded condition found
215+ logger .V (1 ).Info ("draining because no degraded condition found" )
211216 return drainWithCondition ()
212217 }
213218
214219 if degradedCondition .Status == corev1 .ConditionTrue {
215220 // Node is still rebooting, skip
221+ logger .V (1 ).Info ("skip, still rebooting" )
216222 return nil
217223 }
224+
225+ logger = logger .WithValues (
226+ "reasonChangedAt" , slurmNode .Reason .ChangedAt .String (),
227+ "conditionTransitionTime" , degradedCondition .LastTransitionTime .Time .String (),
228+ )
218229 if slurmNode .Reason .ChangedAt .Before (degradedCondition .LastTransitionTime .Time ) {
230+ logger .V (1 ).Info ("undraining, slurm node drained before degraded condition changed" )
219231 return c .undrainSlurmNode (ctx , slurmClusterName , slurmNode .Name )
220232 }
221233
234+ logger .V (1 ).Info ("draining, slurm node drained after degraded condition changed" )
222235 return drainWithCondition ()
223236}
224237
@@ -348,7 +361,7 @@ func (c *SlurmNodesController) drainSlurmNodes(
348361
349362 var errs []error
350363 for _ , pod := range podList .Items {
351- if _ , err := fmt . Sscanf ( "worker-%d" , pod .Name ); err == nil {
364+ if workerPodNameRegex . MatchString ( pod .Name ) {
352365 slurmClusterName := types.NamespacedName {
353366 Namespace : pod .Namespace ,
354367 Name : pod .Labels [consts .LabelInstanceKey ],
@@ -369,7 +382,7 @@ func (c *SlurmNodesController) drainSlurmNode(
369382 slurmClusterName types.NamespacedName ,
370383 slurmNodeName , reason string ,
371384) error {
372- logger := log .FromContext (ctx ).WithName ("drainSlurmNode" ).
385+ logger := log .FromContext (ctx ).WithName ("SlurmNodesController. drainSlurmNode" ).
373386 WithValues (
374387 "slurmNodeName" , slurmNodeName ,
375388 "drainReason" , reason ,
@@ -403,7 +416,7 @@ func (c *SlurmNodesController) slurmNodesFullyDrained(
403416 ctx context.Context ,
404417 k8sNodeName string ,
405418) (bool , error ) {
406- logger := log .FromContext (ctx ).WithName ("slurmNodesFullyDrained" )
419+ logger := log .FromContext (ctx ).WithName ("SlurmNodesController. slurmNodesFullyDrained" )
407420
408421 logger .Info ("checking that slurm nodes are fully drained" )
409422 podList := & corev1.PodList {}
@@ -412,7 +425,7 @@ func (c *SlurmNodesController) slurmNodesFullyDrained(
412425 }
413426
414427 for _ , pod := range podList .Items {
415- if _ , err := fmt . Sscanf ( "worker-%d" , pod .Name ); err == nil {
428+ if workerPodNameRegex . MatchString ( pod .Name ) {
416429 logger = logger .WithValues ("slurmNode" , pod .Name , "instanceKey" , pod .Labels [consts .LabelInstanceKey ])
417430 logger .Info ("found slurm node" )
418431
@@ -443,7 +456,7 @@ func (c *SlurmNodesController) undrainSlurmNode(
443456 slurmClusterName types.NamespacedName ,
444457 slurmNodeName string ,
445458) error {
446- logger := log .FromContext (ctx ).WithName ("undrainSlurmNode" ).V (1 ).
459+ logger := log .FromContext (ctx ).WithName ("SlurmNodesController. undrainSlurmNode" ).V (1 ).
447460 WithValues (
448461 "slurmNodeName" , slurmNodeName ,
449462 "slurmCluster" , slurmClusterName ,
0 commit comments