@@ -15,6 +15,7 @@ import (
1515	"time" 
1616
1717	"github.com/bombsimon/logrusr/v2" 
18+ 	workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1" 
1819	"github.com/spf13/cobra" 
1920	corev1 "k8s.io/api/core/v1" 
2021	"k8s.io/apimachinery/pkg/api/errors" 
@@ -31,7 +32,9 @@ import (
3132	"sigs.k8s.io/controller-runtime/pkg/cache" 
3233	"sigs.k8s.io/controller-runtime/pkg/client" 
3334	"sigs.k8s.io/controller-runtime/pkg/controller" 
35+ 	"sigs.k8s.io/controller-runtime/pkg/event" 
3436	"sigs.k8s.io/controller-runtime/pkg/healthz" 
37+ 	"sigs.k8s.io/controller-runtime/pkg/manager" 
3538	"sigs.k8s.io/controller-runtime/pkg/metrics" 
3639	metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" 
3740	"sigs.k8s.io/controller-runtime/pkg/predicate" 
@@ -78,16 +81,16 @@ var runCmd = &cobra.Command{
7881			LeaderElectionID : "node-labeler.gitpod.io" ,
7982		})
8083		if  err  !=  nil  {
81- 			log .WithError (err ).Fatal ("unable to start node-labeber " )
84+ 			log .WithError (err ).Fatal ("unable to start node-labeler " )
8285		}
8386
84- 		client , err  :=  client .New (ctrl .GetConfigOrDie (), client.Options {})
87+ 		kClient , err  :=  client .New (ctrl .GetConfigOrDie (), client.Options {})
8588		if  err  !=  nil  {
8689			log .WithError (err ).Fatal ("unable to create client" )
8790		}
8891
8992		r  :=  & PodReconciler {
90- 			client ,
93+ 			kClient ,
9194		}
9295
9396		componentPredicate , err  :=  predicate .LabelSelectorPredicate (metav1.LabelSelector {
@@ -110,6 +113,36 @@ var runCmd = &cobra.Command{
110113			log .WithError (err ).Fatal ("unable to bind controller watch event handler" )
111114		}
112115
116+ 		if  err  :=  mgr .GetFieldIndexer ().IndexField (context .Background (), & workspacev1.Workspace {}, "status.runtime.nodeName" , func (o  client.Object ) []string  {
117+ 			ws  :=  o .(* workspacev1.Workspace )
118+ 			if  ws .Status .Runtime  ==  nil  {
119+ 				return  nil 
120+ 			}
121+ 			return  []string {ws .Status .Runtime .NodeName }
122+ 		}); err  !=  nil  {
123+ 			log .WithError (err ).Fatal ("unable to create workspace indexer" )
124+ 			return 
125+ 		}
126+ 
127+ 		nsac , err  :=  NewNodeScaledownAnnotationController (mgr .GetClient ())
128+ 		if  err  !=  nil  {
129+ 			log .WithError (err ).Fatal ("unable to create node scaledown annotation controller" )
130+ 		}
131+ 		err  =  nsac .SetupWithManager (mgr )
132+ 		if  err  !=  nil  {
133+ 			log .WithError (err ).Fatal ("unable to bind node scaledown annotation controller" )
134+ 		}
135+ 
136+ 		err  =  mgr .Add (manager .RunnableFunc (func (ctx  context.Context ) error  {
137+ 			<- ctx .Done ()
138+ 			log .Info ("Received shutdown signal - stopping NodeScaledownAnnotationController" )
139+ 			nsac .Stop ()
140+ 			return  nil 
141+ 		}))
142+ 		if  err  !=  nil  {
143+ 			log .WithError (err ).Fatal ("couldn't properly clean up node scaledown annotation controller" )
144+ 		}
145+ 
113146		metrics .Registry .MustRegister (NodeLabelerCounterVec )
114147		metrics .Registry .MustRegister (NodeLabelerTimeHistVec )
115148
@@ -123,10 +156,10 @@ var runCmd = &cobra.Command{
123156			log .WithError (err ).Fatal ("unable to set up ready check" )
124157		}
125158
126- 		log .Info ("starting node-labeber " )
159+ 		log .Info ("starting node-labeler " )
127160		err  =  mgr .Start (ctrl .SetupSignalHandler ())
128161		if  err  !=  nil  {
129- 			log .WithError (err ).Fatal ("problem running node-labeber " )
162+ 			log .WithError (err ).Fatal ("problem running node-labeler " )
130163		}
131164
132165		log .Info ("Received SIGINT - shutting down" )
@@ -135,6 +168,8 @@ var runCmd = &cobra.Command{
135168
136169func  init () {
137170	utilruntime .Must (clientgoscheme .AddToScheme (scheme ))
171+ 	utilruntime .Must (workspacev1 .AddToScheme (scheme ))
172+ 
138173	rootCmd .AddCommand (runCmd )
139174}
140175
@@ -249,6 +284,207 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
249284	return  reconcile.Result {}, nil 
250285}
251286
287+ type  NodeScaledownAnnotationController  struct  {
288+ 	client.Client 
289+ 	nodesToReconcile  chan  string 
290+ 	stopChan          chan  struct {}
291+ }
292+ 
293+ func  NewNodeScaledownAnnotationController (client  client.Client ) (* NodeScaledownAnnotationController , error ) {
294+ 	controller  :=  & NodeScaledownAnnotationController {
295+ 		Client :           client ,
296+ 		nodesToReconcile : make (chan  string , 1000 ),
297+ 		stopChan :         make (chan  struct {}),
298+ 	}
299+ 
300+ 	return  controller , nil 
301+ }
302+ 
303+ func  (c  * NodeScaledownAnnotationController ) SetupWithManager (mgr  ctrl.Manager ) error  {
304+ 	go  c .reconciliationWorker ()
305+ 	go  c .periodicReconciliation ()
306+ 
307+ 	return  ctrl .NewControllerManagedBy (mgr ).
308+ 		Named ("node-scaledown-annotation-controller" ).
309+ 		For (& workspacev1.Workspace {}).
310+ 		WithEventFilter (c .workspaceFilter ()).
311+ 		Complete (c )
312+ }
313+ 
314+ // periodicReconciliation periodically reconciles all nodes in the cluster 
315+ func  (c  * NodeScaledownAnnotationController ) periodicReconciliation () {
316+ 	ticker  :=  time .NewTicker (5  *  time .Minute )
317+ 	defer  ticker .Stop ()
318+ 
319+ 	for  {
320+ 		select  {
321+ 		case  <- ticker .C :
322+ 			log .Info ("starting periodic full reconciliation" )
323+ 			ctx  :=  context .Background ()
324+ 			if  _ , err  :=  c .reconcileAllNodes (ctx ); err  !=  nil  {
325+ 				log .WithError (err ).Error ("periodic reconciliation failed" )
326+ 			}
327+ 		case  <- c .stopChan :
328+ 			log .Info ("stopping periodic full reconciliation" )
329+ 			return 
330+ 		}
331+ 	}
332+ }
333+ 
334+ // reconciliationWorker consumes nodesToReconcile and reconciles each node 
335+ func  (c  * NodeScaledownAnnotationController ) reconciliationWorker () {
336+ 	log .Info ("reconciliation worker started" )
337+ 	for  {
338+ 		select  {
339+ 		case  nodeName  :=  <- c .nodesToReconcile :
340+ 			ctx  :=  context .Background ()
341+ 			if  err  :=  c .reconcileNode (ctx , nodeName ); err  !=  nil  {
342+ 				log .WithError (err ).WithField ("node" , nodeName ).Error ("failed to reconcile node from queue" )
343+ 			}
344+ 		case  <- c .stopChan :
345+ 			log .Info ("reconciliation worker stopping" )
346+ 			return 
347+ 		}
348+ 	}
349+ }
350+ 
351+ func  (c  * NodeScaledownAnnotationController ) workspaceFilter () predicate.Predicate  {
352+ 	return  predicate.Funcs {
353+ 		CreateFunc : func (e  event.CreateEvent ) bool  {
354+ 			ws  :=  e .Object .(* workspacev1.Workspace )
355+ 			if  ws .Status .Runtime  ==  nil  {
356+ 				log .WithField ("workspace" , ws .Name ).Info ("workspace not ready yet" )
357+ 				return  false 
358+ 			}
359+ 
360+ 			return  ws .Status .Runtime  !=  nil  &&  ws .Status .Runtime .NodeName  !=  "" 
361+ 		},
362+ 		UpdateFunc : func (e  event.UpdateEvent ) bool  {
363+ 			wsOld  :=  e .ObjectOld .(* workspacev1.Workspace )
364+ 			ws  :=  e .ObjectNew .(* workspacev1.Workspace )
365+ 			// if we haven't seen runtime info before and now it's there, let's reconcile. 
366+ 			// similarly, if the node name changed, we need to reconcile the old node as well. 
367+ 			if  (wsOld .Status .Runtime  ==  nil  &&  ws .Status .Runtime  !=  nil  &&  ws .Status .Runtime .NodeName  !=  "" ) ||  // we just got runtime info 
368+ 				(wsOld .Status .Runtime  !=  nil  &&  ws .Status .Runtime  !=  nil  &&  wsOld .Status .Runtime .NodeName  !=  ws .Status .Runtime .NodeName ) { // node name changed 
369+ 				if  wsOld .Status .Runtime  !=  nil  &&  wsOld .Status .Runtime .NodeName  !=  ""  {
370+ 					c .queueNodeForReconciliation (wsOld .Status .Runtime .NodeName )
371+ 				}
372+ 				return  true 
373+ 			}
374+ 
375+ 			return  false 
376+ 		},
377+ 		DeleteFunc : func (e  event.DeleteEvent ) bool  {
378+ 			ws  :=  e .Object .(* workspacev1.Workspace )
379+ 			if  ws .Status .Runtime  !=  nil  &&  ws .Status .Runtime .NodeName  !=  ""  {
380+ 				c .queueNodeForReconciliation (ws .Status .Runtime .NodeName )
381+ 				return  true 
382+ 			}
383+ 			return  false 
384+ 		},
385+ 	}
386+ }
387+ 
388+ func  (c  * NodeScaledownAnnotationController ) queueNodeForReconciliation (nodeName  string ) {
389+ 	select  {
390+ 	case  c .nodesToReconcile  <-  nodeName :
391+ 		log .WithField ("node" , nodeName ).Info ("queued node for reconciliation" )
392+ 	default :
393+ 		log .WithField ("node" , nodeName ).Warn ("reconciliation queue full" )
394+ 	}
395+ }
396+ 
397+ func  (c  * NodeScaledownAnnotationController ) Reconcile (ctx  context.Context , req  ctrl.Request ) (ctrl.Result , error ) {
398+ 	log .WithField ("request" , req .NamespacedName .String ()).Info ("WorkspaceCountController reconciling" )
399+ 
400+ 	var  ws  workspacev1.Workspace 
401+ 	if  err  :=  c .Get (ctx , req .NamespacedName , & ws ); err  !=  nil  {
402+ 		if  ! errors .IsNotFound (err ) {
403+ 			log .WithError (err ).WithField ("workspace" , req .NamespacedName ).Error ("unable to fetch Workspace" )
404+ 			return  ctrl.Result {}, err 
405+ 		}
406+ 		return  ctrl.Result {}, nil 
407+ 	}
408+ 
409+ 	if  ws .Status .Runtime  !=  nil  &&  ws .Status .Runtime .NodeName  !=  ""  {
410+ 		c .queueNodeForReconciliation (ws .Status .Runtime .NodeName )
411+ 	}
412+ 
413+ 	log .WithField ("runtime" , ws .Status .Runtime ).Warn ("reconciling object with no Runtime/NodeName, which wasn't filtered out by workspaceFilter" )
414+ 	return  ctrl.Result {}, nil 
415+ }
416+ 
417+ // Cleanup method to be called when shutting down the controller 
418+ func  (wc  * NodeScaledownAnnotationController ) Stop () {
419+ 	close (wc .stopChan )
420+ }
421+ 
422+ func  (c  * NodeScaledownAnnotationController ) reconcileAllNodes (ctx  context.Context ) (ctrl.Result , error ) {
423+ 	var  nodes  corev1.NodeList 
424+ 	if  err  :=  c .List (ctx , & nodes ); err  !=  nil  {
425+ 		log .WithError (err ).Error ("failed to list nodes" )
426+ 		return  ctrl.Result {}, err 
427+ 	}
428+ 
429+ 	for  _ , node  :=  range  nodes .Items  {
430+ 		c .queueNodeForReconciliation (node .Name )
431+ 	}
432+ 
433+ 	return  ctrl.Result {}, nil 
434+ }
435+ 
436+ func  (c  * NodeScaledownAnnotationController ) reconcileNode (ctx  context.Context , nodeName  string ) error  {
437+ 	var  workspaceList  workspacev1.WorkspaceList 
438+ 	if  err  :=  c .List (ctx , & workspaceList , client.MatchingFields {
439+ 		"status.runtime.nodeName" : nodeName ,
440+ 	}); err  !=  nil  {
441+ 		return  fmt .Errorf ("failed to list workspaces: %w" , err )
442+ 	}
443+ 
444+ 	log .WithField ("node" , nodeName ).WithField ("count" , len (workspaceList .Items )).Info ("acting on workspaces" )
445+ 	count  :=  len (workspaceList .Items )
446+ 
447+ 	return  c .updateNodeAnnotation (ctx , nodeName , count )
448+ }
449+ 
450+ func  (c  * NodeScaledownAnnotationController ) updateNodeAnnotation (ctx  context.Context , nodeName  string , count  int ) error  {
451+ 	return  retry .RetryOnConflict (retry .DefaultBackoff , func () error  {
452+ 		ctx , cancel  :=  context .WithTimeout (ctx , 5 * time .Second )
453+ 		defer  cancel ()
454+ 
455+ 		var  node  corev1.Node 
456+ 		err  :=  c .Get (ctx , types.NamespacedName {Name : nodeName }, & node )
457+ 		if  err  !=  nil  {
458+ 			return  fmt .Errorf ("obtaining node %s: %w" , nodeName , err )
459+ 		}
460+ 
461+ 		shouldDisableScaleDown  :=  count  >  0 
462+ 		currentlyDisabled  :=  false 
463+ 		if  val , exists  :=  node .Annotations ["cluster-autoscaler.kubernetes.io/scale-down-disabled" ]; exists  {
464+ 			currentlyDisabled  =  val  ==  "true" 
465+ 		}
466+ 
467+ 		// Only update if the state needs to change 
468+ 		if  shouldDisableScaleDown  !=  currentlyDisabled  {
469+ 			if  node .Annotations  ==  nil  {
470+ 				node .Annotations  =  make (map [string ]string )
471+ 			}
472+ 
473+ 			if  shouldDisableScaleDown  {
474+ 				node .Annotations ["cluster-autoscaler.kubernetes.io/scale-down-disabled" ] =  "true" 
475+ 				log .WithField ("nodeName" , nodeName ).Info ("disabling scale-down for node" )
476+ 			} else  {
477+ 				delete (node .Annotations , "cluster-autoscaler.kubernetes.io/scale-down-disabled" )
478+ 				log .WithField ("nodeName" , nodeName ).Info ("enabling scale-down for node" )
479+ 			}
480+ 
481+ 			return  c .Update (ctx , & node )
482+ 		}
483+ 
484+ 		return  nil 
485+ 	})
486+ }
487+ 
252488func  updateLabel (label  string , add  bool , nodeName  string , client  client.Client ) error  {
253489	return  retry .RetryOnConflict (retry .DefaultBackoff , func () error  {
254490		ctx , cancel  :=  context .WithTimeout (context .Background (), 5 * time .Second )
0 commit comments