@@ -15,6 +15,7 @@ import (
1515 "time"
1616
1717 "github.com/bombsimon/logrusr/v2"
18+ workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
1819 "github.com/spf13/cobra"
1920 corev1 "k8s.io/api/core/v1"
2021 "k8s.io/apimachinery/pkg/api/errors"
@@ -31,7 +32,9 @@ import (
3132 "sigs.k8s.io/controller-runtime/pkg/cache"
3233 "sigs.k8s.io/controller-runtime/pkg/client"
3334 "sigs.k8s.io/controller-runtime/pkg/controller"
35+ "sigs.k8s.io/controller-runtime/pkg/event"
3436 "sigs.k8s.io/controller-runtime/pkg/healthz"
37+ "sigs.k8s.io/controller-runtime/pkg/manager"
3538 "sigs.k8s.io/controller-runtime/pkg/metrics"
3639 metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
3740 "sigs.k8s.io/controller-runtime/pkg/predicate"
@@ -78,16 +81,16 @@ var runCmd = &cobra.Command{
7881 LeaderElectionID : "node-labeler.gitpod.io" ,
7982 })
8083 if err != nil {
81- log .WithError (err ).Fatal ("unable to start node-labeber " )
84+ log .WithError (err ).Fatal ("unable to start node-labeler " )
8285 }
8386
84- client , err := client .New (ctrl .GetConfigOrDie (), client.Options {})
87+ kClient , err := client .New (ctrl .GetConfigOrDie (), client.Options {})
8588 if err != nil {
8689 log .WithError (err ).Fatal ("unable to create client" )
8790 }
8891
8992 r := & PodReconciler {
90- client ,
93+ kClient ,
9194 }
9295
9396 componentPredicate , err := predicate .LabelSelectorPredicate (metav1.LabelSelector {
@@ -110,6 +113,36 @@ var runCmd = &cobra.Command{
110113 log .WithError (err ).Fatal ("unable to bind controller watch event handler" )
111114 }
112115
116+ if err := mgr .GetFieldIndexer ().IndexField (context .Background (), & workspacev1.Workspace {}, "status.runtime.nodeName" , func (o client.Object ) []string {
117+ ws := o .(* workspacev1.Workspace )
118+ if ws .Status .Runtime == nil {
119+ return nil
120+ }
121+ return []string {ws .Status .Runtime .NodeName }
122+ }); err != nil {
123+ log .WithError (err ).Fatal ("unable to create workspace indexer" )
124+ return
125+ }
126+
127+ nsac , err := NewNodeScaledownAnnotationController (mgr .GetClient ())
128+ if err != nil {
129+ log .WithError (err ).Fatal ("unable to create node scaledown annotation controller" )
130+ }
131+ err = nsac .SetupWithManager (mgr )
132+ if err != nil {
133+ log .WithError (err ).Fatal ("unable to bind node scaledown annotation controller" )
134+ }
135+
136+ err = mgr .Add (manager .RunnableFunc (func (ctx context.Context ) error {
137+ <- ctx .Done ()
138+ log .Info ("Received shutdown signal - stopping NodeScaledownAnnotationController" )
139+ nsac .Stop ()
140+ return nil
141+ }))
142+ if err != nil {
143+ log .WithError (err ).Fatal ("couldn't properly clean up node scaledown annotation controller" )
144+ }
145+
113146 metrics .Registry .MustRegister (NodeLabelerCounterVec )
114147 metrics .Registry .MustRegister (NodeLabelerTimeHistVec )
115148
@@ -123,10 +156,10 @@ var runCmd = &cobra.Command{
123156 log .WithError (err ).Fatal ("unable to set up ready check" )
124157 }
125158
126- log .Info ("starting node-labeber " )
159+ log .Info ("starting node-labeler " )
127160 err = mgr .Start (ctrl .SetupSignalHandler ())
128161 if err != nil {
129- log .WithError (err ).Fatal ("problem running node-labeber " )
162+ log .WithError (err ).Fatal ("problem running node-labeler " )
130163 }
131164
132165 log .Info ("Received SIGINT - shutting down" )
@@ -135,6 +168,8 @@ var runCmd = &cobra.Command{
135168
136169func init () {
137170 utilruntime .Must (clientgoscheme .AddToScheme (scheme ))
171+ utilruntime .Must (workspacev1 .AddToScheme (scheme ))
172+
138173 rootCmd .AddCommand (runCmd )
139174}
140175
@@ -249,6 +284,207 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
249284 return reconcile.Result {}, nil
250285}
251286
287+ type NodeScaledownAnnotationController struct {
288+ client.Client
289+ nodesToReconcile chan string
290+ stopChan chan struct {}
291+ }
292+
293+ func NewNodeScaledownAnnotationController (client client.Client ) (* NodeScaledownAnnotationController , error ) {
294+ controller := & NodeScaledownAnnotationController {
295+ Client : client ,
296+ nodesToReconcile : make (chan string , 1000 ),
297+ stopChan : make (chan struct {}),
298+ }
299+
300+ return controller , nil
301+ }
302+
303+ func (c * NodeScaledownAnnotationController ) SetupWithManager (mgr ctrl.Manager ) error {
304+ go c .reconciliationWorker ()
305+ go c .periodicReconciliation ()
306+
307+ return ctrl .NewControllerManagedBy (mgr ).
308+ Named ("node-scaledown-annotation-controller" ).
309+ For (& workspacev1.Workspace {}).
310+ WithEventFilter (c .workspaceFilter ()).
311+ Complete (c )
312+ }
313+
314+ // periodicReconciliation periodically reconciles all nodes in the cluster
315+ func (c * NodeScaledownAnnotationController ) periodicReconciliation () {
316+ ticker := time .NewTicker (5 * time .Minute )
317+ defer ticker .Stop ()
318+
319+ for {
320+ select {
321+ case <- ticker .C :
322+ log .Info ("starting periodic full reconciliation" )
323+ ctx := context .Background ()
324+ if _ , err := c .reconcileAllNodes (ctx ); err != nil {
325+ log .WithError (err ).Error ("periodic reconciliation failed" )
326+ }
327+ case <- c .stopChan :
328+ log .Info ("stopping periodic full reconciliation" )
329+ return
330+ }
331+ }
332+ }
333+
334+ // reconciliationWorker consumes nodesToReconcile and reconciles each node
335+ func (c * NodeScaledownAnnotationController ) reconciliationWorker () {
336+ log .Info ("reconciliation worker started" )
337+ for {
338+ select {
339+ case nodeName := <- c .nodesToReconcile :
340+ ctx := context .Background ()
341+ if err := c .reconcileNode (ctx , nodeName ); err != nil {
342+ log .WithError (err ).WithField ("node" , nodeName ).Error ("failed to reconcile node from queue" )
343+ }
344+ case <- c .stopChan :
345+ log .Info ("reconciliation worker stopping" )
346+ return
347+ }
348+ }
349+ }
350+
351+ func (c * NodeScaledownAnnotationController ) workspaceFilter () predicate.Predicate {
352+ return predicate.Funcs {
353+ CreateFunc : func (e event.CreateEvent ) bool {
354+ ws := e .Object .(* workspacev1.Workspace )
355+ if ws .Status .Runtime == nil {
356+ log .WithField ("workspace" , ws .Name ).Info ("workspace not ready yet" )
357+ return false
358+ }
359+
360+ return ws .Status .Runtime != nil && ws .Status .Runtime .NodeName != ""
361+ },
362+ UpdateFunc : func (e event.UpdateEvent ) bool {
363+ wsOld := e .ObjectOld .(* workspacev1.Workspace )
364+ ws := e .ObjectNew .(* workspacev1.Workspace )
365+ // if we haven't seen runtime info before and now it's there, let's reconcile.
366+ // similarly, if the node name changed, we need to reconcile the old node as well.
367+ if (wsOld .Status .Runtime == nil && ws .Status .Runtime != nil && ws .Status .Runtime .NodeName != "" ) || // we just got runtime info
368+ (wsOld .Status .Runtime != nil && ws .Status .Runtime != nil && wsOld .Status .Runtime .NodeName != ws .Status .Runtime .NodeName ) { // node name changed
369+ if wsOld .Status .Runtime != nil && wsOld .Status .Runtime .NodeName != "" {
370+ c .queueNodeForReconciliation (wsOld .Status .Runtime .NodeName )
371+ }
372+ return true
373+ }
374+
375+ return false
376+ },
377+ DeleteFunc : func (e event.DeleteEvent ) bool {
378+ ws := e .Object .(* workspacev1.Workspace )
379+ if ws .Status .Runtime != nil && ws .Status .Runtime .NodeName != "" {
380+ c .queueNodeForReconciliation (ws .Status .Runtime .NodeName )
381+ return true
382+ }
383+ return false
384+ },
385+ }
386+ }
387+
388+ func (c * NodeScaledownAnnotationController ) queueNodeForReconciliation (nodeName string ) {
389+ select {
390+ case c .nodesToReconcile <- nodeName :
391+ log .WithField ("node" , nodeName ).Info ("queued node for reconciliation" )
392+ default :
393+ log .WithField ("node" , nodeName ).Warn ("reconciliation queue full" )
394+ }
395+ }
396+
397+ func (c * NodeScaledownAnnotationController ) Reconcile (ctx context.Context , req ctrl.Request ) (ctrl.Result , error ) {
398+ log .WithField ("request" , req .NamespacedName .String ()).Info ("WorkspaceCountController reconciling" )
399+
400+ var ws workspacev1.Workspace
401+ if err := c .Get (ctx , req .NamespacedName , & ws ); err != nil {
402+ if ! errors .IsNotFound (err ) {
403+ log .WithError (err ).WithField ("workspace" , req .NamespacedName ).Error ("unable to fetch Workspace" )
404+ return ctrl.Result {}, err
405+ }
406+ return ctrl.Result {}, nil
407+ }
408+
409+ if ws .Status .Runtime != nil && ws .Status .Runtime .NodeName != "" {
410+ c .queueNodeForReconciliation (ws .Status .Runtime .NodeName )
411+ }
412+
413+ log .WithField ("runtime" , ws .Status .Runtime ).Warn ("reconciling object with no Runtime/NodeName, which wasn't filtered out by workspaceFilter" )
414+ return ctrl.Result {}, nil
415+ }
416+
417+ // Cleanup method to be called when shutting down the controller
418+ func (wc * NodeScaledownAnnotationController ) Stop () {
419+ close (wc .stopChan )
420+ }
421+
422+ func (c * NodeScaledownAnnotationController ) reconcileAllNodes (ctx context.Context ) (ctrl.Result , error ) {
423+ var nodes corev1.NodeList
424+ if err := c .List (ctx , & nodes ); err != nil {
425+ log .WithError (err ).Error ("failed to list nodes" )
426+ return ctrl.Result {}, err
427+ }
428+
429+ for _ , node := range nodes .Items {
430+ c .queueNodeForReconciliation (node .Name )
431+ }
432+
433+ return ctrl.Result {}, nil
434+ }
435+
436+ func (c * NodeScaledownAnnotationController ) reconcileNode (ctx context.Context , nodeName string ) error {
437+ var workspaceList workspacev1.WorkspaceList
438+ if err := c .List (ctx , & workspaceList , client.MatchingFields {
439+ "status.runtime.nodeName" : nodeName ,
440+ }); err != nil {
441+ return fmt .Errorf ("failed to list workspaces: %w" , err )
442+ }
443+
444+ log .WithField ("node" , nodeName ).WithField ("count" , len (workspaceList .Items )).Info ("acting on workspaces" )
445+ count := len (workspaceList .Items )
446+
447+ return c .updateNodeAnnotation (ctx , nodeName , count )
448+ }
449+
450+ func (c * NodeScaledownAnnotationController ) updateNodeAnnotation (ctx context.Context , nodeName string , count int ) error {
451+ return retry .RetryOnConflict (retry .DefaultBackoff , func () error {
452+ ctx , cancel := context .WithTimeout (ctx , 5 * time .Second )
453+ defer cancel ()
454+
455+ var node corev1.Node
456+ err := c .Get (ctx , types.NamespacedName {Name : nodeName }, & node )
457+ if err != nil {
458+ return fmt .Errorf ("obtaining node %s: %w" , nodeName , err )
459+ }
460+
461+ shouldDisableScaleDown := count > 0
462+ currentlyDisabled := false
463+ if val , exists := node .Annotations ["cluster-autoscaler.kubernetes.io/scale-down-disabled" ]; exists {
464+ currentlyDisabled = val == "true"
465+ }
466+
467+ // Only update if the state needs to change
468+ if shouldDisableScaleDown != currentlyDisabled {
469+ if node .Annotations == nil {
470+ node .Annotations = make (map [string ]string )
471+ }
472+
473+ if shouldDisableScaleDown {
474+ node .Annotations ["cluster-autoscaler.kubernetes.io/scale-down-disabled" ] = "true"
475+ log .WithField ("nodeName" , nodeName ).Info ("disabling scale-down for node" )
476+ } else {
477+ delete (node .Annotations , "cluster-autoscaler.kubernetes.io/scale-down-disabled" )
478+ log .WithField ("nodeName" , nodeName ).Info ("enabling scale-down for node" )
479+ }
480+
481+ return c .Update (ctx , & node )
482+ }
483+
484+ return nil
485+ })
486+ }
487+
252488func updateLabel (label string , add bool , nodeName string , client client.Client ) error {
253489 return retry .RetryOnConflict (retry .DefaultBackoff , func () error {
254490 ctx , cancel := context .WithTimeout (context .Background (), 5 * time .Second )
0 commit comments