@@ -50,6 +50,13 @@ const (
5050
5151 registryFacade = "registry-facade"
5252 wsDaemon = "ws-daemon"
53+
54+ // Taint keys for different components
55+ registryFacadeTaintKey = "gitpod.io/registry-facade-not-ready"
56+ wsDaemonTaintKey = "gitpod.io/ws-daemon-not-ready"
57+
58+ workspacesRegularLabel = "gitpod.io/workload_workspace_regular"
59+ workspacesHeadlessLabel = "gitpod.io/workload_workspace_headless"
5360)
5461
5562var defaultRequeueTime = time .Second * 10
@@ -61,6 +68,15 @@ var runCmd = &cobra.Command{
6168 Run : func (cmd * cobra.Command , args []string ) {
6269 ctrl .SetLogger (logrusr .New (log .Log ))
6370
71+ kClient , err := client .New (ctrl .GetConfigOrDie (), client.Options {})
72+ if err != nil {
73+ log .WithError (err ).Fatal ("unable to create client" )
74+ }
75+
76+ if err := initializeLabels (context .Background (), kClient ); err != nil {
77+ log .WithError (err ).Fatal ("failed to initialize labels" )
78+ }
79+
6480 mgr , err := ctrl .NewManager (ctrl .GetConfigOrDie (), ctrl.Options {
6581 Scheme : scheme ,
6682 HealthProbeBindAddress : ":8086" ,
@@ -84,11 +100,6 @@ var runCmd = &cobra.Command{
84100 log .WithError (err ).Fatal ("unable to start node-labeler" )
85101 }
86102
87- kClient , err := client .New (ctrl .GetConfigOrDie (), client.Options {})
88- if err != nil {
89- log .WithError (err ).Fatal ("unable to create client" )
90- }
91-
92103 r := & PodReconciler {
93104 kClient ,
94105 }
@@ -198,21 +209,18 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
198209 }
199210
200211 var (
201- ipAddress string
202- port string
203- component string
204- labelToUpdate string
212+ ipAddress string
213+ port string
214+ taintKey string
205215 )
206216
207217 switch {
208218 case strings .HasPrefix (pod .Name , registryFacade ):
209- component = registryFacade
210- labelToUpdate = fmt .Sprintf (registryFacadeLabel , namespace )
219+ taintKey = registryFacadeTaintKey
211220 ipAddress = pod .Status .HostIP
212221 port = strconv .Itoa (registryFacadePort )
213222 case strings .HasPrefix (pod .Name , wsDaemon ):
214- component = wsDaemon
215- labelToUpdate = fmt .Sprintf (wsdaemonLabel , namespace )
223+ taintKey = wsDaemonTaintKey
216224 ipAddress = pod .Status .PodIP
217225 port = strconv .Itoa (wsdaemonPort )
218226 default :
@@ -222,17 +230,17 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
222230
223231 if ! pod .ObjectMeta .DeletionTimestamp .IsZero () {
224232 // the pod is being removed.
225- // remove the component label from the node
233+ // add the taint to the node
226234 time .Sleep (1 * time .Second )
227- err := updateLabel ( labelToUpdate , false , nodeName , r )
235+ err := updateNodeTaint ( taintKey , true , nodeName , r )
228236 if err != nil {
229237 // this is a edge case when cluster-autoscaler removes a node
230238 // (all the running pods will be removed after that)
231239 if errors .IsNotFound (err ) {
232240 return reconcile.Result {}, nil
233241 }
234242
235- log .WithError (err ).Error ("removing node label " )
243+ log .WithError (err ).Error ("adding node taint " )
236244 return reconcile.Result {RequeueAfter : defaultRequeueTime }, err
237245 }
238246
@@ -250,8 +258,17 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
250258 return reconcile.Result {}, fmt .Errorf ("obtaining node %s: %w" , nodeName , err )
251259 }
252260
253- if labelValue , exists := node .Labels [labelToUpdate ]; exists && labelValue == "true" {
254- // nothing to do, the label already exists.
261+ // Check if taint exists
262+ taintExists := false
263+ for _ , taint := range node .Spec .Taints {
264+ if taint .Key == taintKey {
265+ taintExists = true
266+ break
267+ }
268+ }
269+
270+ if ! taintExists {
271+ // nothing to do, the taint doesn't exist.
255272 return reconcile.Result {}, nil
256273 }
257274
@@ -261,7 +278,7 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
261278 return reconcile.Result {RequeueAfter : defaultRequeueTime }, nil
262279 }
263280
264- if component == registryFacade {
281+ if strings . HasPrefix ( pod . Name , registryFacade ) {
265282 err = checkRegistryFacade (ipAddress , port )
266283 if err != nil {
267284 log .WithError (err ).Error ("checking registry-facade" )
@@ -271,15 +288,15 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req reconcile.Request) (r
271288 time .Sleep (1 * time .Second )
272289 }
273290
274- err = updateLabel ( labelToUpdate , true , nodeName , r )
291+ err = updateNodeTaint ( taintKey , false , nodeName , r )
275292 if err != nil {
276- log .WithError (err ).Error ("updating node label " )
277- return reconcile.Result {}, fmt .Errorf ("trying to add the label : %v" , err )
293+ log .WithError (err ).Error ("removing node taint " )
294+ return reconcile.Result {}, fmt .Errorf ("trying to remove the taint : %v" , err )
278295 }
279296
280297 readyIn := time .Since (pod .Status .StartTime .Time )
281- NodeLabelerTimeHistVec .WithLabelValues (component ).Observe (readyIn .Seconds ())
282- NodeLabelerCounterVec .WithLabelValues (component ).Inc ()
298+ NodeLabelerTimeHistVec .WithLabelValues (strings . Split ( pod . Name , "-" )[ 0 ] ).Observe (readyIn .Seconds ())
299+ NodeLabelerCounterVec .WithLabelValues (strings . Split ( pod . Name , "-" )[ 0 ] ).Inc ()
283300
284301 return reconcile.Result {}, nil
285302}
@@ -485,7 +502,7 @@ func (c *NodeScaledownAnnotationController) updateNodeAnnotation(ctx context.Con
485502 })
486503}
487504
488- func updateLabel ( label string , add bool , nodeName string , client client.Client ) error {
505+ func updateNodeTaint ( taintKey string , add bool , nodeName string , client client.Client ) error {
489506 return retry .RetryOnConflict (retry .DefaultBackoff , func () error {
490507 ctx , cancel := context .WithTimeout (context .Background (), 5 * time .Second )
491508 defer cancel ()
@@ -496,12 +513,36 @@ func updateLabel(label string, add bool, nodeName string, client client.Client)
496513 return err
497514 }
498515
516+ // Create or remove taint
499517 if add {
500- node .Labels [label ] = "true"
501- log .WithField ("label" , label ).WithField ("node" , nodeName ).Info ("adding label to node" )
518+ // Add taint if it doesn't exist
519+ taintExists := false
520+ for _ , taint := range node .Spec .Taints {
521+ if taint .Key == taintKey {
522+ taintExists = true
523+ break
524+ }
525+ }
526+ if ! taintExists {
527+ node .Spec .Taints = append (node .Spec .Taints , corev1.Taint {
528+ Key : taintKey ,
529+ Value : "true" ,
530+ Effect : corev1 .TaintEffectNoSchedule ,
531+ })
532+ log .WithField ("taint" , taintKey ).WithField ("node" , nodeName ).Info ("adding taint to node" )
533+ }
502534 } else {
503- delete (node .Labels , label )
504- log .WithField ("label" , label ).WithField ("node" , nodeName ).Info ("removing label from node" )
535+ // Remove taint if it exists
536+ newTaints := make ([]corev1.Taint , 0 )
537+ for _ , taint := range node .Spec .Taints {
538+ if taint .Key != taintKey {
539+ newTaints = append (newTaints , taint )
540+ }
541+ }
542+ if len (newTaints ) != len (node .Spec .Taints ) {
543+ node .Spec .Taints = newTaints
544+ log .WithField ("taint" , taintKey ).WithField ("node" , nodeName ).Info ("removing taint from node" )
545+ }
505546 }
506547
507548 err = client .Update (ctx , & node )
@@ -569,3 +610,72 @@ func newDefaultTransport() *http.Transport {
569610 DisableKeepAlives : true ,
570611 }
571612}
613+
614+ func initializeLabels (ctx context.Context , kClient client.Client ) error {
615+ log .Info ("initializing labels on nodes" )
616+
617+ var nodes corev1.NodeList
618+ if err := kClient .List (ctx , & nodes ); err != nil {
619+ return fmt .Errorf ("failed to list nodes: %w" , err )
620+ }
621+
622+ for _ , node := range nodes .Items {
623+ if node .Labels == nil {
624+ continue
625+ }
626+ _ , isRegularWorkspaceNode := node .Labels [workspacesRegularLabel ]
627+ _ , isHeadlessWorkspaceNode := node .Labels [workspacesHeadlessLabel ]
628+
629+ if isRegularWorkspaceNode || isHeadlessWorkspaceNode {
630+ err := updateNodeLabel (node .Name , kClient )
631+ if err != nil {
632+ log .WithError (err ).WithField ("node" , node .Name ).Error ("failed to initialize labels on node" )
633+ }
634+ }
635+ }
636+
637+ log .Info ("finished initializing labels on nodes" )
638+ return nil
639+ }
640+
641+ func updateNodeLabel (nodeName string , client client.Client ) error {
642+ return retry .RetryOnConflict (retry .DefaultBackoff , func () error {
643+ ctx , cancel := context .WithTimeout (context .Background (), 5 * time .Second )
644+ defer cancel ()
645+
646+ var node corev1.Node
647+ err := client .Get (ctx , types.NamespacedName {Name : nodeName }, & node )
648+ if err != nil {
649+ return err
650+ }
651+
652+ registryFacadeLabelForNamespace := fmt .Sprintf (registryFacadeLabel , namespace )
653+ wsDaemonLabelForNamespace := fmt .Sprintf (wsdaemonLabel , namespace )
654+
655+ needUpdate := false
656+
657+ if node .Labels == nil {
658+ node .Labels = make (map [string ]string )
659+ }
660+
661+ if v := node .Labels [registryFacadeLabelForNamespace ]; v != "true" {
662+ needUpdate = true
663+ }
664+ if v := node .Labels [wsDaemonLabelForNamespace ]; v != "true" {
665+ needUpdate = true
666+ }
667+
668+ if ! needUpdate {
669+ return nil
670+ }
671+ node .Labels [registryFacadeLabelForNamespace ] = "true"
672+ node .Labels [wsDaemonLabelForNamespace ] = "true"
673+
674+ err = client .Update (ctx , & node )
675+ if err != nil {
676+ return err
677+ }
678+
679+ return nil
680+ })
681+ }
0 commit comments