Skip to content

Commit 18ec42f

Browse files
authored
Merge pull request #161 from run-ai/erez/selective-node-labeling
feat: add flag to disable node labeling
2 parents 093ecf2 + b232a77 commit 18ec42f

File tree

6 files changed

+23
-4
lines changed

6 files changed

+23
-4
lines changed

deploy/fake-gpu-operator/templates/status-updater/deployment.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ spec:
3838
value: "{{ .Values.environment.resourceReservationNamespace }}"
3939
- name: PROMETHEUS_URL
4040
value: "{{ .Values.prometheus.url }}"
41+
- name: DISABLE_NODE_LABELING
42+
value: "{{ .Values.statusUpdater.disableNodeLabeling }}"
4143
restartPolicy: Always
4244
serviceAccountName: status-updater
4345
imagePullSecrets:

deploy/fake-gpu-operator/values.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ devicePlugin:
2222

2323
statusUpdater:
2424
enabled: true
25+
disableNodeLabeling: false
2526
image:
2627
pullPolicy: Always
2728
repository: ghcr.io/run-ai/fake-gpu-operator/status-updater

internal/common/constants/constants.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,4 +26,5 @@ const (
2626
EnvFakeGpuOperatorNs = "FAKE_GPU_OPERATOR_NAMESPACE"
2727
EnvResourceReservationNamespace = "RESOURCE_RESERVATION_NAMESPACE"
2828
EnvPrometheusURL = "PROMETHEUS_URL"
29+
EnvDisableNodeLabeling = "DISABLE_NODE_LABELING"
2930
)

internal/status-updater/app.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ type StatusUpdaterAppConfiguration struct {
3030
TopologyCmName string `mapstructure:"TOPOLOGY_CM_NAME" validate:"required"`
3131
TopologyCmNamespace string `mapstructure:"TOPOLOGY_CM_NAMESPACE" validate:"required"`
3232
PrometheusURL string `mapstructure:"PROMETHEUS_URL"`
33+
DisableNodeLabeling bool `mapstructure:"DISABLE_NODE_LABELING"`
3334
}
3435

3536
type StatusUpdaterApp struct {
@@ -67,8 +68,10 @@ func (app *StatusUpdaterApp) Init(stopCh chan struct{}) {
6768
app.kubeClient = KubeClientFn(clusterConfig)
6869
dynamicClient := DynamicClientFn(clusterConfig)
6970

71+
disableNodeLabeling := viper.GetBool(constants.EnvDisableNodeLabeling)
72+
7073
app.Controllers = append(app.Controllers, podcontroller.NewPodController(app.kubeClient, dynamicClient, app.wg))
71-
app.Controllers = append(app.Controllers, nodecontroller.NewNodeController(app.kubeClient, app.wg))
74+
app.Controllers = append(app.Controllers, nodecontroller.NewNodeController(app.kubeClient, app.wg, disableNodeLabeling))
7275
}
7376

7477
func (app *StatusUpdaterApp) Name() string {

internal/status-updater/controllers/node/controller.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ type NodeController struct {
3636

3737
var _ controllers.Interface = &NodeController{}
3838

39-
func NewNodeController(kubeClient kubernetes.Interface, wg *sync.WaitGroup) *NodeController {
39+
func NewNodeController(kubeClient kubernetes.Interface, wg *sync.WaitGroup, disableNodeLabeling bool) *NodeController {
4040
clusterTopology, err := topology.GetClusterTopologyFromCM(kubeClient)
4141
if err != nil {
4242
log.Fatalf("Failed to get cluster topology: %v", err)
@@ -45,7 +45,7 @@ func NewNodeController(kubeClient kubernetes.Interface, wg *sync.WaitGroup) *Nod
4545
c := &NodeController{
4646
kubeClient: kubeClient,
4747
informer: informers.NewSharedInformerFactory(kubeClient, 0).Core().V1().Nodes().Informer(),
48-
handler: nodehandler.NewNodeHandler(kubeClient, clusterTopology),
48+
handler: nodehandler.NewNodeHandler(kubeClient, clusterTopology, disableNodeLabeling),
4949
clusterTopology: clusterTopology,
5050
}
5151

internal/status-updater/handlers/node/handler.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,16 @@ type NodeHandler struct {
1919
kubeClient kubernetes.Interface
2020

2121
clusterTopology *topology.ClusterTopology
22+
disableLabeling bool
2223
}
2324

2425
var _ Interface = &NodeHandler{}
2526

26-
func NewNodeHandler(kubeClient kubernetes.Interface, clusterTopology *topology.ClusterTopology) *NodeHandler {
27+
func NewNodeHandler(kubeClient kubernetes.Interface, clusterTopology *topology.ClusterTopology, disableLabeling bool) *NodeHandler {
2728
return &NodeHandler{
2829
kubeClient: kubeClient,
2930
clusterTopology: clusterTopology,
31+
disableLabeling: disableLabeling,
3032
}
3133
}
3234

@@ -38,6 +40,11 @@ func (p *NodeHandler) HandleAdd(node *v1.Node) error {
3840
return fmt.Errorf("failed to create node topology ConfigMap: %w", err)
3941
}
4042

43+
if p.disableLabeling {
44+
log.Printf("Skipping node labeling for %s (disabled via config)\n", node.Name)
45+
return nil
46+
}
47+
4148
err = p.labelNode(node)
4249
if err != nil {
4350
return fmt.Errorf("failed to label node: %w", err)
@@ -54,6 +61,11 @@ func (p *NodeHandler) HandleDelete(node *v1.Node) error {
5461
return fmt.Errorf("failed to delete node topology: %w", err)
5562
}
5663

64+
if p.disableLabeling {
65+
log.Printf("Skipping node unlabeling for %s (disabled via config)\n", node.Name)
66+
return nil
67+
}
68+
5769
err = p.unlabelNode(node)
5870
if err != nil {
5971
return fmt.Errorf("failed to unlabel node: %w", err)

0 commit comments

Comments
 (0)