|
| 1 | +package labels |
| 2 | + |
| 3 | +import ( |
| 4 | + "context" |
| 5 | + "fmt" |
| 6 | + "log" |
| 7 | + |
| 8 | + "k8s.io/apimachinery/pkg/api/errors" |
| 9 | + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" |
| 10 | + "k8s.io/client-go/kubernetes" |
| 11 | + "k8s.io/client-go/util/retry" |
| 12 | + |
| 13 | + "github.com/run-ai/fake-gpu-operator/internal/common/topology" |
| 14 | + "github.com/run-ai/fake-gpu-operator/internal/status-exporter/watch" |
| 15 | +) |
| 16 | + |
| 17 | +// MultiNodeLabelsExporter exports labels for multiple KWOK nodes |
| 18 | +type MultiNodeLabelsExporter struct { |
| 19 | + kubeClient kubernetes.Interface |
| 20 | +} |
| 21 | + |
| 22 | +var _ watch.LabelsExporter = &MultiNodeLabelsExporter{} |
| 23 | + |
| 24 | +// NewMultiNodeLabelsExporter creates a new multi-node labels exporter |
| 25 | +func NewMultiNodeLabelsExporter(kubeClient kubernetes.Interface) *MultiNodeLabelsExporter { |
| 26 | + return &MultiNodeLabelsExporter{ |
| 27 | + kubeClient: kubeClient, |
| 28 | + } |
| 29 | +} |
| 30 | + |
| 31 | +// SetLabelsForNode exports labels for a specific node |
| 32 | +func (e *MultiNodeLabelsExporter) SetLabelsForNode(nodeName string, nodeTopology *topology.NodeTopology) error { |
| 33 | + labels := BuildNodeLabels(nodeTopology) |
| 34 | + |
| 35 | + if err := e.setNodeLabels(nodeName, labels); err != nil { |
| 36 | + return fmt.Errorf("failed to set node labels for %s: %w", nodeName, err) |
| 37 | + } |
| 38 | + |
| 39 | + log.Printf("Exported labels for KWOK node: %s\n", nodeName) |
| 40 | + return nil |
| 41 | +} |
| 42 | + |
| 43 | +// setNodeLabels sets labels on a specific node with retry logic to handle conflicts |
| 44 | +func (e *MultiNodeLabelsExporter) setNodeLabels(nodeName string, labels map[string]string) error { |
| 45 | + log.Printf("Setting labels on KWOK node %s: %v\n", nodeName, labels) |
| 46 | + |
| 47 | + // Retry on conflict errors (when node is being modified by KWOK stages) |
| 48 | + return retry.RetryOnConflict(retry.DefaultRetry, func() error { |
| 49 | + node, err := e.kubeClient.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) |
| 50 | + if err != nil { |
| 51 | + if errors.IsNotFound(err) { |
| 52 | + log.Printf("Node %s not found (may have been deleted)\n", nodeName) |
| 53 | + return nil // Node deleted, don't retry |
| 54 | + } |
| 55 | + return err |
| 56 | + } |
| 57 | + |
| 58 | + // Update labels |
| 59 | + for k, v := range labels { |
| 60 | + node.Labels[k] = v |
| 61 | + } |
| 62 | + |
| 63 | + _, err = e.kubeClient.CoreV1().Nodes().Update(context.TODO(), node, metav1.UpdateOptions{}) |
| 64 | + return err |
| 65 | + }) |
| 66 | +} |
0 commit comments