@@ -18,15 +18,18 @@ package controller
1818
1919import (
2020 "context"
21+ "fmt"
22+ "strings"
2123
24+ tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
25+ "github.com/NexusGPU/tensor-fusion-operator/internal/constants"
26+ scheduler "github.com/NexusGPU/tensor-fusion-operator/internal/scheduler"
27+ "github.com/samber/lo"
28+ "k8s.io/apimachinery/pkg/api/errors"
29+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2230 "k8s.io/apimachinery/pkg/runtime"
2331 ctrl "sigs.k8s.io/controller-runtime"
2432 "sigs.k8s.io/controller-runtime/pkg/client"
25- "sigs.k8s.io/controller-runtime/pkg/event"
26- "sigs.k8s.io/controller-runtime/pkg/predicate"
27-
28- tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
29- scheduler "github.com/NexusGPU/tensor-fusion-operator/internal/scheduler"
3033)
3134
3235// GPUReconciler reconciles a GPU object
@@ -43,6 +46,59 @@ type GPUReconciler struct {
4346// Reconcile is part of the main kubernetes reconciliation loop which aims to
4447// move the current state of the cluster closer to the desired state.
4548func (r * GPUReconciler ) Reconcile (ctx context.Context , req ctrl.Request ) (ctrl.Result , error ) {
49+ gpu := & tfv1.GPU {}
50+ if err := r .Get (ctx , req .NamespacedName , gpu ); err != nil {
51+ if errors .IsNotFound (err ) {
52+ return ctrl.Result {}, nil
53+ }
54+ return ctrl.Result {}, err
55+ }
56+
57+ kgvs , _ , err := r .Scheme .ObjectKinds (& tfv1.GPUNode {})
58+ if err != nil {
59+ return ctrl.Result {}, fmt .Errorf ("get object kinds for GPUNode: %w" , err )
60+ }
61+
62+ owner , ok := lo .Find (gpu .OwnerReferences , func (or metav1.OwnerReference ) bool {
63+ for _ , kvg := range kgvs {
64+ if kvg .Kind == or .Kind && fmt .Sprintf ("%s/%s" , kvg .Group , kvg .Version ) == or .APIVersion {
65+ return true
66+ }
67+ }
68+ return false
69+ })
70+
71+ if ! ok {
72+ return ctrl.Result {}, fmt .Errorf ("owner node %s not found" , gpu .Name )
73+ }
74+
75+ gpunode := & tfv1.GPUNode {}
76+ if err := r .Get (ctx , client.ObjectKey {Name : owner .Name }, gpunode ); err != nil {
77+ return ctrl.Result {}, fmt .Errorf ("get node %s: %w" , owner .Name , err )
78+ }
79+
80+ var poolName string
81+ for labelKey := range gpunode .Labels {
82+ after , ok := strings .CutPrefix (labelKey , constants .GPUNodePoolIdentifierLabelPrefix )
83+ if ok {
84+ poolName = after
85+ break
86+ }
87+ }
88+
89+ if poolName == "" {
90+ return ctrl.Result {}, fmt .Errorf ("node %s is not assigned to any pool" , gpunode .Name )
91+ }
92+
93+ if gpu .Labels == nil {
94+ gpu .Labels = make (map [string ]string )
95+ }
96+ gpu .Labels [constants .GpuPoolKey ] = poolName
97+
98+ // update gpu
99+ if err := r .Update (ctx , gpu ); err != nil {
100+ return ctrl.Result {}, fmt .Errorf ("update gpu %s: %w" , gpu .Name , err )
101+ }
46102 return ctrl.Result {}, nil
47103}
48104
@@ -51,21 +107,5 @@ func (r *GPUReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager)
51107 return ctrl .NewControllerManagedBy (mgr ).
52108 For (& tfv1.GPU {}).
53109 Named ("gpu" ).
54- WithEventFilter (
55- predicate.Funcs {
56- CreateFunc : func (e event.CreateEvent ) bool {
57- r .Scheduler .OnAdd (e .Object .(* tfv1.GPU ))
58- return true
59- },
60- UpdateFunc : func (e event.UpdateEvent ) bool {
61- r .Scheduler .OnUpdate (e .ObjectOld .(* tfv1.GPU ), e .ObjectNew .(* tfv1.GPU ))
62- return true
63- },
64- DeleteFunc : func (e event.DeleteEvent ) bool {
65- r .Scheduler .OnDelete (e .Object .(* tfv1.GPU ))
66- return true
67- },
68- },
69- ).
70110 Complete (r )
71111}
0 commit comments