@@ -21,6 +21,7 @@ import (
2121 "sync"
2222 "time"
2323
24+ corev1 "k8s.io/api/core/v1"
2425 "k8s.io/apimachinery/pkg/runtime"
2526 "k8s.io/apimachinery/pkg/types"
2627 ctrl "sigs.k8s.io/controller-runtime"
@@ -30,7 +31,6 @@ import (
3031
3132 llmdOptv1alpha1 "github.com/llm-d-incubation/inferno-autoscaler/api/v1alpha1"
3233 appsv1 "k8s.io/api/apps/v1"
33- corev1 "k8s.io/api/core/v1"
3434 apierrors "k8s.io/apimachinery/pkg/api/errors"
3535)
3636
@@ -44,11 +44,6 @@ type OptimizerReconciler struct {
4444 stopTicker chan struct {}
4545}
4646
47- type AcceleratorModelInfo struct {
48- Count int
49- Memory string
50- }
51-
5247// +kubebuilder:rbac:groups=llmd.llm-d.ai,resources=optimizers,verbs=get;list;watch;create;update;patch;delete
5348// +kubebuilder:rbac:groups=llmd.llm-d.ai,resources=optimizers/status,verbs=get;update;patch
5449// +kubebuilder:rbac:groups=llmd.llm-d.ai,resources=optimizers/finalizers,verbs=update
@@ -72,6 +67,8 @@ func (r *OptimizerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
7267 return ctrl.Result {}, err
7368 }
7469
70+ logger .Info ("reconciling" )
71+
7572 groupedOptimizerObjsWithDeployment := make (map [string ][]llmdOptv1alpha1.Optimizer )
7673 groupOptimizerObjsWithoutDeployment := make (map [string ][]llmdOptv1alpha1.Optimizer )
7774
@@ -109,48 +106,18 @@ func (r *OptimizerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
109106 }
110107 }
111108 }
112- var nodeList corev1.NodeList
113109
114- if err := r .Client .List (ctx , & nodeList ); err != nil {
115- logger .Error (err , "unable to list nodes" )
116- return ctrl.Result {}, err
117- }
110+ coll := NewCollector (r .Client )
118111
119- newInventory := make (map [string ]map [string ]AcceleratorModelInfo )
120-
121- for _ , node := range nodeList .Items {
122- nodeName := node .Name
123- labels := node .Labels
124- model , ok := labels ["nvidia.com/gpu.product" ]
125- if ! ok {
126- continue
127- }
128- memory := labels ["nvidia.com/gpu.memory" ]
129- count := 0
130- if cap , ok := node .Status .Capacity ["nvidia.com/gpu" ]; ok {
131- count = int (cap .Value ())
132- }
133- newInventory [nodeName ] = make (map [string ]AcceleratorModelInfo )
134- newInventory [nodeName ][model ] = AcceleratorModelInfo {
135- Count : count ,
136- Memory : memory ,
137- }
112+ newInventory , err := coll .CollectInventoryK8S (ctx )
138113
114+ if err == nil {
115+ logger .Info ("current inventory in the cluster" , "capacity" , newInventory )
116+ } else {
117+ logger .Error (err , "failed to get cluster inventory" )
139118 }
140119
141- logger .Info ("current inventory in the cluster" , "capacity" , newInventory )
142-
143- // call collector to path each optimizer object with accelarator, maxBatch and numReplicas
144- // acceleraotor and maxBatch are obtained from deployment labels, numReplicas is available from spec.
145-
146- // Output of the collector is passed to Model Analyzer
147-
148- // The result of Model Analyzer is then passed to the Optimizer
149-
150- // Output of the Optimizer is then consumed by actuator to emit prometheus metrics or change replicas directly
151-
152120 return ctrl.Result {}, nil
153-
154121}
155122
156123// SetupWithManager sets up the controller with the Manager.
0 commit comments