move collection to collector.go, collect also amd and intel gpu info

haroldship · haroldship · commit 752c17377e4f · 2025-07-03T12:53:38.000-04:00
Signed-off-by: Harold Ship &lt;harold@il.ibm.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,8 @@
 *.dll
 *.so
 *.dylib
+bin/
+.vscode/
 
 # Test binary, built with `go test -c`
 *.test
@@ -14,6 +16,9 @@
 # Dependency directories (remove the comment below to include it)
 vendor/
 
+# Kind config file
+kind-config.yaml
+
 # Miscellaneous
 .DS_Store
 ~*
diff --git a/internal/controller/collector.go b/internal/controller/collector.go
@@ -0,0 +1,87 @@
+/*
+Copyright 2025.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package controller
+
+import (
+	"context"
+
+	corev1 "k8s.io/api/core/v1"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	logf "sigs.k8s.io/controller-runtime/pkg/log"
+)
+
+type AcceleratorModelInfo struct {
+	Count  int
+	Memory string
+}
+
+// Collector holds the k8s client and discovers GPU inventory
+type Collector struct {
+	Client client.Client
+}
+
+// NewCollector returns an initialized Collector
+func NewCollector(c client.Client) *Collector {
+	return &Collector{Client: c}
+}
+
+var vendors = []string{
+	"nvidia.com",
+	"amd.com",
+	"intel.com",
+}
+
+// CollectInventory lists all Nodes and builds a map[nodeName][model]→info.
+// It checks labels <vendor>/gpu.product, <vendor>/gpu.memory
+// and capacity <vendor>/gpu.
+func (c *Collector) CollectInventoryK8S(ctx context.Context) (map[string]map[string]AcceleratorModelInfo, error) {
+	logger := logf.FromContext(ctx)
+
+	logger.Info("collecting inventory")
+
+	var nodeList corev1.NodeList
+	if err := c.Client.List(ctx, &nodeList); err != nil {
+		logger.Error(err, "unable to list nodes")
+		return nil, err
+	}
+
+	inv := make(map[string]map[string]AcceleratorModelInfo)
+	for _, node := range nodeList.Items {
+		nodeName := node.Name
+		for _, vendor := range vendors {
+			prodKey := vendor + "/gpu.product"
+			memKey := vendor + "/gpu.memory"
+			if model, ok := node.Labels[prodKey]; ok {
+				// found a GPU of this vendor
+				mem := node.Labels[memKey]
+				count := 0
+				if cap, ok := node.Status.Capacity[corev1.ResourceName(vendor+"/gpu")]; ok {
+					count = int(cap.Value())
+				}
+				if inv[nodeName] == nil {
+					inv[nodeName] = make(map[string]AcceleratorModelInfo)
+				}
+				inv[nodeName][model] = AcceleratorModelInfo{
+					Count:  count,
+					Memory: mem,
+				}
+				logger.Info("found inventory", "nodeName", nodeName, "model", model, "count", count, "mem", mem)
+			}
+		}
+	}
+	return inv, nil
+}
diff --git a/internal/controller/optimizer_controller.go b/internal/controller/optimizer_controller.go
@@ -21,6 +21,7 @@ import (
 	"sync"
 	"time"
 
+	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/types"
 	ctrl "sigs.k8s.io/controller-runtime"
@@ -30,7 +31,6 @@ import (
 
 	llmdOptv1alpha1 "github.com/llm-d-incubation/inferno-autoscaler/api/v1alpha1"
 	appsv1 "k8s.io/api/apps/v1"
-	corev1 "k8s.io/api/core/v1"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 )
 
@@ -44,11 +44,6 @@ type OptimizerReconciler struct {
 	stopTicker chan struct{}
 }
 
-type AcceleratorModelInfo struct {
-	Count  int
-	Memory string
-}
-
 // +kubebuilder:rbac:groups=llmd.llm-d.ai,resources=optimizers,verbs=get;list;watch;create;update;patch;delete
 // +kubebuilder:rbac:groups=llmd.llm-d.ai,resources=optimizers/status,verbs=get;update;patch
 // +kubebuilder:rbac:groups=llmd.llm-d.ai,resources=optimizers/finalizers,verbs=update
@@ -72,6 +67,8 @@ func (r *OptimizerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
 		return ctrl.Result{}, err
 	}
 
+	logger.Info("reconciling")
+
 	groupedOptimizerObjsWithDeployment := make(map[string][]llmdOptv1alpha1.Optimizer)
 	groupOptimizerObjsWithoutDeployment := make(map[string][]llmdOptv1alpha1.Optimizer)
 
@@ -109,48 +106,18 @@ func (r *OptimizerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
 			}
 		}
 	}
-	var nodeList corev1.NodeList
 
-	if err := r.Client.List(ctx, &nodeList); err != nil {
-		logger.Error(err, "unable to list nodes")
-		return ctrl.Result{}, err
-	}
+	coll := NewCollector(r.Client)
 
-	newInventory := make(map[string]map[string]AcceleratorModelInfo)
-
-	for _, node := range nodeList.Items {
-		nodeName := node.Name
-		labels := node.Labels
-		model, ok := labels["nvidia.com/gpu.product"]
-		if !ok {
-			continue
-		}
-		memory := labels["nvidia.com/gpu.memory"]
-		count := 0
-		if cap, ok := node.Status.Capacity["nvidia.com/gpu"]; ok {
-			count = int(cap.Value())
-		}
-		newInventory[nodeName] = make(map[string]AcceleratorModelInfo)
-		newInventory[nodeName][model] = AcceleratorModelInfo{
-			Count:  count,
-			Memory: memory,
-		}
+	newInventory, err := coll.CollectInventoryK8S(ctx)
 
+	if err == nil {
+		logger.Info("current inventory in the cluster", "capacity", newInventory)
+	} else {
+		logger.Error(err, "failed to get cluster inventory")
 	}
 
-	logger.Info("current inventory in the cluster", "capacity", newInventory)
-
-	// call collector to path each optimizer object with accelarator, maxBatch and numReplicas
-	// acceleraotor and maxBatch are obtained from deployment labels, numReplicas is available from spec.
-
-	// Output of the collector is passed to Model Analyzer
-
-	// The result of Model Analyzer is then passed to the Optimizer
-
-	// Output of the Optimizer is then consumed by actuator to emit prometheus metrics or change replicas directly
-
 	return ctrl.Result{}, nil
-
 }
 
 // SetupWithManager sets up the controller with the Manager.