Skip to content

Commit 752c173

Browse files
committed
move collection to collector.go, collect also amd and intel gpu info
Signed-off-by: Harold Ship <harold@il.ibm.com>
1 parent 6701c45 commit 752c173

File tree

3 files changed

+101
-42
lines changed

3 files changed

+101
-42
lines changed

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
*.dll
55
*.so
66
*.dylib
7+
bin/
8+
.vscode/
79

810
# Test binary, built with `go test -c`
911
*.test
@@ -14,6 +16,9 @@
1416
# Dependency directories (remove the comment below to include it)
1517
vendor/
1618

19+
# Kind config file
20+
kind-config.yaml
21+
1722
# Miscellaneous
1823
.DS_Store
1924
~*

internal/controller/collector.go

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
/*
2+
Copyright 2025.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package controller
18+
19+
import (
20+
"context"
21+
22+
corev1 "k8s.io/api/core/v1"
23+
"sigs.k8s.io/controller-runtime/pkg/client"
24+
logf "sigs.k8s.io/controller-runtime/pkg/log"
25+
)
26+
27+
type AcceleratorModelInfo struct {
28+
Count int
29+
Memory string
30+
}
31+
32+
// Collector holds the k8s client and discovers GPU inventory
33+
type Collector struct {
34+
Client client.Client
35+
}
36+
37+
// NewCollector returns an initialized Collector
38+
func NewCollector(c client.Client) *Collector {
39+
return &Collector{Client: c}
40+
}
41+
42+
var vendors = []string{
43+
"nvidia.com",
44+
"amd.com",
45+
"intel.com",
46+
}
47+
48+
// CollectInventory lists all Nodes and builds a map[nodeName][model]→info.
49+
// It checks labels <vendor>/gpu.product, <vendor>/gpu.memory
50+
// and capacity <vendor>/gpu.
51+
func (c *Collector) CollectInventoryK8S(ctx context.Context) (map[string]map[string]AcceleratorModelInfo, error) {
52+
logger := logf.FromContext(ctx)
53+
54+
logger.Info("collecting inventory")
55+
56+
var nodeList corev1.NodeList
57+
if err := c.Client.List(ctx, &nodeList); err != nil {
58+
logger.Error(err, "unable to list nodes")
59+
return nil, err
60+
}
61+
62+
inv := make(map[string]map[string]AcceleratorModelInfo)
63+
for _, node := range nodeList.Items {
64+
nodeName := node.Name
65+
for _, vendor := range vendors {
66+
prodKey := vendor + "/gpu.product"
67+
memKey := vendor + "/gpu.memory"
68+
if model, ok := node.Labels[prodKey]; ok {
69+
// found a GPU of this vendor
70+
mem := node.Labels[memKey]
71+
count := 0
72+
if cap, ok := node.Status.Capacity[corev1.ResourceName(vendor+"/gpu")]; ok {
73+
count = int(cap.Value())
74+
}
75+
if inv[nodeName] == nil {
76+
inv[nodeName] = make(map[string]AcceleratorModelInfo)
77+
}
78+
inv[nodeName][model] = AcceleratorModelInfo{
79+
Count: count,
80+
Memory: mem,
81+
}
82+
logger.Info("found inventory", "nodeName", nodeName, "model", model, "count", count, "mem", mem)
83+
}
84+
}
85+
}
86+
return inv, nil
87+
}

internal/controller/optimizer_controller.go

Lines changed: 9 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"sync"
2222
"time"
2323

24+
corev1 "k8s.io/api/core/v1"
2425
"k8s.io/apimachinery/pkg/runtime"
2526
"k8s.io/apimachinery/pkg/types"
2627
ctrl "sigs.k8s.io/controller-runtime"
@@ -30,7 +31,6 @@ import (
3031

3132
llmdOptv1alpha1 "github.com/llm-d-incubation/inferno-autoscaler/api/v1alpha1"
3233
appsv1 "k8s.io/api/apps/v1"
33-
corev1 "k8s.io/api/core/v1"
3434
apierrors "k8s.io/apimachinery/pkg/api/errors"
3535
)
3636

@@ -44,11 +44,6 @@ type OptimizerReconciler struct {
4444
stopTicker chan struct{}
4545
}
4646

47-
type AcceleratorModelInfo struct {
48-
Count int
49-
Memory string
50-
}
51-
5247
// +kubebuilder:rbac:groups=llmd.llm-d.ai,resources=optimizers,verbs=get;list;watch;create;update;patch;delete
5348
// +kubebuilder:rbac:groups=llmd.llm-d.ai,resources=optimizers/status,verbs=get;update;patch
5449
// +kubebuilder:rbac:groups=llmd.llm-d.ai,resources=optimizers/finalizers,verbs=update
@@ -72,6 +67,8 @@ func (r *OptimizerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
7267
return ctrl.Result{}, err
7368
}
7469

70+
logger.Info("reconciling")
71+
7572
groupedOptimizerObjsWithDeployment := make(map[string][]llmdOptv1alpha1.Optimizer)
7673
groupOptimizerObjsWithoutDeployment := make(map[string][]llmdOptv1alpha1.Optimizer)
7774

@@ -109,48 +106,18 @@ func (r *OptimizerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
109106
}
110107
}
111108
}
112-
var nodeList corev1.NodeList
113109

114-
if err := r.Client.List(ctx, &nodeList); err != nil {
115-
logger.Error(err, "unable to list nodes")
116-
return ctrl.Result{}, err
117-
}
110+
coll := NewCollector(r.Client)
118111

119-
newInventory := make(map[string]map[string]AcceleratorModelInfo)
120-
121-
for _, node := range nodeList.Items {
122-
nodeName := node.Name
123-
labels := node.Labels
124-
model, ok := labels["nvidia.com/gpu.product"]
125-
if !ok {
126-
continue
127-
}
128-
memory := labels["nvidia.com/gpu.memory"]
129-
count := 0
130-
if cap, ok := node.Status.Capacity["nvidia.com/gpu"]; ok {
131-
count = int(cap.Value())
132-
}
133-
newInventory[nodeName] = make(map[string]AcceleratorModelInfo)
134-
newInventory[nodeName][model] = AcceleratorModelInfo{
135-
Count: count,
136-
Memory: memory,
137-
}
112+
newInventory, err := coll.CollectInventoryK8S(ctx)
138113

114+
if err == nil {
115+
logger.Info("current inventory in the cluster", "capacity", newInventory)
116+
} else {
117+
logger.Error(err, "failed to get cluster inventory")
139118
}
140119

141-
logger.Info("current inventory in the cluster", "capacity", newInventory)
142-
143-
// call collector to path each optimizer object with accelarator, maxBatch and numReplicas
144-
// acceleraotor and maxBatch are obtained from deployment labels, numReplicas is available from spec.
145-
146-
// Output of the collector is passed to Model Analyzer
147-
148-
// The result of Model Analyzer is then passed to the Optimizer
149-
150-
// Output of the Optimizer is then consumed by actuator to emit prometheus metrics or change replicas directly
151-
152120
return ctrl.Result{}, nil
153-
154121
}
155122

156123
// SetupWithManager sets up the controller with the Manager.

0 commit comments

Comments
 (0)