Skip to content

Commit 8464876

Browse files
authored
Merge pull request #157 from run-ai/erez/compute-domain-node-tracking
add ComputeDomain status tracking with allocated nodes
2 parents a50d925 + 2a159d4 commit 8464876

File tree

7 files changed

+385
-11
lines changed

7 files changed

+385
-11
lines changed

deploy/fake-gpu-operator/templates/compute-domain-controller/clusterrole.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ metadata:
77
rules:
88
- apiGroups: ["resource.k8s.io"]
99
resources: ["resourceclaims"]
10-
verbs: ["get"]
10+
verbs: ["get", "list", "watch"]
1111
- apiGroups: [""]
1212
resources: ["nodes"]
1313
verbs: ["get"]

internal/compute-domain-controller/app.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,19 @@ import (
66

77
"go.uber.org/zap/zapcore"
88
resourceapi "k8s.io/api/resource/v1"
9+
"k8s.io/apimachinery/pkg/labels"
910
"k8s.io/apimachinery/pkg/runtime"
1011
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
1112
ctrl "sigs.k8s.io/controller-runtime"
13+
"sigs.k8s.io/controller-runtime/pkg/cache"
14+
"sigs.k8s.io/controller-runtime/pkg/client"
1215
"sigs.k8s.io/controller-runtime/pkg/healthz"
1316
"sigs.k8s.io/controller-runtime/pkg/log/zap"
1417
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
1518

1619
computedomainv1beta1 "github.com/NVIDIA/k8s-dra-driver-gpu/api/nvidia.com/resource/v1beta1"
1720
"github.com/run-ai/fake-gpu-operator/internal/common/app"
21+
"github.com/run-ai/fake-gpu-operator/pkg/compute-domain/consts"
1822
)
1923

2024
var (
@@ -85,6 +89,11 @@ func (app *ComputeDomainApp) Run() {
8589
func (app *ComputeDomainApp) runController(ctx context.Context) error {
8690
cfg := ctrl.GetConfigOrDie()
8791

92+
computeDomainLabelSelector, err := labels.Parse(consts.ComputeDomainClaimLabel)
93+
if err != nil {
94+
return fmt.Errorf("failed to parse label selector: %w", err)
95+
}
96+
8897
mgr, err := ctrl.NewManager(cfg, ctrl.Options{
8998
Scheme: scheme,
9099
Metrics: metricsserver.Options{
@@ -93,6 +102,13 @@ func (app *ComputeDomainApp) runController(ctx context.Context) error {
93102
HealthProbeBindAddress: app.config.HealthProbeAddress,
94103
LeaderElection: app.config.LeaderElection,
95104
LeaderElectionID: "fake-compute-domain-controller",
105+
Cache: cache.Options{
106+
ByObject: map[client.Object]cache.ByObject{
107+
&resourceapi.ResourceClaim{}: {
108+
Label: computeDomainLabelSelector,
109+
},
110+
},
111+
},
96112
})
97113
if err != nil {
98114
return fmt.Errorf("failed to create controller manager: %w", err)

internal/compute-domain-controller/computedomain_controller.go

Lines changed: 94 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,17 @@ package computedomaincontroller
1919
import (
2020
"context"
2121
"fmt"
22+
"sort"
2223

2324
resourceapi "k8s.io/api/resource/v1"
2425
apierrors "k8s.io/apimachinery/pkg/api/errors"
2526
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2627
"k8s.io/apimachinery/pkg/runtime"
28+
"k8s.io/apimachinery/pkg/types"
2729
ctrl "sigs.k8s.io/controller-runtime"
2830
"sigs.k8s.io/controller-runtime/pkg/client"
2931
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
32+
"sigs.k8s.io/controller-runtime/pkg/handler"
3033
"sigs.k8s.io/controller-runtime/pkg/log"
3134

3235
computedomainv1beta1 "github.com/NVIDIA/k8s-dra-driver-gpu/api/nvidia.com/resource/v1beta1"
@@ -59,6 +62,7 @@ type ComputeDomainReconciler struct {
5962
//+kubebuilder:rbac:groups=resource.nvidia.com,resources=computedomains/status,verbs=get;update;patch
6063
//+kubebuilder:rbac:groups=resource.nvidia.com,resources=computedomains/finalizers,verbs=update
6164
//+kubebuilder:rbac:groups=resource.k8s.io,resources=resourceclaimtemplates,verbs=get;list;watch;create;update;patch;delete
65+
//+kubebuilder:rbac:groups=resource.k8s.io,resources=resourceclaims,verbs=get;list;watch
6266

6367
func (r *ComputeDomainReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
6468
logger := log.FromContext(ctx)
@@ -81,6 +85,9 @@ func (r *ComputeDomainReconciler) Reconcile(ctx context.Context, req ctrl.Reques
8185
if err := r.ensureResourceClaimTemplates(ctx, domain); err != nil {
8286
return ctrl.Result{}, err
8387
}
88+
if err := r.updateStatus(ctx, domain); err != nil {
89+
return ctrl.Result{}, err
90+
}
8491

8592
logger.V(4).Info("reconciled ComputeDomain", "namespace", domain.Namespace, "name", domain.Name)
8693
return ctrl.Result{}, nil
@@ -141,8 +148,8 @@ func (r *ComputeDomainReconciler) ensureTemplate(
141148
Name: name,
142149
Namespace: domain.Namespace,
143150
Labels: map[string]string{
144-
"resource.nvidia.com/computeDomain": domain.Name,
145-
"resource.nvidia.com/computeDomainTarget": templateType,
151+
consts.ComputeDomainTemplateLabel: domain.Name,
152+
consts.ComputeDomainTemplateTargetLabel: templateType,
146153
},
147154
Finalizers: []string{
148155
consts.ComputeDomainFinalizer,
@@ -151,7 +158,7 @@ func (r *ComputeDomainReconciler) ensureTemplate(
151158
Spec: resourceapi.ResourceClaimTemplateSpec{
152159
ObjectMeta: metav1.ObjectMeta{
153160
Labels: map[string]string{
154-
"nvidia.com/computeDomain": domain.Name,
161+
consts.ComputeDomainClaimLabel: domain.Name,
155162
},
156163
},
157164
Spec: resourceapi.ResourceClaimSpec{
@@ -229,5 +236,89 @@ func (r *ComputeDomainReconciler) SetupWithManager(mgr ctrl.Manager) error {
229236
return ctrl.NewControllerManagedBy(mgr).
230237
For(&computedomainv1beta1.ComputeDomain{}).
231238
Owns(&resourceapi.ResourceClaimTemplate{}).
239+
Watches(
240+
&resourceapi.ResourceClaim{},
241+
handler.EnqueueRequestsFromMapFunc(r.mapResourceClaimToComputeDomain),
242+
).
232243
Complete(r)
233244
}
245+
246+
func (r *ComputeDomainReconciler) mapResourceClaimToComputeDomain(ctx context.Context, obj client.Object) []ctrl.Request {
247+
claim, ok := obj.(*resourceapi.ResourceClaim)
248+
if !ok {
249+
return nil
250+
}
251+
252+
domainName, exists := claim.Labels[consts.ComputeDomainClaimLabel]
253+
if !exists {
254+
return nil
255+
}
256+
257+
return []ctrl.Request{{
258+
NamespacedName: types.NamespacedName{
259+
Name: domainName,
260+
Namespace: claim.Namespace,
261+
},
262+
}}
263+
}
264+
265+
func (r *ComputeDomainReconciler) updateStatus(ctx context.Context, domain *computedomainv1beta1.ComputeDomain) error {
266+
claimList := &resourceapi.ResourceClaimList{}
267+
if err := r.List(ctx, claimList,
268+
client.InNamespace(domain.Namespace),
269+
client.MatchingLabels{consts.ComputeDomainClaimLabel: domain.Name},
270+
); err != nil {
271+
return err
272+
}
273+
274+
nodeSet := make(map[string]struct{})
275+
for _, claim := range claimList.Items {
276+
if claim.Status.Allocation == nil {
277+
continue
278+
}
279+
for _, result := range claim.Status.Allocation.Devices.Results {
280+
if result.Pool != "" {
281+
nodeSet[result.Pool] = struct{}{}
282+
}
283+
}
284+
}
285+
286+
nodes := make([]*computedomainv1beta1.ComputeDomainNode, 0, len(nodeSet))
287+
for nodeName := range nodeSet {
288+
nodes = append(nodes, &computedomainv1beta1.ComputeDomainNode{
289+
Name: nodeName,
290+
Status: computedomainv1beta1.ComputeDomainStatusReady,
291+
})
292+
}
293+
sort.Slice(nodes, func(i, j int) bool {
294+
return nodes[i].Name < nodes[j].Name
295+
})
296+
297+
status := computedomainv1beta1.ComputeDomainStatusNotReady
298+
if domain.Spec.NumNodes == 0 || len(nodes) >= domain.Spec.NumNodes {
299+
status = computedomainv1beta1.ComputeDomainStatusReady
300+
}
301+
302+
if !r.statusEqual(domain.Status, nodes, status) {
303+
domain.Status.Nodes = nodes
304+
domain.Status.Status = status
305+
return r.Status().Update(ctx, domain)
306+
}
307+
308+
return nil
309+
}
310+
311+
func (r *ComputeDomainReconciler) statusEqual(current computedomainv1beta1.ComputeDomainStatus, newNodes []*computedomainv1beta1.ComputeDomainNode, newStatus string) bool {
312+
if current.Status != newStatus {
313+
return false
314+
}
315+
if len(current.Nodes) != len(newNodes) {
316+
return false
317+
}
318+
for i, node := range current.Nodes {
319+
if node.Name != newNodes[i].Name {
320+
return false
321+
}
322+
}
323+
return true
324+
}

0 commit comments

Comments
 (0)