Skip to content

Commit 6c151de

Browse files
committed
add ComputeDomain status tracking with allocated nodes
Update the compute domain controller to track which nodes have allocated ResourceClaims and update the ComputeDomain status accordingly. The status includes a list of nodes and an overall Ready/NotReady state based on the numNodes spec field. Key changes: - Watch ResourceClaims with server-side label filtering for efficiency - Update ComputeDomain status.nodes with allocated node names - Set status.status to Ready when numNodes is 0 or reached - Add label constants to centralize label string usage - Update ClusterRole RBAC to allow list/watch on resourceclaims
1 parent 384cf66 commit 6c151de

File tree

7 files changed

+385
-11
lines changed

7 files changed

+385
-11
lines changed

deploy/fake-gpu-operator/templates/compute-domain-controller/clusterrole.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ metadata:
77
rules:
88
- apiGroups: ["resource.k8s.io"]
99
resources: ["resourceclaims"]
10-
verbs: ["get"]
10+
verbs: ["get", "list", "watch"]
1111
- apiGroups: [""]
1212
resources: ["nodes"]
1313
verbs: ["get"]

internal/compute-domain-controller/app.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,19 @@ import (
66

77
"go.uber.org/zap/zapcore"
88
resourceapi "k8s.io/api/resource/v1"
9+
"k8s.io/apimachinery/pkg/labels"
910
"k8s.io/apimachinery/pkg/runtime"
1011
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
1112
ctrl "sigs.k8s.io/controller-runtime"
13+
"sigs.k8s.io/controller-runtime/pkg/cache"
14+
"sigs.k8s.io/controller-runtime/pkg/client"
1215
"sigs.k8s.io/controller-runtime/pkg/healthz"
1316
"sigs.k8s.io/controller-runtime/pkg/log/zap"
1417
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
1518

1619
computedomainv1beta1 "github.com/NVIDIA/k8s-dra-driver-gpu/api/nvidia.com/resource/v1beta1"
1720
"github.com/run-ai/fake-gpu-operator/internal/common/app"
21+
"github.com/run-ai/fake-gpu-operator/pkg/compute-domain/consts"
1822
)
1923

2024
var (
@@ -85,6 +89,11 @@ func (app *ComputeDomainApp) Run() {
8589
func (app *ComputeDomainApp) runController(ctx context.Context) error {
8690
cfg := ctrl.GetConfigOrDie()
8791

92+
computeDomainLabelSelector, err := labels.Parse(consts.ComputeDomainClaimLabel)
93+
if err != nil {
94+
return fmt.Errorf("failed to parse label selector: %w", err)
95+
}
96+
8897
mgr, err := ctrl.NewManager(cfg, ctrl.Options{
8998
Scheme: scheme,
9099
Metrics: metricsserver.Options{
@@ -93,6 +102,13 @@ func (app *ComputeDomainApp) runController(ctx context.Context) error {
93102
HealthProbeBindAddress: app.config.HealthProbeAddress,
94103
LeaderElection: app.config.LeaderElection,
95104
LeaderElectionID: "fake-compute-domain-controller",
105+
Cache: cache.Options{
106+
ByObject: map[client.Object]cache.ByObject{
107+
&resourceapi.ResourceClaim{}: {
108+
Label: computeDomainLabelSelector,
109+
},
110+
},
111+
},
96112
})
97113
if err != nil {
98114
return fmt.Errorf("failed to create controller manager: %w", err)

internal/compute-domain-controller/computedomain_controller.go

Lines changed: 94 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,17 @@ package computedomaincontroller
1919
import (
2020
"context"
2121
"fmt"
22+
"sort"
2223

2324
resourceapi "k8s.io/api/resource/v1"
2425
apierrors "k8s.io/apimachinery/pkg/api/errors"
2526
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2627
"k8s.io/apimachinery/pkg/runtime"
28+
"k8s.io/apimachinery/pkg/types"
2729
ctrl "sigs.k8s.io/controller-runtime"
2830
"sigs.k8s.io/controller-runtime/pkg/client"
2931
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
32+
"sigs.k8s.io/controller-runtime/pkg/handler"
3033
"sigs.k8s.io/controller-runtime/pkg/log"
3134

3235
computedomainv1beta1 "github.com/NVIDIA/k8s-dra-driver-gpu/api/nvidia.com/resource/v1beta1"
@@ -59,6 +62,7 @@ type ComputeDomainReconciler struct {
5962
//+kubebuilder:rbac:groups=resource.nvidia.com,resources=computedomains/status,verbs=get;update;patch
6063
//+kubebuilder:rbac:groups=resource.nvidia.com,resources=computedomains/finalizers,verbs=update
6164
//+kubebuilder:rbac:groups=resource.k8s.io,resources=resourceclaimtemplates,verbs=get;list;watch;create;update;patch;delete
65+
//+kubebuilder:rbac:groups=resource.k8s.io,resources=resourceclaims,verbs=get;list;watch
6266

6367
func (r *ComputeDomainReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
6468
logger := log.FromContext(ctx)
@@ -81,6 +85,9 @@ func (r *ComputeDomainReconciler) Reconcile(ctx context.Context, req ctrl.Reques
8185
if err := r.ensureResourceClaimTemplates(ctx, domain); err != nil {
8286
return ctrl.Result{}, err
8387
}
88+
if err := r.updateStatus(ctx, domain); err != nil {
89+
return ctrl.Result{}, err
90+
}
8491

8592
logger.V(4).Info("reconciled ComputeDomain", "namespace", domain.Namespace, "name", domain.Name)
8693
return ctrl.Result{}, nil
@@ -141,8 +148,8 @@ func (r *ComputeDomainReconciler) ensureTemplate(
141148
Name: name,
142149
Namespace: domain.Namespace,
143150
Labels: map[string]string{
144-
"resource.nvidia.com/computeDomain": domain.Name,
145-
"resource.nvidia.com/computeDomainTarget": templateType,
151+
consts.ComputeDomainTemplateLabel: domain.Name,
152+
consts.ComputeDomainTemplateTargetLabel: templateType,
146153
},
147154
Finalizers: []string{
148155
consts.ComputeDomainFinalizer,
@@ -151,7 +158,7 @@ func (r *ComputeDomainReconciler) ensureTemplate(
151158
Spec: resourceapi.ResourceClaimTemplateSpec{
152159
ObjectMeta: metav1.ObjectMeta{
153160
Labels: map[string]string{
154-
"nvidia.com/computeDomain": domain.Name,
161+
consts.ComputeDomainClaimLabel: domain.Name,
155162
},
156163
},
157164
Spec: resourceapi.ResourceClaimSpec{
@@ -229,5 +236,89 @@ func (r *ComputeDomainReconciler) SetupWithManager(mgr ctrl.Manager) error {
229236
return ctrl.NewControllerManagedBy(mgr).
230237
For(&computedomainv1beta1.ComputeDomain{}).
231238
Owns(&resourceapi.ResourceClaimTemplate{}).
239+
Watches(
240+
&resourceapi.ResourceClaim{},
241+
handler.EnqueueRequestsFromMapFunc(r.mapResourceClaimToComputeDomain),
242+
).
232243
Complete(r)
233244
}
245+
246+
func (r *ComputeDomainReconciler) mapResourceClaimToComputeDomain(ctx context.Context, obj client.Object) []ctrl.Request {
247+
claim, ok := obj.(*resourceapi.ResourceClaim)
248+
if !ok {
249+
return nil
250+
}
251+
252+
domainName, exists := claim.Labels[consts.ComputeDomainClaimLabel]
253+
if !exists {
254+
return nil
255+
}
256+
257+
return []ctrl.Request{{
258+
NamespacedName: types.NamespacedName{
259+
Name: domainName,
260+
Namespace: claim.Namespace,
261+
},
262+
}}
263+
}
264+
265+
func (r *ComputeDomainReconciler) updateStatus(ctx context.Context, domain *computedomainv1beta1.ComputeDomain) error {
266+
claimList := &resourceapi.ResourceClaimList{}
267+
if err := r.List(ctx, claimList,
268+
client.InNamespace(domain.Namespace),
269+
client.MatchingLabels{consts.ComputeDomainClaimLabel: domain.Name},
270+
); err != nil {
271+
return err
272+
}
273+
274+
nodeSet := make(map[string]struct{})
275+
for _, claim := range claimList.Items {
276+
if claim.Status.Allocation == nil {
277+
continue
278+
}
279+
for _, result := range claim.Status.Allocation.Devices.Results {
280+
if result.Pool != "" {
281+
nodeSet[result.Pool] = struct{}{}
282+
}
283+
}
284+
}
285+
286+
nodes := make([]*computedomainv1beta1.ComputeDomainNode, 0, len(nodeSet))
287+
for nodeName := range nodeSet {
288+
nodes = append(nodes, &computedomainv1beta1.ComputeDomainNode{
289+
Name: nodeName,
290+
Status: computedomainv1beta1.ComputeDomainStatusReady,
291+
})
292+
}
293+
sort.Slice(nodes, func(i, j int) bool {
294+
return nodes[i].Name < nodes[j].Name
295+
})
296+
297+
status := computedomainv1beta1.ComputeDomainStatusNotReady
298+
if domain.Spec.NumNodes == 0 || len(nodes) >= domain.Spec.NumNodes {
299+
status = computedomainv1beta1.ComputeDomainStatusReady
300+
}
301+
302+
if !r.statusEqual(domain.Status, nodes, status) {
303+
domain.Status.Nodes = nodes
304+
domain.Status.Status = status
305+
return r.Status().Update(ctx, domain)
306+
}
307+
308+
return nil
309+
}
310+
311+
func (r *ComputeDomainReconciler) statusEqual(current computedomainv1beta1.ComputeDomainStatus, newNodes []*computedomainv1beta1.ComputeDomainNode, newStatus string) bool {
312+
if current.Status != newStatus {
313+
return false
314+
}
315+
if len(current.Nodes) != len(newNodes) {
316+
return false
317+
}
318+
for i, node := range current.Nodes {
319+
if node.Name != newNodes[i].Name {
320+
return false
321+
}
322+
}
323+
return true
324+
}

0 commit comments

Comments
 (0)