Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ metadata:
rules:
- apiGroups: ["resource.k8s.io"]
resources: ["resourceclaims"]
verbs: ["get"]
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get"]
Expand Down
16 changes: 16 additions & 0 deletions internal/compute-domain-controller/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,19 @@ import (

"go.uber.org/zap/zapcore"
resourceapi "k8s.io/api/resource/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/cache"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"

computedomainv1beta1 "github.com/NVIDIA/k8s-dra-driver-gpu/api/nvidia.com/resource/v1beta1"
"github.com/run-ai/fake-gpu-operator/internal/common/app"
"github.com/run-ai/fake-gpu-operator/pkg/compute-domain/consts"
)

var (
Expand Down Expand Up @@ -85,6 +89,11 @@ func (app *ComputeDomainApp) Run() {
func (app *ComputeDomainApp) runController(ctx context.Context) error {
cfg := ctrl.GetConfigOrDie()

computeDomainLabelSelector, err := labels.Parse(consts.ComputeDomainClaimLabel)
if err != nil {
return fmt.Errorf("failed to parse label selector: %w", err)
}

mgr, err := ctrl.NewManager(cfg, ctrl.Options{
Scheme: scheme,
Metrics: metricsserver.Options{
Expand All @@ -93,6 +102,13 @@ func (app *ComputeDomainApp) runController(ctx context.Context) error {
HealthProbeBindAddress: app.config.HealthProbeAddress,
LeaderElection: app.config.LeaderElection,
LeaderElectionID: "fake-compute-domain-controller",
Cache: cache.Options{
ByObject: map[client.Object]cache.ByObject{
&resourceapi.ResourceClaim{}: {
Label: computeDomainLabelSelector,
},
},
},
})
if err != nil {
return fmt.Errorf("failed to create controller manager: %w", err)
Expand Down
97 changes: 94 additions & 3 deletions internal/compute-domain-controller/computedomain_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,17 @@ package computedomaincontroller
import (
"context"
"fmt"
"sort"

resourceapi "k8s.io/api/resource/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/log"

computedomainv1beta1 "github.com/NVIDIA/k8s-dra-driver-gpu/api/nvidia.com/resource/v1beta1"
Expand Down Expand Up @@ -59,6 +62,7 @@ type ComputeDomainReconciler struct {
//+kubebuilder:rbac:groups=resource.nvidia.com,resources=computedomains/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=resource.nvidia.com,resources=computedomains/finalizers,verbs=update
//+kubebuilder:rbac:groups=resource.k8s.io,resources=resourceclaimtemplates,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=resource.k8s.io,resources=resourceclaims,verbs=get;list;watch

func (r *ComputeDomainReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
logger := log.FromContext(ctx)
Expand All @@ -81,6 +85,9 @@ func (r *ComputeDomainReconciler) Reconcile(ctx context.Context, req ctrl.Reques
if err := r.ensureResourceClaimTemplates(ctx, domain); err != nil {
return ctrl.Result{}, err
}
if err := r.updateStatus(ctx, domain); err != nil {
return ctrl.Result{}, err
}

logger.V(4).Info("reconciled ComputeDomain", "namespace", domain.Namespace, "name", domain.Name)
return ctrl.Result{}, nil
Expand Down Expand Up @@ -141,8 +148,8 @@ func (r *ComputeDomainReconciler) ensureTemplate(
Name: name,
Namespace: domain.Namespace,
Labels: map[string]string{
"resource.nvidia.com/computeDomain": domain.Name,
"resource.nvidia.com/computeDomainTarget": templateType,
consts.ComputeDomainTemplateLabel: domain.Name,
consts.ComputeDomainTemplateTargetLabel: templateType,
},
Finalizers: []string{
consts.ComputeDomainFinalizer,
Expand All @@ -151,7 +158,7 @@ func (r *ComputeDomainReconciler) ensureTemplate(
Spec: resourceapi.ResourceClaimTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
"nvidia.com/computeDomain": domain.Name,
consts.ComputeDomainClaimLabel: domain.Name,
},
},
Spec: resourceapi.ResourceClaimSpec{
Expand Down Expand Up @@ -229,5 +236,89 @@ func (r *ComputeDomainReconciler) SetupWithManager(mgr ctrl.Manager) error {
return ctrl.NewControllerManagedBy(mgr).
For(&computedomainv1beta1.ComputeDomain{}).
Owns(&resourceapi.ResourceClaimTemplate{}).
Watches(
&resourceapi.ResourceClaim{},
handler.EnqueueRequestsFromMapFunc(r.mapResourceClaimToComputeDomain),
).
Complete(r)
}

func (r *ComputeDomainReconciler) mapResourceClaimToComputeDomain(ctx context.Context, obj client.Object) []ctrl.Request {
claim, ok := obj.(*resourceapi.ResourceClaim)
if !ok {
return nil
}

domainName, exists := claim.Labels[consts.ComputeDomainClaimLabel]
if !exists {
return nil
}

return []ctrl.Request{{
NamespacedName: types.NamespacedName{
Name: domainName,
Namespace: claim.Namespace,
},
}}
}

func (r *ComputeDomainReconciler) updateStatus(ctx context.Context, domain *computedomainv1beta1.ComputeDomain) error {
claimList := &resourceapi.ResourceClaimList{}
if err := r.List(ctx, claimList,
client.InNamespace(domain.Namespace),
client.MatchingLabels{consts.ComputeDomainClaimLabel: domain.Name},
); err != nil {
return err
}

nodeSet := make(map[string]struct{})
for _, claim := range claimList.Items {
if claim.Status.Allocation == nil {
continue
}
for _, result := range claim.Status.Allocation.Devices.Results {
if result.Pool != "" {
nodeSet[result.Pool] = struct{}{}
}
}
}

nodes := make([]*computedomainv1beta1.ComputeDomainNode, 0, len(nodeSet))
for nodeName := range nodeSet {
nodes = append(nodes, &computedomainv1beta1.ComputeDomainNode{
Name: nodeName,
Status: computedomainv1beta1.ComputeDomainStatusReady,
})
}
sort.Slice(nodes, func(i, j int) bool {
return nodes[i].Name < nodes[j].Name
})

status := computedomainv1beta1.ComputeDomainStatusNotReady
if domain.Spec.NumNodes == 0 || len(nodes) >= domain.Spec.NumNodes {
status = computedomainv1beta1.ComputeDomainStatusReady
}

if !r.statusEqual(domain.Status, nodes, status) {
domain.Status.Nodes = nodes
domain.Status.Status = status
return r.Status().Update(ctx, domain)
}

return nil
}

func (r *ComputeDomainReconciler) statusEqual(current computedomainv1beta1.ComputeDomainStatus, newNodes []*computedomainv1beta1.ComputeDomainNode, newStatus string) bool {
if current.Status != newStatus {
return false
}
if len(current.Nodes) != len(newNodes) {
return false
}
for i, node := range current.Nodes {
if node.Name != newNodes[i].Name {
return false
}
}
return true
}
Loading
Loading