|
| 1 | +/* |
| 2 | + * Copyright 2025 The Kubernetes Authors. |
| 3 | + * |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | + * you may not use this file except in compliance with the License. |
| 6 | + * You may obtain a copy of the License at |
| 7 | + * |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | + * |
| 10 | + * Unless required by applicable law or agreed to in writing, software |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | + * See the License for the specific language governing permissions and |
| 14 | + * limitations under the License. |
| 15 | + */ |
| 16 | + |
| 17 | +package main |
| 18 | + |
| 19 | +import ( |
| 20 | + "context" |
| 21 | + "fmt" |
| 22 | + |
| 23 | + resourceapi "k8s.io/api/resource/v1" |
| 24 | + apierrors "k8s.io/apimachinery/pkg/api/errors" |
| 25 | + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" |
| 26 | + "k8s.io/apimachinery/pkg/runtime" |
| 27 | + ctrl "sigs.k8s.io/controller-runtime" |
| 28 | + "sigs.k8s.io/controller-runtime/pkg/client" |
| 29 | + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" |
| 30 | + "sigs.k8s.io/controller-runtime/pkg/log" |
| 31 | + |
| 32 | + computedomainv1beta1 "github.com/NVIDIA/k8s-dra-driver-gpu/api/nvidia.com/resource/v1beta1" |
| 33 | + "github.com/run-ai/fake-gpu-operator/pkg/compute-domain/consts" |
| 34 | +) |
| 35 | + |
| 36 | +const ( |
| 37 | + // DefaultComputeDomainAllocationMode is the default allocation mode when not specified |
| 38 | + DefaultComputeDomainAllocationMode = "Single" |
| 39 | +) |
| 40 | + |
| 41 | +// ComputeDomainReconciler watches ComputeDomain resources and keeps the |
| 42 | +// associated ResourceClaimTemplates in sync. |
| 43 | +type ComputeDomainReconciler struct { |
| 44 | + client.Client |
| 45 | + Scheme *runtime.Scheme |
| 46 | +} |
| 47 | + |
| 48 | +//+kubebuilder:rbac:groups=resource.nvidia.com,resources=computedomains,verbs=get;list;watch;create;update;patch;delete |
| 49 | +//+kubebuilder:rbac:groups=resource.nvidia.com,resources=computedomains/status,verbs=get;update;patch |
| 50 | +//+kubebuilder:rbac:groups=resource.nvidia.com,resources=computedomains/finalizers,verbs=update |
| 51 | +//+kubebuilder:rbac:groups=resource.k8s.io,resources=resourceclaimtemplates,verbs=get;list;watch;create;update;patch;delete |
| 52 | + |
| 53 | +func (r *ComputeDomainReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { |
| 54 | + logger := log.FromContext(ctx) |
| 55 | + |
| 56 | + domain := &computedomainv1beta1.ComputeDomain{} |
| 57 | + if err := r.Get(ctx, req.NamespacedName, domain); err != nil { |
| 58 | + return ctrl.Result{}, client.IgnoreNotFound(err) |
| 59 | + } |
| 60 | + |
| 61 | + if domain.DeletionTimestamp.IsZero() { |
| 62 | + if err := r.ensureFinalizer(ctx, domain); err != nil { |
| 63 | + return ctrl.Result{}, err |
| 64 | + } |
| 65 | + if err := r.ensureResourceClaimTemplates(ctx, domain); err != nil { |
| 66 | + return ctrl.Result{}, err |
| 67 | + } |
| 68 | + } else { |
| 69 | + if err := r.handleDeletion(ctx, domain); err != nil { |
| 70 | + return ctrl.Result{}, err |
| 71 | + } |
| 72 | + return ctrl.Result{}, nil |
| 73 | + } |
| 74 | + |
| 75 | + logger.V(4).Info("reconciled ComputeDomain", "namespace", domain.Namespace, "name", domain.Name) |
| 76 | + return ctrl.Result{}, nil |
| 77 | +} |
| 78 | + |
| 79 | +func (r *ComputeDomainReconciler) ensureFinalizer(ctx context.Context, domain *computedomainv1beta1.ComputeDomain) error { |
| 80 | + if controllerutil.ContainsFinalizer(domain, consts.ComputeDomainFinalizer) { |
| 81 | + return nil |
| 82 | + } |
| 83 | + |
| 84 | + controllerutil.AddFinalizer(domain, consts.ComputeDomainFinalizer) |
| 85 | + return r.Update(ctx, domain) |
| 86 | +} |
| 87 | + |
| 88 | +func (r *ComputeDomainReconciler) handleDeletion(ctx context.Context, domain *computedomainv1beta1.ComputeDomain) error { |
| 89 | + if !controllerutil.ContainsFinalizer(domain, consts.ComputeDomainFinalizer) { |
| 90 | + return nil |
| 91 | + } |
| 92 | + |
| 93 | + if err := r.deleteResourceClaimTemplates(ctx, domain); err != nil { |
| 94 | + return err |
| 95 | + } |
| 96 | + |
| 97 | + controllerutil.RemoveFinalizer(domain, consts.ComputeDomainFinalizer) |
| 98 | + return r.Update(ctx, domain) |
| 99 | +} |
| 100 | + |
| 101 | +func (r *ComputeDomainReconciler) ensureResourceClaimTemplates(ctx context.Context, domain *computedomainv1beta1.ComputeDomain) error { |
| 102 | + return r.ensureTemplate(ctx, domain, domain.Name, consts.ComputeDomainWorkloadDeviceClass, "workload") |
| 103 | +} |
| 104 | + |
| 105 | +func (r *ComputeDomainReconciler) getAllocationMode(domain *computedomainv1beta1.ComputeDomain) string { |
| 106 | + if domain.Spec.Channel != nil && domain.Spec.Channel.AllocationMode != "" { |
| 107 | + return domain.Spec.Channel.AllocationMode |
| 108 | + } |
| 109 | + return DefaultComputeDomainAllocationMode |
| 110 | +} |
| 111 | + |
| 112 | +func (r *ComputeDomainReconciler) ensureTemplate( |
| 113 | + ctx context.Context, |
| 114 | + domain *computedomainv1beta1.ComputeDomain, |
| 115 | + name string, |
| 116 | + deviceClass string, |
| 117 | + templateType string, |
| 118 | +) error { |
| 119 | + key := client.ObjectKey{Namespace: domain.Namespace, Name: name} |
| 120 | + existing := &resourceapi.ResourceClaimTemplate{} |
| 121 | + err := r.Get(ctx, key, existing) |
| 122 | + if err == nil { |
| 123 | + return nil |
| 124 | + } |
| 125 | + if !apierrors.IsNotFound(err) { |
| 126 | + return err |
| 127 | + } |
| 128 | + |
| 129 | + template := &resourceapi.ResourceClaimTemplate{ |
| 130 | + ObjectMeta: metav1.ObjectMeta{ |
| 131 | + Name: name, |
| 132 | + Namespace: domain.Namespace, |
| 133 | + Labels: map[string]string{ |
| 134 | + "resource.nvidia.com/computeDomain": domain.Name, |
| 135 | + "resource.nvidia.com/computeDomainTarget": templateType, |
| 136 | + }, |
| 137 | + Finalizers: []string{ |
| 138 | + "resource.nvidia.com/computeDomain", |
| 139 | + }, |
| 140 | + }, |
| 141 | + Spec: resourceapi.ResourceClaimTemplateSpec{ |
| 142 | + ObjectMeta: metav1.ObjectMeta{ |
| 143 | + Labels: map[string]string{ |
| 144 | + "nvidia.com/computeDomain": domain.Name, |
| 145 | + }, |
| 146 | + }, |
| 147 | + Spec: resourceapi.ResourceClaimSpec{ |
| 148 | + Devices: resourceapi.DeviceClaim{ |
| 149 | + Config: []resourceapi.DeviceClaimConfiguration{ |
| 150 | + { |
| 151 | + DeviceConfiguration: resourceapi.DeviceConfiguration{ |
| 152 | + Opaque: &resourceapi.OpaqueDeviceConfiguration{ |
| 153 | + Driver: consts.ComputeDomainDriverName, |
| 154 | + Parameters: runtime.RawExtension{ |
| 155 | + Raw: []byte(fmt.Sprintf(`{ |
| 156 | + "allocationMode": "%s", |
| 157 | + "apiVersion": "resource.nvidia.com/v1beta1", |
| 158 | + "domainID": "%s", |
| 159 | + "kind": "ComputeDomainChannelConfig" |
| 160 | + }`, r.getAllocationMode(domain), domain.UID)), |
| 161 | + }, |
| 162 | + }, |
| 163 | + }, |
| 164 | + }, |
| 165 | + }, |
| 166 | + Requests: []resourceapi.DeviceRequest{ |
| 167 | + { |
| 168 | + Name: "channel", |
| 169 | + Exactly: &resourceapi.ExactDeviceRequest{ |
| 170 | + AllocationMode: resourceapi.DeviceAllocationModeExactCount, |
| 171 | + Count: 1, |
| 172 | + DeviceClassName: deviceClass, |
| 173 | + }, |
| 174 | + }, |
| 175 | + }, |
| 176 | + }, |
| 177 | + }, |
| 178 | + }, |
| 179 | + } |
| 180 | + |
| 181 | + if err := controllerutil.SetControllerReference(domain, template, r.Scheme); err != nil { |
| 182 | + return err |
| 183 | + } |
| 184 | + |
| 185 | + return client.IgnoreAlreadyExists(r.Create(ctx, template)) |
| 186 | +} |
| 187 | + |
| 188 | +func (r *ComputeDomainReconciler) deleteResourceClaimTemplates(ctx context.Context, domain *computedomainv1beta1.ComputeDomain) error { |
| 189 | + template := &resourceapi.ResourceClaimTemplate{ |
| 190 | + ObjectMeta: metav1.ObjectMeta{ |
| 191 | + Name: domain.Name, |
| 192 | + Namespace: domain.Namespace, |
| 193 | + }, |
| 194 | + } |
| 195 | + if err := r.Delete(ctx, template); err != nil && !apierrors.IsNotFound(err) { |
| 196 | + return err |
| 197 | + } |
| 198 | + return nil |
| 199 | +} |
| 200 | + |
| 201 | +// SetupWithManager wires the reconciler into the controller-runtime manager. |
| 202 | +func (r *ComputeDomainReconciler) SetupWithManager(mgr ctrl.Manager) error { |
| 203 | + return ctrl.NewControllerManagedBy(mgr). |
| 204 | + For(&computedomainv1beta1.ComputeDomain{}). |
| 205 | + Owns(&resourceapi.ResourceClaimTemplate{}). |
| 206 | + Complete(r) |
| 207 | +} |
0 commit comments