Skip to content

Commit 0453e8f

Browse files
committed
Add fake compute domain controller
1 parent f48e96c commit 0453e8f

File tree

17 files changed

+902
-76
lines changed

17 files changed

+902
-76
lines changed

Dockerfile

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@ COPY ./cmd/mig-faker/ ./cmd/mig-faker/
4242
COPY ./internal/ ./internal/
4343
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENTS=mig-faker
4444

45+
FROM common-builder AS compute-domain-controller-builder
46+
COPY ./cmd/compute-domain-controller/ ./cmd/compute-domain-controller/
47+
COPY ./pkg/compute-domain/ ./pkg/compute-domain/
48+
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENTS=compute-domain-controller
49+
4550
FROM common-builder AS dra-plugin-gpu-builder
4651
COPY ./cmd/dra-plugin-gpu/ ./cmd/dra-plugin-gpu/
4752
COPY ./internal/dra-plugin-gpu/ ./internal/dra-plugin-gpu/
@@ -101,3 +106,7 @@ ENTRYPOINT ["/bin/dra-plugin-gpu"]
101106
FROM ubuntu AS kwok-dra-plugin
102107
COPY --from=kwok-dra-plugin-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/kwok-dra-plugin /bin/
103108
ENTRYPOINT ["/bin/kwok-dra-plugin"]
109+
110+
FROM ubuntu AS compute-domain-controller
111+
COPY --from=compute-domain-controller-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/compute-domain-controller /bin/
112+
ENTRYPOINT ["/bin/compute-domain-controller"]

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
BUILD_DIR=$(shell pwd)/bin
2-
COMPONENTS?=device-plugin dra-plugin-gpu status-updater kwok-gpu-device-plugin kwok-dra-plugin status-exporter status-exporter-kwok topology-server mig-faker
2+
COMPONENTS?=device-plugin dra-plugin-gpu status-updater kwok-gpu-device-plugin kwok-dra-plugin status-exporter status-exporter-kwok topology-server mig-faker compute-domain-controller
33

44
DOCKER_REPO_BASE=ghcr.io/run-ai/fake-gpu-operator
55
DOCKER_TAG?=0.0.0-dev
Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
/*
2+
* Copyright 2025 The Kubernetes Authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package main
18+
19+
import (
20+
"context"
21+
"fmt"
22+
23+
resourceapi "k8s.io/api/resource/v1"
24+
apierrors "k8s.io/apimachinery/pkg/api/errors"
25+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
26+
"k8s.io/apimachinery/pkg/runtime"
27+
ctrl "sigs.k8s.io/controller-runtime"
28+
"sigs.k8s.io/controller-runtime/pkg/client"
29+
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
30+
"sigs.k8s.io/controller-runtime/pkg/log"
31+
32+
computedomainv1beta1 "github.com/NVIDIA/k8s-dra-driver-gpu/api/nvidia.com/resource/v1beta1"
33+
"github.com/run-ai/fake-gpu-operator/pkg/compute-domain/consts"
34+
)
35+
36+
const (
37+
// DefaultComputeDomainAllocationMode is the default allocation mode when not specified
38+
DefaultComputeDomainAllocationMode = "Single"
39+
)
40+
41+
// ComputeDomainReconciler watches ComputeDomain resources and keeps the
42+
// associated ResourceClaimTemplates in sync.
43+
type ComputeDomainReconciler struct {
44+
client.Client
45+
Scheme *runtime.Scheme
46+
}
47+
48+
//+kubebuilder:rbac:groups=resource.nvidia.com,resources=computedomains,verbs=get;list;watch;create;update;patch;delete
49+
//+kubebuilder:rbac:groups=resource.nvidia.com,resources=computedomains/status,verbs=get;update;patch
50+
//+kubebuilder:rbac:groups=resource.nvidia.com,resources=computedomains/finalizers,verbs=update
51+
//+kubebuilder:rbac:groups=resource.k8s.io,resources=resourceclaimtemplates,verbs=get;list;watch;create;update;patch;delete
52+
53+
func (r *ComputeDomainReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
54+
logger := log.FromContext(ctx)
55+
56+
domain := &computedomainv1beta1.ComputeDomain{}
57+
if err := r.Get(ctx, req.NamespacedName, domain); err != nil {
58+
return ctrl.Result{}, client.IgnoreNotFound(err)
59+
}
60+
61+
if domain.DeletionTimestamp.IsZero() {
62+
if err := r.ensureFinalizer(ctx, domain); err != nil {
63+
return ctrl.Result{}, err
64+
}
65+
if err := r.ensureResourceClaimTemplates(ctx, domain); err != nil {
66+
return ctrl.Result{}, err
67+
}
68+
} else {
69+
if err := r.handleDeletion(ctx, domain); err != nil {
70+
return ctrl.Result{}, err
71+
}
72+
return ctrl.Result{}, nil
73+
}
74+
75+
logger.V(4).Info("reconciled ComputeDomain", "namespace", domain.Namespace, "name", domain.Name)
76+
return ctrl.Result{}, nil
77+
}
78+
79+
func (r *ComputeDomainReconciler) ensureFinalizer(ctx context.Context, domain *computedomainv1beta1.ComputeDomain) error {
80+
if controllerutil.ContainsFinalizer(domain, consts.ComputeDomainFinalizer) {
81+
return nil
82+
}
83+
84+
controllerutil.AddFinalizer(domain, consts.ComputeDomainFinalizer)
85+
return r.Update(ctx, domain)
86+
}
87+
88+
func (r *ComputeDomainReconciler) handleDeletion(ctx context.Context, domain *computedomainv1beta1.ComputeDomain) error {
89+
if !controllerutil.ContainsFinalizer(domain, consts.ComputeDomainFinalizer) {
90+
return nil
91+
}
92+
93+
if err := r.deleteResourceClaimTemplates(ctx, domain); err != nil {
94+
return err
95+
}
96+
97+
controllerutil.RemoveFinalizer(domain, consts.ComputeDomainFinalizer)
98+
return r.Update(ctx, domain)
99+
}
100+
101+
func (r *ComputeDomainReconciler) ensureResourceClaimTemplates(ctx context.Context, domain *computedomainv1beta1.ComputeDomain) error {
102+
return r.ensureTemplate(ctx, domain, domain.Name, consts.ComputeDomainWorkloadDeviceClass, "workload")
103+
}
104+
105+
func (r *ComputeDomainReconciler) getAllocationMode(domain *computedomainv1beta1.ComputeDomain) string {
106+
if domain.Spec.Channel != nil && domain.Spec.Channel.AllocationMode != "" {
107+
return domain.Spec.Channel.AllocationMode
108+
}
109+
return DefaultComputeDomainAllocationMode
110+
}
111+
112+
func (r *ComputeDomainReconciler) ensureTemplate(
113+
ctx context.Context,
114+
domain *computedomainv1beta1.ComputeDomain,
115+
name string,
116+
deviceClass string,
117+
templateType string,
118+
) error {
119+
key := client.ObjectKey{Namespace: domain.Namespace, Name: name}
120+
existing := &resourceapi.ResourceClaimTemplate{}
121+
err := r.Get(ctx, key, existing)
122+
if err == nil {
123+
return nil
124+
}
125+
if !apierrors.IsNotFound(err) {
126+
return err
127+
}
128+
129+
template := &resourceapi.ResourceClaimTemplate{
130+
ObjectMeta: metav1.ObjectMeta{
131+
Name: name,
132+
Namespace: domain.Namespace,
133+
Labels: map[string]string{
134+
"resource.nvidia.com/computeDomain": domain.Name,
135+
"resource.nvidia.com/computeDomainTarget": templateType,
136+
},
137+
Finalizers: []string{
138+
"resource.nvidia.com/computeDomain",
139+
},
140+
},
141+
Spec: resourceapi.ResourceClaimTemplateSpec{
142+
ObjectMeta: metav1.ObjectMeta{
143+
Labels: map[string]string{
144+
"nvidia.com/computeDomain": domain.Name,
145+
},
146+
},
147+
Spec: resourceapi.ResourceClaimSpec{
148+
Devices: resourceapi.DeviceClaim{
149+
Config: []resourceapi.DeviceClaimConfiguration{
150+
{
151+
DeviceConfiguration: resourceapi.DeviceConfiguration{
152+
Opaque: &resourceapi.OpaqueDeviceConfiguration{
153+
Driver: consts.ComputeDomainDriverName,
154+
Parameters: runtime.RawExtension{
155+
Raw: []byte(fmt.Sprintf(`{
156+
"allocationMode": "%s",
157+
"apiVersion": "resource.nvidia.com/v1beta1",
158+
"domainID": "%s",
159+
"kind": "ComputeDomainChannelConfig"
160+
}`, r.getAllocationMode(domain), domain.UID)),
161+
},
162+
},
163+
},
164+
},
165+
},
166+
Requests: []resourceapi.DeviceRequest{
167+
{
168+
Name: "channel",
169+
Exactly: &resourceapi.ExactDeviceRequest{
170+
AllocationMode: resourceapi.DeviceAllocationModeExactCount,
171+
Count: 1,
172+
DeviceClassName: deviceClass,
173+
},
174+
},
175+
},
176+
},
177+
},
178+
},
179+
}
180+
181+
if err := controllerutil.SetControllerReference(domain, template, r.Scheme); err != nil {
182+
return err
183+
}
184+
185+
return client.IgnoreAlreadyExists(r.Create(ctx, template))
186+
}
187+
188+
func (r *ComputeDomainReconciler) deleteResourceClaimTemplates(ctx context.Context, domain *computedomainv1beta1.ComputeDomain) error {
189+
template := &resourceapi.ResourceClaimTemplate{
190+
ObjectMeta: metav1.ObjectMeta{
191+
Name: domain.Name,
192+
Namespace: domain.Namespace,
193+
},
194+
}
195+
if err := r.Delete(ctx, template); err != nil && !apierrors.IsNotFound(err) {
196+
return err
197+
}
198+
return nil
199+
}
200+
201+
// SetupWithManager wires the reconciler into the controller-runtime manager.
202+
func (r *ComputeDomainReconciler) SetupWithManager(mgr ctrl.Manager) error {
203+
return ctrl.NewControllerManagedBy(mgr).
204+
For(&computedomainv1beta1.ComputeDomain{}).
205+
Owns(&resourceapi.ResourceClaimTemplate{}).
206+
Complete(r)
207+
}
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
/*
2+
* Copyright 2025 The Kubernetes Authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package main
18+
19+
import (
20+
"context"
21+
"testing"
22+
23+
"github.com/stretchr/testify/assert"
24+
"github.com/stretchr/testify/require"
25+
26+
resourceapi "k8s.io/api/resource/v1"
27+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
28+
"k8s.io/apimachinery/pkg/runtime"
29+
"k8s.io/apimachinery/pkg/types"
30+
ctrl "sigs.k8s.io/controller-runtime"
31+
"sigs.k8s.io/controller-runtime/pkg/client"
32+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
33+
34+
computedomainv1beta1 "github.com/NVIDIA/k8s-dra-driver-gpu/api/nvidia.com/resource/v1beta1"
35+
"github.com/run-ai/fake-gpu-operator/pkg/compute-domain/consts"
36+
)
37+
38+
func TestComputeDomainReconciler_Reconcile(t *testing.T) {
39+
scheme := runtime.NewScheme()
40+
_ = resourceapi.AddToScheme(scheme)
41+
_ = computedomainv1beta1.AddToScheme(scheme)
42+
43+
tests := map[string]struct {
44+
computeDomain *computedomainv1beta1.ComputeDomain
45+
expectedWorkloadTemplate bool
46+
expectedFinalizer bool
47+
}{
48+
"new ComputeDomain creates templates": {
49+
computeDomain: &computedomainv1beta1.ComputeDomain{
50+
ObjectMeta: metav1.ObjectMeta{
51+
Name: "test-domain",
52+
Namespace: "default",
53+
UID: "test-uid",
54+
},
55+
},
56+
expectedWorkloadTemplate: true,
57+
expectedFinalizer: true,
58+
},
59+
}
60+
61+
for name, test := range tests {
62+
t.Run(name, func(t *testing.T) {
63+
// Setup
64+
objs := []client.Object{test.computeDomain}
65+
66+
fakeClient := fake.NewClientBuilder().
67+
WithScheme(scheme).
68+
WithObjects(objs...).
69+
Build()
70+
71+
reconciler := &ComputeDomainReconciler{
72+
Client: fakeClient,
73+
Scheme: scheme,
74+
}
75+
76+
// Execute
77+
req := ctrl.Request{
78+
NamespacedName: types.NamespacedName{
79+
Name: test.computeDomain.GetName(),
80+
Namespace: test.computeDomain.GetNamespace(),
81+
},
82+
}
83+
84+
result, err := reconciler.Reconcile(context.Background(), req)
85+
86+
// Verify
87+
require.NoError(t, err)
88+
assert.Equal(t, ctrl.Result{}, result)
89+
90+
// Check ResourceClaimTemplates
91+
if test.expectedWorkloadTemplate {
92+
workloadTemplate := &resourceapi.ResourceClaimTemplate{}
93+
err := fakeClient.Get(context.Background(), types.NamespacedName{
94+
Name: "test-domain",
95+
Namespace: "default",
96+
}, workloadTemplate)
97+
assert.NoError(t, err)
98+
// Check config section
99+
assert.Len(t, workloadTemplate.Spec.Spec.Devices.Config, 1)
100+
assert.Equal(t, consts.ComputeDomainDriverName, workloadTemplate.Spec.Spec.Devices.Config[0].Opaque.Driver)
101+
// Check requests - only channel request expected
102+
assert.Len(t, workloadTemplate.Spec.Spec.Devices.Requests, 1)
103+
assert.Equal(t, "channel", workloadTemplate.Spec.Spec.Devices.Requests[0].Name)
104+
assert.Equal(t, resourceapi.DeviceAllocationModeExactCount, workloadTemplate.Spec.Spec.Devices.Requests[0].Exactly.AllocationMode)
105+
assert.Equal(t, int64(1), workloadTemplate.Spec.Spec.Devices.Requests[0].Exactly.Count)
106+
assert.Equal(t, consts.ComputeDomainWorkloadDeviceClass, workloadTemplate.Spec.Spec.Devices.Requests[0].Exactly.DeviceClassName)
107+
// Check labels
108+
assert.Equal(t, test.computeDomain.GetName(), workloadTemplate.Labels["resource.nvidia.com/computeDomain"])
109+
assert.Equal(t, "workload", workloadTemplate.Labels["resource.nvidia.com/computeDomainTarget"])
110+
// Check labels copied into generated claims
111+
assert.Equal(t, test.computeDomain.GetName(), workloadTemplate.Spec.Labels["nvidia.com/computeDomain"])
112+
// Check finalizers
113+
assert.Contains(t, workloadTemplate.Finalizers, "resource.nvidia.com/computeDomain")
114+
}
115+
116+
// Check finalizer
117+
updatedDomain := &computedomainv1beta1.ComputeDomain{}
118+
err = fakeClient.Get(context.Background(), req.NamespacedName, updatedDomain)
119+
require.NoError(t, err)
120+
121+
finalizers := updatedDomain.GetFinalizers()
122+
hasFinalizer := false
123+
for _, f := range finalizers {
124+
if f == consts.ComputeDomainFinalizer {
125+
hasFinalizer = true
126+
break
127+
}
128+
}
129+
assert.Equal(t, test.expectedFinalizer, hasFinalizer)
130+
})
131+
}
132+
}
133+
134+
func TestComputeDomainReconciler_Reconcile_NotFound(t *testing.T) {
135+
scheme := runtime.NewScheme()
136+
_ = resourceapi.AddToScheme(scheme)
137+
_ = computedomainv1beta1.AddToScheme(scheme)
138+
139+
fakeClient := fake.NewClientBuilder().
140+
WithScheme(scheme).
141+
Build()
142+
143+
reconciler := &ComputeDomainReconciler{
144+
Client: fakeClient,
145+
Scheme: scheme,
146+
}
147+
148+
req := ctrl.Request{
149+
NamespacedName: types.NamespacedName{
150+
Name: "non-existent",
151+
Namespace: "default",
152+
},
153+
}
154+
155+
result, err := reconciler.Reconcile(context.Background(), req)
156+
assert.NoError(t, err)
157+
assert.Equal(t, ctrl.Result{}, result)
158+
}

0 commit comments

Comments
 (0)