Skip to content

Commit 3f563a8

Browse files
committed
add compute domain DRA plugin skeleton
Add the infrastructure for a compute domain DRA plugin that registers with kubelet, publishes resource slices for channel devices, but does not yet implement actual device preparation (Prepare/Unprepare are stubs). This includes: - cmd/compute-domain-dra-plugin entry point - internal/compute-domain-dra-plugin with app, driver, and discovery logic - Helm templates for DaemonSet, RBAC, and DeviceClass - Dockerfile and Makefile updates
1 parent b36cefe commit 3f563a8

File tree

16 files changed

+657
-2
lines changed

16 files changed

+657
-2
lines changed

Dockerfile

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,12 @@ COPY ./pkg/compute-domain/ ./pkg/compute-domain/
4848
COPY ./internal/compute-domain-controller/ ./internal/compute-domain-controller/
4949
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENTS=compute-domain-controller
5050

51+
FROM common-builder AS compute-domain-dra-plugin-builder
52+
COPY ./cmd/compute-domain-dra-plugin/ ./cmd/compute-domain-dra-plugin/
53+
COPY ./pkg/compute-domain/ ./pkg/compute-domain/
54+
COPY ./internal/compute-domain-dra-plugin/ ./internal/compute-domain-dra-plugin/
55+
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENTS=compute-domain-dra-plugin
56+
5157
FROM common-builder AS dra-plugin-gpu-builder
5258
COPY ./cmd/dra-plugin-gpu/ ./cmd/dra-plugin-gpu/
5359
COPY ./internal/dra-plugin-gpu/ ./internal/dra-plugin-gpu/
@@ -111,3 +117,7 @@ ENTRYPOINT ["/bin/kwok-dra-plugin"]
111117
FROM ubuntu AS compute-domain-controller
112118
COPY --from=compute-domain-controller-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/compute-domain-controller /bin/
113119
ENTRYPOINT ["/bin/compute-domain-controller"]
120+
121+
FROM ubuntu AS compute-domain-dra-plugin
122+
COPY --from=compute-domain-dra-plugin-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/compute-domain-dra-plugin /bin/
123+
ENTRYPOINT ["/bin/compute-domain-dra-plugin"]

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
BUILD_DIR=$(shell pwd)/bin
2-
COMPONENTS?=device-plugin dra-plugin-gpu status-updater kwok-gpu-device-plugin kwok-dra-plugin status-exporter status-exporter-kwok topology-server mig-faker compute-domain-controller
2+
COMPONENTS?=device-plugin dra-plugin-gpu status-updater kwok-gpu-device-plugin kwok-dra-plugin status-exporter status-exporter-kwok topology-server mig-faker compute-domain-controller compute-domain-dra-plugin
33

44
DOCKER_REPO_BASE=ghcr.io/run-ai/fake-gpu-operator
55
DOCKER_TAG?=0.0.0-dev
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
/*
2+
* Copyright 2025 The Kubernetes Authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package main
18+
19+
import (
20+
"github.com/run-ai/fake-gpu-operator/internal/common/app"
21+
plugin "github.com/run-ai/fake-gpu-operator/internal/compute-domain-dra-plugin"
22+
)
23+
24+
func main() {
25+
appRunner := app.NewAppRunner(plugin.NewComputeDomainDRAPluginApp())
26+
appRunner.Run()
27+
}
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
---
2+
apiVersion: {{ include "dra-example-driver.resourceApiVersion" . }}
3+
kind: DeviceClass
4+
metadata:
5+
name: compute-domain-default-channel.nvidia.com
6+
spec:
7+
selectors:
8+
- cel:
9+
expression: "device.driver == 'compute-domain.nvidia.com' && device.attributes['compute-domain.nvidia.com'].type == 'channel' && device.attributes['compute-domain.nvidia.com'].id == 0"
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
{{- define "fake-gpu-operator.compute-domain-dra-plugin.common.metadata.labels" -}}
2+
app: compute-domain-dra-plugin
3+
{{- end -}}
4+
5+
{{- define "fake-gpu-operator.compute-domain-dra-plugin.common.metadata.name" -}}
6+
compute-domain-dra-plugin
7+
{{- end -}}
8+
9+
{{- define "fake-gpu-operator.compute-domain-dra-plugin.common.podSelector" }}
10+
matchLabels:
11+
app: compute-domain-dra-plugin
12+
component: compute-domain-dra-plugin
13+
{{- end }}
14+
15+
{{- define "fake-gpu-operator.compute-domain-dra-plugin.common.podTemplate.metadata" }}
16+
annotations:
17+
checksum/topology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
18+
labels:
19+
app: compute-domain-dra-plugin
20+
component: compute-domain-dra-plugin
21+
{{- end }}
22+
23+
{{- define "fake-gpu-operator.compute-domain-dra-plugin.common.podTemplate.spec" }}
24+
containers:
25+
- image: "{{ .Values.computeDomainDraPlugin.image.repository }}:{{ .Values.computeDomainDraPlugin.image.tag | default .Chart.AppVersion }}"
26+
imagePullPolicy: "{{ .Values.computeDomainDraPlugin.image.pullPolicy }}"
27+
resources:
28+
{{- toYaml .Values.computeDomainDraPlugin.resources | nindent 12 }}
29+
env:
30+
- name: NODE_NAME
31+
valueFrom:
32+
fieldRef:
33+
fieldPath: spec.nodeName
34+
- name: CDI_ROOT
35+
value: "/etc/cdi"
36+
- name: KUBELET_REGISTRAR_DIRECTORY_PATH
37+
value: "/var/lib/kubelet/plugins_registry"
38+
- name: KUBELET_PLUGINS_DIRECTORY_PATH
39+
value: "/var/lib/kubelet/plugins"
40+
name: compute-domain-dra-plugin-ctr
41+
securityContext:
42+
privileged: true
43+
terminationMessagePath: /dev/termination-log
44+
terminationMessagePolicy: File
45+
volumeMounts:
46+
- mountPath: /var/lib/kubelet/plugins_registry
47+
name: plugins-registry
48+
- mountPath: /var/lib/kubelet/plugins
49+
name: plugins
50+
- mountPath: /etc/cdi
51+
name: cdi
52+
dnsPolicy: ClusterFirst
53+
restartPolicy: Always
54+
serviceAccountName: compute-domain-dra-plugin
55+
terminationGracePeriodSeconds: 30
56+
tolerations:
57+
- effect: NoSchedule
58+
key: nvidia.com/gpu
59+
operator: Exists
60+
imagePullSecrets:
61+
- name: gcr-secret
62+
volumes:
63+
- hostPath:
64+
path: /var/lib/kubelet/plugins_registry
65+
type: DirectoryOrCreate
66+
name: plugins-registry
67+
- hostPath:
68+
path: /var/lib/kubelet/plugins
69+
type: DirectoryOrCreate
70+
name: plugins
71+
- hostPath:
72+
path: /etc/cdi
73+
type: DirectoryOrCreate
74+
name: cdi
75+
{{- end }}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
{{- if .Values.computeDomainDraPlugin.enabled -}}
2+
apiVersion: rbac.authorization.k8s.io/v1
3+
kind: ClusterRole
4+
metadata:
5+
name: fake-compute-domain-dra-plugin
6+
rules:
7+
- apiGroups:
8+
- ""
9+
resources:
10+
- nodes
11+
verbs:
12+
- get
13+
- list
14+
- watch
15+
- apiGroups:
16+
- resource.k8s.io
17+
resources:
18+
- resourceclaims
19+
verbs:
20+
- get
21+
- apiGroups:
22+
- resource.k8s.io
23+
resources:
24+
- resourceslices
25+
verbs:
26+
- get
27+
- list
28+
- watch
29+
- create
30+
- update
31+
- delete
32+
{{- end -}}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{{- if .Values.computeDomainDraPlugin.enabled -}}
2+
apiVersion: rbac.authorization.k8s.io/v1
3+
kind: ClusterRoleBinding
4+
metadata:
5+
name: fake-compute-domain-dra-plugin
6+
roleRef:
7+
kind: ClusterRole
8+
apiGroup: rbac.authorization.k8s.io
9+
name: fake-compute-domain-dra-plugin
10+
subjects:
11+
- kind: ServiceAccount
12+
name: compute-domain-dra-plugin
13+
namespace: "{{ .Release.Namespace }}"
14+
{{- end -}}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
{{- if .Values.computeDomainDraPlugin.enabled -}}
2+
apiVersion: apps/v1
3+
kind: DaemonSet
4+
metadata:
5+
name: {{ include "fake-gpu-operator.compute-domain-dra-plugin.common.metadata.name" . }}
6+
labels:
7+
{{- include "fake-gpu-operator.compute-domain-dra-plugin.common.metadata.labels" . | nindent 4 }}
8+
spec:
9+
selector:
10+
{{- include "fake-gpu-operator.compute-domain-dra-plugin.common.podSelector" . | nindent 4 }}
11+
template:
12+
metadata:
13+
{{- include "fake-gpu-operator.compute-domain-dra-plugin.common.podTemplate.metadata" . | nindent 6 }}
14+
spec:
15+
{{- include "fake-gpu-operator.compute-domain-dra-plugin.common.podTemplate.spec" . | nindent 6 }}
16+
nodeSelector:
17+
nvidia.com/gpu.deploy.compute-domain-dra-plugin: "true"
18+
{{- end -}}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{{- if .Values.computeDomainDraPlugin.enabled -}}
2+
apiVersion: v1
3+
kind: ServiceAccount
4+
metadata:
5+
name: compute-domain-dra-plugin
6+
{{- end -}}

deploy/fake-gpu-operator/values.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,3 +191,17 @@ computeDomainController:
191191
limits:
192192
cpu: "200m"
193193
memory: "400Mi"
194+
195+
computeDomainDraPlugin:
196+
enabled: false
197+
image:
198+
pullPolicy: Always
199+
repository: ghcr.io/run-ai/fake-gpu-operator/compute-domain-dra-plugin
200+
tag: ""
201+
resources:
202+
requests:
203+
cpu: "100m"
204+
memory: "200Mi"
205+
limits:
206+
cpu: "200m"
207+
memory: "400Mi"

0 commit comments

Comments
 (0)