Skip to content

Commit 9476038

Browse files
authored
Merge pull request #153 from run-ai/erez/compute-domain-dra-plugin-implementation
Implement compute domain DRA plugin with state management
2 parents b943629 + fe0b112 commit 9476038

File tree

18 files changed

+1746
-36
lines changed

18 files changed

+1746
-36
lines changed

deploy/fake-gpu-operator/templates/compute-domain-dra-plugin/_helpers.tpl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ matchLabels:
1414

1515
{{- define "fake-gpu-operator.compute-domain-dra-plugin.common.podTemplate.metadata" }}
1616
annotations:
17-
checksum/topology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
17+
openshift.io/scc: hostmount-anyuid
1818
labels:
1919
app: compute-domain-dra-plugin
2020
component: compute-domain-dra-plugin

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ require (
2424
k8s.io/client-go v0.34.2
2525
k8s.io/dynamic-resource-allocation v0.34.2
2626
k8s.io/kubelet v0.34.2
27+
k8s.io/kubernetes v1.34.0
2728
sigs.k8s.io/controller-runtime v0.22.4
2829
sigs.k8s.io/dra-example-driver v0.2.0
2930
tags.cncf.io/container-device-interface v1.0.1

go.sum

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ cyphar.com/go-pathrs v0.2.1/go.mod h1:y8f1EMG7r+hCuFf/rXsKqMJrJAUoADZGNh5/vZPKcG
4040
dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
4141
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
4242
github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
43+
github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=
44+
github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
4345
github.com/NVIDIA/k8s-dra-driver-gpu v0.0.0-20251205171057-ccbb55fda6ef h1:OWrjqrsBzJ2j0mPdIRBIXkcsEqYRMLKHvxt9kByDIfA=
4446
github.com/NVIDIA/k8s-dra-driver-gpu v0.0.0-20251205171057-ccbb55fda6ef/go.mod h1:L4pYXkm4uAAdbfCcD9s8BUimSLk6QpYRNMXIYw3js0Y=
4547
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
@@ -722,6 +724,8 @@ k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b h1:MloQ9/bdJyIu9lb1PzujOP
722724
k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts=
723725
k8s.io/kubelet v0.34.2 h1:Dl+1uh7xwJr70r+SHKyIpvu6XvzuoPu0uDIC4cqgJUs=
724726
k8s.io/kubelet v0.34.2/go.mod h1:RfwR03iuKeVV7Z1qD9XKH98c3tlPImJpQ3qHIW40htM=
727+
k8s.io/kubernetes v1.34.0 h1:NvUrwPAVB4W3mSOpJ/RtNGHWWYyUP/xPaX5rUSpzA0w=
728+
k8s.io/kubernetes v1.34.0/go.mod h1:iu+FhII+Oc/1gGWLJcer6wpyih441aNFHl7Pvm8yPto=
725729
k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 h1:hwvWFiBzdWw1FhfY1FooPn3kzWuJ8tmbZBHi4zVsl1Y=
726730
k8s.io/utils v0.0.0-20250604170112-4c0f3b243397/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
727731
rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8=

internal/compute-domain-dra-plugin/app.go

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,3 @@
1-
/*
2-
* Copyright 2025 The Kubernetes Authors.
3-
*
4-
* Licensed under the Apache License, Version 2.0 (the "License");
5-
* you may not use this file except in compliance with the License.
6-
* You may obtain a copy of the License at
7-
*
8-
* http://www.apache.org/licenses/LICENSE-2.0
9-
*
10-
* Unless required by applicable law or agreed to in writing, software
11-
* distributed under the License is distributed on an "AS IS" BASIS,
12-
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13-
* See the License for the specific language governing permissions and
14-
* limitations under the License.
15-
*/
16-
171
package computedomaindraplugin
182

193
import (
@@ -35,6 +19,10 @@ import (
3519
"sigs.k8s.io/dra-example-driver/pkg/flags"
3620
)
3721

22+
const (
23+
DriverPluginCheckpointFile = "computedomain-checkpoint.json"
24+
)
25+
3826
type Flags struct {
3927
kubeClientConfig flags.KubeClientConfig
4028
loggingConfig *flags.LoggingConfig
@@ -43,6 +31,7 @@ type Flags struct {
4331
cdiRoot string
4432
kubeletRegistrarDirectoryPath string
4533
kubeletPluginsDirectoryPath string
34+
healthcheckPort int
4635
}
4736

4837
type Config struct {
@@ -60,6 +49,7 @@ type AppConfig struct {
6049
CDIRoot string `mapstructure:"CDI_ROOT"`
6150
KubeletRegistrarDirectoryPath string `mapstructure:"KUBELET_REGISTRAR_DIRECTORY_PATH"`
6251
KubeletPluginsDirectoryPath string `mapstructure:"KUBELET_PLUGINS_DIRECTORY_PATH"`
52+
HealthcheckPort int `mapstructure:"HEALTHCHECK_PORT"`
6353
}
6454

6555
type ComputeDomainDRAPluginApp struct {
@@ -81,6 +71,7 @@ func (app *ComputeDomainDRAPluginApp) GetConfig() interface{} {
8171
CDIRoot: "/etc/cdi",
8272
KubeletRegistrarDirectoryPath: kubeletplugin.KubeletRegistryDir,
8373
KubeletPluginsDirectoryPath: kubeletplugin.KubeletPluginsDir,
74+
HealthcheckPort: -1,
8475
}
8576
}
8677
return app.config
@@ -131,6 +122,7 @@ func (app *ComputeDomainDRAPluginApp) runPlugin(ctx context.Context) error {
131122
cdiRoot: app.config.CDIRoot,
132123
kubeletRegistrarDirectoryPath: app.config.KubeletRegistrarDirectoryPath,
133124
kubeletPluginsDirectoryPath: app.config.KubeletPluginsDirectoryPath,
125+
healthcheckPort: app.config.HealthcheckPort,
134126
},
135127
coreclient: clientSets.Core,
136128
}
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
/*
2+
* Copyright 2025 The Kubernetes Authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package computedomaindraplugin
18+
19+
import (
20+
"fmt"
21+
"os"
22+
"path/filepath"
23+
24+
"github.com/run-ai/fake-gpu-operator/pkg/compute-domain/consts"
25+
26+
cdiapi "tags.cncf.io/container-device-interface/pkg/cdi"
27+
cdiparser "tags.cncf.io/container-device-interface/pkg/parser"
28+
cdispec "tags.cncf.io/container-device-interface/specs-go"
29+
)
30+
31+
const (
32+
computeDomainCDIVendor = "k8s." + consts.ComputeDomainDriverName
33+
computeDomainCDIClass = "computedomain"
34+
computeDomainCDIKind = computeDomainCDIVendor + "/" + computeDomainCDIClass
35+
36+
computeDomainCDICommonDeviceName = "common"
37+
)
38+
39+
type ComputeDomainCDIHandler struct {
40+
cache *cdiapi.Cache
41+
nvcdiDevice *computeDomainNvcdiDevice
42+
deviceRoot string
43+
claimDevName string
44+
}
45+
46+
func NewComputeDomainCDIHandler(config *Config) (*ComputeDomainCDIHandler, error) {
47+
cache, err := cdiapi.NewCache(
48+
cdiapi.WithSpecDirs(config.flags.cdiRoot),
49+
)
50+
if err != nil {
51+
return nil, fmt.Errorf("unable to create a new CDI cache: %w", err)
52+
}
53+
deviceRoot := filepath.Join(config.DriverPluginPath(), "nvcdi")
54+
nvcdiDevice, err := newComputeDomainNvcdiDevice(deviceRoot)
55+
if err != nil {
56+
return nil, fmt.Errorf("unable to initialize nvcdi device helper: %w", err)
57+
}
58+
59+
handler := &ComputeDomainCDIHandler{
60+
cache: cache,
61+
nvcdiDevice: nvcdiDevice,
62+
deviceRoot: deviceRoot,
63+
claimDevName: "channel",
64+
}
65+
66+
return handler, nil
67+
}
68+
69+
func (cdi *ComputeDomainCDIHandler) CreateCommonSpecFile() error {
70+
spec := &cdispec.Spec{
71+
Kind: computeDomainCDIKind,
72+
Devices: []cdispec.Device{
73+
{
74+
Name: computeDomainCDICommonDeviceName,
75+
ContainerEdits: cdispec.ContainerEdits{
76+
Env: []string{
77+
fmt.Sprintf("KUBERNETES_NODE_NAME=%s", os.Getenv("NODE_NAME")),
78+
fmt.Sprintf("DRA_RESOURCE_DRIVER_NAME=%s", consts.ComputeDomainDriverName),
79+
},
80+
},
81+
},
82+
},
83+
}
84+
85+
minVersion, err := cdispec.MinimumRequiredVersion(spec)
86+
if err != nil {
87+
return fmt.Errorf("failed to get minimum required CDI spec version: %v", err)
88+
}
89+
spec.Version = minVersion
90+
91+
specName, err := cdiapi.GenerateNameForTransientSpec(spec, computeDomainCDICommonDeviceName)
92+
if err != nil {
93+
return fmt.Errorf("failed to generate Spec name: %w", err)
94+
}
95+
96+
return cdi.cache.WriteSpec(spec, specName)
97+
}
98+
99+
func (cdi *ComputeDomainCDIHandler) CreateClaimSpecFile(claimUID string, devices ComputeDomainPreparedDevices) error {
100+
specName := cdiapi.GenerateTransientSpecName(computeDomainCDIVendor, computeDomainCDIClass, claimUID)
101+
102+
spec := &cdispec.Spec{
103+
Kind: computeDomainCDIKind,
104+
Devices: []cdispec.Device{},
105+
}
106+
107+
for _, device := range devices {
108+
claimEdits := cdiapi.ContainerEdits{
109+
ContainerEdits: &cdispec.ContainerEdits{
110+
Env: []string{
111+
fmt.Sprintf("COMPUTE_DOMAIN_DEVICE_%s_RESOURCE_CLAIM=%s", device.DeviceName, claimUID),
112+
},
113+
},
114+
}
115+
if device.ContainerEdits != nil {
116+
claimEdits.Append(device.ContainerEdits)
117+
}
118+
119+
cdiDevice := cdispec.Device{
120+
Name: fmt.Sprintf("%s-%s", claimUID, device.DeviceName),
121+
ContainerEdits: *claimEdits.ContainerEdits,
122+
}
123+
124+
spec.Devices = append(spec.Devices, cdiDevice)
125+
}
126+
127+
minVersion, err := cdiapi.MinimumRequiredVersion(spec)
128+
if err != nil {
129+
return fmt.Errorf("failed to get minimum required CDI spec version: %v", err)
130+
}
131+
spec.Version = minVersion
132+
133+
return cdi.cache.WriteSpec(spec, specName)
134+
}
135+
136+
func (cdi *ComputeDomainCDIHandler) DeleteClaimSpecFile(claimUID string) error {
137+
specName := cdiapi.GenerateTransientSpecName(computeDomainCDIVendor, computeDomainCDIClass, claimUID)
138+
return cdi.cache.RemoveSpec(specName)
139+
}
140+
141+
func (cdi *ComputeDomainCDIHandler) GetClaimDevices(claimUID string, devices []string) []string {
142+
cdiDevices := make([]string, 0, len(devices))
143+
for _, device := range devices {
144+
cdiDevice := cdiparser.QualifiedName(computeDomainCDIVendor, computeDomainCDIClass, fmt.Sprintf("%s-%s", claimUID, device))
145+
cdiDevices = append(cdiDevices, cdiDevice)
146+
}
147+
return cdiDevices
148+
}
149+
150+
func (cdi *ComputeDomainCDIHandler) CreateDomainCDIDevice(domainInfo *DomainInfo) (*cdiapi.ContainerEdits, error) {
151+
if domainInfo == nil {
152+
return nil, fmt.Errorf("domain info is required to create CDI device")
153+
}
154+
return cdi.nvcdiDevice.ContainerEdits(domainInfo)
155+
}

0 commit comments

Comments
 (0)