Skip to content

Commit 454f059

Browse files
authored
Merge pull request #162 from run-ai/erez/compute-domain-kwok
compute domain plugin for KWOK nodes
2 parents e902f4d + 2017adf commit 454f059

File tree

14 files changed

+545
-3
lines changed

14 files changed

+545
-3
lines changed

Dockerfile

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,12 @@ COPY ./internal/status-updater/ ./internal/status-updater/
6565
COPY ./internal/kwok-dra-plugin/ ./internal/kwok-dra-plugin/
6666
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENTS=kwok-dra-plugin
6767

68+
FROM common-builder AS kwok-compute-domain-dra-plugin-builder
69+
COPY ./cmd/kwok-compute-domain-dra-plugin/ ./cmd/kwok-compute-domain-dra-plugin/
70+
COPY ./pkg/compute-domain/ ./pkg/compute-domain/
71+
COPY ./internal/kwok-compute-domain-dra-plugin/ ./internal/kwok-compute-domain-dra-plugin/
72+
RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENTS=kwok-compute-domain-dra-plugin
73+
6874
FROM common-builder AS preloader-builder
6975
COPY ./cmd/preloader/ ./cmd/preloader/
7076
RUN make build-preloader
@@ -114,6 +120,10 @@ FROM ubuntu AS kwok-dra-plugin
114120
COPY --from=kwok-dra-plugin-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/kwok-dra-plugin /bin/
115121
ENTRYPOINT ["/bin/kwok-dra-plugin"]
116122

123+
FROM ubuntu AS kwok-compute-domain-dra-plugin
124+
COPY --from=kwok-compute-domain-dra-plugin-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/kwok-compute-domain-dra-plugin /bin/
125+
ENTRYPOINT ["/bin/kwok-compute-domain-dra-plugin"]
126+
117127
FROM ubuntu AS compute-domain-controller
118128
COPY --from=compute-domain-controller-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/compute-domain-controller /bin/
119129
ENTRYPOINT ["/bin/compute-domain-controller"]

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
BUILD_DIR=$(shell pwd)/bin
2-
COMPONENTS?=device-plugin dra-plugin-gpu status-updater kwok-gpu-device-plugin kwok-dra-plugin status-exporter status-exporter-kwok topology-server mig-faker compute-domain-controller compute-domain-dra-plugin
2+
COMPONENTS?=device-plugin dra-plugin-gpu status-updater kwok-gpu-device-plugin kwok-dra-plugin kwok-compute-domain-dra-plugin status-exporter status-exporter-kwok topology-server mig-faker compute-domain-controller compute-domain-dra-plugin
33

44
DOCKER_REPO_BASE=ghcr.io/run-ai/fake-gpu-operator
55
DOCKER_TAG?=0.0.0-dev
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
package main
2+
3+
import (
4+
"github.com/run-ai/fake-gpu-operator/internal/common/app"
5+
"github.com/run-ai/fake-gpu-operator/internal/common/config"
6+
kwokcomputedomaindraplugin "github.com/run-ai/fake-gpu-operator/internal/kwok-compute-domain-dra-plugin"
7+
)
8+
9+
func main() {
10+
requiredEnvVars := []string{kwokcomputedomaindraplugin.EnvFakeGpuOperatorNamespace}
11+
config.ValidateConfig(requiredEnvVars)
12+
13+
appRunner := app.NewAppRunner(&kwokcomputedomaindraplugin.KWOKComputeDomainDraPluginApp{})
14+
appRunner.Run()
15+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
{{- if .Values.kwokComputeDomainDraPlugin.enabled }}
2+
apiVersion: rbac.authorization.k8s.io/v1
3+
kind: ClusterRole
4+
metadata:
5+
name: kwok-compute-domain-dra-plugin
6+
rules:
7+
- apiGroups:
8+
- ""
9+
resources:
10+
- nodes
11+
verbs:
12+
- get
13+
- list
14+
- watch
15+
- apiGroups:
16+
- resource.k8s.io
17+
resources:
18+
- resourceslices
19+
verbs:
20+
- get
21+
- list
22+
- watch
23+
- create
24+
- update
25+
- delete
26+
{{- end }}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{{- if .Values.kwokComputeDomainDraPlugin.enabled }}
2+
apiVersion: rbac.authorization.k8s.io/v1
3+
kind: ClusterRoleBinding
4+
metadata:
5+
name: kwok-compute-domain-dra-plugin
6+
roleRef:
7+
apiGroup: rbac.authorization.k8s.io
8+
kind: ClusterRole
9+
name: kwok-compute-domain-dra-plugin
10+
subjects:
11+
- kind: ServiceAccount
12+
name: kwok-compute-domain-dra-plugin
13+
namespace: "{{ .Release.Namespace }}"
14+
{{- end }}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
{{- if .Values.kwokComputeDomainDraPlugin.enabled }}
2+
apiVersion: apps/v1
3+
kind: Deployment
4+
metadata:
5+
name: kwok-compute-domain-dra-plugin
6+
labels:
7+
app: kwok-compute-domain-dra-plugin
8+
spec:
9+
selector:
10+
matchLabels:
11+
app: kwok-compute-domain-dra-plugin
12+
component: kwok-compute-domain-dra-plugin
13+
replicas: 1
14+
template:
15+
metadata:
16+
labels:
17+
app: kwok-compute-domain-dra-plugin
18+
component: kwok-compute-domain-dra-plugin
19+
spec:
20+
containers:
21+
- name: kwok-compute-domain-dra-plugin
22+
image: "{{ .Values.kwokComputeDomainDraPlugin.image.repository }}:{{ .Values.kwokComputeDomainDraPlugin.image.tag | default .Chart.AppVersion }}"
23+
imagePullPolicy: "{{ .Values.kwokComputeDomainDraPlugin.image.pullPolicy }}"
24+
resources:
25+
{{- toYaml .Values.kwokComputeDomainDraPlugin.resources | nindent 12 }}
26+
env:
27+
- name: FAKE_GPU_OPERATOR_NAMESPACE
28+
value: "{{ .Release.Namespace }}"
29+
restartPolicy: Always
30+
serviceAccountName: kwok-compute-domain-dra-plugin
31+
imagePullSecrets:
32+
- name: gcr-secret
33+
{{- end }}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{{- if .Values.kwokComputeDomainDraPlugin.enabled }}
2+
apiVersion: v1
3+
kind: ServiceAccount
4+
metadata:
5+
name: kwok-compute-domain-dra-plugin
6+
labels:
7+
app: kwok-compute-domain-dra-plugin
8+
{{- end }}

deploy/fake-gpu-operator/values.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,3 +207,17 @@ computeDomainDraPlugin:
207207
limits:
208208
cpu: "200m"
209209
memory: "400Mi"
210+
211+
kwokComputeDomainDraPlugin:
212+
enabled: false
213+
image:
214+
pullPolicy: Always
215+
repository: ghcr.io/run-ai/fake-gpu-operator/kwok-compute-domain-dra-plugin
216+
tag: ""
217+
resources:
218+
requests:
219+
cpu: "100m"
220+
memory: "200Mi"
221+
limits:
222+
cpu: "200m"
223+
memory: "400Mi"
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
package kwokcomputedomaindraplugin
2+
3+
import (
4+
"context"
5+
"log"
6+
7+
"github.com/spf13/viper"
8+
corev1 "k8s.io/api/core/v1"
9+
resourceapi "k8s.io/api/resource/v1"
10+
"k8s.io/apimachinery/pkg/runtime"
11+
"k8s.io/client-go/kubernetes"
12+
"k8s.io/klog/v2"
13+
ctrl "sigs.k8s.io/controller-runtime"
14+
15+
nodecontroller "github.com/run-ai/fake-gpu-operator/internal/kwok-compute-domain-dra-plugin/controllers/node"
16+
)
17+
18+
const (
19+
EnvFakeGpuOperatorNamespace = "FAKE_GPU_OPERATOR_NAMESPACE"
20+
)
21+
22+
type KWOKComputeDomainDraPluginAppConfiguration struct {
23+
FakeGpuOperatorNamespace string `mapstructure:"FAKE_GPU_OPERATOR_NAMESPACE" validate:"required"`
24+
}
25+
26+
type KWOKComputeDomainDraPluginApp struct {
27+
mgr ctrl.Manager
28+
stopCh chan struct{}
29+
}
30+
31+
func (app *KWOKComputeDomainDraPluginApp) Run() {
32+
ctx, cancel := context.WithCancel(context.Background())
33+
defer cancel()
34+
35+
go func() {
36+
<-app.stopCh
37+
cancel()
38+
}()
39+
40+
if err := app.mgr.Start(ctx); err != nil {
41+
log.Fatalf("Failed to start manager: %v", err)
42+
}
43+
}
44+
45+
func (app *KWOKComputeDomainDraPluginApp) Init(stopCh chan struct{}) {
46+
app.stopCh = stopCh
47+
48+
ctrl.SetLogger(klog.NewKlogr())
49+
50+
cfg, err := ctrl.GetConfig()
51+
if err != nil {
52+
log.Fatalf("Failed to get config: %v", err)
53+
}
54+
cfg.QPS = 100
55+
cfg.Burst = 200
56+
57+
scheme := runtime.NewScheme()
58+
if err := corev1.AddToScheme(scheme); err != nil {
59+
log.Fatalf("Failed to add corev1 to scheme: %v", err)
60+
}
61+
if err := resourceapi.AddToScheme(scheme); err != nil {
62+
log.Fatalf("Failed to add resource.k8s.io to scheme: %v", err)
63+
}
64+
65+
namespace := viper.GetString(EnvFakeGpuOperatorNamespace)
66+
app.mgr, err = ctrl.NewManager(cfg, ctrl.Options{
67+
Scheme: scheme,
68+
})
69+
if err != nil {
70+
log.Fatalf("Failed to create manager: %v", err)
71+
}
72+
73+
kubeClient, err := kubernetes.NewForConfig(cfg)
74+
if err != nil {
75+
log.Fatalf("Failed to create kubernetes client: %v", err)
76+
}
77+
78+
if err := nodecontroller.SetupWithManager(app.mgr, kubeClient, namespace); err != nil {
79+
log.Fatalf("Failed to setup Node controller: %v", err)
80+
}
81+
}
82+
83+
func (app *KWOKComputeDomainDraPluginApp) Name() string {
84+
return "KWOKComputeDomainDraPlugin"
85+
}
86+
87+
func (app *KWOKComputeDomainDraPluginApp) GetConfig() interface{} {
88+
var config KWOKComputeDomainDraPluginAppConfiguration
89+
return config
90+
}

0 commit comments

Comments
 (0)