Skip to content

Commit 6f455a8

Browse files
authored
feat: Implement GPU pool and query component config from GPU pool (#28)
* feat: Implement GPU pool and query component config from GPU pool * fix: remove `enable` in annotations, add tensor-fusion.ai/gpupool to specify gpupool
1 parent 86c149f commit 6f455a8

35 files changed

+409
-366
lines changed

.mirrord/mirrord.json

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,6 @@
88
"env": true
99
},
1010
"target": {
11-
"namespace": "tensor-fusion",
12-
"path": {
13-
"deployment": "tensor-fusion-operator-controller-manager",
14-
"container": "manager"
15-
}
11+
"namespace": "tensor-fusion"
1612
}
1713
}

api/v1/gpupool_types.go

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -157,27 +157,19 @@ type ComponentConfig struct {
157157
Client ClientConfig `json:"client,omitempty"`
158158
}
159159

160-
type WorkerConfig struct {
161-
Image string `json:"image,omitempty"` // "stable" | "latest" | "nightly"
162-
Port int `json:"port,omitempty"`
163-
HostNetwork *bool `json:"hostNetwork,omitempty"`
164-
WorkerPodTemplate runtime.RawExtension `json:"workerPodTemplate,omitempty"` // Mixin extra spec.
165-
}
166-
167160
type HypervisorConfig struct {
168161
Image string `json:"image,omitempty"`
169162
HypervisorDaemonSetTemplate runtime.RawExtension `json:"hypervisorDaemonSetTemplate,omitempty"` // Mixin extra spec.
170163
}
171164

172-
// TODO: client mutation webhook need TLS cert, need check using cert-manager or other ways
173-
type ClientConfig struct {
174-
Image string `json:"image,omitempty"`
175-
Protocol string `json:"protocol,omitempty"`
176-
Port int `json:"port,omitempty"`
165+
type WorkerConfig struct {
166+
PodTemplate runtime.RawExtension `json:"podTemplate"`
167+
}
177168

178-
// +optional
179-
// define how to inject the client pod
180-
PodTemplateMergePatch runtime.RawExtension `json:"podTemplateMergePatch,omitempty"` // Add other things to the original pod.
169+
type ClientConfig struct {
170+
OperatorEndpoint string `json:"operatorEndpoint"`
171+
PatchToPod runtime.RawExtension `json:"patchToPod"`
172+
PatchToContainer runtime.RawExtension `json:"patchToContainer"`
181173
}
182174

183175
// GPUPoolStatus defines the observed state of GPUPool.

api/v1/tensorfusionconnection_types.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ type Resources struct {
3333

3434
// TensorFusionConnectionSpec defines the desired state of TensorFusionConnection.
3535
type TensorFusionConnectionSpec struct {
36+
PoolName string `json:"poolName"`
3637
Resources Resources `json:"resources"`
3738
}
3839

api/v1/zz_generated.deepcopy.go

Lines changed: 3 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
apiVersion: tensor-fusion.ai/v1
2+
kind: GPUPool
3+
metadata:
4+
name: {{ include "tensor-fusion.fullname" . }}-gpupool-sample
5+
spec:
6+
componentConfig:
7+
worker:
8+
podTemplate:
9+
template:
10+
spec:
11+
terminationGracePeriodSeconds: 0
12+
runtimeClassName: nvidia
13+
volumes:
14+
- name: worker-sock
15+
hostPath:
16+
path: /tensor-fusion/worker/sock
17+
type: DirectoryOrCreate
18+
hostNetwork: true
19+
hostPID: true
20+
containers:
21+
- name: tensor-fusion-worker
22+
image: tensorfusion/tensor-fusion-worker:latest
23+
env:
24+
- name: POD_NAME
25+
valueFrom:
26+
fieldRef:
27+
apiVersion: v1
28+
fieldPath: metadata.name
29+
- name: TF_ENABLE_LOG
30+
value: '1'
31+
volumeMounts:
32+
- name: worker-sock
33+
mountPath: /tensor-fusion/worker/sock
34+
command:
35+
- /home/app/tensor-fusion-worker
36+
- -n
37+
- native
38+
- -p
39+
- '$(TENSOR_FUSION_WORKER_PORT)'
40+
- -a
41+
- '0x1129'
42+
- -l
43+
- /tensor-fusion/worker/sock/$(POD_NAME).sock
44+
client:
45+
operatorEndpoint: http://{{ include "tensor-fusion.fullname" . }}.{{ include "tensor-fusion.namespace" . }}:8080
46+
patchToContainer:
47+
volumeMounts:
48+
- mountPath: /tensor-fusion
49+
name: tf-libs
50+
env:
51+
- name: LD_PRELOAD
52+
value: /tensor-fusion/libcuda.so
53+
- name: TF_ENABLE_LOG
54+
value: '1'
55+
patchToPod:
56+
spec:
57+
volumes:
58+
- name: tf-libs
59+
emptyDir: {}
60+
initContainers:
61+
- name: inject-lib
62+
image: tensorfusion/tensor-fusion-client:latest
63+
command:
64+
- sh
65+
- -c
66+
- cp /home/app/*.so /tensor-fusion/ && cp nvidia-smi-linux-amd64-550.54.15 /tensor-fusion/nvidia-smi
67+
volumeMounts:
68+
- mountPath: /tensor-fusion
69+
name: tf-libs

charts/tensor-fusion/templates/tensor-fusion-config.yaml

Lines changed: 0 additions & 71 deletions
This file was deleted.

charts/tensor-fusion/templates/vector.yaml

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,16 +39,23 @@ data:
3939
- prepare_metrics
4040
all_metrics: true
4141
metrics: []
42+
43+
prepare_controller_metrics:
44+
type: remap
45+
inputs: controller_metrics
46+
source: |
47+
.namespace = "tf"
48+
4249
sinks:
4350
sink_greptimedb_hypervisor_metrics:
4451
type: greptimedb_metrics
4552
inputs:
4653
- log_to_metric
4754
new_naming: false
48-
endpoint: {{ .Values.hypervisor.greptimedbEendpoint }}
55+
endpoint: {{ .Values.hypervisor.greptimedbEndpoint }}
4956
5057
sink_greptimedb_controller_metrics:
5158
type: prometheus_remote_write
5259
inputs:
53-
- controller_metrics
54-
endpoint: {{ .Values.controller.greptimedbEendpoint }}
60+
- prepare_controller_metrics
61+
endpoint: {{ .Values.controller.greptimedbEndpoint }}

charts/tensor-fusion/values.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ serviceAccount:
2020
annotations: {}
2121

2222
hypervisor:
23-
greptimedbEendpoint: greptimedb-standalone.greptimedb.svc.cluster.local:4001
23+
greptimedbEndpoint: greptimedb-standalone.greptimedb.svc.cluster.local:4001
2424
image:
2525
repository: tensorfusion/tensor-fusion-hypervisor
2626
# Overrides the image tag whose default is the chart appVersion.
@@ -51,7 +51,7 @@ controller:
5151
tolerations: []
5252
affinity: {}
5353

54-
greptimedbEendpoint: http://greptimedb-standalone.greptimedb.svc.cluster.local:4000/v1/prometheus/write?db=public
54+
greptimedbEndpoint: http://greptimedb-standalone.greptimedb.svc.cluster.local:4000/v1/prometheus/write?db=public
5555
admissionWebhooks:
5656
failurePolicy: Fail
5757
secretName: tensor-fusion-webhook-secret

cmd/main.go

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,13 @@ import (
2626
// to ensure that exec-entrypoint and run can make use of them.
2727
_ "k8s.io/client-go/plugin/pkg/client/auth"
2828

29-
tensorfusionaiv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
29+
tfv1 "github.com/NexusGPU/tensor-fusion-operator/api/v1"
3030
"github.com/NexusGPU/tensor-fusion-operator/internal/config"
3131
"github.com/NexusGPU/tensor-fusion-operator/internal/controller"
3232
"github.com/NexusGPU/tensor-fusion-operator/internal/scheduler"
3333
"github.com/NexusGPU/tensor-fusion-operator/internal/server"
3434
"github.com/NexusGPU/tensor-fusion-operator/internal/server/router"
3535
webhookcorev1 "github.com/NexusGPU/tensor-fusion-operator/internal/webhook/v1"
36-
"github.com/NexusGPU/tensor-fusion-operator/internal/worker"
3736
"k8s.io/apimachinery/pkg/runtime"
3837
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
3938
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
@@ -55,7 +54,7 @@ var (
5554
func init() {
5655
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
5756

58-
utilruntime.Must(tensorfusionaiv1.AddToScheme(scheme))
57+
utilruntime.Must(tfv1.AddToScheme(scheme))
5958
// +kubebuilder:scaffold:scheme
6059
}
6160

@@ -65,10 +64,8 @@ func main() {
6564
var probeAddr string
6665
var secureMetrics bool
6766
var enableHTTP2 bool
68-
var configFile string
6967
var tlsOpts []func(*tls.Config)
7068

71-
flag.StringVar(&configFile, "config", "/etc/tensor-fusion/config.yaml", "Config file of tensor-fusion-operator")
7269
flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+
7370
"Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.")
7471
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
@@ -154,21 +151,14 @@ func main() {
154151
}
155152

156153
ctx := context.Background()
157-
config, err := config.LoadConfig(configFile)
158-
if os.IsNotExist(err) {
159-
setupLog.Info("config file is not exists, use default config", "configFile", configFile)
160-
} else if err != nil {
161-
setupLog.Error(err, "unable to load config", "configFile", configFile, "err", err)
162-
os.Exit(1)
163-
}
154+
gpuPoolState := config.NewGpuPoolStateImpl()
155+
164156
scheduler := scheduler.NewNaiveScheduler()
165157
if err = (&controller.TensorFusionConnectionReconciler{
166-
Client: mgr.GetClient(),
167-
Scheme: mgr.GetScheme(),
168-
Scheduler: scheduler,
169-
WorkerGenerator: &worker.WorkerGenerator{
170-
WorkerConfig: &config.Worker,
171-
},
158+
Client: mgr.GetClient(),
159+
Scheme: mgr.GetScheme(),
160+
Scheduler: scheduler,
161+
GpuPoolState: gpuPoolState,
172162
}).SetupWithManager(mgr); err != nil {
173163
setupLog.Error(err, "unable to create controller", "controller", "TensorFusionConnection")
174164
os.Exit(1)
@@ -185,7 +175,7 @@ func main() {
185175

186176
// nolint:goconst
187177
if os.Getenv("ENABLE_WEBHOOKS") != "false" {
188-
if err = webhookcorev1.SetupPodWebhookWithManager(mgr, &config.PodMutation); err != nil {
178+
if err = webhookcorev1.SetupPodWebhookWithManager(mgr, gpuPoolState); err != nil {
189179
setupLog.Error(err, "unable to create webhook", "webhook", "Pod")
190180
os.Exit(1)
191181
}
@@ -199,8 +189,9 @@ func main() {
199189
os.Exit(1)
200190
}
201191
if err = (&controller.GPUPoolReconciler{
202-
Client: mgr.GetClient(),
203-
Scheme: mgr.GetScheme(),
192+
Client: mgr.GetClient(),
193+
Scheme: mgr.GetScheme(),
194+
GpuPoolState: gpuPoolState,
204195
}).SetupWithManager(mgr); err != nil {
205196
setupLog.Error(err, "unable to create controller", "controller", "GPUPool")
206197
os.Exit(1)

config/crd/bases/tensor-fusion.ai_gpupools.yaml

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -89,16 +89,18 @@ spec:
8989
properties:
9090
client:
9191
properties:
92-
image:
92+
operatorEndpoint:
9393
type: string
94-
podTemplateMergePatch:
95-
description: define how to inject the client pod
94+
patchToContainer:
9695
type: object
9796
x-kubernetes-preserve-unknown-fields: true
98-
port:
99-
type: integer
100-
protocol:
101-
type: string
97+
patchToPod:
98+
type: object
99+
x-kubernetes-preserve-unknown-fields: true
100+
required:
101+
- operatorEndpoint
102+
- patchToContainer
103+
- patchToPod
102104
type: object
103105
hypervisor:
104106
properties:
@@ -110,15 +112,11 @@ spec:
110112
type: object
111113
worker:
112114
properties:
113-
hostNetwork:
114-
type: boolean
115-
image:
116-
type: string
117-
port:
118-
type: integer
119-
workerPodTemplate:
115+
podTemplate:
120116
type: object
121117
x-kubernetes-preserve-unknown-fields: true
118+
required:
119+
- podTemplate
122120
type: object
123121
type: object
124122
nodeManagerConfig:

0 commit comments

Comments
 (0)