Skip to content

Commit 5405a3c

Browse files
committed
feat: node provisioner mode, basic compaction, basic aliyun/aws cloud vendor implementation
1 parent 07c7850 commit 5405a3c

24 files changed

+1105
-45
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,6 @@ go.work
2828
.DS_Store
2929

3030
cmd/__debug*
31-
cmd/*/__debug*
31+
cmd/*/__debug*
32+
33+
prompts/*

.vscode/settings.json

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,68 @@
11
{
22
"cSpell.words": [
3+
"Aliyun",
4+
"AMDCDNA",
5+
"AMDRDNA",
36
"apimachinery",
7+
"AWSGPU",
8+
"batchv",
9+
"CDNA",
410
"certificaterequests",
511
"certmanager",
612
"clientgoscheme",
13+
"cloudnative",
14+
"cloudprovider",
715
"clusterissuers",
816
"controllerutil",
17+
"corev",
918
"crds",
19+
"CUDA",
1020
"cycjimmy",
21+
"dylib",
22+
"Eventf",
1123
"finalizer",
1224
"Finalizers",
1325
"goconst",
1426
"golint",
1527
"gosec",
28+
"gpunode",
29+
"gpunodeclasses",
30+
"gpunodes",
31+
"gpupool",
32+
"gpupools",
1633
"healthz",
1734
"karpenter",
1835
"kubebuilder",
36+
"KUBECONFIG",
1937
"Kubelet",
2038
"kustomization",
2139
"metav",
2240
"metricsserver",
41+
"nindent",
2342
"nolint",
43+
"NVML",
44+
"omitempty",
2445
"onsi",
46+
"printcolumn",
2547
"prometheusagents",
2648
"prometheuses",
2749
"prometheusrules",
50+
"RDNA",
2851
"readyz",
52+
"runpod",
2953
"schedulingconfigtemplate",
3054
"schedulingconfigtemplates",
55+
"schedulingcorev",
3156
"subresource",
3257
"tensorfusion",
3358
"tensorfusionaiv",
3459
"tensorfusioncluster",
3560
"tensorfusionclusters",
61+
"Tera",
62+
"tflops",
3663
"Tmpl",
3764
"utilruntime",
38-
"webhookcorev"
65+
"webhookcorev",
66+
"Xlarge"
3967
]
4068
}

api/v1/gpunode_types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,10 @@ const (
106106
)
107107

108108
type GPUNodeInfo struct {
109+
// +optional
110+
// only set when node is managed by TensorFusion
111+
InstanceID string `json:"instanceID,omitempty"`
112+
109113
Hostname string `json:"hostname,omitempty"`
110114
IP string `json:"ip,omitempty"`
111115
KernelVersion string `json:"kernelVersion,omitempty"`

api/v1/gpupool_types.go

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -69,21 +69,21 @@ type Oversubscription struct {
6969
// +kubebuilder:default=50
7070
// +kubebuilder:validation:Minimum=0
7171
// +kubebuilder:validation:Maximum=100
72-
VramExpandToHostMem int32 `json:"vramExpandToHostMem,omitempty"`
72+
VRAMExpandToHostMem int32 `json:"vramExpandToHostMem,omitempty"`
7373

7474
// the percentage of Host Disk appending to GPU VRAM, default to 70%
7575
// +optional
7676
// +kubebuilder:default=70
7777
// +kubebuilder:validation:Minimum=0
7878
// +kubebuilder:validation:Maximum=100
79-
VramExpandToHostDisk int32 `json:"vramExpandToHostDisk,omitempty"`
79+
VRAMExpandToHostDisk int32 `json:"vramExpandToHostDisk,omitempty"`
8080

81-
// The multipler of TFlops to oversell, default to 500%, indicates 5 times oversell
81+
// The multi of TFlops to oversell, default to 500%, indicates 5 times oversell
8282
// +optional
8383
// +kubebuilder:default=500
8484
// +kubebuilder:validation:Minimum=100
8585
// +kubebuilder:validation:Maximum=100000
86-
TflopsOversellRatio int32 `json:"tflopsOversellRatio,omitempty"`
86+
TFlopsOversellRatio int32 `json:"tflopsOversellRatio,omitempty"`
8787
}
8888

8989
type NodeManagerConfig struct {
@@ -171,14 +171,18 @@ type Requirement struct {
171171
Values []string `json:"values,omitempty"`
172172
}
173173

174-
// +kubebuilder:validation:Enum=node.kubernetes.io/instance-type;kubernetes.io/arch;kubernetes.io/os;topology.kubernetes.io/zone;karpenter.sh/capacity-type
174+
// +kubebuilder:validation:Enum=node.kubernetes.io/instance-type;kubernetes.io/arch;kubernetes.io/os;topology.kubernetes.io/zone;karpenter.sh/capacity-type;tensor-fusion.ai/gpu-arch
175175
type NodeRequirementKey string
176176

177177
const (
178-
NodeRequirementKeyInstanceType NodeRequirementKey = "node.kubernetes.io/instance-type"
179-
NodeRequirementKeyArchitecture NodeRequirementKey = "kubernetes.io/arch"
180-
NodeRequirementKeyOS NodeRequirementKey = "kubernetes.io/os"
181-
NodeRequirementKeyZone NodeRequirementKey = "topology.kubernetes.io/zone"
178+
NodeRequirementKeyInstanceType NodeRequirementKey = "node.kubernetes.io/instance-type"
179+
NodeRequirementKeyArchitecture NodeRequirementKey = "kubernetes.io/arch"
180+
NodeRequirementKeyGPUArchitecture NodeRequirementKey = "tensor-fusion.ai/gpu-arch"
181+
182+
NodeRequirementKeyOS NodeRequirementKey = "kubernetes.io/os"
183+
NodeRequirementKeyZone NodeRequirementKey = "topology.kubernetes.io/zone"
184+
185+
// capacity-type is charging method, can be spot/preemptive or on-demand
182186
NodeRequirementKeyCapacityType NodeRequirementKey = "karpenter.sh/capacity-type"
183187
)
184188

@@ -363,7 +367,7 @@ type GPUPoolStatus struct {
363367
// ProvisioningStatus is to track the status of those outside GPU nodes.
364368
ProvisioningStatus PoolProvisioningStatus `json:"provisioningStatus"`
365369

366-
// when updating any component version or config, poolcontroller will perform rolling update.
370+
// when updating any component version or config, pool controller will perform rolling update.
367371
// the status will be updated periodically, default to 5s, progress will be 0-100.
368372
// when the progress is 100, the component version or config is fully updated.
369373
ComponentStatus PoolComponentStatus `json:"componentStatus"`

api/v1/tensorfusioncluster_types.go

Lines changed: 45 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -101,22 +101,58 @@ type GPUPoolDefinition struct {
101101

102102
// ComputingVendorConfig defines the Cloud vendor connection such as AWS, GCP, Azure etc.
103103
type ComputingVendorConfig struct {
104-
Name string `json:"name,omitempty"` // Name of the computing vendor.
105-
Type string `json:"type,omitempty"` // Type of the computing vendor (e.g., aws, lambdalabs, gcp, azure, together.ai).
106-
AuthType string `json:"authType,omitempty"` // Authentication type (e.g., accessKey, serviceAccount).
104+
Name string `json:"name,omitempty"`
105+
106+
// support popular cloud providers
107+
Type ComputingVendorName `json:"type,omitempty"`
108+
109+
AuthType AuthTypeEnum `json:"authType,omitempty"` // Authentication type (e.g., accessKey, serviceAccount).
107110

108111
// +optional
112+
// +kubebuilder:default=true
109113
Enable *bool `json:"enable,omitempty"` // Enable or disable the computing vendor.
110114

111-
GPUNodeControllerType string `json:"gpuNodeControllerType,omitempty"` // Type of GPU node controller (e.g., asg, karpenter, native).
112-
Params ComputingVendorParams `json:"params,omitempty"`
115+
Params ComputingVendorParams `json:"params,omitempty"`
113116
}
114117

118+
// +kubebuilder:validation:Enum=accessKey;serviceAccountRole
119+
type AuthTypeEnum string
120+
121+
const (
122+
AuthTypeAccessKey AuthTypeEnum = "accessKey"
123+
AuthTypeServiceAccountRole AuthTypeEnum = "serviceAccountRole"
124+
)
125+
126+
// +kubebuilder:validation:Enum=aws;lambda-labs;gcp;azure;oracle-oci;ibm;openshift;vultr;together-ai;aliyun;nvidia;tencent;runpod
127+
type ComputingVendorName string
128+
129+
const (
130+
ComputingVendorAWS ComputingVendorName = "aws"
131+
ComputingVendorGCP ComputingVendorName = "gcp"
132+
ComputingVendorAzure ComputingVendorName = "azure"
133+
ComputingVendorOracle ComputingVendorName = "oracle-oci"
134+
ComputingVendorIBM ComputingVendorName = "ibm"
135+
ComputingVendorOpenShift ComputingVendorName = "openshift"
136+
ComputingVendorVultr ComputingVendorName = "vultr"
137+
ComputingVendorTogetherAI ComputingVendorName = "together-ai"
138+
ComputingVendorLambdaLabs ComputingVendorName = "lambda-labs"
139+
ComputingVendorAliyun ComputingVendorName = "aliyun"
140+
ComputingVendorNvidia ComputingVendorName = "nvidia"
141+
ComputingVendorTencent ComputingVendorName = "tencent"
142+
ComputingVendorRunPod ComputingVendorName = "runpod"
143+
)
144+
115145
type ComputingVendorParams struct {
116-
Region string `json:"region,omitempty"` // Region for the computing vendor.
117-
AccessKey string `json:"accessKey,omitempty"` // Access key for the computing vendor.
118-
SecretKey string `json:"secretKey,omitempty"` // Secret key for the computing vendor.
119-
IAMRole string `json:"iamRole,omitempty"` // IAM role for the computing vendor like AWS
146+
DefaultRegion string `json:"defaultRegion,omitempty"` // Region for the computing vendor.
147+
148+
// the secret of access key and secret key, must be mounted as env var or file path
149+
AccessKeyPath string `json:"accessKeyPath,omitempty"`
150+
SecretKeyPath string `json:"secretKeyPath,omitempty"`
151+
AccessKeyEnvVar string `json:"accessKeyEnvVar,omitempty"`
152+
SecretKeyEnvVar string `json:"secretKeyEnvVar,omitempty"`
153+
154+
// preferred IAM role since it's more secure
155+
IAMRole string `json:"iamRole,omitempty"`
120156
}
121157

122158
// StorageVendorConfig defines Postgres database with extensions for timeseries storage and other resource aggregation results, system events and diagnostics reports etc.

charts/tensor-fusion/templates/gpu-cost-config.yaml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,15 @@ data:
3737
costPerHour: 1.64
3838
fp16TFlops: 312
3939
40+
- model: A10
41+
vendor: NVIDIA
42+
costPerHour: 0.9
43+
fp16TFlops: 125
44+
45+
# A10G has less CUDA core than A10, but with RT cores for rendering case
4046
- model: A10G
4147
vendor: NVIDIA
42-
costPerHour: 0.4
48+
costPerHour: 0.75 # from lambda labs
4349
fp16TFlops: 125
4450
4551
- model: A40

cmd/operator/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,7 @@ func main() {
199199
GPUPoolReconciler := &controller.GPUPoolReconciler{
200200
Client: mgr.GetClient(),
201201
Scheme: mgr.GetScheme(),
202+
Recorder: mgr.GetEventRecorderFor("GPUPool"),
202203
GpuPoolState: gpuPoolState,
203204
GpuNodeState: gpuNodeState,
204205
}

config/crd/bases/tensor-fusion.ai_gpunodes.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,9 @@ spec:
224224
type: string
225225
hostname:
226226
type: string
227+
instanceID:
228+
description: only set when node is managed by TensorFusion
229+
type: string
227230
ip:
228231
type: string
229232
kernelVersion:

config/crd/bases/tensor-fusion.ai_gpupools.yaml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -130,8 +130,8 @@ spec:
130130
properties:
131131
tflopsOversellRatio:
132132
default: 500
133-
description: The multipler of TFlops to oversell, default
134-
to 500%, indicates 5 times oversell
133+
description: The multi of TFlops to oversell, default to 500%,
134+
indicates 5 times oversell
135135
format: int32
136136
maximum: 100000
137137
minimum: 100
@@ -293,6 +293,7 @@ spec:
293293
- kubernetes.io/os
294294
- topology.kubernetes.io/zone
295295
- karpenter.sh/capacity-type
296+
- tensor-fusion.ai/gpu-arch
296297
type: string
297298
operator:
298299
default: In
@@ -343,6 +344,7 @@ spec:
343344
- kubernetes.io/os
344345
- topology.kubernetes.io/zone
345346
- karpenter.sh/capacity-type
347+
- tensor-fusion.ai/gpu-arch
346348
type: string
347349
operator:
348350
default: In
@@ -750,7 +752,7 @@ spec:
750752
type: string
751753
componentStatus:
752754
description: |-
753-
when updating any component version or config, poolcontroller will perform rolling update.
755+
when updating any component version or config, pool controller will perform rolling update.
754756
the status will be updated periodically, default to 5s, progress will be 0-100.
755757
when the progress is 100, the component version or config is fully updated.
756758
properties:

config/crd/bases/tensor-fusion.ai_tensorfusionclusters.yaml

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -70,25 +70,49 @@ spec:
7070
such as AWS, GCP, Azure etc.
7171
properties:
7272
authType:
73+
enum:
74+
- accessKey
75+
- serviceAccountRole
7376
type: string
7477
enable:
78+
default: true
7579
type: boolean
76-
gpuNodeControllerType:
77-
type: string
7880
name:
7981
type: string
8082
params:
8183
properties:
82-
accessKey:
84+
accessKeyEnvVar:
85+
type: string
86+
accessKeyPath:
87+
description: the secret of access key and secret key, must
88+
be mounted as env var or file path
89+
type: string
90+
defaultRegion:
8391
type: string
8492
iamRole:
93+
description: preferred IAM role since it's more secure
8594
type: string
86-
region:
95+
secretKeyEnvVar:
8796
type: string
88-
secretKey:
97+
secretKeyPath:
8998
type: string
9099
type: object
91100
type:
101+
description: support popular cloud providers
102+
enum:
103+
- aws
104+
- lambda-labs
105+
- gcp
106+
- azure
107+
- oracle-oci
108+
- ibm
109+
- openshift
110+
- vultr
111+
- together-ai
112+
- aliyun
113+
- nvidia
114+
- tencent
115+
- runpod
92116
type: string
93117
type: object
94118
dataPipelines:
@@ -204,8 +228,8 @@ spec:
204228
properties:
205229
tflopsOversellRatio:
206230
default: 500
207-
description: The multipler of TFlops to oversell,
208-
default to 500%, indicates 5 times oversell
231+
description: The multi of TFlops to oversell, default
232+
to 500%, indicates 5 times oversell
209233
format: int32
210234
maximum: 100000
211235
minimum: 100
@@ -368,6 +392,7 @@ spec:
368392
- kubernetes.io/os
369393
- topology.kubernetes.io/zone
370394
- karpenter.sh/capacity-type
395+
- tensor-fusion.ai/gpu-arch
371396
type: string
372397
operator:
373398
default: In
@@ -418,6 +443,7 @@ spec:
418443
- kubernetes.io/os
419444
- topology.kubernetes.io/zone
420445
- karpenter.sh/capacity-type
446+
- tensor-fusion.ai/gpu-arch
421447
type: string
422448
operator:
423449
default: In

0 commit comments

Comments
 (0)