Skip to content

Commit 897dbb7

Browse files
authored
feat: support local GPU mode (#57)
* feat: support local GPU mode * chore: changed the GPU field in TensorFusionConnectionSpec to GPUs. Multi-GPU mode will be supported in the future. * chore: define the QosLevel enumeration
1 parent d5e9d87 commit 897dbb7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+1261
-229
lines changed

.vscode/launch.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
"env": {
1313
"ENABLE_WEBHOOKS": "false"
1414
},
15-
"program": "${workspaceFolder}/cmd/operator/main.go",
15+
"program": "${workspaceFolder}/cmd/main.go",
1616
},
1717
{
1818
"name": "Debug Discovery",
@@ -35,7 +35,7 @@
3535
"KUBECONFIG": "~/.kube/config-tf-dev",
3636
"ENABLE_WEBHOOKS": "false"
3737
},
38-
"program": "${workspaceFolder}/cmd/operator/main.go",
38+
"program": "${workspaceFolder}/cmd/main.go",
3939
},
4040
{
4141
"name": "Debug Demo Env Operator",
@@ -47,7 +47,7 @@
4747
"KUBECONFIG": "~/.kube/dev_us-east-1_demo",
4848
"ENABLE_WEBHOOKS": "false"
4949
},
50-
"program": "${workspaceFolder}/cmd/operator/main.go",
50+
"program": "${workspaceFolder}/cmd/main.go",
5151
},
5252
{
5353
"name": "Debug Test Code",

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,11 @@ lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes
9393

9494
.PHONY: build
9595
build: manifests generate fmt vet ## Build manager binary.
96-
go build -o bin/manager cmd/operator/main.go
96+
go build -o bin/manager cmd/main.go
9797

9898
.PHONY: run
9999
run: manifests generate fmt vet ## Run a controller from your host.
100-
go run ./cmd/operator/main.go
100+
go run ./cmd/main.go
101101

102102
# If you wish to build the manager image targeting other platforms you can use the --platform flag.
103103
# (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it.

PROJECT

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,4 +71,12 @@ resources:
7171
kind: SchedulingConfigTemplate
7272
path: github.com/NexusGPU/tensor-fusion-operator/api/v1
7373
version: v1
74+
- api:
75+
crdVersion: v1
76+
namespaced: true
77+
controller: true
78+
domain: tensor-fusion.ai
79+
kind: ClientProfile
80+
path: github.com/NexusGPU/tensor-fusion-operator/api/v1
81+
version: v1
7482
version: "3"

api/v1/clientprofile_types.go

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
/*
2+
Copyright 2024.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package v1
18+
19+
import (
20+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
21+
)
22+
23+
// +kubebuilder:validation:Enum=low;medium;high;critical
24+
type QoSLevel string
25+
26+
const (
27+
QoSLow QoSLevel = "low"
28+
QoSMedium QoSLevel = "medium"
29+
QoSHigh QoSLevel = "high"
30+
QoSCritical QoSLevel = "critical"
31+
)
32+
33+
// ClientProfileSpec defines the desired state of ClientProfile.
34+
type ClientProfileSpec struct {
35+
// +optional
36+
PoolName string `json:"poolName,omitempty"`
37+
38+
// +optional
39+
Resources Resources `json:"resources,omitempty"`
40+
41+
// +optional
42+
// Qos defines the quality of service level for the client.
43+
Qos QoSLevel `json:"qos,omitempty"`
44+
45+
IsLocalGPU bool `json:"isLocalGPU"`
46+
}
47+
48+
// ClientProfileStatus defines the observed state of ClientProfile.
49+
type ClientProfileStatus struct {
50+
}
51+
52+
// +kubebuilder:object:root=true
53+
// +kubebuilder:subresource:status
54+
55+
// ClientProfile is the Schema for the clientprofiles API.
56+
type ClientProfile struct {
57+
metav1.TypeMeta `json:",inline"`
58+
metav1.ObjectMeta `json:"metadata,omitempty"`
59+
60+
Spec ClientProfileSpec `json:"spec,omitempty"`
61+
Status ClientProfileStatus `json:"status,omitempty"`
62+
}
63+
64+
// +kubebuilder:object:root=true
65+
66+
// ClientProfileList contains a list of ClientProfile.
67+
type ClientProfileList struct {
68+
metav1.TypeMeta `json:",inline"`
69+
metav1.ListMeta `json:"metadata,omitempty"`
70+
Items []ClientProfile `json:"items"`
71+
}
72+
73+
func init() {
74+
SchemeBuilder.Register(&ClientProfile{}, &ClientProfileList{})
75+
}

api/v1/gpunode_types.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ type GPUNodeAllocationDetails struct {
102102

103103
Requests GPUResourceUnit `json:"requests"`
104104
Limits GPUResourceUnit `json:"limits"`
105-
QoS string `json:"qos,omitempty"`
105+
QoS QoSLevel `json:"qos,omitempty"`
106106
}
107107

108108
// +kubebuilder:validation:Enum=Pending;Provisioning;Migrating;Running;Succeeded;Failed;Unknown;Destroying

api/v1/gpupool_types.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -260,14 +260,14 @@ type AlertConfig struct {
260260
// Define different QoS and their price.
261261
type QosConfig struct {
262262
Definitions []QosDefinition `json:"definitions,omitempty"`
263-
DefaultQoS string `json:"defaultQoS,omitempty"`
263+
DefaultQoS QoSLevel `json:"defaultQoS,omitempty"`
264264
Pricing []QosPricing `json:"pricing,omitempty"`
265265
}
266266

267267
type QosDefinition struct {
268-
Name string `json:"name,omitempty"`
269-
Description string `json:"description,omitempty"`
270-
Priority int `json:"priority,omitempty"` // Range from 1-100, reflects the scheduling priority when GPU is full and tasks are in the queue.
268+
Name QoSLevel `json:"name,omitempty"`
269+
Description string `json:"description,omitempty"`
270+
Priority int `json:"priority,omitempty"` // Range from 1-100, reflects the scheduling priority when GPU is full and tasks are in the queue.
271271
}
272272

273273
type GPUResourceUnit struct {
@@ -292,7 +292,7 @@ type GPUOrCPUResourceUnit struct {
292292
}
293293

294294
type QosPricing struct {
295-
Qos string `json:"qos,omitempty"`
295+
Qos QoSLevel `json:"qos,omitempty"`
296296

297297
Requests GPUResourcePricingUnit `json:"requests,omitempty"`
298298

api/v1/schedulingconfigtemplate_types.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -152,10 +152,10 @@ type ScaleToZero struct {
152152
}
153153

154154
type AutoFreeze struct {
155-
Qos string `json:"qos,omitempty"`
156-
FreezeToMemTTL string `json:"freezeToMemTTL,omitempty"`
157-
FreezeToDiskTTL string `json:"freezeToDiskTTL,omitempty"`
158-
Enable *bool `json:"enable,omitempty"`
155+
Qos QoSLevel `json:"qos,omitempty"`
156+
FreezeToMemTTL string `json:"freezeToMemTTL,omitempty"`
157+
FreezeToDiskTTL string `json:"freezeToDiskTTL,omitempty"`
158+
Enable *bool `json:"enable,omitempty"`
159159
}
160160

161161
type SmartSchedulerModelInput struct {

api/v1/tensorfusionconnection_types.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,10 @@ type Resources struct {
3535
type TensorFusionConnectionSpec struct {
3636
PoolName string `json:"poolName"`
3737
Resources Resources `json:"resources"`
38+
39+
// +optional
40+
// localGpu mode will schedule the GPU in advance
41+
GPUs []string `json:"gpu"`
3842
}
3943

4044
type TensorFusionConnectionPhase string
@@ -50,7 +54,7 @@ const (
5054
type TensorFusionConnectionStatus struct {
5155
Phase TensorFusionConnectionPhase `json:"phase"`
5256
ConnectionURL string `json:"connectionURL"`
53-
QosClass string `json:"qosClass,omitempty"`
57+
QoS QoSLevel `json:"qos,omitempty"`
5458
GPU string `json:"gpu,omitempty"`
5559
}
5660

api/v1/zz_generated.deepcopy.go

Lines changed: 95 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)