Skip to content

Commit f1b261f

Browse files
authored
feat: support compute percent schedule, add soft/hard/shared computing isolation mode (#413)
* feat: support compute percent schedule, add soft/hard/shared computing isolation mode * fix: compute percent exclusive bug
1 parent bc273a9 commit f1b261f

24 files changed

+750
-37
lines changed

api/v1/tensorfusionconnection_types.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,11 @@ const (
3030

3131
type Resource struct {
3232
Tflops resource.Quantity `json:"tflops"`
33-
Vram resource.Quantity `json:"vram"`
33+
34+
// 0-100 percentage, mutually exclusive with TFLOPs
35+
ComputePercent resource.Quantity `json:"compute"`
36+
37+
Vram resource.Quantity `json:"vram"`
3438
}
3539

3640
type Resources struct {

api/v1/workloadprofile_types.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,11 @@ type WorkloadProfileSpec struct {
5757
// default to false, indicates the workload's embedded worker is same process, soft-isolated
5858
SidecarWorker bool `json:"sidecarWorker,omitempty"`
5959

60+
// +optional
61+
// +kubebuilder:default=soft
62+
// How to isolate computing resources, could be `shared` or `soft` or `hard`
63+
ComputeIsolation ComputingIsolationMode `json:"computeIsolation,omitempty"`
64+
6065
// +optional
6166
// GPUModel specifies the required GPU model (e.g., "A100", "H100")
6267
GPUModel string `json:"gpuModel,omitempty"`
@@ -79,6 +84,15 @@ type WorkloadProfileSpec struct {
7984
WorkerPodTemplate *v1.PodTemplateSpec `json:"workerPodTemplate,omitempty"`
8085
}
8186

87+
// +kubebuilder:validation:Enum=shared;soft;hard
88+
type ComputingIsolationMode string
89+
90+
const (
91+
ComputingIsolationModeShared = "shared"
92+
ComputingIsolationModeSoft = "soft"
93+
ComputingIsolationModeHard = "hard"
94+
)
95+
8296
func (t WorkloadProfileSpec) IsDynamicReplica() bool {
8397
return t.Replicas == nil
8498
}

api/v1/zz_generated.deepcopy.go

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

charts/tensor-fusion/crds/tensor-fusion.ai_gpuresourcequotas.yaml

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,13 @@ spec:
6262
description: Default requests applied to workloads without explicit
6363
requests
6464
properties:
65+
compute:
66+
anyOf:
67+
- type: integer
68+
- type: string
69+
description: 0-100 percentage, mutually exclusive with TFLOPs
70+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
71+
x-kubernetes-int-or-string: true
6572
tflops:
6673
anyOf:
6774
- type: integer
@@ -75,13 +82,21 @@ spec:
7582
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
7683
x-kubernetes-int-or-string: true
7784
required:
85+
- compute
7886
- tflops
7987
- vram
8088
type: object
8189
defaultRequests:
8290
description: Default limits applied to workloads without explicit
8391
limits
8492
properties:
93+
compute:
94+
anyOf:
95+
- type: integer
96+
- type: string
97+
description: 0-100 percentage, mutually exclusive with TFLOPs
98+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
99+
x-kubernetes-int-or-string: true
85100
tflops:
86101
anyOf:
87102
- type: integer
@@ -95,6 +110,7 @@ spec:
95110
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
96111
x-kubernetes-int-or-string: true
97112
required:
113+
- compute
98114
- tflops
99115
- vram
100116
type: object
@@ -103,6 +119,13 @@ spec:
103119
type: integer
104120
maxLimits:
105121
properties:
122+
compute:
123+
anyOf:
124+
- type: integer
125+
- type: string
126+
description: 0-100 percentage, mutually exclusive with TFLOPs
127+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
128+
x-kubernetes-int-or-string: true
106129
tflops:
107130
anyOf:
108131
- type: integer
@@ -116,12 +139,20 @@ spec:
116139
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
117140
x-kubernetes-int-or-string: true
118141
required:
142+
- compute
119143
- tflops
120144
- vram
121145
type: object
122146
maxRequests:
123147
description: Maximum resources per workload
124148
properties:
149+
compute:
150+
anyOf:
151+
- type: integer
152+
- type: string
153+
description: 0-100 percentage, mutually exclusive with TFLOPs
154+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
155+
x-kubernetes-int-or-string: true
125156
tflops:
126157
anyOf:
127158
- type: integer
@@ -135,6 +166,7 @@ spec:
135166
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
136167
x-kubernetes-int-or-string: true
137168
required:
169+
- compute
138170
- tflops
139171
- vram
140172
type: object
@@ -154,6 +186,13 @@ spec:
154186
limits:
155187
description: Total limits for the namespace
156188
properties:
189+
compute:
190+
anyOf:
191+
- type: integer
192+
- type: string
193+
description: 0-100 percentage, mutually exclusive with TFLOPs
194+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
195+
x-kubernetes-int-or-string: true
157196
tflops:
158197
anyOf:
159198
- type: integer
@@ -167,6 +206,7 @@ spec:
167206
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
168207
x-kubernetes-int-or-string: true
169208
required:
209+
- compute
170210
- tflops
171211
- vram
172212
type: object
@@ -178,6 +218,13 @@ spec:
178218
requests:
179219
description: Total requests limits for the namespace
180220
properties:
221+
compute:
222+
anyOf:
223+
- type: integer
224+
- type: string
225+
description: 0-100 percentage, mutually exclusive with TFLOPs
226+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
227+
x-kubernetes-int-or-string: true
181228
tflops:
182229
anyOf:
183230
- type: integer
@@ -191,6 +238,7 @@ spec:
191238
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
192239
x-kubernetes-int-or-string: true
193240
required:
241+
- compute
194242
- tflops
195243
- vram
196244
type: object
@@ -281,6 +329,13 @@ spec:
281329
limits:
282330
description: Current limits usage
283331
properties:
332+
compute:
333+
anyOf:
334+
- type: integer
335+
- type: string
336+
description: 0-100 percentage, mutually exclusive with TFLOPs
337+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
338+
x-kubernetes-int-or-string: true
284339
tflops:
285340
anyOf:
286341
- type: integer
@@ -294,12 +349,20 @@ spec:
294349
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
295350
x-kubernetes-int-or-string: true
296351
required:
352+
- compute
297353
- tflops
298354
- vram
299355
type: object
300356
requests:
301357
description: Current requests usage
302358
properties:
359+
compute:
360+
anyOf:
361+
- type: integer
362+
- type: string
363+
description: 0-100 percentage, mutually exclusive with TFLOPs
364+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
365+
x-kubernetes-int-or-string: true
303366
tflops:
304367
anyOf:
305368
- type: integer
@@ -313,6 +376,7 @@ spec:
313376
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
314377
x-kubernetes-int-or-string: true
315378
required:
379+
- compute
316380
- tflops
317381
- vram
318382
type: object

charts/tensor-fusion/crds/tensor-fusion.ai_gpus.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,13 @@ spec:
6969
properties:
7070
available:
7171
properties:
72+
compute:
73+
anyOf:
74+
- type: integer
75+
- type: string
76+
description: 0-100 percentage, mutually exclusive with TFLOPs
77+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
78+
x-kubernetes-int-or-string: true
7279
tflops:
7380
anyOf:
7481
- type: integer
@@ -82,11 +89,19 @@ spec:
8289
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
8390
x-kubernetes-int-or-string: true
8491
required:
92+
- compute
8593
- tflops
8694
- vram
8795
type: object
8896
capacity:
8997
properties:
98+
compute:
99+
anyOf:
100+
- type: integer
101+
- type: string
102+
description: 0-100 percentage, mutually exclusive with TFLOPs
103+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
104+
x-kubernetes-int-or-string: true
90105
tflops:
91106
anyOf:
92107
- type: integer
@@ -100,6 +115,7 @@ spec:
100115
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
101116
x-kubernetes-int-or-string: true
102117
required:
118+
- compute
103119
- tflops
104120
- vram
105121
type: object

charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,14 @@ spec:
145145
properties:
146146
limits:
147147
properties:
148+
compute:
149+
anyOf:
150+
- type: integer
151+
- type: string
152+
description: 0-100 percentage, mutually exclusive
153+
with TFLOPs
154+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
155+
x-kubernetes-int-or-string: true
148156
tflops:
149157
anyOf:
150158
- type: integer
@@ -158,11 +166,20 @@ spec:
158166
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
159167
x-kubernetes-int-or-string: true
160168
required:
169+
- compute
161170
- tflops
162171
- vram
163172
type: object
164173
requests:
165174
properties:
175+
compute:
176+
anyOf:
177+
- type: integer
178+
- type: string
179+
description: 0-100 percentage, mutually exclusive
180+
with TFLOPs
181+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
182+
x-kubernetes-int-or-string: true
166183
tflops:
167184
anyOf:
168185
- type: integer
@@ -176,6 +193,7 @@ spec:
176193
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
177194
x-kubernetes-int-or-string: true
178195
required:
196+
- compute
179197
- tflops
180198
- vram
181199
type: object

0 commit comments

Comments
 (0)