Skip to content

Commit e3359b9

Browse files
authored
fix: add toggle for vector container on hypervisor (#271)
1 parent 92c0c07 commit e3359b9

File tree

6 files changed

+34
-4
lines changed

6 files changed

+34
-4
lines changed

api/v1/gpupool_types.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,9 @@ type HypervisorConfig struct {
315315

316316
// +optional
317317
PodTemplate *runtime.RawExtension `json:"podTemplate,omitempty"`
318+
319+
// +optional
320+
EnableVector bool `json:"enableVector,omitempty"`
318321
}
319322

320323
type WorkerConfig struct {

charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,8 @@ spec:
212212
type: object
213213
hypervisor:
214214
properties:
215+
enableVector:
216+
type: boolean
215217
image:
216218
type: string
217219
podTemplate:

charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,8 @@ spec:
277277
type: object
278278
hypervisor:
279279
properties:
280+
enableVector:
281+
type: boolean
280282
image:
281283
type: string
282284
podTemplate:

config/crd/bases/tensor-fusion.ai_gpupools.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,8 @@ spec:
212212
type: object
213213
hypervisor:
214214
properties:
215+
enableVector:
216+
type: boolean
215217
image:
216218
type: string
217219
podTemplate:

config/crd/bases/tensor-fusion.ai_tensorfusionclusters.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,8 @@ spec:
277277
type: object
278278
hypervisor:
279279
properties:
280+
enableVector:
281+
type: boolean
280282
image:
281283
type: string
282284
podTemplate:

internal/utils/compose.go

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,11 @@ func AddOrOverrideTFClientMissingAnnotationsBeforePatch(pod *v1.Pod, tfInfo Tens
9898
// add full annotations
9999
pod.Annotations[constants.TFLOPSLimitAnnotation] = tfInfo.Profile.Resources.Limits.Tflops.String()
100100
pod.Annotations[constants.VRAMLimitAnnotation] = tfInfo.Profile.Resources.Limits.Vram.String()
101-
pod.Annotations[constants.QoSLevelAnnotation] = string(tfInfo.Profile.Qos)
101+
if tfInfo.Profile.Qos == "" {
102+
pod.Annotations[constants.QoSLevelAnnotation] = string(tfv1.QoSMedium)
103+
} else {
104+
pod.Annotations[constants.QoSLevelAnnotation] = string(tfInfo.Profile.Qos)
105+
}
102106
pod.Annotations[constants.TFLOPSRequestAnnotation] = tfInfo.Profile.Resources.Requests.Tflops.String()
103107
pod.Annotations[constants.VRAMRequestAnnotation] = tfInfo.Profile.Resources.Requests.Vram.String()
104108
pod.Annotations[constants.GpuCountAnnotation] = fmt.Sprintf("%d", tfInfo.Profile.GPUCount)
@@ -127,7 +131,11 @@ func AppendTFWorkerLabelsAndAnnotationsAfterTemplate(podTmpl *v1.PodTemplate, wo
127131
annotations[constants.VRAMLimitAnnotation] = res.Limits.Vram.String()
128132
annotations[constants.TFLOPSRequestAnnotation] = res.Requests.Tflops.String()
129133
annotations[constants.VRAMRequestAnnotation] = res.Requests.Vram.String()
130-
annotations[constants.QoSLevelAnnotation] = string(workload.Spec.Qos)
134+
if workload.Spec.Qos == "" {
135+
annotations[constants.QoSLevelAnnotation] = string(tfv1.QoSMedium)
136+
} else {
137+
annotations[constants.QoSLevelAnnotation] = string(workload.Spec.Qos)
138+
}
131139

132140
if workload.Spec.GPUCount > 0 {
133141
annotations[constants.GpuCountAnnotation] = fmt.Sprintf("%d", workload.Spec.GPUCount)
@@ -256,8 +264,10 @@ func AddTFHypervisorConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, poo
256264
spec.HostPID = true
257265
spec.TerminationGracePeriodSeconds = constants.GracefulPeriodSeconds
258266

267+
enableVector := pool.Spec.ComponentConfig.Hypervisor != nil && pool.Spec.ComponentConfig.Hypervisor.EnableVector
268+
259269
// when no config or config is not valid, reset hypervisor&vector container
260-
if len(spec.Containers) != 2 {
270+
if enableVector && len(spec.Containers) != 2 {
261271
spec.Containers = []v1.Container{
262272
{
263273
Name: constants.TFContainerNameHypervisor,
@@ -266,6 +276,12 @@ func AddTFHypervisorConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, poo
266276
Name: constants.TFContainerVector,
267277
},
268278
}
279+
} else if len(spec.Containers) != 1 {
280+
spec.Containers = []v1.Container{
281+
{
282+
Name: constants.TFContainerNameHypervisor,
283+
},
284+
}
269285
}
270286

271287
// add volumes of vector and configs
@@ -311,7 +327,10 @@ func AddTFHypervisorConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, poo
311327
})
312328

313329
composeHypervisorContainer(spec, pool)
314-
composeVectorContainer(spec, pool)
330+
331+
if enableVector {
332+
composeVectorContainer(spec, pool)
333+
}
315334
}
316335

317336
func composeHypervisorContainer(spec *v1.PodSpec, pool *tfv1.GPUPool) {

0 commit comments

Comments
 (0)