fix: add toggle for vector container on hypervisor (#271)

Code2Life · web-flow · commit e3359b940cb9 · 2025-07-08T10:44:43.000+08:00
diff --git a/api/v1/gpupool_types.go b/api/v1/gpupool_types.go
@@ -315,6 +315,9 @@ type HypervisorConfig struct {
 
 	// +optional
 	PodTemplate *runtime.RawExtension `json:"podTemplate,omitempty"`
+
+	// +optional
+	EnableVector bool `json:"enableVector,omitempty"`
 }
 
 type WorkerConfig struct {
diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml
@@ -212,6 +212,8 @@ spec:
                     type: object
                   hypervisor:
                     properties:
+                      enableVector:
+                        type: boolean
                       image:
                         type: string
                       podTemplate:
diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml
@@ -277,6 +277,8 @@ spec:
                               type: object
                             hypervisor:
                               properties:
+                                enableVector:
+                                  type: boolean
                                 image:
                                   type: string
                                 podTemplate:
diff --git a/config/crd/bases/tensor-fusion.ai_gpupools.yaml b/config/crd/bases/tensor-fusion.ai_gpupools.yaml
@@ -212,6 +212,8 @@ spec:
                     type: object
                   hypervisor:
                     properties:
+                      enableVector:
+                        type: boolean
                       image:
                         type: string
                       podTemplate:
diff --git a/config/crd/bases/tensor-fusion.ai_tensorfusionclusters.yaml b/config/crd/bases/tensor-fusion.ai_tensorfusionclusters.yaml
@@ -277,6 +277,8 @@ spec:
                               type: object
                             hypervisor:
                               properties:
+                                enableVector:
+                                  type: boolean
                                 image:
                                   type: string
                                 podTemplate:
diff --git a/internal/utils/compose.go b/internal/utils/compose.go
@@ -98,7 +98,11 @@ func AddOrOverrideTFClientMissingAnnotationsBeforePatch(pod *v1.Pod, tfInfo Tens
 	// add full annotations
 	pod.Annotations[constants.TFLOPSLimitAnnotation] = tfInfo.Profile.Resources.Limits.Tflops.String()
 	pod.Annotations[constants.VRAMLimitAnnotation] = tfInfo.Profile.Resources.Limits.Vram.String()
-	pod.Annotations[constants.QoSLevelAnnotation] = string(tfInfo.Profile.Qos)
+	if tfInfo.Profile.Qos == "" {
+		pod.Annotations[constants.QoSLevelAnnotation] = string(tfv1.QoSMedium)
+	} else {
+		pod.Annotations[constants.QoSLevelAnnotation] = string(tfInfo.Profile.Qos)
+	}
 	pod.Annotations[constants.TFLOPSRequestAnnotation] = tfInfo.Profile.Resources.Requests.Tflops.String()
 	pod.Annotations[constants.VRAMRequestAnnotation] = tfInfo.Profile.Resources.Requests.Vram.String()
 	pod.Annotations[constants.GpuCountAnnotation] = fmt.Sprintf("%d", tfInfo.Profile.GPUCount)
@@ -127,7 +131,11 @@ func AppendTFWorkerLabelsAndAnnotationsAfterTemplate(podTmpl *v1.PodTemplate, wo
 	annotations[constants.VRAMLimitAnnotation] = res.Limits.Vram.String()
 	annotations[constants.TFLOPSRequestAnnotation] = res.Requests.Tflops.String()
 	annotations[constants.VRAMRequestAnnotation] = res.Requests.Vram.String()
-	annotations[constants.QoSLevelAnnotation] = string(workload.Spec.Qos)
+	if workload.Spec.Qos == "" {
+		annotations[constants.QoSLevelAnnotation] = string(tfv1.QoSMedium)
+	} else {
+		annotations[constants.QoSLevelAnnotation] = string(workload.Spec.Qos)
+	}
 
 	if workload.Spec.GPUCount > 0 {
 		annotations[constants.GpuCountAnnotation] = fmt.Sprintf("%d", workload.Spec.GPUCount)
@@ -256,8 +264,10 @@ func AddTFHypervisorConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, poo
 	spec.HostPID = true
 	spec.TerminationGracePeriodSeconds = constants.GracefulPeriodSeconds
 
+	enableVector := pool.Spec.ComponentConfig.Hypervisor != nil && pool.Spec.ComponentConfig.Hypervisor.EnableVector
+
 	// when no config or config is not valid, reset hypervisor&vector container
-	if len(spec.Containers) != 2 {
+	if enableVector && len(spec.Containers) != 2 {
 		spec.Containers = []v1.Container{
 			{
 				Name: constants.TFContainerNameHypervisor,
@@ -266,6 +276,12 @@ func AddTFHypervisorConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, poo
 				Name: constants.TFContainerVector,
 			},
 		}
+	} else if len(spec.Containers) != 1 {
+		spec.Containers = []v1.Container{
+			{
+				Name: constants.TFContainerNameHypervisor,
+			},
+		}
 	}
 
 	// add volumes of vector and configs
@@ -311,7 +327,10 @@ func AddTFHypervisorConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, poo
 	})
 
 	composeHypervisorContainer(spec, pool)
-	composeVectorContainer(spec, pool)
+
+	if enableVector {
+		composeVectorContainer(spec, pool)
+	}
 }
 
 func composeHypervisorContainer(spec *v1.PodSpec, pool *tfv1.GPUPool) {

Original file line number	Diff line number	Diff line change
`@@ -315,6 +315,9 @@ type HypervisorConfig struct {`
`315`	`315`
`316`	`316`	`// +optional`
`317`	`317`	PodTemplate *runtime.RawExtension `json:"podTemplate,omitempty"`
	`318`	`+`
	`319`	`+ // +optional`
	`320`	+ EnableVector bool `json:"enableVector,omitempty"`
`318`	`321`	`}`
`319`	`322`
`320`	`323`	`type WorkerConfig struct {`