fix: optimize unit test (#289)

Code2Life · web-flow · commit 019495e56b16 · 2025-07-24T16:38:56.000+08:00
* fix: GPU usedBy field wrongly modified after update issue

* fix: unit test issues

* fix: revert to shared shm
diff --git a/cmd/nodediscovery/main.go b/cmd/nodediscovery/main.go
@@ -265,6 +265,9 @@ func createOrUpdateTensorFusionGPU(
 		if gpu.Status.Available == nil {
 			gpu.Status.Available = gpu.Status.Capacity.DeepCopy()
 		}
+		if gpu.Status.UsedBy == "" {
+			gpu.Status.UsedBy = tfv1.UsedByTensorFusion
+		}
 		return k8sClient.Status().Patch(ctx, gpu, client.Merge)
 	})
 	if err != nil {
diff --git a/internal/controller/gpu_controller.go b/internal/controller/gpu_controller.go
@@ -88,7 +88,7 @@ func (r *GPUReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
 		return ctrl.Result{}, fmt.Errorf("node %s is not assigned to any pool", gpunode.Name)
 	}
 
-	if gpu.Status.UsedBy == "" {
+	if gpu.Status.UsedBy == "" && gpu.Status.UUID != "" {
 		patch := client.MergeFrom(gpu.DeepCopy())
 		gpu.Status.UsedBy = tfv1.UsedByTensorFusion
 		if err := r.Status().Patch(ctx, gpu, patch); err != nil {
diff --git a/internal/controller/gpunode_controller.go b/internal/controller/gpunode_controller.go
@@ -131,14 +131,18 @@ func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
 
 	// Check if hypervisor is running well, if so, set as running status
 	err = r.checkStatusAndUpdateVirtualCapacity(ctx, hypervisorName, node, poolObj)
+	if errors.IsNotFound(err) {
+		log.Info("Hypervisor pod not found, requeue", "hypervisorName", hypervisorName)
+		return ctrl.Result{Requeue: true}, nil
+	}
 	return ctrl.Result{}, err
 }
 
 func (r *GPUNodeReconciler) checkStatusAndUpdateVirtualCapacity(ctx context.Context, hypervisorName string, node *tfv1.GPUNode, poolObj *tfv1.GPUPool) error {
 	pod := &corev1.Pod{}
 	fetchErr := r.Get(ctx, client.ObjectKey{Name: hypervisorName, Namespace: utils.CurrentNamespace()}, pod)
 	if fetchErr != nil {
-		return fmt.Errorf("failed to get hypervisor pod: %w", fetchErr)
+		return fetchErr
 	}
 
 	// Reconcile GPUNode status with hypervisor pod status, when changed
diff --git a/internal/controller/gpupool_compaction_controller.go b/internal/controller/gpupool_compaction_controller.go
@@ -111,14 +111,22 @@ func (r *GPUPoolCompactionReconciler) checkNodeCompaction(ctx context.Context, p
 			if pool.Spec.NodeManagerConfig.ProvisioningMode != tfv1.ProvisioningModeAutoSelect {
 				// not managed by Kubernetes, managed by TensorFusion, safe to terminate, and finalizer will cause K8S node and related cloud resources to be deleted
 				gpuNodeClaimName := gpuNode.Labels[constants.ProvisionerLabelKey]
+				if gpuNodeClaimName == "" {
+					log.Info("skip existing nodes managed by other controller when compaction", "node", gpuNode.Name)
+					continue
+				}
 				gpuNodeClaimObj := &tfv1.GPUNodeClaim{}
 				if err := r.Get(ctx, client.ObjectKey{Name: gpuNodeClaimName}, gpuNodeClaimObj); err != nil {
+					if errors.IsNotFound(err) {
+						log.Info("skip existing nodes managed by other controller when compaction", "node", gpuNode.Name)
+						continue
+					}
 					log.Error(err, "get gpuNodeClaim failed", "gpuNodeClaimName", gpuNodeClaimName)
 					continue
 				}
 				// already deleting
 				if !gpuNodeClaimObj.DeletionTimestamp.IsZero() {
-					log.Info("[Warn] GPUNode deleting during compaction loop, this should not happen", "gpuNodeClaimName", gpuNodeClaimName)
+					log.Info("[Warn] GPUNode deleting during compaction loop, this should not happen", "node", gpuNode.Name)
 					continue
 				}
 				toDeleteGPUNodes = append(toDeleteGPUNodes, gpuNodeClaimName)
diff --git a/internal/controller/tensorfusionworkload_controller_test.go b/internal/controller/tensorfusionworkload_controller_test.go
@@ -201,10 +201,11 @@ var _ = Describe("TensorFusionWorkload Controller", func() {
 			_ = checkWorkerPodCount(workload)
 			checkWorkloadStatus(workload)
 
-			workload = &tfv1.TensorFusionWorkload{}
-			Expect(k8sClient.Get(ctx, key, workload)).To(Succeed())
-			workload.Spec.Replicas = ptr.To(int32(1))
-			Expect(k8sClient.Update(ctx, workload)).To(Succeed())
+			Eventually(func(g Gomega) {
+				g.Expect(k8sClient.Get(ctx, key, workload)).To(Succeed())
+				workload.Spec.Replicas = ptr.To(int32(1))
+				g.Expect(k8sClient.Update(ctx, workload)).To(Succeed())
+			}).Should(Succeed())
 
 			_ = checkWorkerPodCount(workload)
 			checkWorkloadStatus(workload)
diff --git a/internal/utils/compose.go b/internal/utils/compose.go
@@ -216,9 +216,11 @@ func AddTFDefaultClientConfBeforePatch(
 			pod.Spec.Containers[injectContainerIndex].VolumeMounts = append(
 				pod.Spec.Containers[injectContainerIndex].VolumeMounts,
 				v1.VolumeMount{
-					Name:        constants.DataVolumeName,
-					MountPath:   constants.SharedMemDeviceName + constants.TFLibsVolumeMountPath,
-					SubPathExpr: constants.TFDataPathWorkerExpr,
+					Name:      constants.DataVolumeName,
+					MountPath: constants.SharedMemDeviceName,
+					SubPath:   constants.SharedMemMountSubPath,
+					//  + constants.TFLibsVolumeMountPath, SubPathExpr:      constants.TFDataPathWorkerExpr,
+					MountPropagation: ptr.To(v1.MountPropagationHostToContainer),
 				})
 
 			envList := pod.Spec.Containers[injectContainerIndex].Env
@@ -591,9 +593,13 @@ func AddWorkerConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, workerCon
 	spec.Containers[0].VolumeMounts = append(
 		spec.Containers[0].VolumeMounts,
 		v1.VolumeMount{
-			Name:        constants.DataVolumeName,
-			MountPath:   constants.SharedMemDeviceName + constants.TFLibsVolumeMountPath,
-			SubPathExpr: constants.TFDataPathWorkerExpr,
+			Name:      constants.DataVolumeName,
+			MountPath: constants.SharedMemDeviceName,
+			// TODO not working.
+			// + constants.TFLibsVolumeMountPath
+			// SubPathExpr: constants.TFDataPathWorkerExpr,
+			SubPath:          constants.SharedMemMountSubPath,
+			MountPropagation: ptr.To(v1.MountPropagationHostToContainer),
 		})
 	spec.Containers[0].Env = append(spec.Containers[0].Env, v1.EnvVar{
 		Name:  constants.NvidiaVisibleAllDeviceEnv,

Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,7 @@ func (r *GPUReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R`
`88`	`88`	`return ctrl.Result{}, fmt.Errorf("node %s is not assigned to any pool", gpunode.Name)`
`89`	`89`	`}`
`90`	`90`
`91`		`- if gpu.Status.UsedBy == "" {`
	`91`	`+ if gpu.Status.UsedBy == "" && gpu.Status.UUID != "" {`
`92`	`92`	`patch := client.MergeFrom(gpu.DeepCopy())`
`93`	`93`	`gpu.Status.UsedBy = tfv1.UsedByTensorFusion`
`94`	`94`	`if err := r.Status().Patch(ctx, gpu, patch); err != nil {`