Skip to content

Commit 019495e

Browse files
authored
fix: optimize unit test (#289)
* fix: GPU usedBy field wrongly modified after update issue * fix: unit test issues * fix: revert to shared shm
1 parent d0d7525 commit 019495e

File tree

6 files changed

+35
-13
lines changed

6 files changed

+35
-13
lines changed

cmd/nodediscovery/main.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,9 @@ func createOrUpdateTensorFusionGPU(
265265
if gpu.Status.Available == nil {
266266
gpu.Status.Available = gpu.Status.Capacity.DeepCopy()
267267
}
268+
if gpu.Status.UsedBy == "" {
269+
gpu.Status.UsedBy = tfv1.UsedByTensorFusion
270+
}
268271
return k8sClient.Status().Patch(ctx, gpu, client.Merge)
269272
})
270273
if err != nil {

internal/controller/gpu_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ func (r *GPUReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
8888
return ctrl.Result{}, fmt.Errorf("node %s is not assigned to any pool", gpunode.Name)
8989
}
9090

91-
if gpu.Status.UsedBy == "" {
91+
if gpu.Status.UsedBy == "" && gpu.Status.UUID != "" {
9292
patch := client.MergeFrom(gpu.DeepCopy())
9393
gpu.Status.UsedBy = tfv1.UsedByTensorFusion
9494
if err := r.Status().Patch(ctx, gpu, patch); err != nil {

internal/controller/gpunode_controller.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,14 +131,18 @@ func (r *GPUNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
131131

132132
// Check if hypervisor is running well, if so, set as running status
133133
err = r.checkStatusAndUpdateVirtualCapacity(ctx, hypervisorName, node, poolObj)
134+
if errors.IsNotFound(err) {
135+
log.Info("Hypervisor pod not found, requeue", "hypervisorName", hypervisorName)
136+
return ctrl.Result{Requeue: true}, nil
137+
}
134138
return ctrl.Result{}, err
135139
}
136140

137141
func (r *GPUNodeReconciler) checkStatusAndUpdateVirtualCapacity(ctx context.Context, hypervisorName string, node *tfv1.GPUNode, poolObj *tfv1.GPUPool) error {
138142
pod := &corev1.Pod{}
139143
fetchErr := r.Get(ctx, client.ObjectKey{Name: hypervisorName, Namespace: utils.CurrentNamespace()}, pod)
140144
if fetchErr != nil {
141-
return fmt.Errorf("failed to get hypervisor pod: %w", fetchErr)
145+
return fetchErr
142146
}
143147

144148
// Reconcile GPUNode status with hypervisor pod status, when changed

internal/controller/gpupool_compaction_controller.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,14 +111,22 @@ func (r *GPUPoolCompactionReconciler) checkNodeCompaction(ctx context.Context, p
111111
if pool.Spec.NodeManagerConfig.ProvisioningMode != tfv1.ProvisioningModeAutoSelect {
112112
// not managed by Kubernetes, managed by TensorFusion, safe to terminate, and finalizer will cause K8S node and related cloud resources to be deleted
113113
gpuNodeClaimName := gpuNode.Labels[constants.ProvisionerLabelKey]
114+
if gpuNodeClaimName == "" {
115+
log.Info("skip existing nodes managed by other controller when compaction", "node", gpuNode.Name)
116+
continue
117+
}
114118
gpuNodeClaimObj := &tfv1.GPUNodeClaim{}
115119
if err := r.Get(ctx, client.ObjectKey{Name: gpuNodeClaimName}, gpuNodeClaimObj); err != nil {
120+
if errors.IsNotFound(err) {
121+
log.Info("skip existing nodes managed by other controller when compaction", "node", gpuNode.Name)
122+
continue
123+
}
116124
log.Error(err, "get gpuNodeClaim failed", "gpuNodeClaimName", gpuNodeClaimName)
117125
continue
118126
}
119127
// already deleting
120128
if !gpuNodeClaimObj.DeletionTimestamp.IsZero() {
121-
log.Info("[Warn] GPUNode deleting during compaction loop, this should not happen", "gpuNodeClaimName", gpuNodeClaimName)
129+
log.Info("[Warn] GPUNode deleting during compaction loop, this should not happen", "node", gpuNode.Name)
122130
continue
123131
}
124132
toDeleteGPUNodes = append(toDeleteGPUNodes, gpuNodeClaimName)

internal/controller/tensorfusionworkload_controller_test.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -201,10 +201,11 @@ var _ = Describe("TensorFusionWorkload Controller", func() {
201201
_ = checkWorkerPodCount(workload)
202202
checkWorkloadStatus(workload)
203203

204-
workload = &tfv1.TensorFusionWorkload{}
205-
Expect(k8sClient.Get(ctx, key, workload)).To(Succeed())
206-
workload.Spec.Replicas = ptr.To(int32(1))
207-
Expect(k8sClient.Update(ctx, workload)).To(Succeed())
204+
Eventually(func(g Gomega) {
205+
g.Expect(k8sClient.Get(ctx, key, workload)).To(Succeed())
206+
workload.Spec.Replicas = ptr.To(int32(1))
207+
g.Expect(k8sClient.Update(ctx, workload)).To(Succeed())
208+
}).Should(Succeed())
208209

209210
_ = checkWorkerPodCount(workload)
210211
checkWorkloadStatus(workload)

internal/utils/compose.go

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -216,9 +216,11 @@ func AddTFDefaultClientConfBeforePatch(
216216
pod.Spec.Containers[injectContainerIndex].VolumeMounts = append(
217217
pod.Spec.Containers[injectContainerIndex].VolumeMounts,
218218
v1.VolumeMount{
219-
Name: constants.DataVolumeName,
220-
MountPath: constants.SharedMemDeviceName + constants.TFLibsVolumeMountPath,
221-
SubPathExpr: constants.TFDataPathWorkerExpr,
219+
Name: constants.DataVolumeName,
220+
MountPath: constants.SharedMemDeviceName,
221+
SubPath: constants.SharedMemMountSubPath,
222+
// + constants.TFLibsVolumeMountPath, SubPathExpr: constants.TFDataPathWorkerExpr,
223+
MountPropagation: ptr.To(v1.MountPropagationHostToContainer),
222224
})
223225

224226
envList := pod.Spec.Containers[injectContainerIndex].Env
@@ -591,9 +593,13 @@ func AddWorkerConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, workerCon
591593
spec.Containers[0].VolumeMounts = append(
592594
spec.Containers[0].VolumeMounts,
593595
v1.VolumeMount{
594-
Name: constants.DataVolumeName,
595-
MountPath: constants.SharedMemDeviceName + constants.TFLibsVolumeMountPath,
596-
SubPathExpr: constants.TFDataPathWorkerExpr,
596+
Name: constants.DataVolumeName,
597+
MountPath: constants.SharedMemDeviceName,
598+
// TODO not working.
599+
// + constants.TFLibsVolumeMountPath
600+
// SubPathExpr: constants.TFDataPathWorkerExpr,
601+
SubPath: constants.SharedMemMountSubPath,
602+
MountPropagation: ptr.To(v1.MountPropagationHostToContainer),
597603
})
598604
spec.Containers[0].Env = append(spec.Containers[0].Env, v1.EnvVar{
599605
Name: constants.NvidiaVisibleAllDeviceEnv,

0 commit comments

Comments
 (0)