Skip to content

Commit 59495e5

Browse files
authored
fix: unit test issues (#302)
1 parent c14f67d commit 59495e5

File tree

4 files changed

+30
-27
lines changed

4 files changed

+30
-27
lines changed

cmd/nodediscovery/main.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -179,14 +179,16 @@ func main() {
179179
}
180180

181181
// Use proper patch-based update with retry on conflict
182-
func patchGPUNodeStatus(k8sClient client.Client, ctx context.Context, gpunode *tfv1.GPUNode, totalTFlops resource.Quantity, totalVRAM resource.Quantity, count int32, allDeviceIDs []string) error {
182+
func patchGPUNodeStatus(k8sClient client.Client, ctx context.Context,
183+
gpunode *tfv1.GPUNode, totalTFlops resource.Quantity, totalVRAM resource.Quantity,
184+
count int32, allDeviceIDs []string) error {
183185

184186
currentGPUNode := &tfv1.GPUNode{}
185187
if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(gpunode), currentGPUNode); err != nil {
186188
return err
187189
}
188190
patch := client.MergeFrom(currentGPUNode.DeepCopy())
189-
updateGPUNodeStatus(&currentGPUNode.Status, totalTFlops, totalVRAM, int32(count), allDeviceIDs)
191+
updateGPUNodeStatus(&currentGPUNode.Status, totalTFlops, totalVRAM, count, allDeviceIDs)
190192
return k8sClient.Status().Patch(ctx, currentGPUNode, patch)
191193
}
192194

cmd/nodediscovery/main_test.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,8 @@ func TestPatchGPUNodeStatus(t *testing.T) {
181181
assert.Equal(t, resource.MustParse("24Gi"), patchedNode.Status.TotalVRAM)
182182
assert.Equal(t, int32(6), patchedNode.Status.TotalGPUs)
183183
assert.Equal(t, int32(6), patchedNode.Status.ManagedGPUs)
184-
assert.Equal(t, []string{"gpu-0", "gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5"}, patchedNode.Status.ManagedGPUDeviceIDs)
184+
assert.Equal(t, []string{"gpu-0", "gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5"},
185+
patchedNode.Status.ManagedGPUDeviceIDs)
185186
// Verify existing phase was preserved
186187
assert.Equal(t, tfv1.TensorFusionGPUNodePhaseRunning, patchedNode.Status.Phase)
187188
},
@@ -368,7 +369,8 @@ func TestPatchGPUNodeStatus_Integration(t *testing.T) {
368369
for i, update := range updates {
369370
t.Run(fmt.Sprintf("update_%d", i+1), func(t *testing.T) {
370371
// Apply the patch
371-
err := patchGPUNodeStatus(k8sClient, ctx, gpuNode, update.totalTFlops, update.totalVRAM, update.count, update.allDeviceIDs)
372+
err := patchGPUNodeStatus(k8sClient, ctx, gpuNode, update.totalTFlops,
373+
update.totalVRAM, update.count, update.allDeviceIDs)
372374
assert.NoError(t, err, "Patch should succeed")
373375

374376
// Verify the update was applied

internal/controller/gpupool_node_provision.go

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -145,23 +145,23 @@ func (r *GPUPoolReconciler) reconcilePoolCapacityWithProvisioner(ctx context.Con
145145

146146
// lock the pool before next node scaling up loop, add assumed scaling resources util all pending nodeClaims are running
147147
newCreatedNodes := map[string]tfv1.Resource{}
148-
for _, node := range gpuNodeParams {
149-
go func(node tfv1.GPUNodeClaimSpec) {
148+
for _, nodeClaim := range gpuNodeParams {
149+
go func(nodeClaim tfv1.GPUNodeClaimSpec) {
150150
defer wg.Done()
151151

152152
// Create GPUNode custom resource immediately and GPUNode controller will watch the K8S node to be ready
153153
// Persist the status to GPUNode to avoid duplicated creation in next reconciliation
154154
// If the K8S node never be ready after some time, the GPUNode will be deleted, then the Pool reconcile loop can scale up and meet the capacity constraint again
155155

156-
costPerHour, pricingErr := provider.GetInstancePricing(node.InstanceType, node.CapacityType, node.Region)
156+
costPerHour, pricingErr := provider.GetInstancePricing(nodeClaim.InstanceType, nodeClaim.CapacityType, nodeClaim.Region)
157157
if pricingErr != nil {
158158
errList = append(errList, pricingErr)
159159
return
160160
}
161161

162162
gpuNodeClaimRes := &tfv1.GPUNodeClaim{
163163
ObjectMeta: metav1.ObjectMeta{
164-
Name: node.NodeName,
164+
Name: nodeClaim.NodeName,
165165
Labels: map[string]string{
166166
constants.LabelKeyOwner: pool.Name,
167167
constants.LabelKeyClusterOwner: cluster.Name,
@@ -174,19 +174,20 @@ func (r *GPUPoolReconciler) reconcilePoolCapacityWithProvisioner(ctx context.Con
174174
constants.PricingAnnotation: strconv.FormatFloat(costPerHour, 'f', 6, 64),
175175
},
176176
},
177-
Spec: node,
177+
Spec: nodeClaim,
178178
}
179179
_ = controllerutil.SetControllerReference(pool, gpuNodeClaimRes, r.Scheme)
180180
err := r.Create(ctx, gpuNodeClaimRes)
181181
if err != nil {
182182
errList = append(errList, err)
183183
return
184184
}
185-
newCreatedNodes[node.NodeName] = tfv1.Resource{
186-
Tflops: node.TFlopsOffered,
187-
Vram: node.VRAMOffered,
185+
log.Info("Created new GPUNode claim", "gpuNodeClaimName", nodeClaim.NodeName)
186+
newCreatedNodes[nodeClaim.NodeName] = tfv1.Resource{
187+
Tflops: nodeClaim.TFlopsOffered,
188+
Vram: nodeClaim.VRAMOffered,
188189
}
189-
}(node)
190+
}(nodeClaim)
190191
}
191192

192193
wg.Wait()

internal/controller/suite_test.go

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -506,24 +506,22 @@ func (c *TensorFusionEnv) AddMockGPU4ProvisionedNodes(gpuNodeClaimList *tfv1.GPU
506506
if err := k8sClient.Status().Update(ctx, latest); err != nil {
507507
return err
508508
}
509-
510-
// update GPUNode status to trigger node level reconcile, simulate node discovery job
511-
if gpuNode.Status.Phase == "" || gpuNode.Status.TotalGPUs == 0 {
512-
gpuNode.Status = tfv1.GPUNodeStatus{
513-
Phase: tfv1.TensorFusionGPUNodePhasePending,
514-
TotalGPUs: 1,
515-
ManagedGPUs: 1,
516-
TotalTFlops: gpuNodeClaim.Spec.TFlopsOffered,
517-
TotalVRAM: gpuNodeClaim.Spec.VRAMOffered,
518-
}
519-
if err := k8sClient.Status().Update(ctx, gpuNode); err != nil {
520-
return err
521-
}
522-
}
523509
return nil
524510
})
525511
Expect(err).Should(Succeed())
526512
}
513+
514+
// update GPUNode status to trigger node level reconcile, simulate node discovery job
515+
if gpuNode.Status.Phase == "" || gpuNode.Status.TotalGPUs == 0 {
516+
gpuNode.Status = tfv1.GPUNodeStatus{
517+
Phase: tfv1.TensorFusionGPUNodePhasePending,
518+
TotalGPUs: 1,
519+
ManagedGPUs: 1,
520+
TotalTFlops: gpuNodeClaim.Spec.TFlopsOffered,
521+
TotalVRAM: gpuNodeClaim.Spec.VRAMOffered,
522+
}
523+
Expect(k8sClient.Status().Update(ctx, gpuNode)).Should(Succeed())
524+
}
527525
}
528526
}
529527

0 commit comments

Comments
 (0)