fix: helm typo and patch gpu node bug (#301)

Code2Life · web-flow · commit c14f67d6bd2e · 2025-07-30T16:42:06.000+08:00
* fix: helm typo

* fix: node discovery patch gpu node status issue
diff --git a/api/v1/gpunode_types.go b/api/v1/gpunode_types.go
@@ -63,11 +63,15 @@ type GPUNodeStatus struct {
 	TotalTFlops resource.Quantity `json:"totalTFlops"`
 	TotalVRAM   resource.Quantity `json:"totalVRAM"`
 
-	VirtualTFlops resource.Quantity `json:"virtualTFlops"`
-	VirtualVRAM   resource.Quantity `json:"virtualVRAM"`
+	// +optional
+	VirtualTFlops resource.Quantity `json:"virtualTFlops,omitempty"`
+	// +optional
+	VirtualVRAM resource.Quantity `json:"virtualVRAM,omitempty"`
 
-	AvailableTFlops resource.Quantity `json:"availableTFlops"`
-	AvailableVRAM   resource.Quantity `json:"availableVRAM"`
+	// +optional
+	AvailableTFlops resource.Quantity `json:"availableTFlops,omitempty"`
+	// +optional
+	AvailableVRAM resource.Quantity `json:"availableVRAM,omitempty"`
 
 	// +optional
 	VirtualAvailableTFlops *resource.Quantity `json:"virtualAvailableTFlops,omitempty"`
diff --git a/charts/tensor-fusion/crds/tensor-fusion.ai_gpunodes.yaml b/charts/tensor-fusion/crds/tensor-fusion.ai_gpunodes.yaml
@@ -261,15 +261,11 @@ spec:
                 pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
                 x-kubernetes-int-or-string: true
             required:
-            - availableTFlops
-            - availableVRAM
             - managedGPUs
             - phase
             - totalGPUs
             - totalTFlops
             - totalVRAM
-            - virtualTFlops
-            - virtualVRAM
             type: object
         type: object
     served: true
diff --git a/cmd/nodediscovery/main.go b/cmd/nodediscovery/main.go
@@ -169,29 +169,27 @@ func main() {
 		availableVRAM.Add(gpu.Status.Available.Vram)
 	}
 
-	// Use proper patch-based update with retry on conflict
 	err = retry.RetryOnConflict(retry.DefaultBackoff, func() error {
-		// Get the latest version of the resource
-		currentGPUNode := &tfv1.GPUNode{}
-		if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(gpunode), currentGPUNode); err != nil {
-			return err
-		}
-
-		// Create a patch from the original to the desired state
-		patch := client.MergeFrom(currentGPUNode.DeepCopy())
-
-		// Update status fields conditionally
-		updateGPUNodeStatus(&currentGPUNode.Status, totalTFlops, totalVRAM, int32(count), allDeviceIDs)
-
-		// Apply the patch using the status subresource
-		return k8sClient.Status().Patch(ctx, currentGPUNode, patch)
+		return patchGPUNodeStatus(k8sClient, ctx, gpunode, totalTFlops, totalVRAM, int32(count), allDeviceIDs)
 	})
 	if err != nil {
-		ctrl.Log.Error(err, "failed to update status of GPUNode after retries")
+		ctrl.Log.Error(err, "failed to patch status of GPUNode after retries")
 		os.Exit(1)
 	}
 }
 
+// Use proper patch-based update with retry on conflict
+func patchGPUNodeStatus(k8sClient client.Client, ctx context.Context, gpunode *tfv1.GPUNode, totalTFlops resource.Quantity, totalVRAM resource.Quantity, count int32, allDeviceIDs []string) error {
+
+	currentGPUNode := &tfv1.GPUNode{}
+	if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(gpunode), currentGPUNode); err != nil {
+		return err
+	}
+	patch := client.MergeFrom(currentGPUNode.DeepCopy())
+	updateGPUNodeStatus(&currentGPUNode.Status, totalTFlops, totalVRAM, int32(count), allDeviceIDs)
+	return k8sClient.Status().Patch(ctx, currentGPUNode, patch)
+}
+
 func createOrUpdateTensorFusionGPU(
 	k8sClient client.Client, ctx context.Context, k8sNodeName string, gpunode *tfv1.GPUNode,
 	uuid string, deviceName string, memInfo nvml.Memory_v2, tflops resource.Quantity) *tfv1.GPU {
diff --git a/cmd/nodediscovery/main_test.go b/cmd/nodediscovery/main_test.go
@@ -2,6 +2,7 @@ package main
 
 import (
 	"context"
+	"fmt"
 	"testing"
 	"time"
 
@@ -12,6 +13,7 @@ import (
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime"
+	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/client/fake"
 )
 
@@ -106,3 +108,292 @@ func TestGPUControllerReference(t *testing.T) {
 	assert.True(t, metav1.IsControlledBy(gpu, newGpuNode))
 	assert.False(t, metav1.IsControlledBy(gpu, gpuNode))
 }
+
+func TestPatchGPUNodeStatus(t *testing.T) {
+	tests := []struct {
+		name           string
+		setupGPUNode   func() *tfv1.GPUNode
+		totalTFlops    resource.Quantity
+		totalVRAM      resource.Quantity
+		count          int32
+		allDeviceIDs   []string
+		expectError    bool
+		validateResult func(t *testing.T, originalNode, patchedNode *tfv1.GPUNode)
+	}{
+		{
+			name: "successful patch with empty phase",
+			setupGPUNode: func() *tfv1.GPUNode {
+				return &tfv1.GPUNode{
+					ObjectMeta: metav1.ObjectMeta{
+						Name:      "test-gpu-node",
+						Namespace: "default",
+					},
+					Status: tfv1.GPUNodeStatus{
+						Phase:       "", // Empty phase should be set to pending
+						TotalTFlops: resource.MustParse("50"),
+						TotalVRAM:   resource.MustParse("8Gi"),
+						TotalGPUs:   2,
+					},
+				}
+			},
+			totalTFlops:  resource.MustParse("100"),
+			totalVRAM:    resource.MustParse("16Gi"),
+			count:        4,
+			allDeviceIDs: []string{"gpu-0", "gpu-1", "gpu-2", "gpu-3"},
+			expectError:  false,
+			validateResult: func(t *testing.T, originalNode, patchedNode *tfv1.GPUNode) {
+				// Verify status fields were updated
+				assert.Equal(t, resource.MustParse("100"), patchedNode.Status.TotalTFlops)
+				assert.Equal(t, resource.MustParse("16Gi"), patchedNode.Status.TotalVRAM)
+				assert.Equal(t, int32(4), patchedNode.Status.TotalGPUs)
+				assert.Equal(t, int32(4), patchedNode.Status.ManagedGPUs)
+				assert.Equal(t, []string{"gpu-0", "gpu-1", "gpu-2", "gpu-3"}, patchedNode.Status.ManagedGPUDeviceIDs)
+				assert.Equal(t, tfv1.TensorFusionGPUNodePhasePending, patchedNode.Status.Phase)
+				// Verify NodeInfo was updated
+				assert.True(t, patchedNode.Status.NodeInfo.RAMSize.Value() > 0)
+				assert.True(t, patchedNode.Status.NodeInfo.DataDiskSize.Value() > 0)
+			},
+		},
+		{
+			name: "successful patch with existing phase preserved",
+			setupGPUNode: func() *tfv1.GPUNode {
+				return &tfv1.GPUNode{
+					ObjectMeta: metav1.ObjectMeta{
+						Name:      "test-gpu-node-running",
+						Namespace: "default",
+					},
+					Status: tfv1.GPUNodeStatus{
+						Phase:       tfv1.TensorFusionGPUNodePhaseRunning,
+						TotalTFlops: resource.MustParse("200"),
+						TotalVRAM:   resource.MustParse("32Gi"),
+						TotalGPUs:   8,
+					},
+				}
+			},
+			totalTFlops:  resource.MustParse("150"),
+			totalVRAM:    resource.MustParse("24Gi"),
+			count:        6,
+			allDeviceIDs: []string{"gpu-0", "gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5"},
+			expectError:  false,
+			validateResult: func(t *testing.T, originalNode, patchedNode *tfv1.GPUNode) {
+				// Verify status fields were updated
+				assert.Equal(t, resource.MustParse("150"), patchedNode.Status.TotalTFlops)
+				assert.Equal(t, resource.MustParse("24Gi"), patchedNode.Status.TotalVRAM)
+				assert.Equal(t, int32(6), patchedNode.Status.TotalGPUs)
+				assert.Equal(t, int32(6), patchedNode.Status.ManagedGPUs)
+				assert.Equal(t, []string{"gpu-0", "gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5"}, patchedNode.Status.ManagedGPUDeviceIDs)
+				// Verify existing phase was preserved
+				assert.Equal(t, tfv1.TensorFusionGPUNodePhaseRunning, patchedNode.Status.Phase)
+			},
+		},
+		{
+			name: "zero resources handled correctly",
+			setupGPUNode: func() *tfv1.GPUNode {
+				return &tfv1.GPUNode{
+					ObjectMeta: metav1.ObjectMeta{
+						Name:      "test-gpu-node-zero",
+						Namespace: "default",
+					},
+					Status: tfv1.GPUNodeStatus{
+						Phase: "",
+					},
+				}
+			},
+			totalTFlops:  resource.MustParse("0"),
+			totalVRAM:    resource.MustParse("0"),
+			count:        0,
+			allDeviceIDs: []string{},
+			expectError:  false,
+			validateResult: func(t *testing.T, originalNode, patchedNode *tfv1.GPUNode) {
+				assert.Equal(t, resource.MustParse("0"), patchedNode.Status.TotalTFlops)
+				assert.Equal(t, resource.MustParse("0"), patchedNode.Status.TotalVRAM)
+				assert.Equal(t, int32(0), patchedNode.Status.TotalGPUs)
+				assert.Equal(t, int32(0), patchedNode.Status.ManagedGPUs)
+				assert.Empty(t, patchedNode.Status.ManagedGPUDeviceIDs)
+				assert.Equal(t, tfv1.TensorFusionGPUNodePhasePending, patchedNode.Status.Phase)
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			ctx := context.Background()
+			gpuNode := tt.setupGPUNode()
+
+			// Setup fake client with the GPUNode
+			scheme := runtime.NewScheme()
+			_ = tfv1.AddToScheme(scheme)
+			k8sClient := fake.NewClientBuilder().
+				WithScheme(scheme).
+				WithStatusSubresource(&tfv1.GPUNode{}).
+				WithObjects(gpuNode).
+				Build()
+
+			// Store original state for comparison
+			originalNode := gpuNode.DeepCopy()
+
+			// Call the function under test
+			err := patchGPUNodeStatus(k8sClient, ctx, gpuNode, tt.totalTFlops, tt.totalVRAM, tt.count, tt.allDeviceIDs)
+
+			// Verify error expectation
+			if tt.expectError {
+				assert.Error(t, err, "Expected an error but got none")
+				return
+			}
+			assert.NoError(t, err, "Unexpected error")
+
+			// Get the updated GPUNode from the client to verify the patch was applied
+			updatedNode := &tfv1.GPUNode{}
+			err = k8sClient.Get(ctx, client.ObjectKeyFromObject(gpuNode), updatedNode)
+			assert.NoError(t, err, "Failed to get updated GPUNode")
+
+			// Run custom validation
+			if tt.validateResult != nil {
+				tt.validateResult(t, originalNode, updatedNode)
+			}
+		})
+	}
+}
+
+func TestPatchGPUNodeStatus_ErrorScenarios(t *testing.T) {
+	tests := []struct {
+		name         string
+		setupClient  func() client.Client
+		setupGPUNode func() *tfv1.GPUNode
+		expectedErr  string
+	}{
+		{
+			name: "GPUNode not found error",
+			setupClient: func() client.Client {
+				// Create client without the GPUNode object
+				scheme := runtime.NewScheme()
+				_ = tfv1.AddToScheme(scheme)
+				return fake.NewClientBuilder().
+					WithScheme(scheme).
+					WithStatusSubresource(&tfv1.GPUNode{}).
+					Build()
+			},
+			setupGPUNode: func() *tfv1.GPUNode {
+				return &tfv1.GPUNode{
+					ObjectMeta: metav1.ObjectMeta{
+						Name:      "nonexistent-gpu-node",
+						Namespace: "default",
+					},
+				}
+			},
+			expectedErr: "not found",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			ctx := context.Background()
+			k8sClient := tt.setupClient()
+			gpuNode := tt.setupGPUNode()
+
+			// Call the function under test
+			err := patchGPUNodeStatus(k8sClient, ctx, gpuNode,
+				resource.MustParse("100"),
+				resource.MustParse("16Gi"),
+				4,
+				[]string{"gpu-0", "gpu-1", "gpu-2", "gpu-3"})
+
+			// Verify the expected error occurred
+			assert.Error(t, err, "Expected an error but got none")
+			assert.Contains(t, err.Error(), tt.expectedErr, "Error message should contain expected text")
+		})
+	}
+}
+
+func TestPatchGPUNodeStatus_Integration(t *testing.T) {
+	// Integration test that verifies the complete flow
+	ctx := context.Background()
+
+	// Setup initial GPUNode
+	gpuNode := &tfv1.GPUNode{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "integration-test-node",
+			Namespace: "default",
+		},
+		Status: tfv1.GPUNodeStatus{
+			Phase:               "",
+			TotalTFlops:         resource.MustParse("10"),
+			TotalVRAM:           resource.MustParse("2Gi"),
+			TotalGPUs:           1,
+			ManagedGPUs:         0, // Different from TotalGPUs to test sync
+			ManagedGPUDeviceIDs: []string{"old-device"},
+			NodeInfo: tfv1.GPUNodeInfo{
+				RAMSize:      resource.MustParse("1Gi"),
+				DataDiskSize: resource.MustParse("1Gi"),
+			},
+		},
+	}
+
+	// Setup fake client
+	scheme := runtime.NewScheme()
+	_ = tfv1.AddToScheme(scheme)
+	k8sClient := fake.NewClientBuilder().
+		WithScheme(scheme).
+		WithStatusSubresource(&tfv1.GPUNode{}).
+		WithObjects(gpuNode).
+		Build()
+
+	// Test multiple sequential patches to verify state consistency
+	updates := []struct {
+		totalTFlops  resource.Quantity
+		totalVRAM    resource.Quantity
+		count        int32
+		allDeviceIDs []string
+	}{
+		{
+			totalTFlops:  resource.MustParse("100"),
+			totalVRAM:    resource.MustParse("16Gi"),
+			count:        4,
+			allDeviceIDs: []string{"gpu-0", "gpu-1", "gpu-2", "gpu-3"},
+		},
+		{
+			totalTFlops:  resource.MustParse("200"),
+			totalVRAM:    resource.MustParse("32Gi"),
+			count:        8,
+			allDeviceIDs: []string{"gpu-0", "gpu-1", "gpu-2", "gpu-3", "gpu-4", "gpu-5", "gpu-6", "gpu-7"},
+		},
+		{
+			totalTFlops:  resource.MustParse("50"),
+			totalVRAM:    resource.MustParse("8Gi"),
+			count:        2,
+			allDeviceIDs: []string{"gpu-0", "gpu-1"},
+		},
+	}
+
+	for i, update := range updates {
+		t.Run(fmt.Sprintf("update_%d", i+1), func(t *testing.T) {
+			// Apply the patch
+			err := patchGPUNodeStatus(k8sClient, ctx, gpuNode, update.totalTFlops, update.totalVRAM, update.count, update.allDeviceIDs)
+			assert.NoError(t, err, "Patch should succeed")
+
+			// Verify the update was applied
+			updatedNode := &tfv1.GPUNode{}
+			err = k8sClient.Get(ctx, client.ObjectKeyFromObject(gpuNode), updatedNode)
+			assert.NoError(t, err, "Should be able to get updated node")
+
+			// Verify all fields were updated correctly
+			assert.Equal(t, update.totalTFlops, updatedNode.Status.TotalTFlops)
+			assert.Equal(t, update.totalVRAM, updatedNode.Status.TotalVRAM)
+			assert.Equal(t, update.count, updatedNode.Status.TotalGPUs)
+			assert.Equal(t, update.count, updatedNode.Status.ManagedGPUs)
+			assert.Equal(t, update.allDeviceIDs, updatedNode.Status.ManagedGPUDeviceIDs)
+
+			// Phase should be set to pending only on first update
+			if i == 0 {
+				assert.Equal(t, tfv1.TensorFusionGPUNodePhasePending, updatedNode.Status.Phase)
+			} else {
+				// Should remain pending on subsequent updates
+				assert.Equal(t, tfv1.TensorFusionGPUNodePhasePending, updatedNode.Status.Phase)
+			}
+
+			// NodeInfo should be updated with system values
+			assert.True(t, updatedNode.Status.NodeInfo.RAMSize.Value() > 0)
+			assert.True(t, updatedNode.Status.NodeInfo.DataDiskSize.Value() > 0)
+		})
+	}
+}
diff --git a/config/crd/bases/tensor-fusion.ai_gpunodes.yaml b/config/crd/bases/tensor-fusion.ai_gpunodes.yaml