fix: available field and owner reference issue in nodediscovery (#180)

knave · 施建 · web-flow · commit bd6444032d23 · 2025-05-12T15:41:12.000+08:00
* fix: available field and owner reference issue in nodediscovery

* fix: linter issues and upgrade linter verison in makefile

---------

Co-authored-by: 施建 &lt;shijian@mac3.local&gt;
diff --git a/Makefile b/Makefile
@@ -174,7 +174,7 @@ GOLANGCI_LINT = $(LOCALBIN)/golangci-lint
 KUSTOMIZE_VERSION ?= v5.5.0
 CONTROLLER_TOOLS_VERSION ?= v0.16.4
 ENVTEST_VERSION ?= release-0.19
-GOLANGCI_LINT_VERSION ?= v1.64.8
+GOLANGCI_LINT_VERSION ?= v2.1.6
 
 .PHONY: kustomize
 kustomize: $(KUSTOMIZE) ## Download kustomize locally if necessary.
@@ -194,7 +194,7 @@ $(ENVTEST): $(LOCALBIN)
 .PHONY: golangci-lint
 golangci-lint: $(GOLANGCI_LINT) ## Download golangci-lint locally if necessary.
 $(GOLANGCI_LINT): $(LOCALBIN)
-	$(call go-install-tool,$(GOLANGCI_LINT),github.com/golangci/golangci-lint/cmd/golangci-lint,$(GOLANGCI_LINT_VERSION))
+	$(call go-install-tool,$(GOLANGCI_LINT),github.com/golangci/golangci-lint/v2/cmd/golangci-lint,$(GOLANGCI_LINT_VERSION))
 
 # go-install-tool will 'go install' any package with custom target and name of binary, if it doesn't exist
 # $1 - target path with name of binary
diff --git a/cmd/nodediscovery/main.go b/cmd/nodediscovery/main.go
@@ -152,71 +152,8 @@ func main() {
 		} else {
 			ctrl.Log.Info("found GPU info from config", "deviceName", deviceName, "FP16 TFlops", tflops, "uuid", uuid)
 		}
-		gpu := &tfv1.GPU{
-			ObjectMeta: metav1.ObjectMeta{
-				Name: uuid,
-			},
-		}
-
-		gpuStatus := tfv1.GPUStatus{
-			Phase: tfv1.TensorFusionGPUPhaseRunning,
-			Capacity: &tfv1.Resource{
-				Vram:   resource.MustParse(fmt.Sprintf("%dKi", memInfo.Total/1024)),
-				Tflops: tflops,
-			},
-			UUID:     uuid,
-			GPUModel: deviceName,
-			NodeSelector: map[string]string{
-				"kubernetes.io/hostname": k8sNodeName,
-			},
-		}
-
-		err = retry.OnError(retry.DefaultBackoff, func(err error) bool {
-			return true // Retry on all errors for now
-		}, func() error {
-			_, err := controllerutil.CreateOrUpdate(ctx, k8sClient, gpu, func() error {
-				// Set metadata fields
-				gpu.Labels = map[string]string{
-					constants.LabelKeyOwner: gpunode.Name,
-				}
-				gpu.Annotations = map[string]string{
-					constants.GPULastReportTimeAnnotationKey: time.Now().Format(time.RFC3339),
-				}
-
-				// Set controller reference
-				return controllerutil.SetControllerReference(gpunode, gpu, Scheme)
-			})
-			return err
-		})
-
-		if err != nil {
-			ctrl.Log.Error(err, "failed to create or update GPU after retries", "gpu", gpu)
-			os.Exit(1)
-		}
-
-		available := gpuStatus.Available
-		gpu.Status = gpuStatus
-		if available == nil {
-			gpu.Status.Available = gpuStatus.Capacity
-		} else {
-			gpu.Status.Available = available
-		}
 
-		err = retry.RetryOnConflict(retry.DefaultBackoff, func() error {
-			currentGPU := &tfv1.GPU{}
-			if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(gpu), currentGPU); err != nil {
-				return err
-			}
-
-			currentGPU.Status = gpu.Status
-
-			return k8sClient.Status().Update(ctx, currentGPU)
-		})
-
-		if err != nil {
-			ctrl.Log.Error(err, "failed to update status of GPU after retries", "gpu", gpu)
-			os.Exit(1)
-		}
+		gpu := createOrUpdateTensorFusionGPU(k8sClient, ctx, k8sNodeName, gpunode, uuid, deviceName, memInfo, tflops)
 
 		totalTFlops.Add(gpu.Status.Capacity.Tflops)
 		totalVRAM.Add(gpu.Status.Capacity.Vram)
@@ -246,13 +183,82 @@ func main() {
 
 		return k8sClient.Status().Update(ctx, currentGPUNode)
 	})
-
 	if err != nil {
 		ctrl.Log.Error(err, "failed to update status of GPUNode after retries")
 		os.Exit(1)
 	}
 }
 
+func createOrUpdateTensorFusionGPU(
+	k8sClient client.Client, ctx context.Context, k8sNodeName string, gpunode *tfv1.GPUNode,
+	uuid string, deviceName string, memInfo nvml.Memory_v2, tflops resource.Quantity) *tfv1.GPU {
+	gpu := &tfv1.GPU{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: uuid,
+		},
+	}
+
+	err := retry.OnError(retry.DefaultBackoff, func(err error) bool {
+		return true // Retry on all errors for now
+	}, func() error {
+		_, err := controllerutil.CreateOrUpdate(ctx, k8sClient, gpu, func() error {
+			// Set metadata fields
+			gpu.Labels = map[string]string{
+				constants.LabelKeyOwner: gpunode.Name,
+			}
+			gpu.Annotations = map[string]string{
+				constants.GPULastReportTimeAnnotationKey: time.Now().Format(time.RFC3339),
+			}
+
+			if !metav1.IsControlledBy(gpu, gpunode) {
+				gpu.OwnerReferences = []metav1.OwnerReference{
+					*metav1.NewControllerRef(gpunode, gpunode.GroupVersionKind()),
+				}
+			}
+
+			return nil
+		})
+		return err
+	})
+	if err != nil {
+		ctrl.Log.Error(err, "failed to create or update GPU after retries", "gpu", gpu)
+		os.Exit(1)
+	}
+
+	err = retry.RetryOnConflict(retry.DefaultBackoff, func() error {
+		if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(gpu), gpu); err != nil {
+			return err
+		}
+
+		newStatus := tfv1.GPUStatus{
+			Phase: tfv1.TensorFusionGPUPhaseRunning,
+			Capacity: &tfv1.Resource{
+				Vram:   resource.MustParse(fmt.Sprintf("%dKi", memInfo.Total/1024)),
+				Tflops: tflops,
+			},
+			UUID:     uuid,
+			GPUModel: deviceName,
+			NodeSelector: map[string]string{
+				"kubernetes.io/hostname": k8sNodeName,
+			},
+		}
+
+		if gpu.Status.Available == nil {
+			newStatus.Available = newStatus.Capacity
+		} else {
+			newStatus.Available = gpu.Status.Available
+		}
+		gpu.Status = newStatus
+		return k8sClient.Status().Update(ctx, gpu)
+	})
+	if err != nil {
+		ctrl.Log.Error(err, "failed to update status of GPU after retries", "gpu", gpu)
+		os.Exit(1)
+	}
+
+	return gpu
+}
+
 func nodeStatus(k8sNodeName string) *tfv1.GPUNodeStatus {
 	return &tfv1.GPUNodeStatus{
 		KubernetesNodeName: k8sNodeName,
@@ -309,7 +315,7 @@ func getDiskInfo(path string) (total int64) {
 	err = syscall.Statfs(absPath, &stat)
 	if err != nil {
 		if errors.Is(err, syscall.ENOENT) {
-			err = os.MkdirAll(absPath, 0755)
+			err = os.MkdirAll(absPath, 0o755)
 			if err != nil {
 				fmt.Printf("error creating folder: %s, err: %v\n", absPath, err)
 				return 0
diff --git a/cmd/nodediscovery/main_test.go b/cmd/nodediscovery/main_test.go
@@ -0,0 +1,107 @@
+package main
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/NVIDIA/go-nvml/pkg/nvml"
+	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
+	"github.com/NexusGPU/tensor-fusion/internal/constants"
+	"github.com/stretchr/testify/assert"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/runtime"
+	"sigs.k8s.io/controller-runtime/pkg/client/fake"
+)
+
+func TestCreateOrUpdateTensorFusionGPU(t *testing.T) {
+	// Setup test data
+	ctx := context.Background()
+	uuid := "test-uuid"
+	memInfo := nvml.Memory_v2{Total: 16 * 1024 * 1024 * 1024} // 16 GiB
+	tflops := resource.MustParse("100")
+	deviceName := "NVIDIA-Test-GPU"
+	k8sNodeName := "test-node"
+	gpuNodeName := "test-gpu-node"
+
+	gpuNode := &tfv1.GPUNode{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: gpuNodeName,
+		},
+	}
+
+	scheme := runtime.NewScheme()
+	_ = tfv1.AddToScheme(scheme)
+
+	k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithStatusSubresource(&tfv1.GPU{}).Build()
+
+	gpu := createOrUpdateTensorFusionGPU(k8sClient, ctx, k8sNodeName, gpuNode, uuid, deviceName, memInfo, tflops)
+
+	// Assertions
+	assert.NotNil(t, gpu, "GPU object should not be nil")
+	assert.Equal(t, uuid, gpu.Name, "GPU name should match UUID")
+	assert.Equal(t, deviceName, gpu.Status.GPUModel, "GPU model should match device name")
+	assert.Equal(t, tflops, gpu.Status.Capacity.Tflops, "GPU TFlops should match")
+	assert.Equal(t, resource.MustParse("16384Mi"), gpu.Status.Capacity.Vram, "GPU VRAM should match")
+	assert.Equal(t, gpu.Status.Capacity, gpu.Status.Available, "Available resources should match capacity")
+	assert.Equal(t, map[string]string{"kubernetes.io/hostname": k8sNodeName},
+		gpu.Status.NodeSelector, "Node selector should match")
+	assert.Equal(t, tfv1.TensorFusionGPUPhaseRunning, gpu.Status.Phase, "GPU phase should be running")
+
+	// Verify labels and annotations
+	assert.Equal(t, map[string]string{constants.LabelKeyOwner: gpuNodeName}, gpu.Labels, "GPU labels should match")
+	assert.Contains(t, gpu.Annotations, constants.GPULastReportTimeAnnotationKey,
+		"GPU annotations should contain last report time")
+	_, err := time.Parse(time.RFC3339, gpu.Annotations[constants.GPULastReportTimeAnnotationKey])
+	assert.NoError(t, err, "Last report time annotation should be a valid RFC3339 timestamp")
+
+	// Verify the Available field does not change after the update
+	gpu.Status.Available.Tflops.Sub(resource.MustParse("1000"))
+	gpu.Status.Available.Vram.Sub(resource.MustParse("2000Mi"))
+	err = k8sClient.Status().Update(ctx, gpu)
+	assert.NoError(t, err)
+
+	tflops.Add(resource.MustParse("100"))
+	updatedGpu := createOrUpdateTensorFusionGPU(k8sClient, ctx, k8sNodeName, gpuNode, uuid, deviceName, memInfo, tflops)
+	assert.NotEqual(t, updatedGpu.Status.Capacity, gpu.Status.Capacity, "GPU capacity should not match")
+	assert.Equal(t, updatedGpu.Status.Available.Tflops, gpu.Status.Available.Tflops, "GPU TFlops should match")
+	assert.Equal(t, updatedGpu.Status.Available.Vram, gpu.Status.Available.Vram, "GPU VRAM should match")
+}
+
+func TestGPUControllerReference(t *testing.T) {
+	// Setup test data
+	ctx := context.Background()
+	uuid := "test-uuid"
+	memInfo := nvml.Memory_v2{Total: 16 * 1024 * 1024 * 1024} // 16 GiB
+	tflops := resource.MustParse("100")
+	deviceName := "NVIDIA-Test-GPU"
+	k8sNodeName := "test-node"
+	gpuNodeName := "test-gpu-node"
+
+	gpuNode := &tfv1.GPUNode{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: gpuNodeName,
+			UID:  "mock-uid",
+		},
+	}
+
+	scheme := runtime.NewScheme()
+	_ = tfv1.AddToScheme(scheme)
+
+	k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithStatusSubresource(&tfv1.GPU{}).Build()
+
+	gpu := createOrUpdateTensorFusionGPU(k8sClient, ctx, k8sNodeName, gpuNode, uuid, deviceName, memInfo, tflops)
+	assert.True(t, metav1.IsControlledBy(gpu, gpuNode))
+
+	newGpuNode := &tfv1.GPUNode{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: "new-test-gpu-node",
+			UID:  "new-mock-uid",
+		},
+	}
+
+	gpu = createOrUpdateTensorFusionGPU(k8sClient, ctx, k8sNodeName, newGpuNode, uuid, deviceName, memInfo, tflops)
+	assert.True(t, metav1.IsControlledBy(gpu, newGpuNode))
+	assert.False(t, metav1.IsControlledBy(gpu, gpuNode))
+}