Skip to content

Commit bd64440

Browse files
knave施建
andauthored
fix: available field and owner reference issue in nodediscovery (#180)
* fix: available field and owner reference issue in nodediscovery * fix: linter issues and upgrade linter verison in makefile --------- Co-authored-by: 施建 <[email protected]>
1 parent 488c7e9 commit bd64440

File tree

3 files changed

+181
-68
lines changed

3 files changed

+181
-68
lines changed

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ GOLANGCI_LINT = $(LOCALBIN)/golangci-lint
174174
KUSTOMIZE_VERSION ?= v5.5.0
175175
CONTROLLER_TOOLS_VERSION ?= v0.16.4
176176
ENVTEST_VERSION ?= release-0.19
177-
GOLANGCI_LINT_VERSION ?= v1.64.8
177+
GOLANGCI_LINT_VERSION ?= v2.1.6
178178

179179
.PHONY: kustomize
180180
kustomize: $(KUSTOMIZE) ## Download kustomize locally if necessary.
@@ -194,7 +194,7 @@ $(ENVTEST): $(LOCALBIN)
194194
.PHONY: golangci-lint
195195
golangci-lint: $(GOLANGCI_LINT) ## Download golangci-lint locally if necessary.
196196
$(GOLANGCI_LINT): $(LOCALBIN)
197-
$(call go-install-tool,$(GOLANGCI_LINT),github.com/golangci/golangci-lint/cmd/golangci-lint,$(GOLANGCI_LINT_VERSION))
197+
$(call go-install-tool,$(GOLANGCI_LINT),github.com/golangci/golangci-lint/v2/cmd/golangci-lint,$(GOLANGCI_LINT_VERSION))
198198

199199
# go-install-tool will 'go install' any package with custom target and name of binary, if it doesn't exist
200200
# $1 - target path with name of binary

cmd/nodediscovery/main.go

Lines changed: 72 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -152,71 +152,8 @@ func main() {
152152
} else {
153153
ctrl.Log.Info("found GPU info from config", "deviceName", deviceName, "FP16 TFlops", tflops, "uuid", uuid)
154154
}
155-
gpu := &tfv1.GPU{
156-
ObjectMeta: metav1.ObjectMeta{
157-
Name: uuid,
158-
},
159-
}
160-
161-
gpuStatus := tfv1.GPUStatus{
162-
Phase: tfv1.TensorFusionGPUPhaseRunning,
163-
Capacity: &tfv1.Resource{
164-
Vram: resource.MustParse(fmt.Sprintf("%dKi", memInfo.Total/1024)),
165-
Tflops: tflops,
166-
},
167-
UUID: uuid,
168-
GPUModel: deviceName,
169-
NodeSelector: map[string]string{
170-
"kubernetes.io/hostname": k8sNodeName,
171-
},
172-
}
173-
174-
err = retry.OnError(retry.DefaultBackoff, func(err error) bool {
175-
return true // Retry on all errors for now
176-
}, func() error {
177-
_, err := controllerutil.CreateOrUpdate(ctx, k8sClient, gpu, func() error {
178-
// Set metadata fields
179-
gpu.Labels = map[string]string{
180-
constants.LabelKeyOwner: gpunode.Name,
181-
}
182-
gpu.Annotations = map[string]string{
183-
constants.GPULastReportTimeAnnotationKey: time.Now().Format(time.RFC3339),
184-
}
185-
186-
// Set controller reference
187-
return controllerutil.SetControllerReference(gpunode, gpu, Scheme)
188-
})
189-
return err
190-
})
191-
192-
if err != nil {
193-
ctrl.Log.Error(err, "failed to create or update GPU after retries", "gpu", gpu)
194-
os.Exit(1)
195-
}
196-
197-
available := gpuStatus.Available
198-
gpu.Status = gpuStatus
199-
if available == nil {
200-
gpu.Status.Available = gpuStatus.Capacity
201-
} else {
202-
gpu.Status.Available = available
203-
}
204155

205-
err = retry.RetryOnConflict(retry.DefaultBackoff, func() error {
206-
currentGPU := &tfv1.GPU{}
207-
if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(gpu), currentGPU); err != nil {
208-
return err
209-
}
210-
211-
currentGPU.Status = gpu.Status
212-
213-
return k8sClient.Status().Update(ctx, currentGPU)
214-
})
215-
216-
if err != nil {
217-
ctrl.Log.Error(err, "failed to update status of GPU after retries", "gpu", gpu)
218-
os.Exit(1)
219-
}
156+
gpu := createOrUpdateTensorFusionGPU(k8sClient, ctx, k8sNodeName, gpunode, uuid, deviceName, memInfo, tflops)
220157

221158
totalTFlops.Add(gpu.Status.Capacity.Tflops)
222159
totalVRAM.Add(gpu.Status.Capacity.Vram)
@@ -246,13 +183,82 @@ func main() {
246183

247184
return k8sClient.Status().Update(ctx, currentGPUNode)
248185
})
249-
250186
if err != nil {
251187
ctrl.Log.Error(err, "failed to update status of GPUNode after retries")
252188
os.Exit(1)
253189
}
254190
}
255191

192+
func createOrUpdateTensorFusionGPU(
193+
k8sClient client.Client, ctx context.Context, k8sNodeName string, gpunode *tfv1.GPUNode,
194+
uuid string, deviceName string, memInfo nvml.Memory_v2, tflops resource.Quantity) *tfv1.GPU {
195+
gpu := &tfv1.GPU{
196+
ObjectMeta: metav1.ObjectMeta{
197+
Name: uuid,
198+
},
199+
}
200+
201+
err := retry.OnError(retry.DefaultBackoff, func(err error) bool {
202+
return true // Retry on all errors for now
203+
}, func() error {
204+
_, err := controllerutil.CreateOrUpdate(ctx, k8sClient, gpu, func() error {
205+
// Set metadata fields
206+
gpu.Labels = map[string]string{
207+
constants.LabelKeyOwner: gpunode.Name,
208+
}
209+
gpu.Annotations = map[string]string{
210+
constants.GPULastReportTimeAnnotationKey: time.Now().Format(time.RFC3339),
211+
}
212+
213+
if !metav1.IsControlledBy(gpu, gpunode) {
214+
gpu.OwnerReferences = []metav1.OwnerReference{
215+
*metav1.NewControllerRef(gpunode, gpunode.GroupVersionKind()),
216+
}
217+
}
218+
219+
return nil
220+
})
221+
return err
222+
})
223+
if err != nil {
224+
ctrl.Log.Error(err, "failed to create or update GPU after retries", "gpu", gpu)
225+
os.Exit(1)
226+
}
227+
228+
err = retry.RetryOnConflict(retry.DefaultBackoff, func() error {
229+
if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(gpu), gpu); err != nil {
230+
return err
231+
}
232+
233+
newStatus := tfv1.GPUStatus{
234+
Phase: tfv1.TensorFusionGPUPhaseRunning,
235+
Capacity: &tfv1.Resource{
236+
Vram: resource.MustParse(fmt.Sprintf("%dKi", memInfo.Total/1024)),
237+
Tflops: tflops,
238+
},
239+
UUID: uuid,
240+
GPUModel: deviceName,
241+
NodeSelector: map[string]string{
242+
"kubernetes.io/hostname": k8sNodeName,
243+
},
244+
}
245+
246+
if gpu.Status.Available == nil {
247+
newStatus.Available = newStatus.Capacity
248+
} else {
249+
newStatus.Available = gpu.Status.Available
250+
}
251+
gpu.Status = newStatus
252+
return k8sClient.Status().Update(ctx, gpu)
253+
})
254+
if err != nil {
255+
ctrl.Log.Error(err, "failed to update status of GPU after retries", "gpu", gpu)
256+
os.Exit(1)
257+
}
258+
259+
return gpu
260+
}
261+
256262
func nodeStatus(k8sNodeName string) *tfv1.GPUNodeStatus {
257263
return &tfv1.GPUNodeStatus{
258264
KubernetesNodeName: k8sNodeName,
@@ -309,7 +315,7 @@ func getDiskInfo(path string) (total int64) {
309315
err = syscall.Statfs(absPath, &stat)
310316
if err != nil {
311317
if errors.Is(err, syscall.ENOENT) {
312-
err = os.MkdirAll(absPath, 0755)
318+
err = os.MkdirAll(absPath, 0o755)
313319
if err != nil {
314320
fmt.Printf("error creating folder: %s, err: %v\n", absPath, err)
315321
return 0

cmd/nodediscovery/main_test.go

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
package main
2+
3+
import (
4+
"context"
5+
"testing"
6+
"time"
7+
8+
"github.com/NVIDIA/go-nvml/pkg/nvml"
9+
tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
10+
"github.com/NexusGPU/tensor-fusion/internal/constants"
11+
"github.com/stretchr/testify/assert"
12+
"k8s.io/apimachinery/pkg/api/resource"
13+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
14+
"k8s.io/apimachinery/pkg/runtime"
15+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
16+
)
17+
18+
func TestCreateOrUpdateTensorFusionGPU(t *testing.T) {
19+
// Setup test data
20+
ctx := context.Background()
21+
uuid := "test-uuid"
22+
memInfo := nvml.Memory_v2{Total: 16 * 1024 * 1024 * 1024} // 16 GiB
23+
tflops := resource.MustParse("100")
24+
deviceName := "NVIDIA-Test-GPU"
25+
k8sNodeName := "test-node"
26+
gpuNodeName := "test-gpu-node"
27+
28+
gpuNode := &tfv1.GPUNode{
29+
ObjectMeta: metav1.ObjectMeta{
30+
Name: gpuNodeName,
31+
},
32+
}
33+
34+
scheme := runtime.NewScheme()
35+
_ = tfv1.AddToScheme(scheme)
36+
37+
k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithStatusSubresource(&tfv1.GPU{}).Build()
38+
39+
gpu := createOrUpdateTensorFusionGPU(k8sClient, ctx, k8sNodeName, gpuNode, uuid, deviceName, memInfo, tflops)
40+
41+
// Assertions
42+
assert.NotNil(t, gpu, "GPU object should not be nil")
43+
assert.Equal(t, uuid, gpu.Name, "GPU name should match UUID")
44+
assert.Equal(t, deviceName, gpu.Status.GPUModel, "GPU model should match device name")
45+
assert.Equal(t, tflops, gpu.Status.Capacity.Tflops, "GPU TFlops should match")
46+
assert.Equal(t, resource.MustParse("16384Mi"), gpu.Status.Capacity.Vram, "GPU VRAM should match")
47+
assert.Equal(t, gpu.Status.Capacity, gpu.Status.Available, "Available resources should match capacity")
48+
assert.Equal(t, map[string]string{"kubernetes.io/hostname": k8sNodeName},
49+
gpu.Status.NodeSelector, "Node selector should match")
50+
assert.Equal(t, tfv1.TensorFusionGPUPhaseRunning, gpu.Status.Phase, "GPU phase should be running")
51+
52+
// Verify labels and annotations
53+
assert.Equal(t, map[string]string{constants.LabelKeyOwner: gpuNodeName}, gpu.Labels, "GPU labels should match")
54+
assert.Contains(t, gpu.Annotations, constants.GPULastReportTimeAnnotationKey,
55+
"GPU annotations should contain last report time")
56+
_, err := time.Parse(time.RFC3339, gpu.Annotations[constants.GPULastReportTimeAnnotationKey])
57+
assert.NoError(t, err, "Last report time annotation should be a valid RFC3339 timestamp")
58+
59+
// Verify the Available field does not change after the update
60+
gpu.Status.Available.Tflops.Sub(resource.MustParse("1000"))
61+
gpu.Status.Available.Vram.Sub(resource.MustParse("2000Mi"))
62+
err = k8sClient.Status().Update(ctx, gpu)
63+
assert.NoError(t, err)
64+
65+
tflops.Add(resource.MustParse("100"))
66+
updatedGpu := createOrUpdateTensorFusionGPU(k8sClient, ctx, k8sNodeName, gpuNode, uuid, deviceName, memInfo, tflops)
67+
assert.NotEqual(t, updatedGpu.Status.Capacity, gpu.Status.Capacity, "GPU capacity should not match")
68+
assert.Equal(t, updatedGpu.Status.Available.Tflops, gpu.Status.Available.Tflops, "GPU TFlops should match")
69+
assert.Equal(t, updatedGpu.Status.Available.Vram, gpu.Status.Available.Vram, "GPU VRAM should match")
70+
}
71+
72+
func TestGPUControllerReference(t *testing.T) {
73+
// Setup test data
74+
ctx := context.Background()
75+
uuid := "test-uuid"
76+
memInfo := nvml.Memory_v2{Total: 16 * 1024 * 1024 * 1024} // 16 GiB
77+
tflops := resource.MustParse("100")
78+
deviceName := "NVIDIA-Test-GPU"
79+
k8sNodeName := "test-node"
80+
gpuNodeName := "test-gpu-node"
81+
82+
gpuNode := &tfv1.GPUNode{
83+
ObjectMeta: metav1.ObjectMeta{
84+
Name: gpuNodeName,
85+
UID: "mock-uid",
86+
},
87+
}
88+
89+
scheme := runtime.NewScheme()
90+
_ = tfv1.AddToScheme(scheme)
91+
92+
k8sClient := fake.NewClientBuilder().WithScheme(scheme).WithStatusSubresource(&tfv1.GPU{}).Build()
93+
94+
gpu := createOrUpdateTensorFusionGPU(k8sClient, ctx, k8sNodeName, gpuNode, uuid, deviceName, memInfo, tflops)
95+
assert.True(t, metav1.IsControlledBy(gpu, gpuNode))
96+
97+
newGpuNode := &tfv1.GPUNode{
98+
ObjectMeta: metav1.ObjectMeta{
99+
Name: "new-test-gpu-node",
100+
UID: "new-mock-uid",
101+
},
102+
}
103+
104+
gpu = createOrUpdateTensorFusionGPU(k8sClient, ctx, k8sNodeName, newGpuNode, uuid, deviceName, memInfo, tflops)
105+
assert.True(t, metav1.IsControlledBy(gpu, newGpuNode))
106+
assert.False(t, metav1.IsControlledBy(gpu, gpuNode))
107+
}

0 commit comments

Comments
 (0)