Skip to content

Commit 22e7d24

Browse files
authored
fix: gpu info matching and node discovery issues (#169)
* fix: chart issue * fix: gpu info auto reload * fix: backoff retry in node discovery job, fail when no GPU info found * fix: lint issues, upgrade linter version
1 parent faff97e commit 22e7d24

File tree

22 files changed

+209
-103
lines changed

22 files changed

+209
-103
lines changed

.github/workflows/lint.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,6 @@ jobs:
3434
go-version: '~1.24'
3535

3636
- name: Run linter
37-
uses: golangci/golangci-lint-action@v6
37+
uses: golangci/golangci-lint-action@v7
3838
with:
39-
version: v1.64.8
39+
version: v2.1.5

.golangci.yml

Lines changed: 30 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,15 @@
1+
version: "2"
12
run:
2-
timeout: 5m
33
allow-parallel-runners: true
4-
5-
issues:
6-
# don't skip warning about doc comments
7-
# don't exclude the default set of lint
8-
exclude-use-default: false
9-
# restore some of the defaults
10-
# (fill in the rest as needed)
11-
exclude-rules:
12-
- path: "api/*"
13-
linters:
14-
- lll
15-
- path: "internal/*"
16-
linters:
17-
- dupl
18-
- lll
194
linters:
20-
disable-all: true
5+
default: none
216
enable:
7+
- copyloopvar
228
- dupl
239
- errcheck
24-
- copyloopvar
2510
- ginkgolinter
2611
- goconst
2712
- gocyclo
28-
- gofmt
29-
- goimports
30-
- gosimple
3113
- govet
3214
- ineffassign
3315
- lll
@@ -36,12 +18,34 @@ linters:
3618
- prealloc
3719
- revive
3820
- staticcheck
39-
- typecheck
4021
- unconvert
4122
- unparam
4223
- unused
43-
44-
linters-settings:
45-
revive:
24+
settings:
25+
revive:
26+
rules:
27+
- name: comment-spacings
28+
exclusions:
29+
generated: lax
4630
rules:
47-
- name: comment-spacings
31+
- linters:
32+
- lll
33+
path: api/*
34+
- linters:
35+
- dupl
36+
- lll
37+
path: internal/*
38+
paths:
39+
- third_party$
40+
- builtin$
41+
- examples$
42+
formatters:
43+
enable:
44+
- gofmt
45+
- goimports
46+
exclusions:
47+
generated: lax
48+
paths:
49+
- third_party$
50+
- builtin$
51+
- examples$

.vscode/launch.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
"ENABLE_WEBHOOKS": "false"
1414
},
1515
"program": "${workspaceFolder}/cmd/main.go",
16+
"args": ["--gpu-info-config", "${workspaceFolder}/config/samples/gpu-info-config.yaml"]
1617
},
1718
{
1819
"name": "Debug Discovery",

charts/tensor-fusion/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 1.2.18
18+
version: 1.2.19
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to

charts/tensor-fusion/templates/gpu-public-gpu-info.yaml

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,42 +32,54 @@ data:
3232
fp16TFlops: 65
3333
3434
# Ampere Architecture Series
35-
- model: A100_SXM
35+
- model: A100_SXM_80G
3636
fullModelName: "NVIDIA A100-SXM4-80GB"
3737
vendor: NVIDIA
3838
costPerHour: 1.89
3939
fp16TFlops: 312
4040
41-
- model: A100_PCIe
41+
- model: A100_PCIe_80G
4242
fullModelName: "NVIDIA A100 80GB PCIe"
4343
vendor: NVIDIA
4444
costPerHour: 1.64
4545
fp16TFlops: 312
4646
47-
- model: A100_40G_SXM
48-
fullModelName: "NVIDIA A100-SXM4-40G"
47+
- model: A100_SXM_40G
48+
fullModelName: "NVIDIA A100-SXM4-40GB"
4949
vendor: NVIDIA
5050
costPerHour: 1.4
5151
fp16TFlops: 312
5252
53-
- model: A100_40G_PCIe
53+
- model: A100_PCIe_40G
5454
fullModelName: "NVIDIA A100 40GB PCIe"
5555
vendor: NVIDIA
5656
costPerHour: 1.2
5757
fp16TFlops: 312
5858
59-
- model: A800_SXM
60-
fullModelName: "NVIDIA A800-SXM4-40G"
59+
- model: A800_SXM_40G
60+
fullModelName: "NVIDIA A800-SXM4-40GB"
61+
vendor: NVIDIA
62+
costPerHour: 1.89
63+
fp16TFlops: 312
64+
65+
- model: A800_SXM_80G
66+
fullModelName: "NVIDIA A800-SXM4-80GB"
6167
vendor: NVIDIA
6268
costPerHour: 1.89
6369
fp16TFlops: 312
6470
65-
- model: A800_PCIe
71+
- model: A800_PCIe_80G
6672
fullModelName: "NVIDIA A800 80GB PCIe"
6773
vendor: NVIDIA
6874
costPerHour: 1.64
6975
fp16TFlops: 312
7076
77+
- model: A800_PCIe_40G
78+
fullModelName: "NVIDIA A800 40GB PCIe"
79+
vendor: NVIDIA
80+
costPerHour: 1.64
81+
fp16TFlops: 312
82+
7183
- model: A10
7284
fullModelName: "NVIDIA A10"
7385
vendor: NVIDIA

cmd/main.go

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"fmt"
2424
"os"
2525
"strings"
26+
"time"
2627

2728
// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
2829
// to ensure that exec-entrypoint and run can make use of them.
@@ -116,6 +117,9 @@ func main() {
116117
gpuInfos = make([]config.GpuInfo, 0)
117118
}
118119

120+
// Watch configMap change with interval, check lastModifiedTime to reload gpuInfoConfig
121+
watchGPUInfoChanges(gpuInfoConfig, &gpuInfos)
122+
119123
// Metrics endpoint is enabled in 'config/default/kustomization.yaml'. The Metrics options configure the server.
120124
// More info:
121125
// - https://pkg.go.dev/sigs.k8s.io/[email protected]/pkg/metrics/server
@@ -269,7 +273,7 @@ func main() {
269273
Scheme: mgr.GetScheme(),
270274
Scheduler: scheduler,
271275
Recorder: mgr.GetEventRecorderFor("tensorfusionworkload"),
272-
GpuInfos: gpuInfos,
276+
GpuInfos: &gpuInfos,
273277
}).SetupWithManager(mgr); err != nil {
274278
setupLog.Error(err, "unable to create controller", "controller", "TensorFusionWorkload")
275279
os.Exit(1)
@@ -311,6 +315,41 @@ func main() {
311315
}
312316
}
313317

318+
func watchGPUInfoChanges(gpuInfoConfig string, gpuInfos *[]config.GpuInfo) {
319+
var lastModTime time.Time
320+
if fileInfo, err := os.Stat(gpuInfoConfig); err == nil {
321+
lastModTime = fileInfo.ModTime()
322+
}
323+
324+
go func() {
325+
ticker := time.NewTicker(15 * time.Second)
326+
defer ticker.Stop()
327+
328+
for range ticker.C {
329+
// Check if file has been modified
330+
fileInfo, err := os.Stat(gpuInfoConfig)
331+
if err != nil {
332+
ctrl.Log.Error(err, "unable to stat gpuInfo file", "gpuInfoConfig", gpuInfoConfig)
333+
continue
334+
}
335+
336+
currentModTime := fileInfo.ModTime()
337+
if currentModTime.After(lastModTime) {
338+
ctrl.Log.Info("gpuInfo file modified, reloading.")
339+
updatedGpuInfos, err := config.LoadGpuInfoFromFile(gpuInfoConfig)
340+
if err != nil {
341+
ctrl.Log.Error(err, "unable to reload gpuInfo file", "gpuInfoConfig", gpuInfoConfig)
342+
continue
343+
}
344+
345+
*gpuInfos = updatedGpuInfos
346+
lastModTime = currentModTime
347+
ctrl.Log.Info("gpuInfo reloaded successfully.", "gpuInfoConfig", gpuInfoConfig)
348+
}
349+
}
350+
}()
351+
}
352+
314353
// only for local development, won't set KUBECONFIG env var in none local environments
315354
func normalizeKubeConfigEnv() {
316355
cfgPath := os.Getenv("KUBECONFIG")

cmd/nodediscovery/main.go

Lines changed: 58 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
2525
"k8s.io/client-go/rest"
2626
"k8s.io/client-go/tools/clientcmd"
27+
"k8s.io/client-go/util/retry"
2728
ctrl "sigs.k8s.io/controller-runtime"
2829
"sigs.k8s.io/controller-runtime/pkg/client"
2930
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
@@ -140,28 +141,23 @@ func main() {
140141
})
141142
tflops := info.Fp16TFlops
142143
if !ok {
143-
tflops = resource.Quantity{}
144-
ctrl.Log.Info("unable to find GPU info from config", "deviceName", deviceName, "uuid", uuid)
144+
ctrl.Log.Info(
145+
"[Error] Unknown GPU model, please update `gpu-public-gpu-info` configMap "+
146+
" to match your GPU model name in `nvidia-smi`, this may cause you workload stuck, "+
147+
"refer this doc to resolve it in detail: "+
148+
"https://tensor-fusion.ai/guide/troubleshooting/handbook"+
149+
"#pod-stuck-in-starting-status-after-enabling-tensorfusion",
150+
"deviceName", deviceName, "uuid", uuid)
151+
os.Exit(1)
145152
} else {
146-
ctrl.Log.Info("found GPU info from config", "deviceName", deviceName, "baseline FP16 TFlops", tflops, "uuid", uuid)
153+
ctrl.Log.Info("found GPU info from config", "deviceName", deviceName, "FP16 TFlops", tflops, "uuid", uuid)
147154
}
148155
gpu := &tfv1.GPU{
149156
ObjectMeta: metav1.ObjectMeta{
150157
Name: uuid,
151-
Labels: map[string]string{
152-
constants.LabelKeyOwner: gpunode.Name,
153-
},
154-
Annotations: map[string]string{
155-
constants.GPULastReportTimeAnnotationKey: time.Now().Format(time.RFC3339),
156-
},
157158
},
158159
}
159160

160-
if err := controllerutil.SetControllerReference(gpunode, gpu, Scheme); err != nil {
161-
ctrl.Log.Error(err, "failed to set controller reference")
162-
os.Exit(1)
163-
}
164-
165161
gpuStatus := tfv1.GPUStatus{
166162
Phase: tfv1.TensorFusionGPUPhaseRunning,
167163
Capacity: &tfv1.Resource{
@@ -174,11 +170,30 @@ func main() {
174170
"kubernetes.io/hostname": k8sNodeName,
175171
},
176172
}
177-
_, err = controllerutil.CreateOrUpdate(ctx, k8sClient, gpu, func() error { return nil })
173+
174+
err = retry.OnError(retry.DefaultBackoff, func(err error) bool {
175+
return true // Retry on all errors for now
176+
}, func() error {
177+
_, err := controllerutil.CreateOrUpdate(ctx, k8sClient, gpu, func() error {
178+
// Set metadata fields
179+
gpu.Labels = map[string]string{
180+
constants.LabelKeyOwner: gpunode.Name,
181+
}
182+
gpu.Annotations = map[string]string{
183+
constants.GPULastReportTimeAnnotationKey: time.Now().Format(time.RFC3339),
184+
}
185+
186+
// Set controller reference
187+
return controllerutil.SetControllerReference(gpunode, gpu, Scheme)
188+
})
189+
return err
190+
})
191+
178192
if err != nil {
179-
ctrl.Log.Error(err, "failed to create GPU", "gpu", gpu)
193+
ctrl.Log.Error(err, "failed to create or update GPU after retries", "gpu", gpu)
180194
os.Exit(1)
181195
}
196+
182197
available := gpuStatus.Available
183198
gpu.Status = gpuStatus
184199
if available == nil {
@@ -187,8 +202,19 @@ func main() {
187202
gpu.Status.Available = available
188203
}
189204

190-
if err := k8sClient.Status().Patch(ctx, gpu, client.Merge); err != nil {
191-
ctrl.Log.Error(err, "failed to update status of GPU", "gpu", gpu)
205+
err = retry.RetryOnConflict(retry.DefaultBackoff, func() error {
206+
currentGPU := &tfv1.GPU{}
207+
if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(gpu), currentGPU); err != nil {
208+
return err
209+
}
210+
211+
currentGPU.Status = gpu.Status
212+
213+
return k8sClient.Status().Update(ctx, currentGPU)
214+
})
215+
216+
if err != nil {
217+
ctrl.Log.Error(err, "failed to update status of GPU after retries", "gpu", gpu)
192218
os.Exit(1)
193219
}
194220

@@ -209,8 +235,20 @@ func main() {
209235
ns.NodeInfo.RAMSize = *resource.NewQuantity(getTotalHostRAM(), resource.DecimalSI)
210236
ns.NodeInfo.DataDiskSize = *resource.NewQuantity(getDiskInfo(constants.TFDataPath), resource.DecimalSI)
211237
gpunode.Status = *ns
212-
if err := k8sClient.Status().Patch(ctx, gpunode, client.Merge); err != nil {
213-
ctrl.Log.Error(err, "failed to update status of GPUNode")
238+
239+
err = retry.RetryOnConflict(retry.DefaultBackoff, func() error {
240+
currentGPUNode := &tfv1.GPUNode{}
241+
if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(gpunode), currentGPUNode); err != nil {
242+
return err
243+
}
244+
245+
currentGPUNode.Status = *ns
246+
247+
return k8sClient.Status().Update(ctx, currentGPUNode)
248+
})
249+
250+
if err != nil {
251+
ctrl.Log.Error(err, "failed to update status of GPUNode after retries")
214252
os.Exit(1)
215253
}
216254
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
- model: T4
2+
fullModelName: "Tesla T4"
3+
vendor: NVIDIA
4+
costPerHour: 0.53
5+
fp16TFlops: 65
6+
7+
- model: A100_SXM
8+
fullModelName: "NVIDIA A100-SXM4-80GB"
9+
vendor: NVIDIA
10+
costPerHour: 1.89
11+
fp16TFlops: 312

internal/cloudprovider/alibaba/ecs.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ func (p AlibabaGPUNodeProvider) TestConnection() error {
7272
request := ecs.CreateDescribeRegionsRequest()
7373
_, err := p.client.DescribeRegions(request)
7474
if err != nil {
75-
return fmt.Errorf("Can not connect to Aliyun ECS API: %v", err)
75+
return fmt.Errorf("can not connect to Aliyun ECS API: %v", err)
7676
}
7777
fmt.Printf("Successfully connected to Aliyun ECS. Available regions got")
7878
return nil

internal/config/gpu_info.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ func LoadGpuInfoFromFile(filename string) ([]GpuInfo, error) {
2828
return infos, nil
2929
}
3030

31-
func MockGpuInfo() []GpuInfo {
32-
return []GpuInfo{
31+
func MockGpuInfo() *[]GpuInfo {
32+
return &[]GpuInfo{
3333
{
3434
Model: "mock",
3535
Vendor: "mock",

0 commit comments

Comments
 (0)