Skip to content

Commit 649c72d

Browse files
authored
fix: complement metrics items, fix event permission issue (#322)
* fix: complement metrics items * fix: add missing metrics fields * fix: potential invalid score issue * fix: support mobile gpu tflops discovery * fix: add v100 and fix EST pricing * fix: scheduler metrics zero issue * fix: lint issue * fix: events API forbidden issue
1 parent a0d6609 commit 649c72d

File tree

13 files changed

+101
-13
lines changed

13 files changed

+101
-13
lines changed

.vscode/settings.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@
103103
"nodeclassref",
104104
"noderesources",
105105
"nolint",
106+
"Nvlink",
106107
"NVML",
107108
"objs",
108109
"omitempty",

charts/tensor-fusion/templates/gpu-public-gpu-info.yaml

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,15 +84,15 @@ data:
8484
- model: A10
8585
fullModelName: "NVIDIA A10"
8686
vendor: NVIDIA
87-
costPerHour: 0.9
87+
costPerHour: 0.7
8888
fp16TFlops: 125
8989
90-
# A10G has less CUDA core than A10, but with RT cores for rendering case
90+
# A10G has more RT cores than A10for rendering case
9191
- model: A10G
9292
fullModelName: "NVIDIA A10G"
9393
vendor: NVIDIA
94-
costPerHour: 0.75 # from lambda labs
95-
fp16TFlops: 63
94+
costPerHour: 0.8
95+
fp16TFlops: 125
9696
9797
- model: A40
9898
fullModelName: "NVIDIA A40 48GB PCIe"
@@ -334,6 +334,18 @@ data:
334334
costPerHour: 1.00
335335
fp16TFlops: 148
336336
337+
- model: V100
338+
fullModelName: "NVIDIA V100"
339+
vendor: NVIDIA
340+
costPerHour: 0.7
341+
fp16TFlops: 125
342+
343+
- model: V100S
344+
fullModelName: "NVIDIA V100S"
345+
vendor: NVIDIA
346+
costPerHour: 0.8
347+
fp16TFlops: 130
348+
337349
# higher mem bandwidth and vram size(141G) than H100
338350
- model: H200
339351
fullModelName: "NVIDIA H200 80GB HBM3"
@@ -375,13 +387,13 @@ data:
375387
- model: RTX_5080
376388
fullModelName: "NVIDIA GeForce RTX 5080"
377389
vendor: NVIDIA
378-
costPerHour: 0.3
390+
costPerHour: 0.9
379391
fp16TFlops: 225
380392
381393
- model: RTX_5070Ti
382394
fullModelName: "NVIDIA GeForce RTX 5070 Ti"
383395
vendor: NVIDIA
384-
costPerHour: 0.4
396+
costPerHour: 0.5
385397
fp16TFlops: 177.4
386398
387399
- model: RTX_5070

charts/tensor-fusion/templates/rbac.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,17 @@ rules:
1616
- patch
1717
- update
1818
- watch
19+
- apiGroups:
20+
- events.k8s.io
21+
resources:
22+
- events
23+
verbs:
24+
- create
25+
- get
26+
- list
27+
- patch
28+
- update
29+
- watch
1930
- apiGroups:
2031
- ""
2132
resources:

charts/tensor-fusion/values.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,9 @@ dynamicConfig:
224224
# retention period for metrics data
225225
metricsTTL: 30d
226226
metricsFormat: influx
227+
# extra pod labels to be added to metrics,
228+
# you can map label keys to other measure tags
229+
metricsExtraPodLabels: {}
227230

228231
# alert rules
229232
alertRules:

cmd/nodediscovery/main.go

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ import (
3636
)
3737

3838
const TMP_PATH = "/tmp"
39+
const LAPTOP_GPU_SUFFIX = " Laptop GPU"
3940

4041
var Scheme = runtime.NewScheme()
4142

@@ -144,10 +145,17 @@ func main() {
144145
ctrl.Log.Error(errors.New(nvml.ErrorString(ret)), "unable to get memory info of device", "index", i)
145146
os.Exit(1)
146147
}
148+
149+
// Nvidia mobile series GPU chips are the same as desktop series GPU, but clock speed is lower
150+
// so we can use desktop series GPU info to represent mobile series GPU, and set available TFlops with a multiplier
151+
isLaptopGPU := strings.HasSuffix(deviceName, LAPTOP_GPU_SUFFIX)
152+
if isLaptopGPU {
153+
deviceName = strings.ReplaceAll(deviceName, LAPTOP_GPU_SUFFIX, "")
154+
ctrl.Log.Info("found mobile/laptop GPU, clock speed is lower, will set lower TFlops", "deviceName", deviceName)
155+
}
147156
info, ok := lo.Find(gpuInfo, func(info config.GpuInfo) bool {
148157
return info.FullModelName == deviceName
149158
})
150-
tflops := info.Fp16TFlops
151159
if !ok {
152160
ctrl.Log.Info(
153161
"[Error] Unknown GPU model, please update `gpu-public-gpu-info` configMap "+
@@ -157,9 +165,13 @@ func main() {
157165
"#pod-stuck-in-starting-status-after-enabling-tensorfusion",
158166
"deviceName", deviceName, "uuid", uuid)
159167
os.Exit(1)
160-
} else {
161-
ctrl.Log.Info("found GPU info from config", "deviceName", deviceName, "FP16 TFlops", tflops, "uuid", uuid)
162168
}
169+
tflops := info.Fp16TFlops
170+
if isLaptopGPU {
171+
tflops = resource.MustParse(fmt.Sprintf("%.2f",
172+
tflops.AsApproximateFloat64()*constants.MobileGpuClockSpeedMultiplier))
173+
}
174+
ctrl.Log.Info("found GPU info from config", "deviceName", deviceName, "FP16 TFlops", tflops, "uuid", uuid)
163175

164176
gpu, err := createOrUpdateTensorFusionGPU(k8sClient, ctx, k8sNodeName, gpunode, uuid, deviceName, memInfo, tflops)
165177
if err != nil {

cmd/nodediscovery/main_test.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package main
33
import (
44
"context"
55
"fmt"
6+
"strings"
67
"testing"
78
"time"
89

@@ -82,6 +83,18 @@ func TestCreateOrUpdateTensorFusionGPU(t *testing.T) {
8283
assert.Equal(t, updatedGpu.Status.Available.Vram, gpu.Status.Available.Vram, "GPU VRAM should match")
8384
}
8485

86+
func TestParseLaptopGPU(t *testing.T) {
87+
deviceName := "NVIDIA-Test-GPU Laptop GPU"
88+
isLaptopGPU := strings.HasSuffix(deviceName, " Laptop GPU")
89+
assert.True(t, isLaptopGPU)
90+
deviceName = strings.ReplaceAll(deviceName, " Laptop GPU", "")
91+
assert.Equal(t, "NVIDIA-Test-GPU", deviceName)
92+
tflops := resource.MustParse("100.147")
93+
tflops = resource.MustParse(fmt.Sprintf("%.2f", tflops.AsApproximateFloat64()*constants.MobileGpuClockSpeedMultiplier))
94+
expected := resource.MustParse("75110m")
95+
assert.Equal(t, expected.String(), tflops.String())
96+
}
97+
8598
func TestGPUControllerReference(t *testing.T) {
8699
// Setup test data
87100
ctx := context.Background()

config/rbac/role.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,17 @@ rules:
114114
- patch
115115
- update
116116
- watch
117+
- apiGroups:
118+
- events.k8s.io
119+
resources:
120+
- events
121+
verbs:
122+
- create
123+
- get
124+
- list
125+
- patch
126+
- update
127+
- watch
117128
- apiGroups:
118129
- tensor-fusion.ai
119130
resources:

internal/constants/constants.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,3 +198,5 @@ const AuthorizationHeader = "Authorization"
198198
const ExtraVerificationInfoPodIDKey = "authentication.kubernetes.io/pod-uid"
199199

200200
const SchedulerSimulationKey = "simulate-schedule"
201+
202+
const MobileGpuClockSpeedMultiplier = 0.75

internal/controller/tensorfusioncluster_controller.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ type TensorFusionClusterReconciler struct {
6464
// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=tensorfusionclusters/status,verbs=get;update;patch
6565
// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=tensorfusionclusters/finalizers,verbs=update
6666
// +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch;update;list;watch;get
67+
// +kubebuilder:rbac:groups=events.k8s.io,resources=events,verbs=create;patch;update;list;watch;get
6768
// +kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch;create;update;patch;delete
6869
// +kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch;create;update;patch;delete
6970
// +kubebuilder:rbac:groups=batch,resources=cronjobs,verbs=get;list;watch

internal/gpuallocator/gpuallocator.go

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"github.com/NexusGPU/tensor-fusion/internal/config"
1919
"github.com/NexusGPU/tensor-fusion/internal/constants"
2020
"github.com/NexusGPU/tensor-fusion/internal/gpuallocator/filter"
21+
"github.com/NexusGPU/tensor-fusion/internal/metrics"
2122
"github.com/NexusGPU/tensor-fusion/internal/quota"
2223
"github.com/NexusGPU/tensor-fusion/internal/utils"
2324
"github.com/samber/lo"
@@ -256,6 +257,7 @@ func (s *GpuAllocator) Bind(
256257
// Use actual allocated GPU count instead of requested count
257258
s.quotaStore.AllocateQuota(req.WorkloadNameNamespace.Namespace, req)
258259
s.addAllocationMap(gpuNodeName, req.PodMeta)
260+
metrics.SetSchedulerMetrics(req.PoolName, true)
259261

260262
log.FromContext(s.ctx).Info("GPU allocation successful",
261263
"namespace", req.WorkloadNameNamespace.Namespace,
@@ -284,10 +286,12 @@ func (s *GpuAllocator) Alloc(req *tfv1.AllocRequest) ([]*tfv1.GPU, error) {
284286

285287
filteredGPUs, _, err := s.CheckQuotaAndFilter(s.ctx, req, false)
286288
if err != nil {
289+
metrics.SetSchedulerMetrics(req.PoolName, false)
287290
return nil, err
288291
}
289292
selectedGPUs, err := s.Select(req, filteredGPUs)
290293
if err != nil {
294+
metrics.SetSchedulerMetrics(req.PoolName, false)
291295
return nil, err
292296
}
293297

@@ -1258,5 +1262,12 @@ func (s *GpuAllocator) getPlacementMode(ctx context.Context, poolName string) tf
12581262

12591263
// normalize score to [0, 100]
12601264
func normalizeScore(cfg *config.GPUFitConfig, vramScore, tflopsScore float64) int {
1261-
return int(math.Round(vramScore*cfg.VramWeight + tflopsScore*cfg.TflopsWeight))
1265+
score := int(math.Round(vramScore*cfg.VramWeight + tflopsScore*cfg.TflopsWeight))
1266+
if score < 0 {
1267+
return 0
1268+
}
1269+
if score > 100 {
1270+
return 100
1271+
}
1272+
return score
12621273
}

0 commit comments

Comments
 (0)