fix: complement metrics items, fix event permission issue (#322)

Code2Life · web-flow · commit 649c72dd7e9e · 2025-08-15T17:07:58.000+08:00
* fix: complement metrics items

* fix: add missing metrics fields

* fix: potential invalid score issue

* fix: support mobile gpu tflops discovery

* fix: add v100 and fix EST pricing

* fix: scheduler metrics zero issue

* fix: lint issue

* fix: events API forbidden issue
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -103,6 +103,7 @@
         "nodeclassref",
         "noderesources",
         "nolint",
+        "Nvlink",
         "NVML",
         "objs",
         "omitempty",
diff --git a/charts/tensor-fusion/templates/gpu-public-gpu-info.yaml b/charts/tensor-fusion/templates/gpu-public-gpu-info.yaml
@@ -84,15 +84,15 @@ data:
     - model: A10
       fullModelName: "NVIDIA A10"
       vendor: NVIDIA
-      costPerHour: 0.9
+      costPerHour: 0.7
       fp16TFlops: 125
 
-    # A10G has less CUDA core than A10, but with RT cores for rendering case
+    # A10G has more RT cores than A10for rendering case
     - model: A10G
       fullModelName: "NVIDIA A10G"
       vendor: NVIDIA
-      costPerHour: 0.75 # from lambda labs
-      fp16TFlops: 63
+      costPerHour: 0.8
+      fp16TFlops: 125
 
     - model: A40
       fullModelName: "NVIDIA A40 48GB PCIe"
@@ -334,6 +334,18 @@ data:
       costPerHour: 1.00
       fp16TFlops: 148
 
+    - model: V100
+      fullModelName: "NVIDIA V100"
+      vendor: NVIDIA
+      costPerHour: 0.7
+      fp16TFlops: 125
+
+    - model: V100S
+      fullModelName: "NVIDIA V100S"
+      vendor: NVIDIA
+      costPerHour: 0.8
+      fp16TFlops: 130
+
     # higher mem bandwidth and vram size(141G) than H100
     - model: H200
       fullModelName: "NVIDIA H200 80GB HBM3"
@@ -375,13 +387,13 @@ data:
     - model: RTX_5080
       fullModelName: "NVIDIA GeForce RTX 5080"
       vendor: NVIDIA
-      costPerHour: 0.3
+      costPerHour: 0.9
       fp16TFlops: 225
 
     - model: RTX_5070Ti
       fullModelName: "NVIDIA GeForce RTX 5070 Ti"
       vendor: NVIDIA
-      costPerHour: 0.4
+      costPerHour: 0.5
       fp16TFlops: 177.4
 
     - model: RTX_5070
diff --git a/charts/tensor-fusion/templates/rbac.yaml b/charts/tensor-fusion/templates/rbac.yaml
@@ -16,6 +16,17 @@ rules:
   - patch
   - update
   - watch
+- apiGroups:
+  - events.k8s.io
+  resources:
+  - events
+  verbs:
+  - create
+  - get
+  - list
+  - patch
+  - update
+  - watch
 - apiGroups:
   - ""
   resources:
diff --git a/charts/tensor-fusion/values.yaml b/charts/tensor-fusion/values.yaml
@@ -224,6 +224,9 @@ dynamicConfig:
   # retention period for metrics data
   metricsTTL: 30d
   metricsFormat: influx
+  # extra pod labels to be added to metrics, 
+  # you can map label keys to other measure tags
+  metricsExtraPodLabels: {}
 
   # alert rules
   alertRules:    
diff --git a/cmd/nodediscovery/main.go b/cmd/nodediscovery/main.go
@@ -36,6 +36,7 @@ import (
 )
 
 const TMP_PATH = "/tmp"
+const LAPTOP_GPU_SUFFIX = " Laptop GPU"
 
 var Scheme = runtime.NewScheme()
 
@@ -144,10 +145,17 @@ func main() {
 			ctrl.Log.Error(errors.New(nvml.ErrorString(ret)), "unable to get memory info of device", "index", i)
 			os.Exit(1)
 		}
+
+		// Nvidia mobile series GPU chips are the same as desktop series GPU, but clock speed is lower
+		// so we can use desktop series GPU info to represent mobile series GPU, and set available TFlops with a multiplier
+		isLaptopGPU := strings.HasSuffix(deviceName, LAPTOP_GPU_SUFFIX)
+		if isLaptopGPU {
+			deviceName = strings.ReplaceAll(deviceName, LAPTOP_GPU_SUFFIX, "")
+			ctrl.Log.Info("found mobile/laptop GPU, clock speed is lower, will set lower TFlops", "deviceName", deviceName)
+		}
 		info, ok := lo.Find(gpuInfo, func(info config.GpuInfo) bool {
 			return info.FullModelName == deviceName
 		})
-		tflops := info.Fp16TFlops
 		if !ok {
 			ctrl.Log.Info(
 				"[Error] Unknown GPU model, please update `gpu-public-gpu-info` configMap "+
@@ -157,9 +165,13 @@ func main() {
 					"#pod-stuck-in-starting-status-after-enabling-tensorfusion",
 				"deviceName", deviceName, "uuid", uuid)
 			os.Exit(1)
-		} else {
-			ctrl.Log.Info("found GPU info from config", "deviceName", deviceName, "FP16 TFlops", tflops, "uuid", uuid)
 		}
+		tflops := info.Fp16TFlops
+		if isLaptopGPU {
+			tflops = resource.MustParse(fmt.Sprintf("%.2f",
+				tflops.AsApproximateFloat64()*constants.MobileGpuClockSpeedMultiplier))
+		}
+		ctrl.Log.Info("found GPU info from config", "deviceName", deviceName, "FP16 TFlops", tflops, "uuid", uuid)
 
 		gpu, err := createOrUpdateTensorFusionGPU(k8sClient, ctx, k8sNodeName, gpunode, uuid, deviceName, memInfo, tflops)
 		if err != nil {
diff --git a/cmd/nodediscovery/main_test.go b/cmd/nodediscovery/main_test.go
@@ -3,6 +3,7 @@ package main
 import (
 	"context"
 	"fmt"
+	"strings"
 	"testing"
 	"time"
 
@@ -82,6 +83,18 @@ func TestCreateOrUpdateTensorFusionGPU(t *testing.T) {
 	assert.Equal(t, updatedGpu.Status.Available.Vram, gpu.Status.Available.Vram, "GPU VRAM should match")
 }
 
+func TestParseLaptopGPU(t *testing.T) {
+	deviceName := "NVIDIA-Test-GPU Laptop GPU"
+	isLaptopGPU := strings.HasSuffix(deviceName, " Laptop GPU")
+	assert.True(t, isLaptopGPU)
+	deviceName = strings.ReplaceAll(deviceName, " Laptop GPU", "")
+	assert.Equal(t, "NVIDIA-Test-GPU", deviceName)
+	tflops := resource.MustParse("100.147")
+	tflops = resource.MustParse(fmt.Sprintf("%.2f", tflops.AsApproximateFloat64()*constants.MobileGpuClockSpeedMultiplier))
+	expected := resource.MustParse("75110m")
+	assert.Equal(t, expected.String(), tflops.String())
+}
+
 func TestGPUControllerReference(t *testing.T) {
 	// Setup test data
 	ctx := context.Background()
diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml
@@ -114,6 +114,17 @@ rules:
   - patch
   - update
   - watch
+- apiGroups:
+  - events.k8s.io
+  resources:
+  - events
+  verbs:
+  - create
+  - get
+  - list
+  - patch
+  - update
+  - watch
 - apiGroups:
   - tensor-fusion.ai
   resources:
diff --git a/internal/constants/constants.go b/internal/constants/constants.go
@@ -198,3 +198,5 @@ const AuthorizationHeader = "Authorization"
 const ExtraVerificationInfoPodIDKey = "authentication.kubernetes.io/pod-uid"
 
 const SchedulerSimulationKey = "simulate-schedule"
+
+const MobileGpuClockSpeedMultiplier = 0.75
diff --git a/internal/controller/tensorfusioncluster_controller.go b/internal/controller/tensorfusioncluster_controller.go
@@ -64,6 +64,7 @@ type TensorFusionClusterReconciler struct {
 // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=tensorfusionclusters/status,verbs=get;update;patch
 // +kubebuilder:rbac:groups=tensor-fusion.ai,resources=tensorfusionclusters/finalizers,verbs=update
 // +kubebuilder:rbac:groups=core,resources=events,verbs=create;patch;update;list;watch;get
+// +kubebuilder:rbac:groups=events.k8s.io,resources=events,verbs=create;patch;update;list;watch;get
 // +kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch;create;update;patch;delete
 // +kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch;create;update;patch;delete
 // +kubebuilder:rbac:groups=batch,resources=cronjobs,verbs=get;list;watch
diff --git a/internal/gpuallocator/gpuallocator.go b/internal/gpuallocator/gpuallocator.go
@@ -18,6 +18,7 @@ import (
 	"github.com/NexusGPU/tensor-fusion/internal/config"
 	"github.com/NexusGPU/tensor-fusion/internal/constants"
 	"github.com/NexusGPU/tensor-fusion/internal/gpuallocator/filter"
+	"github.com/NexusGPU/tensor-fusion/internal/metrics"
 	"github.com/NexusGPU/tensor-fusion/internal/quota"
 	"github.com/NexusGPU/tensor-fusion/internal/utils"
 	"github.com/samber/lo"
@@ -256,6 +257,7 @@ func (s *GpuAllocator) Bind(
 	// Use actual allocated GPU count instead of requested count
 	s.quotaStore.AllocateQuota(req.WorkloadNameNamespace.Namespace, req)
 	s.addAllocationMap(gpuNodeName, req.PodMeta)
+	metrics.SetSchedulerMetrics(req.PoolName, true)
 
 	log.FromContext(s.ctx).Info("GPU allocation successful",
 		"namespace", req.WorkloadNameNamespace.Namespace,
@@ -284,10 +286,12 @@ func (s *GpuAllocator) Alloc(req *tfv1.AllocRequest) ([]*tfv1.GPU, error) {
 
 	filteredGPUs, _, err := s.CheckQuotaAndFilter(s.ctx, req, false)
 	if err != nil {
+		metrics.SetSchedulerMetrics(req.PoolName, false)
 		return nil, err
 	}
 	selectedGPUs, err := s.Select(req, filteredGPUs)
 	if err != nil {
+		metrics.SetSchedulerMetrics(req.PoolName, false)
 		return nil, err
 	}
 
@@ -1258,5 +1262,12 @@ func (s *GpuAllocator) getPlacementMode(ctx context.Context, poolName string) tf
 
 // normalize score to [0, 100]
 func normalizeScore(cfg *config.GPUFitConfig, vramScore, tflopsScore float64) int {
-	return int(math.Round(vramScore*cfg.VramWeight + tflopsScore*cfg.TflopsWeight))
+	score := int(math.Round(vramScore*cfg.VramWeight + tflopsScore*cfg.TflopsWeight))
+	if score < 0 {
+		return 0
+	}
+	if score > 100 {
+		return 100
+	}
+	return score
 }
diff --git a/internal/metrics/migrate.go b/internal/metrics/migrate.go
@@ -16,9 +16,9 @@ var TFVersionMigrationMap = []struct {
 
 		"CREATE TABLE IF NOT EXISTS tf_system_log (\n    `component` String NULL INVERTED INDEX,\n    `container` String NULL INVERTED INDEX,\n    `message` String NULL FULLTEXT INDEX WITH (analyzer = 'English' , case_sensitive = 'false'),\n    `namespace` String NULL INVERTED INDEX,\n    `pod` String NULL SKIPPING INDEX,\n    `stream` String NULL,\n    `timestamp` String NULL,\n    `greptime_timestamp` Timestamp_ms TIME INDEX,\n    PRIMARY KEY (`component`, `container`, `namespace`, `pod`))\n    ENGINE=mito WITH( ttl='30d', merge_mode = 'last_non_null')",
 
-		"CREATE TABLE IF NOT EXISTS tf_worker_usage (\n    `workload` String NULL INVERTED INDEX,\n    `worker` String NULL SKIPPING INDEX,\n    `pool` String NULL INVERTED INDEX,\n    `node` String NULL INVERTED INDEX,\n    `uuid` String NULL INVERTED INDEX,\n    `compute_percentage` Double NULL,\n    `memory_bytes` BigInt UNSIGNED NULL,\n    `compute_tflops` Double NULL,\n    `compute_throttled_cnt` BigInt NULL,\n    `vram_freezed_cnt` BigInt NULL,\n    `vram_resumed_cnt` BigInt NULL,\n    `ts` Timestamp_ms TIME INDEX,\n    PRIMARY KEY (`workload`, `worker`, `pool`, `node`, `uuid`))\n    ENGINE=mito WITH( ttl='30d', merge_mode = 'last_non_null')",
+		"CREATE TABLE IF NOT EXISTS tf_worker_usage (\n    `workload` String NULL INVERTED INDEX,\n    `worker` String NULL SKIPPING INDEX,\n    `namespace` String NULL INVERTED INDEX,\n    `pool` String NULL INVERTED INDEX,\n    `node` String NULL INVERTED INDEX,\n    `uuid` String NULL INVERTED INDEX,\n    `compute_tflops` Double NULL,\n    `compute_percentage` Double NULL,\n    `memory_bytes` BigInt UNSIGNED NULL,\n    `memory_percentage` Double NULL,\n    `compute_throttled_cnt` BigInt NULL,\n    `vram_freezed_cnt` BigInt NULL,\n    `vram_resumed_cnt` BigInt NULL,\n    `ts` Timestamp_ms TIME INDEX,\n    PRIMARY KEY (`workload`, `worker`, `namespace`, `pool`, `node`, `uuid`))\n    ENGINE=mito WITH( ttl='30d', merge_mode = 'last_non_null')",
 
-		"CREATE TABLE IF NOT EXISTS tf_gpu_usage (\n    `node` String NULL INVERTED INDEX,\n    `pool` String NULL INVERTED INDEX,\n    `uuid` String NULL INVERTED INDEX,\n    `compute_percentage` Double NULL,\n    `memory_percentage` Double NULL,\n    `memory_bytes` BigInt UNSIGNED NULL,\n    `compute_tflops` Double NULL,\n    `rx` Double NULL,\n    `tx` Double NULL,\n    `temperature` Double NULL,\n    `ts` Timestamp_ms TIME INDEX,\n    PRIMARY KEY (`node`, `pool`, `uuid`))\n    ENGINE=mito WITH( ttl='30d', merge_mode = 'last_non_null')",
+		"CREATE TABLE IF NOT EXISTS tf_gpu_usage (\n    `node` String NULL INVERTED INDEX,\n    `pool` String NULL INVERTED INDEX,\n    `uuid` String NULL INVERTED INDEX,\n    `compute_percentage` Double NULL,\n    `memory_percentage` Double NULL,\n    `memory_bytes` BigInt UNSIGNED NULL,\n    `compute_tflops` Double NULL,\n    `rx` Double NULL,\n    `tx` Double NULL,\n    `temperature` Double NULL,\n    `graphics_clock_mhz` Double NULL,\n    `sm_clock_mhz` Double NULL,\n    `memory_clock_mhz` Double NULL,\n    `video_clock_mhz` Double NULL,\n    `power_usage` Double NULL,\n    `nvlink_rx` Double NULL,\n    `nvlink_tx` Double NULL,\n    `ts` Timestamp_ms TIME INDEX,\n    PRIMARY KEY (`node`, `pool`, `uuid`))\n    ENGINE=mito WITH( ttl='30d', merge_mode = 'last_non_null')",
 	}},
 
 	// add alter SQL in future
diff --git a/internal/metrics/types.go b/internal/metrics/types.go
@@ -127,13 +127,15 @@ func (sl TFSystemLog) TableName() string {
 type HypervisorWorkerUsageMetrics struct {
 	WorkloadName string `json:"workloadName" gorm:"column:workload;index:,class:INVERTED"`
 	WorkerName   string `json:"workerName" gorm:"column:worker;index:,class:SKIPPING"`
+	Namespace    string `json:"namespace" gorm:"column:namespace;index:,class:INVERTED"`
 	PoolName     string `json:"poolName" gorm:"column:pool;index:,class:INVERTED"`
 	NodeName     string `json:"nodeName" gorm:"column:node;index:,class:INVERTED"`
 	UUID         string `json:"uuid" gorm:"column:uuid;index:,class:INVERTED"`
 
+	ComputeTflops  float64 `json:"computeTflops" gorm:"column:compute_tflops"`
 	ComputePercent float64 `json:"computePercent" gorm:"column:compute_percentage"`
 	VRAMBytes      uint64  `json:"vramBytes" gorm:"column:memory_bytes"`
-	ComputeTflops  float64 `json:"computeTflops" gorm:"column:compute_tflops"`
+	VRAMPercent    float64 `json:"vramPercent" gorm:"column:memory_percentage"`
 
 	ComputeThrottledCount int64 `json:"computeThrottledCount" gorm:"column:compute_throttled_cnt"`
 	VRAMFreezedCount      int64 `json:"vramFreezedCount" gorm:"column:vram_freezed_cnt"`
@@ -163,6 +165,13 @@ type HypervisorGPUUsageMetrics struct {
 	PcieTxKB    float64 `json:"pcieTx" gorm:"column:tx"`
 	Temperature float64 `json:"temperature" gorm:"column:temperature"`
 
+	GraphicsClockMHz float64 `json:"graphicsClockMHz" gorm:"column:graphics_clock_mhz"`
+	SMClockMHz       float64 `json:"smClockMHz" gorm:"column:sm_clock_mhz"`
+	MemoryClockMHz   float64 `json:"memoryClockMHz" gorm:"column:memory_clock_mhz"`
+	VideoClockMHz    float64 `json:"videoClockMHz" gorm:"column:video_clock_mhz"`
+	PowerUsage       float64 `json:"powerUsage" gorm:"column:power_usage"`
+	NvlinkRx         float64 `json:"nvlinkRx" gorm:"column:nvlink_rx"`
+	NvlinkTx         float64 `json:"nvlinkTx" gorm:"column:nvlink_tx"`
 	// NOTE: make sure new fields will be migrated in SetupTable function
 
 	Timestamp time.Time `json:"ts" gorm:"column:ts;index:,class:TIME"`
diff --git a/internal/scheduler/gpuresources/gpuresources.go b/internal/scheduler/gpuresources/gpuresources.go
@@ -11,6 +11,7 @@ import (
 	"github.com/NexusGPU/tensor-fusion/internal/config"
 	"github.com/NexusGPU/tensor-fusion/internal/constants"
 	"github.com/NexusGPU/tensor-fusion/internal/gpuallocator"
+	"github.com/NexusGPU/tensor-fusion/internal/metrics"
 	"github.com/NexusGPU/tensor-fusion/internal/utils"
 	"github.com/samber/lo"
 	v1 "k8s.io/api/core/v1"
@@ -127,6 +128,7 @@ func (s *GPUFit) PreFilter(ctx context.Context, state *framework.CycleState, pod
 	}
 
 	if err != nil {
+		metrics.SetSchedulerMetrics(allocRequest.PoolName, false)
 		s.fh.EventRecorder().Eventf(pod, pod, v1.EventTypeWarning, "GPUQuotaOrCapacityNotEnough",
 			"check quota and filter", "TensorFusion schedule failed, no enough resource or quotas: "+err.Error())
 		s.logger.Error(err, "failed to check quota and filter", "pod", pod.Name)