fix: skip gpu limiter not working issue, observability optimize (#350)

Code2Life · web-flow · commit 0389852aa9fd · 2025-09-04T18:44:59.000+08:00
* fix: skip gpu limiter not working issue

* fix: avoid k8s QoS side effect for inject lib init container

* fix: potential panic issues

* fix: remove unused event
diff --git a/config/samples/dynamic-config.yaml b/config/samples/dynamic-config.yaml
@@ -1,23 +1,260 @@
 metricsTTL: 30d
 
 # default to 'influx', influx v2 line protocol
-metricsFormat: json
+metricsFormat: influx
 
-alertRules:
-- name: GPUTFlopsFull
-  query: |
-    SELECT
-      node,
-      pool,
-      uuid,
-      avg(compute_percentage) AS compute_used
-    FROM tf_gpu_usage
-    WHERE compute_percentage > {{ .Threshold }} AND {{ .Conditions }}
-    GROUP BY node, pool, uuid
-  threshold: 97
-  evaluationInterval: 30s
-  consecutiveCount: 4
-  severity: P1
-  summary: "GPU TFlops Full, used {{ .compute_used }}% on {{ .node }} {{ .uuid }}"
-  alertTargetInstance: "{{ .uuid }}"
-  description: "GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has TFlops usage above {{ .Threshold }}% for 4 consecutive 30s, average usage: {{ .compute_used }}%"
+alertRules:    
+  # Worker TFlops throttled alert
+  - name: WorkerTFlopsThrottled
+    query: |
+      SELECT workload, worker, uuid, node, MAX(compute_throttled_cnt)-MIN(compute_throttled_cnt) as throttled_increase
+      FROM tf_worker_usage
+      WHERE {{ .Conditions }}
+      GROUP BY workload, worker, uuid, node
+      HAVING throttled_increase > {{ .Threshold }}
+    threshold: 0
+    evaluationInterval: 15s
+    consecutiveCount: 3
+    severity: P1
+    summary: "Worker TFlops Throttled"
+    description: "Worker {{ .worker }} from Node {{ .node }} is using more than {{ .Threshold }}% of its TFlops limit"
+    alertTargetInstance: "{{ .worker }}-{{ .uuid }}"
+    runbookURL: "https://tensor-fusion.ai/guide/troubleshooting/handbook"
+  
+  # Worker VRAM switching too frequent alert
+  - name: WorkerVRAMSwitchCountIncreasing
+    query: |
+      SELECT workload, worker, uuid, node, MAX(vram_resumed_cnt)-MIN(vram_resumed_cnt) as switch_increase
+      FROM tf_worker_usage 
+      WHERE {{ .Conditions }}
+      GROUP BY workload, worker, uuid, node
+      HAVING switch_increase > {{ .Threshold }}
+    threshold: 0
+    evaluationInterval: 2m
+    consecutiveCount: 1
+    severity: P1
+    summary: "Worker VRAM Switch Count Increasing"
+    description: "Worker {{ .worker }} from Node {{ .node }} has switched VRAM {{ .switch_increase }} times in last 2 minutes, GPU may be too hot"
+    alertTargetInstance: "{{ .worker }}-{{ .uuid }}"
+    runbookURL: "https://tensor-fusion.ai/guide/troubleshooting/handbook"
+  
+  # Worker can not scale up/scheduled alert
+  - name: WorkerAllocationFailed
+    query: |
+      SELECT pool, (MAX(total_allocation_fail_cnt) - MIN(total_allocation_fail_cnt)) as failure_increase,
+      FROM tf_system_metrics
+      WHERE {{ .Conditions }}
+      GROUP BY pool
+      HAVING failure_increase > {{ .Threshold }}
+    threshold: 0
+    evaluationInterval: 30s
+    consecutiveCount: 1
+    severity: P1
+    summary: "Worker allocation failed for GPU Pool {{ .pool }}"
+    description: "Worker allocation failed, {{ .failure_increase }} times in last 30 seconds for GPU Pool {{ .pool }}"
+    alertTargetInstance: "{{ .pool }}"
+    runbookURL: "https://tensor-fusion.ai/guide/troubleshooting/handbook"
+  
+  # Single GPU Alerts
+  
+  # GPU VRAM Full Alert
+  - name: GPUVRAMFull
+    query: |
+      SELECT
+        node,
+        pool,
+        uuid,
+        avg(memory_percentage) AS memory_used
+      FROM tf_gpu_usage
+      WHERE memory_percentage > {{ .Threshold }} AND {{ .Conditions }}
+      GROUP BY node, pool, uuid
+    threshold: 97
+    evaluationInterval: 30s
+    consecutiveCount: 2
+    severity: P1
+    summary: "GPU VRAM Full, used {{ .memory_used }}% on {{ .node }} {{ .uuid }}"
+    alertTargetInstance: "{{ .uuid }}"
+    description: "GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has VRAM usage above {{ .Threshold }}% for 2 consecutive 30s, average usage: {{ .memory_used }}%"
+  
+  # GPU TFlops Full Alert
+  - name: GPUTFlopsFull
+    query: |
+      SELECT
+        node,
+        pool,
+        uuid,
+        avg(compute_percentage) AS compute_used
+      FROM tf_gpu_usage
+      WHERE compute_percentage > {{ .Threshold }} AND {{ .Conditions }}
+      GROUP BY node, pool, uuid
+    threshold: 97
+    evaluationInterval: 30s
+    consecutiveCount: 4
+    severity: P1
+    summary: "GPU TFlops Full, used {{ .compute_used }}% on {{ .node }} {{ .uuid }}"
+    alertTargetInstance: "{{ .uuid }}"
+    description: "GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has TFlops usage above {{ .Threshold }}% for 4 consecutive 30s, average usage: {{ .compute_used }}%"
+  
+  # GPU Temperature alert
+  - name: GPUTemperatureHigh
+    query: |
+      SELECT
+        node,
+        pool,
+        uuid,
+        avg(temperature) AS avg_temperature
+      FROM tf_gpu_usage
+      WHERE temperature > {{ .Threshold }} AND {{ .Conditions }}
+      GROUP BY node, pool, uuid
+    threshold: 90
+    evaluationInterval: 30s
+    consecutiveCount: 3
+    severity: P1
+    summary: "GPU Temperature High, {{ .avg_temperature }}°C on {{ .node }} {{ .uuid }}"
+    alertTargetInstance: "{{ .uuid }}"
+    description: "GPU {{ .uuid }} from Node {{ .node }} has temperature above {{ .Threshold }}°C, Average temperature: {{ .avg_temperature }}, GPU Pool: {{ .pool }}"
+    runbookURL: "https://tensor-fusion.ai/guide/troubleshooting/handbook"
+  
+  # GPU Pool Alerts
+  
+  # Node TFlops allocation alert
+  - name: NodeTFlopsAllocationCritical
+    query: | 
+      SELECT node, pool, (100 - avg(allocated_tflops_percent)) as tflops_available
+      FROM tf_node_metrics
+      WHERE {{ .Conditions }}
+      GROUP BY node, pool
+      HAVING tflops_available < {{ .Threshold }}
+    threshold: 5
+    evaluationInterval: 1m
+    consecutiveCount: 2
+    severity: P0
+    summary: "Available TFlops below threshold, remaining {{ .tflops_available }}% for {{ .node }}"
+    description: "Node {{ .node }} in Pool {{ .pool }} has available TFlops below {{ .Threshold }}%"
+    alertTargetInstance: "{{ .node }}"
+  
+  - name: NodeTFlopsAllocationWarning
+    query: | 
+      SELECT node, pool, (100 - avg(allocated_tflops_percent)) as tflops_available
+      FROM tf_node_metrics
+      WHERE {{ .Conditions }}
+      GROUP BY node, pool
+      HAVING tflops_available < {{ .Threshold }}
+    threshold: 10
+    evaluationInterval: 1m
+    consecutiveCount: 2
+    severity: P1
+    summary: "Node available TFlops below threshold, remaining {{ .tflops_available }}% for {{ .node }}"
+    description: "Node {{ .node }} in Pool {{ .pool }} has available TFlops below {{ .Threshold }}%"
+    alertTargetInstance: "{{ .node }}"
+  
+  # Pool TFlops allocation alert - Total
+  - name: PoolTotalTFlopsAllocationCritical
+    query: |
+      SELECT pool, (100 - avg(allocated_tflops_percent)) as tflops_available
+      FROM tf_node_metrics
+      WHERE {{ .Conditions }}
+      GROUP BY pool
+      HAVING tflops_available < {{ .Threshold }}
+    threshold: 5
+    evaluationInterval: 1m
+    consecutiveCount: 2
+    severity: P0
+    summary: "Pool available TFlops below threshold, remaining {{ .tflops_available }}%"
+    description: "Pool {{ .pool }} has available TFlops below {{ .Threshold }}%"
+    alertTargetInstance: "{{ .pool }}"
+  
+  - name: PoolTotalTFlopsAllocationWarning
+    query: |
+      SELECT pool, (100 - avg(allocated_tflops_percent)) as tflops_available
+      FROM tf_node_metrics
+      WHERE {{ .Conditions }}
+      GROUP BY pool
+      HAVING tflops_available < {{ .Threshold }}
+    threshold: 10
+    evaluationInterval: 1m
+    consecutiveCount: 2
+    severity: P1
+    summary: "Pool available TFlops below threshold, remaining {{ .tflops_available }}%"
+    description: "Pool {{ .pool }} has available TFlops below {{ .Threshold }}%"
+    alertTargetInstance: "{{ .pool }}"
+  
+  # Node VRAM allocation alert
+  - name: NodeVRAMAllocationCritical
+    query: |
+      SELECT node, pool, (100 - avg(allocated_vram_percent)) as vram_available
+      FROM tf_node_metrics
+      WHERE {{ .Conditions }}
+      GROUP BY node, pool
+      HAVING vram_available < {{ .Threshold }}
+    threshold: 5
+    evaluationInterval: 1m
+    consecutiveCount: 2
+    severity: P1
+    summary: "Node available VRAM below threshold, remaining {{ .vram_available }}% for {{ .node }}"
+    description: "Node {{ .node }} in Pool {{ .pool }} has available VRAM below {{ .Threshold }}%"
+    alertTargetInstance: "{{ .node }}"
+
+  - name: NodeVRAMAllocationWarning
+    query: |
+      SELECT node, pool, (100 - avg(allocated_vram_percent)) as vram_available
+      FROM tf_node_metrics
+      WHERE {{ .Conditions }}
+      GROUP BY node, pool
+      HAVING vram_available < {{ .Threshold }}
+    threshold: 10
+    evaluationInterval: 1m
+    consecutiveCount: 2
+    severity: P1
+    summary: "Node available VRAM below threshold, remaining {{ .vram_available }}% for {{ .node }}"
+    description: "Node {{ .node }} in Pool {{ .pool }} has available VRAM below {{ .Threshold }}%"
+    alertTargetInstance: "{{ .node }}"
+  
+  # Pool VRAM allocation alert
+  - name: PoolVRAMAllocationWarning
+    query: |
+      SELECT pool, (100 - avg(allocated_vram_percent)) as vram_available
+      FROM tf_node_metrics
+      WHERE {{ .Conditions }}
+      GROUP BY pool
+      HAVING vram_available < {{ .Threshold }}
+    threshold: 10
+    evaluationInterval: 1m
+    consecutiveCount: 2
+    severity: P1
+    summary: "Pool available VRAM below threshold, remaining {{ .vram_available }}% for {{ .pool }}"
+    description: "Pool {{ .pool }} has available VRAM below {{ .Threshold }}%"
+    alertTargetInstance: "{{ .pool }}"
+  
+  # Empty or Idle GPU Alert
+  - name: EmptyGPU
+    query: |
+      SELECT DISTINCT node 
+      FROM tf_node_metrics 
+      WHERE {{ .Conditions }} AND node NOT IN (
+          SELECT DISTINCT node 
+          FROM tf_worker_usage 
+          WHERE {{ .Conditions }}
+      )
+    threshold: 0
+    evaluationInterval: 5m
+    consecutiveCount: 2
+    severity: P2
+    summary: "Empty GPU without any workload, Node {{ .node }}"
+    description: "GPU Node {{ .node }} has no workload running, should be decommissioned"
+    alertTargetInstance: "{{ .node }}"
+  
+  - name: IdleGPU
+    query: |
+      SELECT node, pool, uuid, avg(compute_percentage) as compute, avg(memory_percentage) vram
+      FROM tf_gpu_usage
+      WHERE {{ .Conditions }}
+      GROUP BY node, pool, uuid
+      HAVING compute < 1 and vram < {{ .Threshold }};
+    threshold: 5
+    evaluationInterval: 10m
+    consecutiveCount: 3
+    severity: P2
+    summary: "Idle GPU found: {{ .uuid }} on Node {{ .node }}"
+    description: "GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has been idle for 3 consecutive 10m, compute: {{ .compute }}, vram: {{ .vram }}"
+    alertTargetInstance: "{{ .uuid }}"
diff --git a/internal/cloudprovider/common/utils.go b/internal/cloudprovider/common/utils.go
@@ -131,6 +131,16 @@ func CalculateLeastCostGPUNodes(ctx context.Context, provider types.GPUNodeProvi
 
 	nodes := make([]tfv1.GPUNodeClaimSpec, 0, bestNumInstances)
 	for i := int64(0); i < bestNumInstances; i++ {
+
+		tflopsQuantity, err := resource.ParseQuantity(fmt.Sprintf("%f", bestInstance.FP16TFlopsPerGPU*float64(bestInstance.GPUCount)))
+		if err != nil {
+			return nil, fmt.Errorf("failed to parse GPUDeviceOffered: %v", err)
+		}
+
+		vramQuantity, err := resource.ParseQuantity(fmt.Sprintf("%dGi", bestInstance.VRAMGigabytesPerGPU*bestInstance.GPUCount))
+		if err != nil {
+			return nil, fmt.Errorf("failed to parse VRAMOffered: %v", err)
+		}
 		nodes = append(nodes, tfv1.GPUNodeClaimSpec{
 			NodeName:     fmt.Sprintf("%s-%s", pool.Name, generateRandomString(8)),
 			InstanceType: bestInstance.InstanceType,
@@ -139,8 +149,8 @@ func CalculateLeastCostGPUNodes(ctx context.Context, provider types.GPUNodeProvi
 			Zone:         zone,
 			CapacityType: preferredCapacityType,
 
-			TFlopsOffered:    resource.MustParse(fmt.Sprintf("%f", bestInstance.FP16TFlopsPerGPU*float64(bestInstance.GPUCount))),
-			VRAMOffered:      resource.MustParse(fmt.Sprintf("%dGi", bestInstance.VRAMGigabytesPerGPU*bestInstance.GPUCount)),
+			TFlopsOffered:    tflopsQuantity,
+			VRAMOffered:      vramQuantity,
 			GPUDeviceOffered: bestInstance.GPUCount,
 
 			ExtraParams: cluster.Spec.ComputingVendor.Params.ExtraParams,
diff --git a/internal/cloudprovider/karpenter/nodeclaim.go b/internal/cloudprovider/karpenter/nodeclaim.go
@@ -318,7 +318,11 @@ func (p KarpenterGPUNodeProvider) buildNodeClaim(ctx context.Context, param *tfv
 
 	// Add GPU resources if specified (Karpenter supports nvidia.com/gpu)
 	if param.GPUDeviceOffered > 0 {
-		resourceRequests[karpenterConfig.GPUResourceName] = resource.MustParse(fmt.Sprintf("%d", param.GPUDeviceOffered))
+		quantity, err := resource.ParseQuantity(fmt.Sprintf("%d", param.GPUDeviceOffered))
+		if err != nil {
+			return nil, fmt.Errorf("failed to parse GPUDeviceOffered: %v", err)
+		}
+		resourceRequests[karpenterConfig.GPUResourceName] = quantity
 	}
 
 	// query nodeClass and build NodeClassRef
diff --git a/internal/controller/tensorfusioncluster_controller.go b/internal/controller/tensorfusioncluster_controller.go
@@ -43,7 +43,6 @@ import (
 	"github.com/NexusGPU/tensor-fusion/internal/constants"
 	"github.com/NexusGPU/tensor-fusion/internal/metrics"
 	utils "github.com/NexusGPU/tensor-fusion/internal/utils"
-	corev1 "k8s.io/api/core/v1"
 
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 
@@ -382,7 +381,6 @@ func (r *TensorFusionClusterReconciler) checkTFClusterComponentsReady(ctx contex
 		constants.LabelKeyOwner: tfc.GetName(),
 	}))
 	if err != nil {
-		r.Recorder.Eventf(tfc, corev1.EventTypeWarning, "CheckComponentStatusError", err.Error())
 		return false, nil, fmt.Errorf("failed to list GPUPools: %w", err)
 	}
 	if len(pools.Items) != len(tfc.Spec.GPUPools) {
@@ -411,7 +409,6 @@ func (r *TensorFusionClusterReconciler) updateTFClusterStatus(ctx context.Contex
 		}
 	}
 	if err := r.Status().Update(ctx, tfc); err != nil {
-		r.Recorder.Eventf(tfc, corev1.EventTypeWarning, "UpdateClusterStatusError", err.Error())
 		return err
 	}
 	return nil
diff --git a/internal/controller/tensorfusionworkload_controller.go b/internal/controller/tensorfusionworkload_controller.go
@@ -347,7 +347,6 @@ func (r *TensorFusionWorkloadReconciler) updateStatus(
 		readyCondition.Status = metav1.ConditionFalse
 		readyCondition.Reason = "WorkerFailed"
 		readyCondition.Message = fmt.Sprintf("Failed workers num: %d", failedWorkers)
-		r.Recorder.Eventf(workload, corev1.EventTypeWarning, "WorkerFailed", "Failed workers num: %d", failedWorkers)
 	} else if workload.Spec.IsDynamicReplica() {
 		// for dynamic replicas, if no worker failed, indicate workload is running
 		phase = tfv1.TensorFusionWorkloadPhaseRunning
diff --git a/internal/metrics/encoders/influx.go b/internal/metrics/encoders/influx.go
@@ -4,6 +4,7 @@ import (
 	"time"
 
 	metricsProto "github.com/influxdata/line-protocol/v2/lineprotocol"
+	"k8s.io/klog/v2"
 )
 
 // InfluxStrategy implements InfluxDB line protocol encoding
@@ -28,7 +29,12 @@ func (s *InfluxStrategy) AddTag(key, value string) {
 }
 
 func (s *InfluxStrategy) AddField(key string, value any) {
-	s.enc.AddField(key, metricsProto.MustNewValue(value))
+	v, parsed := metricsProto.NewValue(value)
+	if !parsed {
+		klog.Error("metrics influx encoder failed to parse value: ", key, value)
+		return
+	}
+	s.enc.AddField(key, v)
 }
 
 func (s *InfluxStrategy) EndLine(timestamp time.Time) {
diff --git a/internal/metrics/recorder.go b/internal/metrics/recorder.go
diff --git a/internal/utils/compose.go b/internal/utils/compose.go

Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,6 @@ import (`
`43`	`43`	`"github.com/NexusGPU/tensor-fusion/internal/constants"`
`44`	`44`	`"github.com/NexusGPU/tensor-fusion/internal/metrics"`
`45`	`45`	`utils "github.com/NexusGPU/tensor-fusion/internal/utils"`
`46`		`- corev1 "k8s.io/api/core/v1"`
`47`	`46`
`48`	`47`	`metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"`
`49`	`48`
`@@ -382,7 +381,6 @@ func (r *TensorFusionClusterReconciler) checkTFClusterComponentsReady(ctx contex`
`382`	`381`	`constants.LabelKeyOwner: tfc.GetName(),`
`383`	`382`	`}))`
`384`	`383`	`if err != nil {`
`385`		`- r.Recorder.Eventf(tfc, corev1.EventTypeWarning, "CheckComponentStatusError", err.Error())`
`386`	`384`	`return false, nil, fmt.Errorf("failed to list GPUPools: %w", err)`
`387`	`385`	`}`
`388`	`386`	`if len(pools.Items) != len(tfc.Spec.GPUPools) {`
`@@ -411,7 +409,6 @@ func (r *TensorFusionClusterReconciler) updateTFClusterStatus(ctx context.Contex`
`411`	`409`	`}`
`412`	`410`	`}`
`413`	`411`	`if err := r.Status().Update(ctx, tfc); err != nil {`
`414`		`- r.Recorder.Eventf(tfc, corev1.EventTypeWarning, "UpdateClusterStatusError", err.Error())`
`415`	`412`	`return err`
`416`	`413`	`}`
`417`	`414`	`return nil`
Original file line number	Diff line number	Diff line change
`@@ -4,6 +4,7 @@ import (`
`4`	`4`	`"time"`
`5`	`5`
`6`	`6`	`metricsProto "github.com/influxdata/line-protocol/v2/lineprotocol"`
	`7`	`+ "k8s.io/klog/v2"`
`7`	`8`	`)`
`8`	`9`
`9`	`10`	`// InfluxStrategy implements InfluxDB line protocol encoding`
`@@ -28,7 +29,12 @@ func (s *InfluxStrategy) AddTag(key, value string) {`
`28`	`29`	`}`
`29`	`30`
`30`	`31`	`func (s *InfluxStrategy) AddField(key string, value any) {`
`31`		`- s.enc.AddField(key, metricsProto.MustNewValue(value))`
	`32`	`+ v, parsed := metricsProto.NewValue(value)`
	`33`	`+ if !parsed {`
	`34`	`+ klog.Error("metrics influx encoder failed to parse value: ", key, value)`
	`35`	`+ return`
	`36`	`+ }`
	`37`	`+ s.enc.AddField(key, v)`
`32`	`38`	`}`
`33`	`39`
`34`	`40`	`func (s *InfluxStrategy) EndLine(timestamp time.Time) {`