Skip to content

Commit 0389852

Browse files
authored
fix: skip gpu limiter not working issue, observability optimize (#350)
* fix: skip gpu limiter not working issue * fix: avoid k8s QoS side effect for inject lib init container * fix: potential panic issues * fix: remove unused event
1 parent 9006e96 commit 0389852

File tree

8 files changed

+329
-43
lines changed

8 files changed

+329
-43
lines changed

config/samples/dynamic-config.yaml

Lines changed: 256 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,260 @@
11
metricsTTL: 30d
22

33
# default to 'influx', influx v2 line protocol
4-
metricsFormat: json
4+
metricsFormat: influx
55

6-
alertRules:
7-
- name: GPUTFlopsFull
8-
query: |
9-
SELECT
10-
node,
11-
pool,
12-
uuid,
13-
avg(compute_percentage) AS compute_used
14-
FROM tf_gpu_usage
15-
WHERE compute_percentage > {{ .Threshold }} AND {{ .Conditions }}
16-
GROUP BY node, pool, uuid
17-
threshold: 97
18-
evaluationInterval: 30s
19-
consecutiveCount: 4
20-
severity: P1
21-
summary: "GPU TFlops Full, used {{ .compute_used }}% on {{ .node }} {{ .uuid }}"
22-
alertTargetInstance: "{{ .uuid }}"
23-
description: "GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has TFlops usage above {{ .Threshold }}% for 4 consecutive 30s, average usage: {{ .compute_used }}%"
6+
alertRules:
7+
# Worker TFlops throttled alert
8+
- name: WorkerTFlopsThrottled
9+
query: |
10+
SELECT workload, worker, uuid, node, MAX(compute_throttled_cnt)-MIN(compute_throttled_cnt) as throttled_increase
11+
FROM tf_worker_usage
12+
WHERE {{ .Conditions }}
13+
GROUP BY workload, worker, uuid, node
14+
HAVING throttled_increase > {{ .Threshold }}
15+
threshold: 0
16+
evaluationInterval: 15s
17+
consecutiveCount: 3
18+
severity: P1
19+
summary: "Worker TFlops Throttled"
20+
description: "Worker {{ .worker }} from Node {{ .node }} is using more than {{ .Threshold }}% of its TFlops limit"
21+
alertTargetInstance: "{{ .worker }}-{{ .uuid }}"
22+
runbookURL: "https://tensor-fusion.ai/guide/troubleshooting/handbook"
23+
24+
# Worker VRAM switching too frequent alert
25+
- name: WorkerVRAMSwitchCountIncreasing
26+
query: |
27+
SELECT workload, worker, uuid, node, MAX(vram_resumed_cnt)-MIN(vram_resumed_cnt) as switch_increase
28+
FROM tf_worker_usage
29+
WHERE {{ .Conditions }}
30+
GROUP BY workload, worker, uuid, node
31+
HAVING switch_increase > {{ .Threshold }}
32+
threshold: 0
33+
evaluationInterval: 2m
34+
consecutiveCount: 1
35+
severity: P1
36+
summary: "Worker VRAM Switch Count Increasing"
37+
description: "Worker {{ .worker }} from Node {{ .node }} has switched VRAM {{ .switch_increase }} times in last 2 minutes, GPU may be too hot"
38+
alertTargetInstance: "{{ .worker }}-{{ .uuid }}"
39+
runbookURL: "https://tensor-fusion.ai/guide/troubleshooting/handbook"
40+
41+
# Worker can not scale up/scheduled alert
42+
- name: WorkerAllocationFailed
43+
query: |
44+
SELECT pool, (MAX(total_allocation_fail_cnt) - MIN(total_allocation_fail_cnt)) as failure_increase,
45+
FROM tf_system_metrics
46+
WHERE {{ .Conditions }}
47+
GROUP BY pool
48+
HAVING failure_increase > {{ .Threshold }}
49+
threshold: 0
50+
evaluationInterval: 30s
51+
consecutiveCount: 1
52+
severity: P1
53+
summary: "Worker allocation failed for GPU Pool {{ .pool }}"
54+
description: "Worker allocation failed, {{ .failure_increase }} times in last 30 seconds for GPU Pool {{ .pool }}"
55+
alertTargetInstance: "{{ .pool }}"
56+
runbookURL: "https://tensor-fusion.ai/guide/troubleshooting/handbook"
57+
58+
# Single GPU Alerts
59+
60+
# GPU VRAM Full Alert
61+
- name: GPUVRAMFull
62+
query: |
63+
SELECT
64+
node,
65+
pool,
66+
uuid,
67+
avg(memory_percentage) AS memory_used
68+
FROM tf_gpu_usage
69+
WHERE memory_percentage > {{ .Threshold }} AND {{ .Conditions }}
70+
GROUP BY node, pool, uuid
71+
threshold: 97
72+
evaluationInterval: 30s
73+
consecutiveCount: 2
74+
severity: P1
75+
summary: "GPU VRAM Full, used {{ .memory_used }}% on {{ .node }} {{ .uuid }}"
76+
alertTargetInstance: "{{ .uuid }}"
77+
description: "GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has VRAM usage above {{ .Threshold }}% for 2 consecutive 30s, average usage: {{ .memory_used }}%"
78+
79+
# GPU TFlops Full Alert
80+
- name: GPUTFlopsFull
81+
query: |
82+
SELECT
83+
node,
84+
pool,
85+
uuid,
86+
avg(compute_percentage) AS compute_used
87+
FROM tf_gpu_usage
88+
WHERE compute_percentage > {{ .Threshold }} AND {{ .Conditions }}
89+
GROUP BY node, pool, uuid
90+
threshold: 97
91+
evaluationInterval: 30s
92+
consecutiveCount: 4
93+
severity: P1
94+
summary: "GPU TFlops Full, used {{ .compute_used }}% on {{ .node }} {{ .uuid }}"
95+
alertTargetInstance: "{{ .uuid }}"
96+
description: "GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has TFlops usage above {{ .Threshold }}% for 4 consecutive 30s, average usage: {{ .compute_used }}%"
97+
98+
# GPU Temperature alert
99+
- name: GPUTemperatureHigh
100+
query: |
101+
SELECT
102+
node,
103+
pool,
104+
uuid,
105+
avg(temperature) AS avg_temperature
106+
FROM tf_gpu_usage
107+
WHERE temperature > {{ .Threshold }} AND {{ .Conditions }}
108+
GROUP BY node, pool, uuid
109+
threshold: 90
110+
evaluationInterval: 30s
111+
consecutiveCount: 3
112+
severity: P1
113+
summary: "GPU Temperature High, {{ .avg_temperature }}°C on {{ .node }} {{ .uuid }}"
114+
alertTargetInstance: "{{ .uuid }}"
115+
description: "GPU {{ .uuid }} from Node {{ .node }} has temperature above {{ .Threshold }}°C, Average temperature: {{ .avg_temperature }}, GPU Pool: {{ .pool }}"
116+
runbookURL: "https://tensor-fusion.ai/guide/troubleshooting/handbook"
117+
118+
# GPU Pool Alerts
119+
120+
# Node TFlops allocation alert
121+
- name: NodeTFlopsAllocationCritical
122+
query: |
123+
SELECT node, pool, (100 - avg(allocated_tflops_percent)) as tflops_available
124+
FROM tf_node_metrics
125+
WHERE {{ .Conditions }}
126+
GROUP BY node, pool
127+
HAVING tflops_available < {{ .Threshold }}
128+
threshold: 5
129+
evaluationInterval: 1m
130+
consecutiveCount: 2
131+
severity: P0
132+
summary: "Available TFlops below threshold, remaining {{ .tflops_available }}% for {{ .node }}"
133+
description: "Node {{ .node }} in Pool {{ .pool }} has available TFlops below {{ .Threshold }}%"
134+
alertTargetInstance: "{{ .node }}"
135+
136+
- name: NodeTFlopsAllocationWarning
137+
query: |
138+
SELECT node, pool, (100 - avg(allocated_tflops_percent)) as tflops_available
139+
FROM tf_node_metrics
140+
WHERE {{ .Conditions }}
141+
GROUP BY node, pool
142+
HAVING tflops_available < {{ .Threshold }}
143+
threshold: 10
144+
evaluationInterval: 1m
145+
consecutiveCount: 2
146+
severity: P1
147+
summary: "Node available TFlops below threshold, remaining {{ .tflops_available }}% for {{ .node }}"
148+
description: "Node {{ .node }} in Pool {{ .pool }} has available TFlops below {{ .Threshold }}%"
149+
alertTargetInstance: "{{ .node }}"
150+
151+
# Pool TFlops allocation alert - Total
152+
- name: PoolTotalTFlopsAllocationCritical
153+
query: |
154+
SELECT pool, (100 - avg(allocated_tflops_percent)) as tflops_available
155+
FROM tf_node_metrics
156+
WHERE {{ .Conditions }}
157+
GROUP BY pool
158+
HAVING tflops_available < {{ .Threshold }}
159+
threshold: 5
160+
evaluationInterval: 1m
161+
consecutiveCount: 2
162+
severity: P0
163+
summary: "Pool available TFlops below threshold, remaining {{ .tflops_available }}%"
164+
description: "Pool {{ .pool }} has available TFlops below {{ .Threshold }}%"
165+
alertTargetInstance: "{{ .pool }}"
166+
167+
- name: PoolTotalTFlopsAllocationWarning
168+
query: |
169+
SELECT pool, (100 - avg(allocated_tflops_percent)) as tflops_available
170+
FROM tf_node_metrics
171+
WHERE {{ .Conditions }}
172+
GROUP BY pool
173+
HAVING tflops_available < {{ .Threshold }}
174+
threshold: 10
175+
evaluationInterval: 1m
176+
consecutiveCount: 2
177+
severity: P1
178+
summary: "Pool available TFlops below threshold, remaining {{ .tflops_available }}%"
179+
description: "Pool {{ .pool }} has available TFlops below {{ .Threshold }}%"
180+
alertTargetInstance: "{{ .pool }}"
181+
182+
# Node VRAM allocation alert
183+
- name: NodeVRAMAllocationCritical
184+
query: |
185+
SELECT node, pool, (100 - avg(allocated_vram_percent)) as vram_available
186+
FROM tf_node_metrics
187+
WHERE {{ .Conditions }}
188+
GROUP BY node, pool
189+
HAVING vram_available < {{ .Threshold }}
190+
threshold: 5
191+
evaluationInterval: 1m
192+
consecutiveCount: 2
193+
severity: P1
194+
summary: "Node available VRAM below threshold, remaining {{ .vram_available }}% for {{ .node }}"
195+
description: "Node {{ .node }} in Pool {{ .pool }} has available VRAM below {{ .Threshold }}%"
196+
alertTargetInstance: "{{ .node }}"
197+
198+
- name: NodeVRAMAllocationWarning
199+
query: |
200+
SELECT node, pool, (100 - avg(allocated_vram_percent)) as vram_available
201+
FROM tf_node_metrics
202+
WHERE {{ .Conditions }}
203+
GROUP BY node, pool
204+
HAVING vram_available < {{ .Threshold }}
205+
threshold: 10
206+
evaluationInterval: 1m
207+
consecutiveCount: 2
208+
severity: P1
209+
summary: "Node available VRAM below threshold, remaining {{ .vram_available }}% for {{ .node }}"
210+
description: "Node {{ .node }} in Pool {{ .pool }} has available VRAM below {{ .Threshold }}%"
211+
alertTargetInstance: "{{ .node }}"
212+
213+
# Pool VRAM allocation alert
214+
- name: PoolVRAMAllocationWarning
215+
query: |
216+
SELECT pool, (100 - avg(allocated_vram_percent)) as vram_available
217+
FROM tf_node_metrics
218+
WHERE {{ .Conditions }}
219+
GROUP BY pool
220+
HAVING vram_available < {{ .Threshold }}
221+
threshold: 10
222+
evaluationInterval: 1m
223+
consecutiveCount: 2
224+
severity: P1
225+
summary: "Pool available VRAM below threshold, remaining {{ .vram_available }}% for {{ .pool }}"
226+
description: "Pool {{ .pool }} has available VRAM below {{ .Threshold }}%"
227+
alertTargetInstance: "{{ .pool }}"
228+
229+
# Empty or Idle GPU Alert
230+
- name: EmptyGPU
231+
query: |
232+
SELECT DISTINCT node
233+
FROM tf_node_metrics
234+
WHERE {{ .Conditions }} AND node NOT IN (
235+
SELECT DISTINCT node
236+
FROM tf_worker_usage
237+
WHERE {{ .Conditions }}
238+
)
239+
threshold: 0
240+
evaluationInterval: 5m
241+
consecutiveCount: 2
242+
severity: P2
243+
summary: "Empty GPU without any workload, Node {{ .node }}"
244+
description: "GPU Node {{ .node }} has no workload running, should be decommissioned"
245+
alertTargetInstance: "{{ .node }}"
246+
247+
- name: IdleGPU
248+
query: |
249+
SELECT node, pool, uuid, avg(compute_percentage) as compute, avg(memory_percentage) vram
250+
FROM tf_gpu_usage
251+
WHERE {{ .Conditions }}
252+
GROUP BY node, pool, uuid
253+
HAVING compute < 1 and vram < {{ .Threshold }};
254+
threshold: 5
255+
evaluationInterval: 10m
256+
consecutiveCount: 3
257+
severity: P2
258+
summary: "Idle GPU found: {{ .uuid }} on Node {{ .node }}"
259+
description: "GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has been idle for 3 consecutive 10m, compute: {{ .compute }}, vram: {{ .vram }}"
260+
alertTargetInstance: "{{ .uuid }}"

internal/cloudprovider/common/utils.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,16 @@ func CalculateLeastCostGPUNodes(ctx context.Context, provider types.GPUNodeProvi
131131

132132
nodes := make([]tfv1.GPUNodeClaimSpec, 0, bestNumInstances)
133133
for i := int64(0); i < bestNumInstances; i++ {
134+
135+
tflopsQuantity, err := resource.ParseQuantity(fmt.Sprintf("%f", bestInstance.FP16TFlopsPerGPU*float64(bestInstance.GPUCount)))
136+
if err != nil {
137+
return nil, fmt.Errorf("failed to parse GPUDeviceOffered: %v", err)
138+
}
139+
140+
vramQuantity, err := resource.ParseQuantity(fmt.Sprintf("%dGi", bestInstance.VRAMGigabytesPerGPU*bestInstance.GPUCount))
141+
if err != nil {
142+
return nil, fmt.Errorf("failed to parse VRAMOffered: %v", err)
143+
}
134144
nodes = append(nodes, tfv1.GPUNodeClaimSpec{
135145
NodeName: fmt.Sprintf("%s-%s", pool.Name, generateRandomString(8)),
136146
InstanceType: bestInstance.InstanceType,
@@ -139,8 +149,8 @@ func CalculateLeastCostGPUNodes(ctx context.Context, provider types.GPUNodeProvi
139149
Zone: zone,
140150
CapacityType: preferredCapacityType,
141151

142-
TFlopsOffered: resource.MustParse(fmt.Sprintf("%f", bestInstance.FP16TFlopsPerGPU*float64(bestInstance.GPUCount))),
143-
VRAMOffered: resource.MustParse(fmt.Sprintf("%dGi", bestInstance.VRAMGigabytesPerGPU*bestInstance.GPUCount)),
152+
TFlopsOffered: tflopsQuantity,
153+
VRAMOffered: vramQuantity,
144154
GPUDeviceOffered: bestInstance.GPUCount,
145155

146156
ExtraParams: cluster.Spec.ComputingVendor.Params.ExtraParams,

internal/cloudprovider/karpenter/nodeclaim.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,11 @@ func (p KarpenterGPUNodeProvider) buildNodeClaim(ctx context.Context, param *tfv
318318

319319
// Add GPU resources if specified (Karpenter supports nvidia.com/gpu)
320320
if param.GPUDeviceOffered > 0 {
321-
resourceRequests[karpenterConfig.GPUResourceName] = resource.MustParse(fmt.Sprintf("%d", param.GPUDeviceOffered))
321+
quantity, err := resource.ParseQuantity(fmt.Sprintf("%d", param.GPUDeviceOffered))
322+
if err != nil {
323+
return nil, fmt.Errorf("failed to parse GPUDeviceOffered: %v", err)
324+
}
325+
resourceRequests[karpenterConfig.GPUResourceName] = quantity
322326
}
323327

324328
// query nodeClass and build NodeClassRef

internal/controller/tensorfusioncluster_controller.go

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ import (
4343
"github.com/NexusGPU/tensor-fusion/internal/constants"
4444
"github.com/NexusGPU/tensor-fusion/internal/metrics"
4545
utils "github.com/NexusGPU/tensor-fusion/internal/utils"
46-
corev1 "k8s.io/api/core/v1"
4746

4847
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
4948

@@ -382,7 +381,6 @@ func (r *TensorFusionClusterReconciler) checkTFClusterComponentsReady(ctx contex
382381
constants.LabelKeyOwner: tfc.GetName(),
383382
}))
384383
if err != nil {
385-
r.Recorder.Eventf(tfc, corev1.EventTypeWarning, "CheckComponentStatusError", err.Error())
386384
return false, nil, fmt.Errorf("failed to list GPUPools: %w", err)
387385
}
388386
if len(pools.Items) != len(tfc.Spec.GPUPools) {
@@ -411,7 +409,6 @@ func (r *TensorFusionClusterReconciler) updateTFClusterStatus(ctx context.Contex
411409
}
412410
}
413411
if err := r.Status().Update(ctx, tfc); err != nil {
414-
r.Recorder.Eventf(tfc, corev1.EventTypeWarning, "UpdateClusterStatusError", err.Error())
415412
return err
416413
}
417414
return nil

internal/controller/tensorfusionworkload_controller.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,6 @@ func (r *TensorFusionWorkloadReconciler) updateStatus(
347347
readyCondition.Status = metav1.ConditionFalse
348348
readyCondition.Reason = "WorkerFailed"
349349
readyCondition.Message = fmt.Sprintf("Failed workers num: %d", failedWorkers)
350-
r.Recorder.Eventf(workload, corev1.EventTypeWarning, "WorkerFailed", "Failed workers num: %d", failedWorkers)
351350
} else if workload.Spec.IsDynamicReplica() {
352351
// for dynamic replicas, if no worker failed, indicate workload is running
353352
phase = tfv1.TensorFusionWorkloadPhaseRunning

internal/metrics/encoders/influx.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"time"
55

66
metricsProto "github.com/influxdata/line-protocol/v2/lineprotocol"
7+
"k8s.io/klog/v2"
78
)
89

910
// InfluxStrategy implements InfluxDB line protocol encoding
@@ -28,7 +29,12 @@ func (s *InfluxStrategy) AddTag(key, value string) {
2829
}
2930

3031
func (s *InfluxStrategy) AddField(key string, value any) {
31-
s.enc.AddField(key, metricsProto.MustNewValue(value))
32+
v, parsed := metricsProto.NewValue(value)
33+
if !parsed {
34+
klog.Error("metrics influx encoder failed to parse value: ", key, value)
35+
return
36+
}
37+
s.enc.AddField(key, v)
3238
}
3339

3440
func (s *InfluxStrategy) EndLine(timestamp time.Time) {

0 commit comments

Comments
 (0)