11metricsTTL : 30d 
22
33#  default to 'influx', influx v2 line protocol
4- metricsFormat : json 
4+ metricsFormat : influx 
55
6- alertRules :
7- - name : GPUTFlopsFull 
8-   query : | 
9-     SELECT 
10-       node, 
11-       pool, 
12-       uuid, 
13-       avg(compute_percentage) AS compute_used 
14-     FROM tf_gpu_usage 
15-     WHERE compute_percentage > {{ .Threshold }} AND {{ .Conditions }} 
16-     GROUP BY node, pool, uuid 
17-    threshold : 97 
18-   evaluationInterval : 30s 
19-   consecutiveCount : 4 
20-   severity : P1 
21-   summary : " GPU TFlops Full, used {{ .compute_used }}% on {{ .node }} {{ .uuid }}" 
22-   alertTargetInstance : " {{ .uuid }}" 
23-   description : " GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has TFlops usage above {{ .Threshold }}% for 4 consecutive 30s, average usage: {{ .compute_used }}%" 
6+ alertRules :    
7+   #  Worker TFlops throttled alert
8+   - name : WorkerTFlopsThrottled 
9+     query : | 
10+       SELECT workload, worker, uuid, node, MAX(compute_throttled_cnt)-MIN(compute_throttled_cnt) as throttled_increase 
11+       FROM tf_worker_usage 
12+       WHERE {{ .Conditions }} 
13+       GROUP BY workload, worker, uuid, node 
14+       HAVING throttled_increase > {{ .Threshold }} 
15+      threshold : 0 
16+     evaluationInterval : 15s 
17+     consecutiveCount : 3 
18+     severity : P1 
19+     summary : " Worker TFlops Throttled" 
20+     description : " Worker {{ .worker }} from Node {{ .node }} is using more than {{ .Threshold }}% of its TFlops limit" 
21+     alertTargetInstance : " {{ .worker }}-{{ .uuid }}" 
22+     runbookURL : " https://tensor-fusion.ai/guide/troubleshooting/handbook" 
23+   
24+   #  Worker VRAM switching too frequent alert
25+   - name : WorkerVRAMSwitchCountIncreasing 
26+     query : | 
27+       SELECT workload, worker, uuid, node, MAX(vram_resumed_cnt)-MIN(vram_resumed_cnt) as switch_increase 
28+       FROM tf_worker_usage  
29+       WHERE {{ .Conditions }} 
30+       GROUP BY workload, worker, uuid, node 
31+       HAVING switch_increase > {{ .Threshold }} 
32+      threshold : 0 
33+     evaluationInterval : 2m 
34+     consecutiveCount : 1 
35+     severity : P1 
36+     summary : " Worker VRAM Switch Count Increasing" 
37+     description : " Worker {{ .worker }} from Node {{ .node }} has switched VRAM {{ .switch_increase }} times in last 2 minutes, GPU may be too hot" 
38+     alertTargetInstance : " {{ .worker }}-{{ .uuid }}" 
39+     runbookURL : " https://tensor-fusion.ai/guide/troubleshooting/handbook" 
40+   
41+   #  Worker can not scale up/scheduled alert
42+   - name : WorkerAllocationFailed 
43+     query : | 
44+       SELECT pool, (MAX(total_allocation_fail_cnt) - MIN(total_allocation_fail_cnt)) as failure_increase, 
45+       FROM tf_system_metrics 
46+       WHERE {{ .Conditions }} 
47+       GROUP BY pool 
48+       HAVING failure_increase > {{ .Threshold }} 
49+      threshold : 0 
50+     evaluationInterval : 30s 
51+     consecutiveCount : 1 
52+     severity : P1 
53+     summary : " Worker allocation failed for GPU Pool {{ .pool }}" 
54+     description : " Worker allocation failed, {{ .failure_increase }} times in last 30 seconds for GPU Pool {{ .pool }}" 
55+     alertTargetInstance : " {{ .pool }}" 
56+     runbookURL : " https://tensor-fusion.ai/guide/troubleshooting/handbook" 
57+   
58+   #  Single GPU Alerts
59+   
60+   #  GPU VRAM Full Alert
61+   - name : GPUVRAMFull 
62+     query : | 
63+       SELECT 
64+         node, 
65+         pool, 
66+         uuid, 
67+         avg(memory_percentage) AS memory_used 
68+       FROM tf_gpu_usage 
69+       WHERE memory_percentage > {{ .Threshold }} AND {{ .Conditions }} 
70+       GROUP BY node, pool, uuid 
71+      threshold : 97 
72+     evaluationInterval : 30s 
73+     consecutiveCount : 2 
74+     severity : P1 
75+     summary : " GPU VRAM Full, used {{ .memory_used }}% on {{ .node }} {{ .uuid }}" 
76+     alertTargetInstance : " {{ .uuid }}" 
77+     description : " GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has VRAM usage above {{ .Threshold }}% for 2 consecutive 30s, average usage: {{ .memory_used }}%" 
78+   
79+   #  GPU TFlops Full Alert
80+   - name : GPUTFlopsFull 
81+     query : | 
82+       SELECT 
83+         node, 
84+         pool, 
85+         uuid, 
86+         avg(compute_percentage) AS compute_used 
87+       FROM tf_gpu_usage 
88+       WHERE compute_percentage > {{ .Threshold }} AND {{ .Conditions }} 
89+       GROUP BY node, pool, uuid 
90+      threshold : 97 
91+     evaluationInterval : 30s 
92+     consecutiveCount : 4 
93+     severity : P1 
94+     summary : " GPU TFlops Full, used {{ .compute_used }}% on {{ .node }} {{ .uuid }}" 
95+     alertTargetInstance : " {{ .uuid }}" 
96+     description : " GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has TFlops usage above {{ .Threshold }}% for 4 consecutive 30s, average usage: {{ .compute_used }}%" 
97+   
98+   #  GPU Temperature alert
99+   - name : GPUTemperatureHigh 
100+     query : | 
101+       SELECT 
102+         node, 
103+         pool, 
104+         uuid, 
105+         avg(temperature) AS avg_temperature 
106+       FROM tf_gpu_usage 
107+       WHERE temperature > {{ .Threshold }} AND {{ .Conditions }} 
108+       GROUP BY node, pool, uuid 
109+      threshold : 90 
110+     evaluationInterval : 30s 
111+     consecutiveCount : 3 
112+     severity : P1 
113+     summary : " GPU Temperature High, {{ .avg_temperature }}°C on {{ .node }} {{ .uuid }}" 
114+     alertTargetInstance : " {{ .uuid }}" 
115+     description : " GPU {{ .uuid }} from Node {{ .node }} has temperature above {{ .Threshold }}°C, Average temperature: {{ .avg_temperature }}, GPU Pool: {{ .pool }}" 
116+     runbookURL : " https://tensor-fusion.ai/guide/troubleshooting/handbook" 
117+   
118+   #  GPU Pool Alerts
119+   
120+   #  Node TFlops allocation alert
121+   - name : NodeTFlopsAllocationCritical 
122+     query : |  
123+       SELECT node, pool, (100 - avg(allocated_tflops_percent)) as tflops_available 
124+       FROM tf_node_metrics 
125+       WHERE {{ .Conditions }} 
126+       GROUP BY node, pool 
127+       HAVING tflops_available < {{ .Threshold }} 
128+      threshold : 5 
129+     evaluationInterval : 1m 
130+     consecutiveCount : 2 
131+     severity : P0 
132+     summary : " Available TFlops below threshold, remaining {{ .tflops_available }}% for {{ .node }}" 
133+     description : " Node {{ .node }} in Pool {{ .pool }} has available TFlops below {{ .Threshold }}%" 
134+     alertTargetInstance : " {{ .node }}" 
135+   
136+   - name : NodeTFlopsAllocationWarning 
137+     query : |  
138+       SELECT node, pool, (100 - avg(allocated_tflops_percent)) as tflops_available 
139+       FROM tf_node_metrics 
140+       WHERE {{ .Conditions }} 
141+       GROUP BY node, pool 
142+       HAVING tflops_available < {{ .Threshold }} 
143+      threshold : 10 
144+     evaluationInterval : 1m 
145+     consecutiveCount : 2 
146+     severity : P1 
147+     summary : " Node available TFlops below threshold, remaining {{ .tflops_available }}% for {{ .node }}" 
148+     description : " Node {{ .node }} in Pool {{ .pool }} has available TFlops below {{ .Threshold }}%" 
149+     alertTargetInstance : " {{ .node }}" 
150+   
151+   #  Pool TFlops allocation alert - Total
152+   - name : PoolTotalTFlopsAllocationCritical 
153+     query : | 
154+       SELECT pool, (100 - avg(allocated_tflops_percent)) as tflops_available 
155+       FROM tf_node_metrics 
156+       WHERE {{ .Conditions }} 
157+       GROUP BY pool 
158+       HAVING tflops_available < {{ .Threshold }} 
159+      threshold : 5 
160+     evaluationInterval : 1m 
161+     consecutiveCount : 2 
162+     severity : P0 
163+     summary : " Pool available TFlops below threshold, remaining {{ .tflops_available }}%" 
164+     description : " Pool {{ .pool }} has available TFlops below {{ .Threshold }}%" 
165+     alertTargetInstance : " {{ .pool }}" 
166+   
167+   - name : PoolTotalTFlopsAllocationWarning 
168+     query : | 
169+       SELECT pool, (100 - avg(allocated_tflops_percent)) as tflops_available 
170+       FROM tf_node_metrics 
171+       WHERE {{ .Conditions }} 
172+       GROUP BY pool 
173+       HAVING tflops_available < {{ .Threshold }} 
174+      threshold : 10 
175+     evaluationInterval : 1m 
176+     consecutiveCount : 2 
177+     severity : P1 
178+     summary : " Pool available TFlops below threshold, remaining {{ .tflops_available }}%" 
179+     description : " Pool {{ .pool }} has available TFlops below {{ .Threshold }}%" 
180+     alertTargetInstance : " {{ .pool }}" 
181+   
182+   #  Node VRAM allocation alert
183+   - name : NodeVRAMAllocationCritical 
184+     query : | 
185+       SELECT node, pool, (100 - avg(allocated_vram_percent)) as vram_available 
186+       FROM tf_node_metrics 
187+       WHERE {{ .Conditions }} 
188+       GROUP BY node, pool 
189+       HAVING vram_available < {{ .Threshold }} 
190+      threshold : 5 
191+     evaluationInterval : 1m 
192+     consecutiveCount : 2 
193+     severity : P1 
194+     summary : " Node available VRAM below threshold, remaining {{ .vram_available }}% for {{ .node }}" 
195+     description : " Node {{ .node }} in Pool {{ .pool }} has available VRAM below {{ .Threshold }}%" 
196+     alertTargetInstance : " {{ .node }}" 
197+ 
198+   - name : NodeVRAMAllocationWarning 
199+     query : | 
200+       SELECT node, pool, (100 - avg(allocated_vram_percent)) as vram_available 
201+       FROM tf_node_metrics 
202+       WHERE {{ .Conditions }} 
203+       GROUP BY node, pool 
204+       HAVING vram_available < {{ .Threshold }} 
205+      threshold : 10 
206+     evaluationInterval : 1m 
207+     consecutiveCount : 2 
208+     severity : P1 
209+     summary : " Node available VRAM below threshold, remaining {{ .vram_available }}% for {{ .node }}" 
210+     description : " Node {{ .node }} in Pool {{ .pool }} has available VRAM below {{ .Threshold }}%" 
211+     alertTargetInstance : " {{ .node }}" 
212+   
213+   #  Pool VRAM allocation alert
214+   - name : PoolVRAMAllocationWarning 
215+     query : | 
216+       SELECT pool, (100 - avg(allocated_vram_percent)) as vram_available 
217+       FROM tf_node_metrics 
218+       WHERE {{ .Conditions }} 
219+       GROUP BY pool 
220+       HAVING vram_available < {{ .Threshold }} 
221+      threshold : 10 
222+     evaluationInterval : 1m 
223+     consecutiveCount : 2 
224+     severity : P1 
225+     summary : " Pool available VRAM below threshold, remaining {{ .vram_available }}% for {{ .pool }}" 
226+     description : " Pool {{ .pool }} has available VRAM below {{ .Threshold }}%" 
227+     alertTargetInstance : " {{ .pool }}" 
228+   
229+   #  Empty or Idle GPU Alert
230+   - name : EmptyGPU 
231+     query : | 
232+       SELECT DISTINCT node  
233+       FROM tf_node_metrics  
234+       WHERE {{ .Conditions }} AND node NOT IN ( 
235+           SELECT DISTINCT node  
236+           FROM tf_worker_usage  
237+           WHERE {{ .Conditions }} 
238+       ) 
239+      threshold : 0 
240+     evaluationInterval : 5m 
241+     consecutiveCount : 2 
242+     severity : P2 
243+     summary : " Empty GPU without any workload, Node {{ .node }}" 
244+     description : " GPU Node {{ .node }} has no workload running, should be decommissioned" 
245+     alertTargetInstance : " {{ .node }}" 
246+   
247+   - name : IdleGPU 
248+     query : | 
249+       SELECT node, pool, uuid, avg(compute_percentage) as compute, avg(memory_percentage) vram 
250+       FROM tf_gpu_usage 
251+       WHERE {{ .Conditions }} 
252+       GROUP BY node, pool, uuid 
253+       HAVING compute < 1 and vram < {{ .Threshold }}; 
254+      threshold : 5 
255+     evaluationInterval : 10m 
256+     consecutiveCount : 3 
257+     severity : P2 
258+     summary : " Idle GPU found: {{ .uuid }} on Node {{ .node }}" 
259+     description : " GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has been idle for 3 consecutive 10m, compute: {{ .compute }}, vram: {{ .vram }}" 
260+     alertTargetInstance : " {{ .uuid }}" 
0 commit comments