@@ -230,34 +230,34 @@ dynamicConfig:
230230 # Worker TFlops throttled alert
231231 - name : WorkerTFlopsThrottled
232232 query : |
233- SELECT workload, worker, uuid, node_name , MAX(compute_throttled_cnt)-MIN(compute_throttled_cnt) as throttled_increase
233+ SELECT workload, worker, uuid, node , MAX(compute_throttled_cnt)-MIN(compute_throttled_cnt) as throttled_increase
234234 FROM tf_worker_usage
235235 WHERE {{ .Conditions }}
236- GROUP BY workload, worker, uuid, node_name
236+ GROUP BY workload, worker, uuid, node
237237 HAVING throttled_increase > {{ .Threshold }}
238238 threshold : 0
239239 evaluationInterval : 15s
240240 consecutiveCount : 3
241241 severity : P1
242242 summary : " Worker TFlops Throttled"
243- description : " Worker {{ .worker }} from Node {{ .node_name }} is using more than {{ .Threshold }}% of its TFlops limit"
243+ description : " Worker {{ .worker }} from Node {{ .node }} is using more than {{ .Threshold }}% of its TFlops limit"
244244 alertTargetInstance : " {{ .worker }}-{{ .uuid }}"
245245 runbookURL : " https://tensor-fusion.ai/guide/troubleshooting/handbook"
246246
247247 # Worker VRAM switching too frequent alert
248248 - name : WorkerVRAMSwitchCountIncreasing
249249 query : |
250- SELECT workload, worker, uuid, node_name , MAX(vram_resumed_cnt)-MIN(vram_resumed_cnt) as switch_increase
250+ SELECT workload, worker, uuid, node , MAX(vram_resumed_cnt)-MIN(vram_resumed_cnt) as switch_increase
251251 FROM tf_worker_usage
252252 WHERE {{ .Conditions }}
253- GROUP BY workload, worker, uuid, node_name
253+ GROUP BY workload, worker, uuid, node
254254 HAVING switch_increase > {{ .Threshold }}
255255 threshold : 0
256256 evaluationInterval : 2m
257257 consecutiveCount : 1
258258 severity : P1
259259 summary : " Worker VRAM Switch Count Increasing"
260- description : " Worker {{ .worker }} from Node {{ .node_name }} has switched VRAM {{ .switch_increase }} times in last 2 minutes, GPU may be too hot"
260+ description : " Worker {{ .worker }} from Node {{ .node }} has switched VRAM {{ .switch_increase }} times in last 2 minutes, GPU may be too hot"
261261 alertTargetInstance : " {{ .worker }}-{{ .uuid }}"
262262 runbookURL : " https://tensor-fusion.ai/guide/troubleshooting/handbook"
263263
@@ -284,92 +284,92 @@ dynamicConfig:
284284 - name : GPUVRAMFull
285285 query : |
286286 SELECT
287- node_name ,
287+ node ,
288288 pool,
289289 uuid,
290290 avg(memory_percentage) AS memory_used
291291 FROM tf_gpu_usage
292292 WHERE memory_percentage > {{ .Threshold }} AND {{ .Conditions }}
293- GROUP BY node_name , pool, uuid
293+ GROUP BY node , pool, uuid
294294 threshold : 97
295295 evaluationInterval : 30s
296296 consecutiveCount : 2
297297 severity : P1
298- summary : " GPU VRAM Full, used {{ .memory_used }}% on {{ .node_name }} {{ .uuid }}"
298+ summary : " GPU VRAM Full, used {{ .memory_used }}% on {{ .node }} {{ .uuid }}"
299299 alertTargetInstance : " {{ .uuid }}"
300- description : " GPU {{ .uuid }} on Node {{ .node_name }} in Pool {{ .pool }} has VRAM usage above {{ .Threshold }}% for 2 consecutive 30s, average usage: {{ .memory_used }}%"
300+ description : " GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has VRAM usage above {{ .Threshold }}% for 2 consecutive 30s, average usage: {{ .memory_used }}%"
301301
302302 # GPU TFlops Full Alert
303303 - name : GPUTFlopsFull
304304 query : |
305305 SELECT
306- node_name ,
306+ node ,
307307 pool,
308308 uuid,
309309 avg(compute_percentage) AS compute_used
310310 FROM tf_gpu_usage
311311 WHERE compute_percentage > {{ .Threshold }} AND {{ .Conditions }}
312- GROUP BY node_name , pool, uuid
312+ GROUP BY node , pool, uuid
313313 threshold : 97
314314 evaluationInterval : 30s
315315 consecutiveCount : 4
316316 severity : P1
317- summary : " GPU TFlops Full, used {{ .compute_used }}% on {{ .node_name }} {{ .uuid }}"
317+ summary : " GPU TFlops Full, used {{ .compute_used }}% on {{ .node }} {{ .uuid }}"
318318 alertTargetInstance : " {{ .uuid }}"
319- description : " GPU {{ .uuid }} on Node {{ .node_name }} in Pool {{ .pool }} has TFlops usage above {{ .Threshold }}% for 4 consecutive 30s, average usage: {{ .compute_used }}%"
319+ description : " GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has TFlops usage above {{ .Threshold }}% for 4 consecutive 30s, average usage: {{ .compute_used }}%"
320320
321321 # GPU Temperature alert
322322 - name : GPUTemperatureHigh
323323 query : |
324324 SELECT
325- node_name ,
325+ node ,
326326 pool,
327327 uuid,
328328 avg(temperature) AS avg_temperature
329329 FROM tf_gpu_usage
330330 WHERE temperature > {{ .Threshold }} AND {{ .Conditions }}
331- GROUP BY node_name , pool, uuid
331+ GROUP BY node , pool, uuid
332332 threshold : 90
333333 evaluationInterval : 30s
334334 consecutiveCount : 3
335335 severity : P1
336- summary : " GPU Temperature High, {{ .avg_temperature }}°C on {{ .node_name }} {{ .uuid }}"
336+ summary : " GPU Temperature High, {{ .avg_temperature }}°C on {{ .node }} {{ .uuid }}"
337337 alertTargetInstance : " {{ .uuid }}"
338- description : " GPU {{ .uuid }} from Node {{ .node_name }} has temperature above {{ .Threshold }}°C, Average temperature: {{ .avg_temperature }}, GPU Pool: {{ .pool }}"
338+ description : " GPU {{ .uuid }} from Node {{ .node }} has temperature above {{ .Threshold }}°C, Average temperature: {{ .avg_temperature }}, GPU Pool: {{ .pool }}"
339339 runbookURL : " https://tensor-fusion.ai/guide/troubleshooting/handbook"
340340
341341 # GPU Pool Alerts
342342
343343 # Node TFlops allocation alert
344344 - name : NodeTFlopsAllocationCritical
345345 query : |
346- SELECT node_name , pool, (100 - avg(allocated_tflops_percent)) as tflops_available
346+ SELECT node , pool, (100 - avg(allocated_tflops_percent)) as tflops_available
347347 FROM tf_node_resources
348348 WHERE {{ .Conditions }}
349- GROUP BY node_name , pool
349+ GROUP BY node , pool
350350 HAVING tflops_available < {{ .Threshold }}
351351 threshold : 5
352352 evaluationInterval : 1m
353353 consecutiveCount : 2
354354 severity : P0
355- summary : " Available TFlops below threshold, remaining {{ .tflops_available }}% for {{ .node_name }}"
356- description : " Node {{ .node _name }} in Pool {{ .pool }} has available TFlops below {{ .Threshold }}%"
357- alertTargetInstance : " {{ .node _name }}"
355+ summary : " Available TFlops below threshold, remaining {{ .tflops_available }}% for {{ .node }}"
356+ description : " Node {{ .node }} in Pool {{ .pool }} has available TFlops below {{ .Threshold }}%"
357+ alertTargetInstance : " {{ .node }}"
358358
359359 - name : NodeTFlopsAllocationWarning
360360 query : |
361- SELECT node_name , pool, (100 - avg(allocated_tflops_percent)) as tflops_available
361+ SELECT node , pool, (100 - avg(allocated_tflops_percent)) as tflops_available
362362 FROM tf_node_resources
363363 WHERE {{ .Conditions }}
364- GROUP BY node_name , pool
364+ GROUP BY node , pool
365365 HAVING tflops_available < {{ .Threshold }}
366366 threshold : 10
367367 evaluationInterval : 1m
368368 consecutiveCount : 2
369369 severity : P1
370- summary : " Node available TFlops below threshold, remaining {{ .tflops_available }}% for {{ .node_name }}"
371- description : " Node {{ .node _name }} in Pool {{ .pool }} has available TFlops below {{ .Threshold }}%"
372- alertTargetInstance : " {{ .node _name }}"
370+ summary : " Node available TFlops below threshold, remaining {{ .tflops_available }}% for {{ .node }}"
371+ description : " Node {{ .node }} in Pool {{ .pool }} has available TFlops below {{ .Threshold }}%"
372+ alertTargetInstance : " {{ .node }}"
373373
374374 # Pool TFlops allocation alert - Total
375375 - name : PoolTotalTFlopsAllocationCritical
@@ -405,33 +405,33 @@ dynamicConfig:
405405 # Node VRAM allocation alert
406406 - name : NodeVRAMAllocationCritical
407407 query : |
408- SELECT node_name , pool, (100 - avg(allocated_vram_percent)) as vram_available
408+ SELECT node , pool, (100 - avg(allocated_vram_percent)) as vram_available
409409 FROM tf_node_resources
410410 WHERE {{ .Conditions }}
411- GROUP BY node_name , pool
411+ GROUP BY node , pool
412412 HAVING vram_available < {{ .Threshold }}
413413 threshold : 5
414414 evaluationInterval : 1m
415415 consecutiveCount : 2
416416 severity : P1
417- summary : " Node available VRAM below threshold, remaining {{ .vram_available }}% for {{ .node_name }}"
418- description : " Node {{ .node _name }} in Pool {{ .pool }} has available VRAM below {{ .Threshold }}%"
419- alertTargetInstance : " {{ .node _name }}"
417+ summary : " Node available VRAM below threshold, remaining {{ .vram_available }}% for {{ .node }}"
418+ description : " Node {{ .node }} in Pool {{ .pool }} has available VRAM below {{ .Threshold }}%"
419+ alertTargetInstance : " {{ .node }}"
420420
421421 - name : NodeVRAMAllocationWarning
422422 query : |
423- SELECT node_name , pool, (100 - avg(allocated_vram_percent)) as vram_available
423+ SELECT node , pool, (100 - avg(allocated_vram_percent)) as vram_available
424424 FROM tf_node_resources
425425 WHERE {{ .Conditions }}
426- GROUP BY node_name , pool
426+ GROUP BY node , pool
427427 HAVING vram_available < {{ .Threshold }}
428428 threshold : 10
429429 evaluationInterval : 1m
430430 consecutiveCount : 2
431431 severity : P1
432- summary : " Node available VRAM below threshold, remaining {{ .vram_available }}% for {{ .node_name }}"
433- description : " Node {{ .node _name }} in Pool {{ .pool }} has available VRAM below {{ .Threshold }}%"
434- alertTargetInstance : " {{ .node _name }}"
432+ summary : " Node available VRAM below threshold, remaining {{ .vram_available }}% for {{ .node }}"
433+ description : " Node {{ .node }} in Pool {{ .pool }} has available VRAM below {{ .Threshold }}%"
434+ alertTargetInstance : " {{ .node }}"
435435
436436 # Pool VRAM allocation alert
437437 - name : PoolVRAMAllocationWarning
@@ -452,32 +452,32 @@ dynamicConfig:
452452 # Empty or Idle GPU Alert
453453 - name : EmptyGPU
454454 query : |
455- SELECT DISTINCT node_name
455+ SELECT DISTINCT node
456456 FROM tf_node_resources
457- WHERE {{ .Conditions }} AND node_name NOT IN (
458- SELECT DISTINCT node_name
457+ WHERE {{ .Conditions }} AND node NOT IN (
458+ SELECT DISTINCT node
459459 FROM tf_worker_usage
460460 WHERE {{ .Conditions }}
461461 )
462462 threshold : 0
463463 evaluationInterval : 5m
464464 consecutiveCount : 2
465465 severity : P2
466- summary : " Empty GPU without any workload, Node {{ .node_name }}"
467- description : " GPU Node {{ .node_name }} has no workload running, should be decommissioned"
468- alertTargetInstance : " {{ .node_name }}"
466+ summary : " Empty GPU without any workload, Node {{ .node }}"
467+ description : " GPU Node {{ .node }} has no workload running, should be decommissioned"
468+ alertTargetInstance : " {{ .node }}"
469469
470470 - name : IdleGPU
471471 query : |
472- SELECT node_name , pool, uuid, avg(compute_percentage) as compute, avg(memory_percentage) vram
472+ SELECT node , pool, uuid, avg(compute_percentage) as compute, avg(memory_percentage) vram
473473 FROM tf_gpu_usage
474474 WHERE {{ .Conditions }}
475- GROUP BY node_name , pool, uuid
475+ GROUP BY node , pool, uuid
476476 HAVING compute < 1 and vram < {{ .Threshold }};
477477 threshold : 5
478478 evaluationInterval : 10m
479479 consecutiveCount : 3
480480 severity : P2
481- summary : " Idle GPU found: {{ .uuid }} on Node {{ .node_name }}"
482- description : " GPU {{ .uuid }} on Node {{ .node_name }} in Pool {{ .pool }} has been idle for 3 consecutive 10m, compute: {{ .compute }}, vram: {{ .vram }}"
481+ summary : " Idle GPU found: {{ .uuid }} on Node {{ .node }}"
482+ description : " GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has been idle for 3 consecutive 10m, compute: {{ .compute }}, vram: {{ .vram }}"
483483 alertTargetInstance : " {{ .uuid }}"
0 commit comments