@@ -148,6 +148,10 @@ alert:
148148 limits :
149149 memory : 1Gi
150150 cpu : 1500m
151+ persistence :
152+ enabled : false
153+ # storageClass: "gp3"
154+ # size: 10Gi
151155 alertManagerConfig :
152156 global : {}
153157 receivers :
@@ -347,7 +351,7 @@ dynamicConfig:
347351 - name : NodeTFlopsAllocationCritical
348352 query : |
349353 SELECT node, pool, (100 - avg(allocated_tflops_percent)) as tflops_available
350- FROM tf_node_resources
354+ FROM tf_node_metrics
351355 WHERE {{ .Conditions }}
352356 GROUP BY node, pool
353357 HAVING tflops_available < {{ .Threshold }}
@@ -362,7 +366,7 @@ dynamicConfig:
362366 - name : NodeTFlopsAllocationWarning
363367 query : |
364368 SELECT node, pool, (100 - avg(allocated_tflops_percent)) as tflops_available
365- FROM tf_node_resources
369+ FROM tf_node_metrics
366370 WHERE {{ .Conditions }}
367371 GROUP BY node, pool
368372 HAVING tflops_available < {{ .Threshold }}
@@ -378,7 +382,7 @@ dynamicConfig:
378382 - name : PoolTotalTFlopsAllocationCritical
379383 query : |
380384 SELECT pool, (100 - avg(allocated_tflops_percent)) as tflops_available
381- FROM tf_node_resources
385+ FROM tf_node_metrics
382386 WHERE {{ .Conditions }}
383387 GROUP BY pool
384388 HAVING tflops_available < {{ .Threshold }}
@@ -393,7 +397,7 @@ dynamicConfig:
393397 - name : PoolTotalTFlopsAllocationWarning
394398 query : |
395399 SELECT pool, (100 - avg(allocated_tflops_percent)) as tflops_available
396- FROM tf_node_resources
400+ FROM tf_node_metrics
397401 WHERE {{ .Conditions }}
398402 GROUP BY pool
399403 HAVING tflops_available < {{ .Threshold }}
@@ -409,7 +413,7 @@ dynamicConfig:
409413 - name : NodeVRAMAllocationCritical
410414 query : |
411415 SELECT node, pool, (100 - avg(allocated_vram_percent)) as vram_available
412- FROM tf_node_resources
416+ FROM tf_node_metrics
413417 WHERE {{ .Conditions }}
414418 GROUP BY node, pool
415419 HAVING vram_available < {{ .Threshold }}
@@ -424,7 +428,7 @@ dynamicConfig:
424428 - name : NodeVRAMAllocationWarning
425429 query : |
426430 SELECT node, pool, (100 - avg(allocated_vram_percent)) as vram_available
427- FROM tf_node_resources
431+ FROM tf_node_metrics
428432 WHERE {{ .Conditions }}
429433 GROUP BY node, pool
430434 HAVING vram_available < {{ .Threshold }}
@@ -440,7 +444,7 @@ dynamicConfig:
440444 - name : PoolVRAMAllocationWarning
441445 query : |
442446 SELECT pool, (100 - avg(allocated_vram_percent)) as vram_available
443- FROM tf_node_resources
447+ FROM tf_node_metrics
444448 WHERE {{ .Conditions }}
445449 GROUP BY pool
446450 HAVING vram_available < {{ .Threshold }}
@@ -456,7 +460,7 @@ dynamicConfig:
456460 - name : EmptyGPU
457461 query : |
458462 SELECT DISTINCT node
459- FROM tf_node_resources
463+ FROM tf_node_metrics
460464 WHERE {{ .Conditions }} AND node NOT IN (
461465 SELECT DISTINCT node
462466 FROM tf_worker_usage
0 commit comments