Skip to content

Commit 990eaef

Browse files
authored
fix: support allocation debug and simulate, allocator mem wrong state bug (#274)
* fix: support allocation debug and simulate, allocator mem wrong state bug * fix: gpu deallocator bug caused by annotation mismatch
1 parent 634e1ff commit 990eaef

File tree

21 files changed

+363
-150
lines changed

21 files changed

+363
-150
lines changed

.vscode/launch.json

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,14 +60,13 @@
6060
"env": {
6161
"KUBECONFIG": "~/.kube/config-local-studio",
6262
"ENABLE_WEBHOOKS": "false",
63-
"ENABLE_SCHEDULER": "false"
63+
"ENABLE_SCHEDULER": "true",
64+
"ENABLE_CR_CONTROLLER": "true"
6465
},
6566
"args": [
6667
"--gpu-info-config", "${workspaceFolder}/config/samples/gpu-info-config.yaml",
6768
"--dynamic-config", "${workspaceFolder}/config/samples/dynamic-config.yaml",
6869
"--scheduler-config", "${workspaceFolder}/config/samples/scheduler-config.yaml",
69-
"--enable-alert",
70-
"--enable-auto-scale"
7170
],
7271
"program": "${workspaceFolder}/cmd/main.go",
7372
},

.vscode/settings.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
"goerrors",
5252
"golint",
5353
"Gomega",
54+
"gonic",
5455
"gopsutil",
5556
"gorm",
5657
"gosec",

api/v1/gpuresourcequota_types.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,14 @@ type AllocRequest struct {
188188
PodMeta metav1.ObjectMeta
189189
}
190190

191+
type GPUAllocationInfo struct {
192+
Request Resource `json:"request,omitempty"`
193+
Limit Resource `json:"limit,omitempty"`
194+
PodName string `json:"podName,omitempty"`
195+
PodUID string `json:"podUID,omitempty"`
196+
Namespace string `json:"namespace,omitempty"`
197+
}
198+
191199
type AdjustRequest struct {
192200
PodUID string
193201
IsScaleUp bool

api/v1/zz_generated.deepcopy.go

Lines changed: 17 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

charts/tensor-fusion/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 1.4.6
18+
version: 1.4.7
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to

charts/tensor-fusion/values.yaml

Lines changed: 48 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -230,34 +230,34 @@ dynamicConfig:
230230
# Worker TFlops throttled alert
231231
- name: WorkerTFlopsThrottled
232232
query: |
233-
SELECT workload, worker, uuid, node_name, MAX(compute_throttled_cnt)-MIN(compute_throttled_cnt) as throttled_increase
233+
SELECT workload, worker, uuid, node, MAX(compute_throttled_cnt)-MIN(compute_throttled_cnt) as throttled_increase
234234
FROM tf_worker_usage
235235
WHERE {{ .Conditions }}
236-
GROUP BY workload, worker, uuid, node_name
236+
GROUP BY workload, worker, uuid, node
237237
HAVING throttled_increase > {{ .Threshold }}
238238
threshold: 0
239239
evaluationInterval: 15s
240240
consecutiveCount: 3
241241
severity: P1
242242
summary: "Worker TFlops Throttled"
243-
description: "Worker {{ .worker }} from Node {{ .node_name }} is using more than {{ .Threshold }}% of its TFlops limit"
243+
description: "Worker {{ .worker }} from Node {{ .node }} is using more than {{ .Threshold }}% of its TFlops limit"
244244
alertTargetInstance: "{{ .worker }}-{{ .uuid }}"
245245
runbookURL: "https://tensor-fusion.ai/guide/troubleshooting/handbook"
246246

247247
# Worker VRAM switching too frequent alert
248248
- name: WorkerVRAMSwitchCountIncreasing
249249
query: |
250-
SELECT workload, worker, uuid, node_name, MAX(vram_resumed_cnt)-MIN(vram_resumed_cnt) as switch_increase
250+
SELECT workload, worker, uuid, node, MAX(vram_resumed_cnt)-MIN(vram_resumed_cnt) as switch_increase
251251
FROM tf_worker_usage
252252
WHERE {{ .Conditions }}
253-
GROUP BY workload, worker, uuid, node_name
253+
GROUP BY workload, worker, uuid, node
254254
HAVING switch_increase > {{ .Threshold }}
255255
threshold: 0
256256
evaluationInterval: 2m
257257
consecutiveCount: 1
258258
severity: P1
259259
summary: "Worker VRAM Switch Count Increasing"
260-
description: "Worker {{ .worker }} from Node {{ .node_name }} has switched VRAM {{ .switch_increase }} times in last 2 minutes, GPU may be too hot"
260+
description: "Worker {{ .worker }} from Node {{ .node }} has switched VRAM {{ .switch_increase }} times in last 2 minutes, GPU may be too hot"
261261
alertTargetInstance: "{{ .worker }}-{{ .uuid }}"
262262
runbookURL: "https://tensor-fusion.ai/guide/troubleshooting/handbook"
263263

@@ -284,92 +284,92 @@ dynamicConfig:
284284
- name: GPUVRAMFull
285285
query: |
286286
SELECT
287-
node_name,
287+
node,
288288
pool,
289289
uuid,
290290
avg(memory_percentage) AS memory_used
291291
FROM tf_gpu_usage
292292
WHERE memory_percentage > {{ .Threshold }} AND {{ .Conditions }}
293-
GROUP BY node_name, pool, uuid
293+
GROUP BY node, pool, uuid
294294
threshold: 97
295295
evaluationInterval: 30s
296296
consecutiveCount: 2
297297
severity: P1
298-
summary: "GPU VRAM Full, used {{ .memory_used }}% on {{ .node_name }} {{ .uuid }}"
298+
summary: "GPU VRAM Full, used {{ .memory_used }}% on {{ .node }} {{ .uuid }}"
299299
alertTargetInstance: "{{ .uuid }}"
300-
description: "GPU {{ .uuid }} on Node {{ .node_name }} in Pool {{ .pool }} has VRAM usage above {{ .Threshold }}% for 2 consecutive 30s, average usage: {{ .memory_used }}%"
300+
description: "GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has VRAM usage above {{ .Threshold }}% for 2 consecutive 30s, average usage: {{ .memory_used }}%"
301301

302302
# GPU TFlops Full Alert
303303
- name: GPUTFlopsFull
304304
query: |
305305
SELECT
306-
node_name,
306+
node,
307307
pool,
308308
uuid,
309309
avg(compute_percentage) AS compute_used
310310
FROM tf_gpu_usage
311311
WHERE compute_percentage > {{ .Threshold }} AND {{ .Conditions }}
312-
GROUP BY node_name, pool, uuid
312+
GROUP BY node, pool, uuid
313313
threshold: 97
314314
evaluationInterval: 30s
315315
consecutiveCount: 4
316316
severity: P1
317-
summary: "GPU TFlops Full, used {{ .compute_used }}% on {{ .node_name }} {{ .uuid }}"
317+
summary: "GPU TFlops Full, used {{ .compute_used }}% on {{ .node }} {{ .uuid }}"
318318
alertTargetInstance: "{{ .uuid }}"
319-
description: "GPU {{ .uuid }} on Node {{ .node_name }} in Pool {{ .pool }} has TFlops usage above {{ .Threshold }}% for 4 consecutive 30s, average usage: {{ .compute_used }}%"
319+
description: "GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has TFlops usage above {{ .Threshold }}% for 4 consecutive 30s, average usage: {{ .compute_used }}%"
320320

321321
# GPU Temperature alert
322322
- name: GPUTemperatureHigh
323323
query: |
324324
SELECT
325-
node_name,
325+
node,
326326
pool,
327327
uuid,
328328
avg(temperature) AS avg_temperature
329329
FROM tf_gpu_usage
330330
WHERE temperature > {{ .Threshold }} AND {{ .Conditions }}
331-
GROUP BY node_name, pool, uuid
331+
GROUP BY node, pool, uuid
332332
threshold: 90
333333
evaluationInterval: 30s
334334
consecutiveCount: 3
335335
severity: P1
336-
summary: "GPU Temperature High, {{ .avg_temperature }}°C on {{ .node_name }} {{ .uuid }}"
336+
summary: "GPU Temperature High, {{ .avg_temperature }}°C on {{ .node }} {{ .uuid }}"
337337
alertTargetInstance: "{{ .uuid }}"
338-
description: "GPU {{ .uuid }} from Node {{ .node_name }} has temperature above {{ .Threshold }}°C, Average temperature: {{ .avg_temperature }}, GPU Pool: {{ .pool }}"
338+
description: "GPU {{ .uuid }} from Node {{ .node }} has temperature above {{ .Threshold }}°C, Average temperature: {{ .avg_temperature }}, GPU Pool: {{ .pool }}"
339339
runbookURL: "https://tensor-fusion.ai/guide/troubleshooting/handbook"
340340

341341
# GPU Pool Alerts
342342

343343
# Node TFlops allocation alert
344344
- name: NodeTFlopsAllocationCritical
345345
query: |
346-
SELECT node_name, pool, (100 - avg(allocated_tflops_percent)) as tflops_available
346+
SELECT node, pool, (100 - avg(allocated_tflops_percent)) as tflops_available
347347
FROM tf_node_resources
348348
WHERE {{ .Conditions }}
349-
GROUP BY node_name, pool
349+
GROUP BY node, pool
350350
HAVING tflops_available < {{ .Threshold }}
351351
threshold: 5
352352
evaluationInterval: 1m
353353
consecutiveCount: 2
354354
severity: P0
355-
summary: "Available TFlops below threshold, remaining {{ .tflops_available }}% for {{ .node_name }}"
356-
description: "Node {{ .node _name }} in Pool {{ .pool }} has available TFlops below {{ .Threshold }}%"
357-
alertTargetInstance: "{{ .node _name }}"
355+
summary: "Available TFlops below threshold, remaining {{ .tflops_available }}% for {{ .node }}"
356+
description: "Node {{ .node }} in Pool {{ .pool }} has available TFlops below {{ .Threshold }}%"
357+
alertTargetInstance: "{{ .node }}"
358358

359359
- name: NodeTFlopsAllocationWarning
360360
query: |
361-
SELECT node_name, pool, (100 - avg(allocated_tflops_percent)) as tflops_available
361+
SELECT node, pool, (100 - avg(allocated_tflops_percent)) as tflops_available
362362
FROM tf_node_resources
363363
WHERE {{ .Conditions }}
364-
GROUP BY node_name, pool
364+
GROUP BY node, pool
365365
HAVING tflops_available < {{ .Threshold }}
366366
threshold: 10
367367
evaluationInterval: 1m
368368
consecutiveCount: 2
369369
severity: P1
370-
summary: "Node available TFlops below threshold, remaining {{ .tflops_available }}% for {{ .node_name }}"
371-
description: "Node {{ .node _name }} in Pool {{ .pool }} has available TFlops below {{ .Threshold }}%"
372-
alertTargetInstance: "{{ .node _name }}"
370+
summary: "Node available TFlops below threshold, remaining {{ .tflops_available }}% for {{ .node }}"
371+
description: "Node {{ .node }} in Pool {{ .pool }} has available TFlops below {{ .Threshold }}%"
372+
alertTargetInstance: "{{ .node }}"
373373

374374
# Pool TFlops allocation alert - Total
375375
- name: PoolTotalTFlopsAllocationCritical
@@ -405,33 +405,33 @@ dynamicConfig:
405405
# Node VRAM allocation alert
406406
- name: NodeVRAMAllocationCritical
407407
query: |
408-
SELECT node_name, pool, (100 - avg(allocated_vram_percent)) as vram_available
408+
SELECT node, pool, (100 - avg(allocated_vram_percent)) as vram_available
409409
FROM tf_node_resources
410410
WHERE {{ .Conditions }}
411-
GROUP BY node_name, pool
411+
GROUP BY node, pool
412412
HAVING vram_available < {{ .Threshold }}
413413
threshold: 5
414414
evaluationInterval: 1m
415415
consecutiveCount: 2
416416
severity: P1
417-
summary: "Node available VRAM below threshold, remaining {{ .vram_available }}% for {{ .node_name }}"
418-
description: "Node {{ .node _name }} in Pool {{ .pool }} has available VRAM below {{ .Threshold }}%"
419-
alertTargetInstance: "{{ .node _name }}"
417+
summary: "Node available VRAM below threshold, remaining {{ .vram_available }}% for {{ .node }}"
418+
description: "Node {{ .node }} in Pool {{ .pool }} has available VRAM below {{ .Threshold }}%"
419+
alertTargetInstance: "{{ .node }}"
420420

421421
- name: NodeVRAMAllocationWarning
422422
query: |
423-
SELECT node_name, pool, (100 - avg(allocated_vram_percent)) as vram_available
423+
SELECT node, pool, (100 - avg(allocated_vram_percent)) as vram_available
424424
FROM tf_node_resources
425425
WHERE {{ .Conditions }}
426-
GROUP BY node_name, pool
426+
GROUP BY node, pool
427427
HAVING vram_available < {{ .Threshold }}
428428
threshold: 10
429429
evaluationInterval: 1m
430430
consecutiveCount: 2
431431
severity: P1
432-
summary: "Node available VRAM below threshold, remaining {{ .vram_available }}% for {{ .node_name }}"
433-
description: "Node {{ .node _name }} in Pool {{ .pool }} has available VRAM below {{ .Threshold }}%"
434-
alertTargetInstance: "{{ .node _name }}"
432+
summary: "Node available VRAM below threshold, remaining {{ .vram_available }}% for {{ .node }}"
433+
description: "Node {{ .node }} in Pool {{ .pool }} has available VRAM below {{ .Threshold }}%"
434+
alertTargetInstance: "{{ .node }}"
435435

436436
# Pool VRAM allocation alert
437437
- name: PoolVRAMAllocationWarning
@@ -452,32 +452,32 @@ dynamicConfig:
452452
# Empty or Idle GPU Alert
453453
- name: EmptyGPU
454454
query: |
455-
SELECT DISTINCT node_name
455+
SELECT DISTINCT node
456456
FROM tf_node_resources
457-
WHERE {{ .Conditions }} AND node_name NOT IN (
458-
SELECT DISTINCT node_name
457+
WHERE {{ .Conditions }} AND node NOT IN (
458+
SELECT DISTINCT node
459459
FROM tf_worker_usage
460460
WHERE {{ .Conditions }}
461461
)
462462
threshold: 0
463463
evaluationInterval: 5m
464464
consecutiveCount: 2
465465
severity: P2
466-
summary: "Empty GPU without any workload, Node {{ .node_name }}"
467-
description: "GPU Node {{ .node_name }} has no workload running, should be decommissioned"
468-
alertTargetInstance: "{{ .node_name }}"
466+
summary: "Empty GPU without any workload, Node {{ .node }}"
467+
description: "GPU Node {{ .node }} has no workload running, should be decommissioned"
468+
alertTargetInstance: "{{ .node }}"
469469

470470
- name: IdleGPU
471471
query: |
472-
SELECT node_name, pool, uuid, avg(compute_percentage) as compute, avg(memory_percentage) vram
472+
SELECT node, pool, uuid, avg(compute_percentage) as compute, avg(memory_percentage) vram
473473
FROM tf_gpu_usage
474474
WHERE {{ .Conditions }}
475-
GROUP BY node_name, pool, uuid
475+
GROUP BY node, pool, uuid
476476
HAVING compute < 1 and vram < {{ .Threshold }};
477477
threshold: 5
478478
evaluationInterval: 10m
479479
consecutiveCount: 3
480480
severity: P2
481-
summary: "Idle GPU found: {{ .uuid }} on Node {{ .node_name }}"
482-
description: "GPU {{ .uuid }} on Node {{ .node_name }} in Pool {{ .pool }} has been idle for 3 consecutive 10m, compute: {{ .compute }}, vram: {{ .vram }}"
481+
summary: "Idle GPU found: {{ .uuid }} on Node {{ .node }}"
482+
description: "GPU {{ .uuid }} on Node {{ .node }} in Pool {{ .pool }} has been idle for 3 consecutive 10m, compute: {{ .compute }}, vram: {{ .vram }}"
483483
alertTargetInstance: "{{ .uuid }}"

0 commit comments

Comments
 (0)