Skip to content

Commit 8e03b24

Browse files
authored
feat: Compatible with XPU monitoring (#11088)
1 parent 18c65c3 commit 8e03b24

File tree

9 files changed

+109
-53
lines changed

9 files changed

+109
-53
lines changed

agent/app/dto/monitor.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ type MonitorGPUSearch struct {
4444
EndTime time.Time `json:"endTime"`
4545
}
4646
type MonitorGPUData struct {
47+
GPUType string `json:"gpuType"`
4748
ProductNames []string `json:"productNames"`
4849
Date []time.Time `json:"date"`
4950
GPUValue []float64 `json:"gpuValue"`
@@ -58,8 +59,8 @@ type GPUPowerUsageHelper struct {
5859
Percent float64 `json:"percent"`
5960
}
6061
type GPUMemoryUsageHelper struct {
61-
Total int `json:"total"`
62-
Used int `json:"used"`
62+
Total float64 `json:"total"`
63+
Used float64 `json:"used"`
6364
Percent float64 `json:"percent"`
6465

6566
GPUProcesses []GPUProcess `json:"gpuProcesses"`

agent/app/model/monitor.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ type MonitorGPU struct {
3939
Temperature int `json:"temperature"`
4040
PowerDraw float64 `json:"powerDraw"`
4141
MaxPowerLimit float64 `json:"maxPowerLimit"`
42-
MemUsed int `json:"memUsed"`
43-
MemTotal int `json:"memTotal"`
42+
MemUsed float64 `json:"memUsed"`
43+
MemTotal float64 `json:"memTotal"`
4444
FanSpeed int `json:"fanSpeed"`
4545
Processes string `json:"processes"`
4646
}

agent/app/service/monitor.go

Lines changed: 43 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -131,17 +131,21 @@ func (m *MonitorService) LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.Monit
131131
}
132132
if len(req.ProductName) == 0 {
133133
if gpuExist {
134+
data.GPUType = "gpu"
134135
gpuInfo, err := gpuclient.LoadGpuInfo()
135136
if err != nil || len(gpuInfo.GPUs) == 0 {
137+
global.LOG.Error("Load GPU info failed or no GPU found, err: ", err)
136138
return data, buserr.New("ErrRecordNotFound")
137139
}
138140
req.ProductName = gpuInfo.GPUs[0].ProductName
139141
for _, item := range gpuInfo.GPUs {
140142
data.ProductNames = append(data.ProductNames, item.ProductName)
141143
}
142144
} else {
145+
data.GPUType = "xpu"
143146
xpuInfo, err := xpuClient.LoadGpuInfo()
144147
if err != nil || len(xpuInfo.Xpu) == 0 {
148+
global.LOG.Error("Load XPU info failed or no XPU found, err: ", err)
145149
return data, buserr.New("ErrRecordNotFound")
146150
}
147151
req.ProductName = xpuInfo.Xpu[0].Basic.DeviceName
@@ -159,15 +163,18 @@ func (m *MonitorService) LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.Monit
159163
data.Date = append(data.Date, gpu.CreatedAt)
160164
data.GPUValue = append(data.GPUValue, gpu.GPUUtil)
161165
data.TemperatureValue = append(data.TemperatureValue, gpu.Temperature)
162-
data.PowerValue = append(data.PowerValue, dto.GPUPowerUsageHelper{
163-
Total: gpu.MaxPowerLimit,
164-
Used: gpu.PowerDraw,
165-
Percent: gpu.PowerDraw / gpu.MaxPowerLimit * 100,
166-
})
166+
powerItem := dto.GPUPowerUsageHelper{
167+
Total: gpu.MaxPowerLimit,
168+
Used: gpu.PowerDraw,
169+
}
170+
if powerItem.Total != 0 {
171+
powerItem.Percent = powerItem.Used / powerItem.Total
172+
}
173+
data.PowerValue = append(data.PowerValue, powerItem)
167174
memItem := dto.GPUMemoryUsageHelper{
168175
Total: gpu.MemTotal,
169176
Used: gpu.MemUsed,
170-
Percent: float64(gpu.MemUsed) / float64(gpu.MemTotal) * 100,
177+
Percent: gpu.MemUsed / gpu.MemTotal * 100,
171178
}
172179
var process []dto.GPUProcess
173180
if err := json.Unmarshal([]byte(gpu.Processes), &process); err == nil {
@@ -564,14 +571,13 @@ func saveGPUDataToDB() {
564571
var list []model.MonitorGPU
565572
for _, gpuItem := range gpuInfo.GPUs {
566573
item := model.MonitorGPU{
567-
ProductName: gpuItem.ProductName,
568-
GPUUtil: loadGPUInfoFloat(gpuItem.GPUUtil),
569-
Temperature: loadGPUInfoInt(gpuItem.Temperature),
570-
PowerDraw: loadGPUInfoFloat(gpuItem.PowerDraw),
571-
MaxPowerLimit: loadGPUInfoFloat(gpuItem.MaxPowerLimit),
572-
MemUsed: loadGPUInfoInt(gpuItem.MemUsed),
573-
MemTotal: loadGPUInfoInt(gpuItem.MemTotal),
574-
FanSpeed: loadGPUInfoInt(gpuItem.FanSpeed),
574+
ProductName: gpuItem.ProductName,
575+
GPUUtil: loadGPUInfoFloat(gpuItem.GPUUtil),
576+
Temperature: loadGPUInfoInt(gpuItem.Temperature),
577+
PowerDraw: loadGPUInfoFloat(gpuItem.PowerDraw),
578+
MemUsed: loadGPUInfoFloat(gpuItem.MemUsed),
579+
MemTotal: loadGPUInfoFloat(gpuItem.MemTotal),
580+
FanSpeed: loadGPUInfoInt(gpuItem.FanSpeed),
575581
}
576582
process, _ := json.Marshal(gpuItem.Processes)
577583
if len(process) != 0 {
@@ -596,25 +602,28 @@ func saveXPUDataToDB() {
596602
var list []model.MonitorGPU
597603
for _, xpuItem := range xpuInfo.Xpu {
598604
item := model.MonitorGPU{
599-
ProductName: xpuItem.Basic.DeviceName,
600-
GPUUtil: loadGPUInfoFloat(xpuItem.Stats.MemoryUtil),
601-
Temperature: loadGPUInfoInt(xpuItem.Stats.Temperature),
602-
PowerDraw: loadGPUInfoFloat(xpuItem.Stats.Power),
603-
MemUsed: loadGPUInfoInt(xpuItem.Stats.MemoryUsed),
604-
MemTotal: loadGPUInfoInt(xpuItem.Basic.Memory),
605-
}
606-
var processItem []dto.GPUProcess
607-
for _, ps := range xpuItem.Processes {
608-
processItem = append(processItem, dto.GPUProcess{
609-
Pid: fmt.Sprintf("%v", ps.PID),
610-
Type: ps.SHR,
611-
ProcessName: ps.Command,
612-
UsedMemory: ps.Memory,
613-
})
614-
}
615-
process, _ := json.Marshal(processItem)
616-
if len(process) != 0 {
617-
item.Processes = string(process)
605+
ProductName: xpuItem.Basic.DeviceName,
606+
GPUUtil: loadGPUInfoFloat(xpuItem.Stats.MemoryUtil),
607+
Temperature: loadGPUInfoInt(xpuItem.Stats.Temperature),
608+
PowerDraw: loadGPUInfoFloat(xpuItem.Stats.Power),
609+
MaxPowerLimit: float64(xpuItem.Config.PowerLimit),
610+
MemUsed: loadGPUInfoFloat(xpuItem.Stats.MemoryUsed),
611+
MemTotal: loadGPUInfoFloat(xpuItem.Basic.Memory),
612+
}
613+
if len(xpuItem.Processes) != 0 {
614+
var processItem []dto.GPUProcess
615+
for _, ps := range xpuItem.Processes {
616+
processItem = append(processItem, dto.GPUProcess{
617+
Pid: fmt.Sprintf("%v", ps.PID),
618+
Type: ps.SHR,
619+
ProcessName: ps.Command,
620+
UsedMemory: ps.Memory,
621+
})
622+
}
623+
process, _ := json.Marshal(processItem)
624+
if len(process) != 0 {
625+
item.Processes = string(process)
626+
}
618627
}
619628
list = append(list, item)
620629
}
@@ -633,6 +642,7 @@ func loadGPUInfoInt(val string) int {
633642
}
634643
func loadGPUInfoFloat(val string) float64 {
635644
valItem := strings.ReplaceAll(val, "W", "")
645+
valItem = strings.ReplaceAll(valItem, "MB", "")
636646
valItem = strings.ReplaceAll(valItem, "%", "")
637647
valItem = strings.TrimSpace(valItem)
638648
data, _ := strconv.ParseFloat(valItem, 64)

agent/init/migration/migrations/init.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -721,7 +721,7 @@ var UpdateDatabase = &gormigrate.Migration{
721721
}
722722

723723
var AddGPUMonitor = &gormigrate.Migration{
724-
ID: "20251119-add-gpu-monitor",
724+
ID: "20251127-add-gpu-monitor",
725725
Migrate: func(tx *gorm.DB) error {
726726
return global.GPUMonitorDB.AutoMigrate(&model.MonitorGPU{})
727727
},

agent/utils/ai_tools/xpu/xpu_info.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ type XpuInfo struct {
1010
type Xpu struct {
1111
Basic Basic `json:"basic"`
1212
Stats Stats `json:"stats"`
13+
Config Config `json:"config"`
1314
Processes []Process `json:"processes"`
1415
}
1516

@@ -23,6 +24,11 @@ type Basic struct {
2324
PciBdfAddress string `json:"pciBdfAddress"`
2425
}
2526

27+
type Config struct {
28+
PowerLimit int `json:"power_limit"`
29+
PowerValidRange string `json:"power_vaild_range"`
30+
}
31+
2632
type Stats struct {
2733
Power string `json:"power"`
2834
Frequency string `json:"frequency"`

frontend/src/api/interface/host.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@ export namespace Host {
167167
endTime: Date;
168168
}
169169
export interface MonitorGPUData {
170+
gpuType: string;
170171
productNames: Array<string>;
171172
date: Array<Date>;
172173
gpuValue: Array<number>;

frontend/src/lang/modules/zh.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -656,7 +656,7 @@ const message = {
656656
gpu: {
657657
gpu: 'GPU 监控',
658658
base: '基础信息',
659-
gpuHelper: '当前系统未检测到 NVIDIA-SMI或者XPU-SMI 指令,请检查后重试!',
659+
gpuHelper: '当前系统未检测到 NVIDIA-SMI 或者 XPU-SMI 指令,请检查后重试!',
660660
driverVersion: '驱动版本',
661661
cudaVersion: 'CUDA 版本',
662662
process: '进程信息',

frontend/src/views/ai/gpu/index.vue

Lines changed: 49 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@
3030
</div>
3131
</el-card>
3232
</div>
33-
<el-row :gutter="7" class="card-interval" v-if="options.length !== 0">
34-
<el-col :span="24">
35-
<el-card style="overflow: inherit">
33+
<el-row :gutter="7" v-if="options.length !== 0">
34+
<el-col v-bind="gpuType === 'gpu' ? fullWidthProps : halfWidthProps">
35+
<el-card class="card-interval" style="overflow: inherit">
3636
<template #header>
3737
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
3838
<span class="title">{{ $t('monitor.gpuUtil') }}</span>
@@ -51,7 +51,7 @@
5151
</el-card>
5252
</el-col>
5353
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
54-
<el-card style="overflow: inherit">
54+
<el-card class="card-interval" style="overflow: inherit">
5555
<template #header>
5656
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
5757
<span class="title">{{ $t('monitor.memoryUsage') }}</span>
@@ -70,7 +70,7 @@
7070
</el-card>
7171
</el-col>
7272
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
73-
<el-card style="overflow: inherit">
73+
<el-card class="card-interval" style="overflow: inherit">
7474
<template #header>
7575
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
7676
<span class="title">{{ $t('monitor.powerUsage') }}</span>
@@ -89,7 +89,7 @@
8989
</el-card>
9090
</el-col>
9191
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
92-
<el-card style="overflow: inherit">
92+
<el-card class="card-interval" style="overflow: inherit">
9393
<template #header>
9494
<div>
9595
{{ $t('monitor.temperature') }}
@@ -110,8 +110,8 @@
110110
</div>
111111
</el-card>
112112
</el-col>
113-
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
114-
<el-card style="overflow: inherit">
113+
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12" v-if="gpuType === 'gpu'">
114+
<el-card class="card-interval" style="overflow: inherit">
115115
<template #header>
116116
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
117117
<span class="title">{{ $t('monitor.fanSpeed') }}</span>
@@ -161,8 +161,12 @@ const mobile = computed(() => {
161161
return globalStore.isMobile();
162162
});
163163
164+
const fullWidthProps = { span: 24 };
165+
const halfWidthProps = { xs: 24, sm: 24, md: 12, lg: 12, xl: 12 };
166+
164167
const loading = ref(false);
165168
const options = ref([]);
169+
const gpuType = ref('gpu');
166170
const timeRangeGlobal = ref<[Date, Date]>([new Date(new Date().setHours(0, 0, 0, 0)), new Date()]);
167171
const chartsOption = ref({
168172
loadPowerChart: null,
@@ -189,14 +193,19 @@ const search = async () => {
189193
.then((res) => {
190194
loading.value = false;
191195
options.value = res.data.productNames || [];
196+
gpuType.value = res.data.gpuType || 'gpu';
192197
searchInfo.productName = searchInfo.productName || (options.value.length > 0 ? options.value[0] : '');
193198
let baseDate = res.data.date.length === 0 ? loadEmptyDate(timeRangeGlobal.value) : res.data.date;
194199
let date = baseDate.map(function (item: any) {
195200
return dateFormatWithoutYear(item);
196201
});
197202
initCPUCharts(date, res.data.gpuValue);
198203
initMemoryCharts(date, res.data.memoryValue);
199-
initPowerCharts(date, res.data.powerValue);
204+
if (gpuType.value === 'gpu') {
205+
initPowerCharts(date, res.data.powerValue);
206+
} else {
207+
initXpuPowerCharts(date, res.data.powerValue);
208+
}
200209
initSpeedCharts(date, res.data.speedValue);
201210
initTemperatureCharts(date, res.data.temperatureValue);
202211
})
@@ -270,6 +279,33 @@ function initPowerCharts(baseDate: any, items: any) {
270279
formatStr: '%',
271280
};
272281
}
282+
283+
function initXpuPowerCharts(baseDate: any, items: any) {
284+
let list = items.map(function (item: any) {
285+
return { value: Number(item.used.toFixed(2)), data: item };
286+
});
287+
list = list.length === 0 ? loadEmptyData2() : list;
288+
chartsOption.value['loadPowerChart'] = {
289+
xData: baseDate,
290+
yData: [
291+
{
292+
name: i18n.global.t('monitor.powerUsage'),
293+
data: list,
294+
},
295+
],
296+
tooltip: {
297+
trigger: 'axis',
298+
formatter: function (list: any) {
299+
let res = loadDate(list[0].name);
300+
for (const item of list) {
301+
res += loadSeries(item, item.data.value ? item.data.value : item.data, 'W');
302+
}
303+
return res;
304+
},
305+
},
306+
formatStr: 'W',
307+
};
308+
}
273309
function initTemperatureCharts(baseDate: any, items: any) {
274310
let temperatures = items.map(function (item: any) {
275311
return Number(item);
@@ -334,14 +370,15 @@ function withMemoryProcess(list: any) {
334370
if (!process) {
335371
return res;
336372
}
373+
let title = gpuType.value === 'gpu' ? i18n.global.t('aiTools.gpu.type') : i18n.global.t('aiTools.gpu.shr');
337374
res += `
338375
<div style="margin-top: 10px; border-bottom: 1px dashed black;"></div>
339376
<table style="border-collapse: collapse; margin-top: 20px; font-size: 12px;">
340377
<thead>
341378
<tr>
342379
<th style="padding: 6px 8px;">PID</th>
343-
<th style="padding: 6px 8px;">${i18n.global.t('aiTools.gpu.type')}</th>
344380
<th style="padding: 6px 8px;">${i18n.global.t('aiTools.gpu.processName')}</th>
381+
<th style="padding: 6px 8px;">${title}</th>
345382
<th style="padding: 6px 8px;">${i18n.global.t('aiTools.gpu.processMemoryUsage')}</th>
346383
</tr>
347384
</thead>
@@ -354,10 +391,10 @@ function withMemoryProcess(list: any) {
354391
${row.pid}
355392
</td>
356393
<td style="padding: 6px 8px; text-align: center;">
357-
${loadProcessType(row.type)}
394+
${row.processName}
358395
</td>
359396
<td style="padding: 6px 8px; text-align: center;">
360-
${row.processName}
397+
${loadProcessType(row.type)}
361398
</td>
362399
<td style="padding: 6px 8px; text-align: center;">
363400
${row.usedMemory}

frontend/src/views/home/status/index.vue

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,9 @@
6969
<el-button v-if="!cpuShowAll" @click="cpuShowAll = true" icon="More" link size="small" />
7070
<el-button v-if="cpuShowAll" @click="cpuShowAll = false" icon="ArrowUp" link size="small" />
7171
</div>
72+
<br />
7273

73-
<el-button link size="small" type="primary" class="mt-2 mb-2" @click="showTop = !showTop">
74+
<el-button link size="small" type="primary" class="mt-1 mb-2" @click="showTop = !showTop">
7475
{{ $t('home.cpuTop') }}
7576
<el-icon v-if="!showTop"><ArrowRight /></el-icon>
7677
<el-icon v-if="showTop"><ArrowDown /></el-icon>
@@ -315,7 +316,7 @@
315316
<span class="input-help" v-else>{{ item.deviceName }}</span>
316317
</el-col>
317318
</template>
318-
<el-col :xs="6" :sm="6" :md="6" :lg="6" :xl="6" align="center" v-if="totalCount > 5">
319+
<el-col :xs="6" :sm="6" :md="3" :lg="3" :xl="3" align="center" v-if="totalCount > 5">
319320
<el-button v-if="!showMore" link type="primary" @click="changeShowMore(true)" class="buttonClass">
320321
{{ $t('tabs.more') }}
321322
<el-icon><Bottom /></el-icon>

0 commit comments

Comments
 (0)