Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions agent/app/dto/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ type MonitorGPUSearch struct {
EndTime time.Time `json:"endTime"`
}
type MonitorGPUData struct {
GPUType string `json:"gpuType"`
ProductNames []string `json:"productNames"`
Date []time.Time `json:"date"`
GPUValue []float64 `json:"gpuValue"`
Expand All @@ -58,8 +59,8 @@ type GPUPowerUsageHelper struct {
Percent float64 `json:"percent"`
}
type GPUMemoryUsageHelper struct {
Total int `json:"total"`
Used int `json:"used"`
Total float64 `json:"total"`
Used float64 `json:"used"`
Percent float64 `json:"percent"`

GPUProcesses []GPUProcess `json:"gpuProcesses"`
Expand Down
4 changes: 2 additions & 2 deletions agent/app/model/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ type MonitorGPU struct {
Temperature int `json:"temperature"`
PowerDraw float64 `json:"powerDraw"`
MaxPowerLimit float64 `json:"maxPowerLimit"`
MemUsed int `json:"memUsed"`
MemTotal int `json:"memTotal"`
MemUsed float64 `json:"memUsed"`
MemTotal float64 `json:"memTotal"`
FanSpeed int `json:"fanSpeed"`
Processes string `json:"processes"`
}
76 changes: 43 additions & 33 deletions agent/app/service/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@
return data, nil
}

func (m *MonitorService) LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.MonitorGPUData, error) {

Check failure on line 121 in agent/app/service/monitor.go

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Refactor this method to reduce its Cognitive Complexity from 26 to the 15 allowed.

See more on https://sonarcloud.io/project/issues?id=1Panel-dev_1Panel&issues=AZrAiIi3zAF9f2sWs1A1&open=AZrAiIi3zAF9f2sWs1A1&pullRequest=11088
loc, _ := time.LoadLocation(common.LoadTimeZoneByCmd())
req.StartTime = req.StartTime.In(loc)
req.EndTime = req.EndTime.In(loc)
Expand All @@ -131,17 +131,21 @@
}
if len(req.ProductName) == 0 {
if gpuExist {
data.GPUType = "gpu"
gpuInfo, err := gpuclient.LoadGpuInfo()
if err != nil || len(gpuInfo.GPUs) == 0 {
global.LOG.Error("Load GPU info failed or no GPU found, err: ", err)
return data, buserr.New("ErrRecordNotFound")
}
req.ProductName = gpuInfo.GPUs[0].ProductName
for _, item := range gpuInfo.GPUs {
data.ProductNames = append(data.ProductNames, item.ProductName)
}
} else {
data.GPUType = "xpu"
xpuInfo, err := xpuClient.LoadGpuInfo()
if err != nil || len(xpuInfo.Xpu) == 0 {
global.LOG.Error("Load XPU info failed or no XPU found, err: ", err)
return data, buserr.New("ErrRecordNotFound")
}
req.ProductName = xpuInfo.Xpu[0].Basic.DeviceName
Expand All @@ -159,15 +163,18 @@
data.Date = append(data.Date, gpu.CreatedAt)
data.GPUValue = append(data.GPUValue, gpu.GPUUtil)
data.TemperatureValue = append(data.TemperatureValue, gpu.Temperature)
data.PowerValue = append(data.PowerValue, dto.GPUPowerUsageHelper{
Total: gpu.MaxPowerLimit,
Used: gpu.PowerDraw,
Percent: gpu.PowerDraw / gpu.MaxPowerLimit * 100,
})
powerItem := dto.GPUPowerUsageHelper{
Total: gpu.MaxPowerLimit,
Used: gpu.PowerDraw,
}
if powerItem.Total != 0 {
powerItem.Percent = powerItem.Used / powerItem.Total
}
data.PowerValue = append(data.PowerValue, powerItem)
memItem := dto.GPUMemoryUsageHelper{
Total: gpu.MemTotal,
Used: gpu.MemUsed,
Percent: float64(gpu.MemUsed) / float64(gpu.MemTotal) * 100,
Percent: gpu.MemUsed / gpu.MemTotal * 100,
}
var process []dto.GPUProcess
if err := json.Unmarshal([]byte(gpu.Processes), &process); err == nil {
Expand Down Expand Up @@ -564,14 +571,13 @@
var list []model.MonitorGPU
for _, gpuItem := range gpuInfo.GPUs {
item := model.MonitorGPU{
ProductName: gpuItem.ProductName,
GPUUtil: loadGPUInfoFloat(gpuItem.GPUUtil),
Temperature: loadGPUInfoInt(gpuItem.Temperature),
PowerDraw: loadGPUInfoFloat(gpuItem.PowerDraw),
MaxPowerLimit: loadGPUInfoFloat(gpuItem.MaxPowerLimit),
MemUsed: loadGPUInfoInt(gpuItem.MemUsed),
MemTotal: loadGPUInfoInt(gpuItem.MemTotal),
FanSpeed: loadGPUInfoInt(gpuItem.FanSpeed),
ProductName: gpuItem.ProductName,
GPUUtil: loadGPUInfoFloat(gpuItem.GPUUtil),
Temperature: loadGPUInfoInt(gpuItem.Temperature),
PowerDraw: loadGPUInfoFloat(gpuItem.PowerDraw),
MemUsed: loadGPUInfoFloat(gpuItem.MemUsed),
MemTotal: loadGPUInfoFloat(gpuItem.MemTotal),
FanSpeed: loadGPUInfoInt(gpuItem.FanSpeed),
}
process, _ := json.Marshal(gpuItem.Processes)
if len(process) != 0 {
Expand All @@ -596,25 +602,28 @@
var list []model.MonitorGPU
for _, xpuItem := range xpuInfo.Xpu {
item := model.MonitorGPU{
ProductName: xpuItem.Basic.DeviceName,
GPUUtil: loadGPUInfoFloat(xpuItem.Stats.MemoryUtil),
Temperature: loadGPUInfoInt(xpuItem.Stats.Temperature),
PowerDraw: loadGPUInfoFloat(xpuItem.Stats.Power),
MemUsed: loadGPUInfoInt(xpuItem.Stats.MemoryUsed),
MemTotal: loadGPUInfoInt(xpuItem.Basic.Memory),
}
var processItem []dto.GPUProcess
for _, ps := range xpuItem.Processes {
processItem = append(processItem, dto.GPUProcess{
Pid: fmt.Sprintf("%v", ps.PID),
Type: ps.SHR,
ProcessName: ps.Command,
UsedMemory: ps.Memory,
})
}
process, _ := json.Marshal(processItem)
if len(process) != 0 {
item.Processes = string(process)
ProductName: xpuItem.Basic.DeviceName,
GPUUtil: loadGPUInfoFloat(xpuItem.Stats.MemoryUtil),
Temperature: loadGPUInfoInt(xpuItem.Stats.Temperature),
PowerDraw: loadGPUInfoFloat(xpuItem.Stats.Power),
MaxPowerLimit: float64(xpuItem.Config.PowerLimit),
MemUsed: loadGPUInfoFloat(xpuItem.Stats.MemoryUsed),
MemTotal: loadGPUInfoFloat(xpuItem.Basic.Memory),
}
if len(xpuItem.Processes) != 0 {
var processItem []dto.GPUProcess
for _, ps := range xpuItem.Processes {
processItem = append(processItem, dto.GPUProcess{
Pid: fmt.Sprintf("%v", ps.PID),
Type: ps.SHR,
ProcessName: ps.Command,
UsedMemory: ps.Memory,
})
}
process, _ := json.Marshal(processItem)
if len(process) != 0 {
item.Processes = string(process)
}
}
list = append(list, item)
}
Expand All @@ -633,6 +642,7 @@
}
func loadGPUInfoFloat(val string) float64 {
valItem := strings.ReplaceAll(val, "W", "")
valItem = strings.ReplaceAll(valItem, "MB", "")
valItem = strings.ReplaceAll(valItem, "%", "")
valItem = strings.TrimSpace(valItem)
data, _ := strconv.ParseFloat(valItem, 64)
Expand Down
2 changes: 1 addition & 1 deletion agent/init/migration/migrations/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,7 @@ var UpdateDatabase = &gormigrate.Migration{
}

var AddGPUMonitor = &gormigrate.Migration{
ID: "20251119-add-gpu-monitor",
ID: "20251127-add-gpu-monitor",
Migrate: func(tx *gorm.DB) error {
return global.GPUMonitorDB.AutoMigrate(&model.MonitorGPU{})
},
Expand Down
6 changes: 6 additions & 0 deletions agent/utils/ai_tools/xpu/xpu_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ type XpuInfo struct {
type Xpu struct {
Basic Basic `json:"basic"`
Stats Stats `json:"stats"`
Config Config `json:"config"`
Processes []Process `json:"processes"`
}

Expand All @@ -23,6 +24,11 @@ type Basic struct {
PciBdfAddress string `json:"pciBdfAddress"`
}

type Config struct {
PowerLimit int `json:"power_limit"`
PowerValidRange string `json:"power_vaild_range"`
}

type Stats struct {
Power string `json:"power"`
Frequency string `json:"frequency"`
Expand Down
1 change: 1 addition & 0 deletions frontend/src/api/interface/host.ts
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ export namespace Host {
endTime: Date;
}
export interface MonitorGPUData {
gpuType: string;
productNames: Array<string>;
date: Array<Date>;
gpuValue: Array<number>;
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/lang/modules/zh.ts
Original file line number Diff line number Diff line change
Expand Up @@ -656,7 +656,7 @@ const message = {
gpu: {
gpu: 'GPU 监控',
base: '基础信息',
gpuHelper: '当前系统未检测到 NVIDIA-SMI或者XPU-SMI 指令,请检查后重试!',
gpuHelper: '当前系统未检测到 NVIDIA-SMI 或者 XPU-SMI 指令,请检查后重试!',
driverVersion: '驱动版本',
cudaVersion: 'CUDA 版本',
process: '进程信息',
Expand Down
61 changes: 49 additions & 12 deletions frontend/src/views/ai/gpu/index.vue
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@
</div>
</el-card>
</div>
<el-row :gutter="7" class="card-interval" v-if="options.length !== 0">
<el-col :span="24">
<el-card style="overflow: inherit">
<el-row :gutter="7" v-if="options.length !== 0">
<el-col v-bind="gpuType === 'gpu' ? fullWidthProps : halfWidthProps">
<el-card class="card-interval" style="overflow: inherit">
<template #header>
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
<span class="title">{{ $t('monitor.gpuUtil') }}</span>
Expand All @@ -51,7 +51,7 @@
</el-card>
</el-col>
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
<el-card style="overflow: inherit">
<el-card class="card-interval" style="overflow: inherit">
<template #header>
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
<span class="title">{{ $t('monitor.memoryUsage') }}</span>
Expand All @@ -70,7 +70,7 @@
</el-card>
</el-col>
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
<el-card style="overflow: inherit">
<el-card class="card-interval" style="overflow: inherit">
<template #header>
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
<span class="title">{{ $t('monitor.powerUsage') }}</span>
Expand All @@ -89,7 +89,7 @@
</el-card>
</el-col>
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
<el-card style="overflow: inherit">
<el-card class="card-interval" style="overflow: inherit">
<template #header>
<div>
{{ $t('monitor.temperature') }}
Expand All @@ -110,8 +110,8 @@
</div>
</el-card>
</el-col>
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12">
<el-card style="overflow: inherit">
<el-col :xs="24" :sm="24" :md="12" :lg="12" :xl="12" v-if="gpuType === 'gpu'">
<el-card class="card-interval" style="overflow: inherit">
<template #header>
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
<span class="title">{{ $t('monitor.fanSpeed') }}</span>
Expand Down Expand Up @@ -161,8 +161,12 @@ const mobile = computed(() => {
return globalStore.isMobile();
});

const fullWidthProps = { span: 24 };
const halfWidthProps = { xs: 24, sm: 24, md: 12, lg: 12, xl: 12 };

const loading = ref(false);
const options = ref([]);
const gpuType = ref('gpu');
const timeRangeGlobal = ref<[Date, Date]>([new Date(new Date().setHours(0, 0, 0, 0)), new Date()]);
const chartsOption = ref({
loadPowerChart: null,
Expand All @@ -189,14 +193,19 @@ const search = async () => {
.then((res) => {
loading.value = false;
options.value = res.data.productNames || [];
gpuType.value = res.data.gpuType || 'gpu';
searchInfo.productName = searchInfo.productName || (options.value.length > 0 ? options.value[0] : '');
let baseDate = res.data.date.length === 0 ? loadEmptyDate(timeRangeGlobal.value) : res.data.date;
let date = baseDate.map(function (item: any) {
return dateFormatWithoutYear(item);
});
initCPUCharts(date, res.data.gpuValue);
initMemoryCharts(date, res.data.memoryValue);
initPowerCharts(date, res.data.powerValue);
if (gpuType.value === 'gpu') {
initPowerCharts(date, res.data.powerValue);
} else {
initXpuPowerCharts(date, res.data.powerValue);
}
initSpeedCharts(date, res.data.speedValue);
initTemperatureCharts(date, res.data.temperatureValue);
})
Expand Down Expand Up @@ -270,6 +279,33 @@ function initPowerCharts(baseDate: any, items: any) {
formatStr: '%',
};
}

function initXpuPowerCharts(baseDate: any, items: any) {
let list = items.map(function (item: any) {
return { value: Number(item.used.toFixed(2)), data: item };
});
list = list.length === 0 ? loadEmptyData2() : list;
chartsOption.value['loadPowerChart'] = {
xData: baseDate,
yData: [
{
name: i18n.global.t('monitor.powerUsage'),
data: list,
},
],
tooltip: {
trigger: 'axis',
formatter: function (list: any) {
let res = loadDate(list[0].name);
for (const item of list) {
res += loadSeries(item, item.data.value ? item.data.value : item.data, 'W');
}
return res;
},
},
formatStr: 'W',
};
}
function initTemperatureCharts(baseDate: any, items: any) {
let temperatures = items.map(function (item: any) {
return Number(item);
Expand Down Expand Up @@ -334,14 +370,15 @@ function withMemoryProcess(list: any) {
if (!process) {
return res;
}
let title = gpuType.value === 'gpu' ? i18n.global.t('aiTools.gpu.type') : i18n.global.t('aiTools.gpu.shr');
res += `
<div style="margin-top: 10px; border-bottom: 1px dashed black;"></div>
<table style="border-collapse: collapse; margin-top: 20px; font-size: 12px;">
<thead>
<tr>
<th style="padding: 6px 8px;">PID</th>
<th style="padding: 6px 8px;">${i18n.global.t('aiTools.gpu.type')}</th>
<th style="padding: 6px 8px;">${i18n.global.t('aiTools.gpu.processName')}</th>
<th style="padding: 6px 8px;">${title}</th>
<th style="padding: 6px 8px;">${i18n.global.t('aiTools.gpu.processMemoryUsage')}</th>
</tr>
</thead>
Expand All @@ -354,10 +391,10 @@ function withMemoryProcess(list: any) {
${row.pid}
</td>
<td style="padding: 6px 8px; text-align: center;">
${loadProcessType(row.type)}
${row.processName}
</td>
<td style="padding: 6px 8px; text-align: center;">
${row.processName}
${loadProcessType(row.type)}
</td>
<td style="padding: 6px 8px; text-align: center;">
${row.usedMemory}
Expand Down
5 changes: 3 additions & 2 deletions frontend/src/views/home/status/index.vue
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,9 @@
<el-button v-if="!cpuShowAll" @click="cpuShowAll = true" icon="More" link size="small" />
<el-button v-if="cpuShowAll" @click="cpuShowAll = false" icon="ArrowUp" link size="small" />
</div>
<br />

<el-button link size="small" type="primary" class="mt-2 mb-2" @click="showTop = !showTop">
<el-button link size="small" type="primary" class="mt-1 mb-2" @click="showTop = !showTop">
{{ $t('home.cpuTop') }}
<el-icon v-if="!showTop"><ArrowRight /></el-icon>
<el-icon v-if="showTop"><ArrowDown /></el-icon>
Expand Down Expand Up @@ -315,7 +316,7 @@
<span class="input-help" v-else>{{ item.deviceName }}</span>
</el-col>
</template>
<el-col :xs="6" :sm="6" :md="6" :lg="6" :xl="6" align="center" v-if="totalCount > 5">
<el-col :xs="6" :sm="6" :md="3" :lg="3" :xl="3" align="center" v-if="totalCount > 5">
<el-button v-if="!showMore" link type="primary" @click="changeShowMore(true)" class="buttonClass">
{{ $t('tabs.more') }}
<el-icon><Bottom /></el-icon>
Expand Down
Loading