Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions agent/app/api/v2/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,7 @@
sort.Strings(options)
helper.SuccessWithData(c, options)
}

func (b *BaseApi) GetCPUOptions(c *gin.Context) {

Check warning on line 130 in agent/app/api/v2/monitor.go

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Remove the 'Get' prefix from this function name.

See more on https://sonarcloud.io/project/issues?id=1Panel-dev_1Panel&issues=AZrD4F3EujXon5Il7xLb&open=AZrD4F3EujXon5Il7xLb&pullRequest=11099
helper.SuccessWithData(c, monitorService.LoadGPUOptions())
}
8 changes: 5 additions & 3 deletions agent/app/dto/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,17 +38,19 @@ type MonitorSettingUpdate struct {
Value string `json:"value"`
}

type MonitorGPUOptions struct {
GPUType string `json:"gpuType"`
Options []string `json:"options"`
}
type MonitorGPUSearch struct {
ProductName string `json:"productName"`
StartTime time.Time `json:"startTime"`
EndTime time.Time `json:"endTime"`
}
type MonitorGPUData struct {
GPUType string `json:"gpuType"`
ProductNames []string `json:"productNames"`
Date []time.Time `json:"date"`
GPUValue []float64 `json:"gpuValue"`
TemperatureValue []int `json:"temperatureValue"`
TemperatureValue []float64 `json:"temperatureValue"`
PowerValue []GPUPowerUsageHelper `json:"powerValue"`
MemoryValue []GPUMemoryUsageHelper `json:"memoryValue"`
SpeedValue []int `json:"speedValue"`
Expand Down
2 changes: 1 addition & 1 deletion agent/app/model/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ type MonitorGPU struct {
BaseModel
ProductName string `json:"productName"`
GPUUtil float64 `json:"gpuUtil"`
Temperature int `json:"temperature"`
Temperature float64 `json:"temperature"`
PowerDraw float64 `json:"powerDraw"`
MaxPowerLimit float64 `json:"maxPowerLimit"`
MemUsed float64 `json:"memUsed"`
Expand Down
95 changes: 50 additions & 45 deletions agent/app/service/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
type IMonitorService interface {
Run()
LoadMonitorData(req dto.MonitorSearch) ([]dto.MonitorData, error)
LoadGPUOptions() dto.MonitorGPUOptions
LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.MonitorGPUData, error)
LoadSetting() (*dto.MonitorSetting, error)
UpdateSetting(key, value string) error
Expand Down Expand Up @@ -118,42 +119,43 @@
return data, nil
}

func (m *MonitorService) LoadGPUOptions() dto.MonitorGPUOptions {
var data dto.MonitorGPUOptions
gpuExist, gpuClient := gpu.New()
xpuExist, xpuClient := xpu.New()
if !gpuExist && !xpuExist {
return data
}
if gpuExist {
data.GPUType = "gpu"
gpuInfo, err := gpuClient.LoadGpuInfo()
if err != nil || len(gpuInfo.GPUs) == 0 {
global.LOG.Error("Load GPU info failed or no GPU found, err: ", err)
return data
}
sort.Slice(gpuInfo.GPUs, func(i, j int) bool {
return gpuInfo.GPUs[i].Index < gpuInfo.GPUs[j].Index
})
for _, item := range gpuInfo.GPUs {
data.Options = append(data.Options, fmt.Sprintf("%d - %s", item.Index, item.ProductName))

Check failure on line 140 in agent/app/service/monitor.go

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Define a constant instead of duplicating this literal "%d - %s" 3 times.

See more on https://sonarcloud.io/project/issues?id=1Panel-dev_1Panel&issues=AZrD4Fx7ujXon5Il7xLZ&open=AZrD4Fx7ujXon5Il7xLZ&pullRequest=11099
}
return data
} else {

Check failure on line 143 in agent/app/service/monitor.go

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Remove this 'else' clause; the code should continue after the error check.

See more on https://sonarcloud.io/project/issues?id=1Panel-dev_1Panel&issues=AZrD4Fx7ujXon5Il7xLa&open=AZrD4Fx7ujXon5Il7xLa&pullRequest=11099
data.GPUType = "xpu"
var err error
data.Options, err = xpuClient.LoadDeviceList()
if err != nil || len(data.Options) == 0 {
global.LOG.Error("Load XPU info failed or no XPU found, err: ", err)
}
return data
}
}

func (m *MonitorService) LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.MonitorGPUData, error) {
loc, _ := time.LoadLocation(common.LoadTimeZoneByCmd())
req.StartTime = req.StartTime.In(loc)
req.EndTime = req.EndTime.In(loc)

var data dto.MonitorGPUData
gpuExist, gpuclient := gpu.New()
xpuExist, xpuClient := xpu.New()
if !gpuExist && !xpuExist {
return data, nil
}
if len(req.ProductName) == 0 {
if gpuExist {
data.GPUType = "gpu"
gpuInfo, err := gpuclient.LoadGpuInfo()
if err != nil || len(gpuInfo.GPUs) == 0 {
global.LOG.Error("Load GPU info failed or no GPU found, err: ", err)
return data, buserr.New("ErrRecordNotFound")
}
req.ProductName = gpuInfo.GPUs[0].ProductName
for _, item := range gpuInfo.GPUs {
data.ProductNames = append(data.ProductNames, item.ProductName)
}
} else {
data.GPUType = "xpu"
xpuInfo, err := xpuClient.LoadGpuInfo()
if err != nil || len(xpuInfo.Xpu) == 0 {
global.LOG.Error("Load XPU info failed or no XPU found, err: ", err)
return data, buserr.New("ErrRecordNotFound")
}
req.ProductName = xpuInfo.Xpu[0].Basic.DeviceName
for _, item := range xpuInfo.Xpu {
data.ProductNames = append(data.ProductNames, item.Basic.DeviceName)
}
}
}
gpuList, err := monitorRepo.GetGPU(repo.WithByCreatedAt(req.StartTime, req.EndTime), monitorRepo.WithByProductName(req.ProductName))
if err != nil {
return data, err
Expand Down Expand Up @@ -571,13 +573,14 @@
var list []model.MonitorGPU
for _, gpuItem := range gpuInfo.GPUs {
item := model.MonitorGPU{
ProductName: gpuItem.ProductName,
GPUUtil: loadGPUInfoFloat(gpuItem.GPUUtil),
Temperature: loadGPUInfoInt(gpuItem.Temperature),
PowerDraw: loadGPUInfoFloat(gpuItem.PowerDraw),
MemUsed: loadGPUInfoFloat(gpuItem.MemUsed),
MemTotal: loadGPUInfoFloat(gpuItem.MemTotal),
FanSpeed: loadGPUInfoInt(gpuItem.FanSpeed),
ProductName: fmt.Sprintf("%d - %s", gpuItem.Index, gpuItem.ProductName),
GPUUtil: loadGPUInfoFloat(gpuItem.GPUUtil),
Temperature: loadGPUInfoFloat(gpuItem.Temperature),
PowerDraw: loadGPUInfoFloat(gpuItem.PowerDraw),
MaxPowerLimit: loadGPUInfoFloat(gpuItem.MaxPowerLimit),
MemUsed: loadGPUInfoFloat(gpuItem.MemUsed),
MemTotal: loadGPUInfoFloat(gpuItem.MemTotal),
FanSpeed: loadGPUInfoInt(gpuItem.FanSpeed),
}
process, _ := json.Marshal(gpuItem.Processes)
if len(process) != 0 {
Expand All @@ -602,13 +605,12 @@
var list []model.MonitorGPU
for _, xpuItem := range xpuInfo.Xpu {
item := model.MonitorGPU{
ProductName: xpuItem.Basic.DeviceName,
GPUUtil: loadGPUInfoFloat(xpuItem.Stats.MemoryUtil),
Temperature: loadGPUInfoInt(xpuItem.Stats.Temperature),
PowerDraw: loadGPUInfoFloat(xpuItem.Stats.Power),
MaxPowerLimit: float64(xpuItem.Config.PowerLimit),
MemUsed: loadGPUInfoFloat(xpuItem.Stats.MemoryUsed),
MemTotal: loadGPUInfoFloat(xpuItem.Basic.Memory),
ProductName: fmt.Sprintf("%d - %s", xpuItem.Basic.DeviceID, xpuItem.Basic.DeviceName),
GPUUtil: loadGPUInfoFloat(xpuItem.Stats.MemoryUtil),
Temperature: loadGPUInfoFloat(xpuItem.Stats.Temperature),
PowerDraw: loadGPUInfoFloat(xpuItem.Stats.Power),
MemUsed: loadGPUInfoFloat(xpuItem.Stats.MemoryUsed),
MemTotal: loadGPUInfoFloat(xpuItem.Basic.Memory),
}
if len(xpuItem.Processes) != 0 {
var processItem []dto.GPUProcess
Expand Down Expand Up @@ -643,6 +645,9 @@
func loadGPUInfoFloat(val string) float64 {
valItem := strings.ReplaceAll(val, "W", "")
valItem = strings.ReplaceAll(valItem, "MB", "")
valItem = strings.ReplaceAll(valItem, "MiB", "")
valItem = strings.ReplaceAll(valItem, "C", "")
valItem = strings.ReplaceAll(valItem, "°C", "")
valItem = strings.ReplaceAll(valItem, "%", "")
valItem = strings.TrimSpace(valItem)
data, _ := strconv.ParseFloat(valItem, 64)
Expand Down
2 changes: 1 addition & 1 deletion agent/init/migration/migrations/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,7 @@ var UpdateDatabase = &gormigrate.Migration{
}

var AddGPUMonitor = &gormigrate.Migration{
ID: "20251127-add-gpu-monitor",
ID: "20251122-add-gpu-monitor",
Migrate: func(tx *gorm.DB) error {
return global.GPUMonitorDB.AutoMigrate(&model.MonitorGPU{})
},
Expand Down
1 change: 1 addition & 0 deletions agent/router/ro_host.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ func (s *HostRouter) InitRouter(Router *gin.RouterGroup) {
hostRouter.POST("/monitor/search", baseApi.LoadMonitor)
hostRouter.POST("/monitor/gpu/search", baseApi.LoadGPUMonitor)
hostRouter.POST("/monitor/clean", baseApi.CleanMonitor)
hostRouter.GET("/monitor/gpuoptions", baseApi.GetCPUOptions)
hostRouter.GET("/monitor/netoptions", baseApi.GetNetworkOptions)
hostRouter.GET("/monitor/iooptions", baseApi.GetIOOptions)
hostRouter.GET("/monitor/setting", baseApi.LoadMonitorSetting)
Expand Down
17 changes: 17 additions & 0 deletions agent/utils/ai_tools/xpu/xpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,14 @@

func (x XpuSMI) LoadDashData() ([]XPUSimpleInfo, error) {
cmdMgr := cmd.NewCommandMgr(cmd.WithTimeout(5 * time.Second))
data, err := cmdMgr.RunWithStdoutBashC("xpu-smi discovery -j")

Check failure on line 96 in agent/utils/ai_tools/xpu/xpu.go

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Define a constant instead of duplicating this literal "xpu-smi discovery -j" 3 times.

See more on https://sonarcloud.io/project/issues?id=1Panel-dev_1Panel&issues=AZrD4F3QujXon5Il7xLd&open=AZrD4F3QujXon5Il7xLd&pullRequest=11099
if err != nil {
return nil, fmt.Errorf("calling xpu-smi failed, %v", err)
}

var deviceInfo DeviceInfo
if err := json.Unmarshal([]byte(data), &deviceInfo); err != nil {
return nil, fmt.Errorf("deviceInfo json unmarshal failed, err: %w", err)

Check failure on line 103 in agent/utils/ai_tools/xpu/xpu.go

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Define a constant instead of duplicating this literal "deviceInfo json unmarshal failed, err: %w" 3 times.

See more on https://sonarcloud.io/project/issues?id=1Panel-dev_1Panel&issues=AZrD4F3QujXon5Il7xLc&open=AZrD4F3QujXon5Il7xLc&pullRequest=11099
}

var res []XPUSimpleInfo
Expand All @@ -120,6 +120,23 @@
return res, nil
}

func (x XpuSMI) LoadDeviceList() ([]string, error) {
cmdMgr := cmd.NewCommandMgr(cmd.WithTimeout(5 * time.Second))
data, err := cmdMgr.RunWithStdoutBashC("xpu-smi discovery -j")
if err != nil {
return nil, fmt.Errorf("calling xpu-smi failed, %v", err)
}
var deviceInfo DeviceInfo
if err := json.Unmarshal([]byte(data), &deviceInfo); err != nil {
return nil, fmt.Errorf("deviceInfo json unmarshal failed, err: %w", err)
}
var deviceNames []string
for _, device := range deviceInfo.DeviceList {
deviceNames = append(deviceNames, fmt.Sprintf("%d - %s", device.DeviceID, device.DeviceName))
}
return deviceNames, nil
}

func (x XpuSMI) LoadGpuInfo() (*XpuInfo, error) {
cmdMgr := cmd.NewCommandMgr(cmd.WithTimeout(5 * time.Second))
data, err := cmdMgr.RunWithStdoutBashC("xpu-smi discovery -j")
Expand Down
6 changes: 0 additions & 6 deletions agent/utils/ai_tools/xpu/xpu_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ type XpuInfo struct {
type Xpu struct {
Basic Basic `json:"basic"`
Stats Stats `json:"stats"`
Config Config `json:"config"`
Processes []Process `json:"processes"`
}

Expand All @@ -24,11 +23,6 @@ type Basic struct {
PciBdfAddress string `json:"pciBdfAddress"`
}

type Config struct {
PowerLimit int `json:"power_limit"`
PowerValidRange string `json:"power_vaild_range"`
}

type Stats struct {
Power string `json:"power"`
Frequency string `json:"frequency"`
Expand Down
6 changes: 4 additions & 2 deletions frontend/src/api/interface/host.ts
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,11 @@ export namespace Host {
startTime: Date;
endTime: Date;
}
export interface MonitorGPUData {
export interface MonitorGPUOptions {
gpuType: string;
productNames: Array<string>;
options: Array<string>;
}
export interface MonitorGPUData {
date: Array<Date>;
gpuValue: Array<number>;
temperatureValue: Array<number>;
Expand Down
3 changes: 3 additions & 0 deletions frontend/src/api/modules/host.ts
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ export const operateFilterChain = (name: string, op: string) => {
export const loadMonitor = (param: Host.MonitorSearch) => {
return http.post<Array<Host.MonitorData>>(`/hosts/monitor/search`, param);
};
export const getGPUOptions = () => {
return http.get<Host.MonitorGPUOptions>(`/hosts/monitor/gpuoptions`);
};
export const loadGPUMonitor = (param: Host.MonitorGPUSearch) => {
return http.post<Host.MonitorGPUData>(`/hosts/monitor/gpu/search`, param);
};
Expand Down
44 changes: 28 additions & 16 deletions frontend/src/views/ai/gpu/index.vue
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,16 @@
<el-card class="card-interval" style="overflow: inherit">
<template #header>
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
<span class="title">{{ $t('monitor.gpuUtil') }}</span>
<span class="title">{{ $t('monitor.memoryUsage') }}</span>
</div>
</template>
<div class="chart">
<v-charts
height="400px"
id="loadGPUChart"
id="loadMemoryChart"
type="line"
:option="chartsOption['loadGPUChart']"
v-if="chartsOption['loadGPUChart']"
:option="chartsOption['loadMemoryChart']"
v-if="chartsOption['loadMemoryChart']"
:dataZoom="true"
/>
</div>
Expand All @@ -54,16 +54,16 @@
<el-card class="card-interval" style="overflow: inherit">
<template #header>
<div :class="mobile ? 'flx-wrap' : 'flex justify-between'">
<span class="title">{{ $t('monitor.memoryUsage') }}</span>
<span class="title">{{ $t('monitor.gpuUtil') }}</span>
</div>
</template>
<div class="chart">
<v-charts
height="400px"
id="loadMemoryChart"
id="loadGPUChart"
type="line"
:option="chartsOption['loadMemoryChart']"
v-if="chartsOption['loadMemoryChart']"
:option="chartsOption['loadGPUChart']"
v-if="chartsOption['loadGPUChart']"
:dataZoom="true"
/>
</div>
Expand Down Expand Up @@ -148,7 +148,7 @@

<script setup lang="ts">
import { ref, reactive, onMounted, computed } from 'vue';
import { loadGPUMonitor } from '@/api/modules/host';
import { loadGPUMonitor, getGPUOptions } from '@/api/modules/host';
import { dateFormatWithoutYear } from '@/utils/util';
import { GlobalStore } from '@/store';
import { shortcuts } from '@/utils/shortcuts';
Expand Down Expand Up @@ -183,6 +183,21 @@ const searchInfo = reactive<Host.MonitorGPUSearch>({
endTime: new Date(),
});

const loadOptions = async () => {
loading.value = true;
await getGPUOptions()
.then((res) => {
gpuType.value = res.data.gpuType || 'gpu';
options.value = res.data.options || [];
searchInfo.productName = options.value.length > 0 ? options.value[0] : '';
search();
})
.catch(() => {
loading.value = false;
options.value = [];
});
};

const search = async () => {
if (searchTime.value && searchTime.value.length === 2) {
searchInfo.startTime = searchTime.value[0];
Expand All @@ -192,9 +207,6 @@ const search = async () => {
await loadGPUMonitor(searchInfo)
.then((res) => {
loading.value = false;
options.value = res.data.productNames || [];
gpuType.value = res.data.gpuType || 'gpu';
searchInfo.productName = searchInfo.productName || (options.value.length > 0 ? options.value[0] : '');
let baseDate = res.data.date.length === 0 ? loadEmptyDate(timeRangeGlobal.value) : res.data.date;
let date = baseDate.map(function (item: any) {
return dateFormatWithoutYear(item);
Expand Down Expand Up @@ -270,7 +282,7 @@ function initPowerCharts(baseDate: any, items: any) {
formatter: function (list: any) {
let res = loadDate(list[0].name);
for (const item of list) {
res += loadSeries(item, item.data.value ? item.data.value : item.data, '%');
res += loadSeries(item, item.data.value, '%');
res += `( ${item.data?.data.used} W / ${item.data?.data.total} W)<br/>`;
}
return res;
Expand Down Expand Up @@ -298,7 +310,7 @@ function initXpuPowerCharts(baseDate: any, items: any) {
formatter: function (list: any) {
let res = loadDate(list[0].name);
for (const item of list) {
res += loadSeries(item, item.data.value ? item.data.value : item.data, 'W');
res += loadSeries(item, item.data.value, 'W');
}
return res;
},
Expand Down Expand Up @@ -364,7 +376,7 @@ function withMemoryProcess(list: any) {
if (item.data?.data) {
process = item.data?.data.gpuProcesses || [];
}
res += loadSeries(item, item.data.value ? item.data.value : item.data, '%');
res += loadSeries(item, item.data.value, '%');
res += `( ${item.data?.data.used} MiB / ${item.data?.data.total} MiB)<br/>`;
}
if (!process) {
Expand Down Expand Up @@ -425,7 +437,7 @@ const loadProcessType = (val: string) => {
};

onMounted(() => {
search();
loadOptions();
});
</script>

Expand Down
Loading
Loading