Skip to content

Commit 3d20238

Browse files
authored
feat: Compatible with XPU monitoring (#11099)
1 parent 74b48a1 commit 3d20238

File tree

13 files changed

+126
-86
lines changed

13 files changed

+126
-86
lines changed

agent/app/api/v2/monitor.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,3 +126,7 @@ func (b *BaseApi) GetIOOptions(c *gin.Context) {
126126
sort.Strings(options)
127127
helper.SuccessWithData(c, options)
128128
}
129+
130+
func (b *BaseApi) GetCPUOptions(c *gin.Context) {
131+
helper.SuccessWithData(c, monitorService.LoadGPUOptions())
132+
}

agent/app/dto/monitor.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,17 +38,19 @@ type MonitorSettingUpdate struct {
3838
Value string `json:"value"`
3939
}
4040

41+
type MonitorGPUOptions struct {
42+
GPUType string `json:"gpuType"`
43+
Options []string `json:"options"`
44+
}
4145
type MonitorGPUSearch struct {
4246
ProductName string `json:"productName"`
4347
StartTime time.Time `json:"startTime"`
4448
EndTime time.Time `json:"endTime"`
4549
}
4650
type MonitorGPUData struct {
47-
GPUType string `json:"gpuType"`
48-
ProductNames []string `json:"productNames"`
4951
Date []time.Time `json:"date"`
5052
GPUValue []float64 `json:"gpuValue"`
51-
TemperatureValue []int `json:"temperatureValue"`
53+
TemperatureValue []float64 `json:"temperatureValue"`
5254
PowerValue []GPUPowerUsageHelper `json:"powerValue"`
5355
MemoryValue []GPUMemoryUsageHelper `json:"memoryValue"`
5456
SpeedValue []int `json:"speedValue"`

agent/app/model/monitor.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ type MonitorGPU struct {
3636
BaseModel
3737
ProductName string `json:"productName"`
3838
GPUUtil float64 `json:"gpuUtil"`
39-
Temperature int `json:"temperature"`
39+
Temperature float64 `json:"temperature"`
4040
PowerDraw float64 `json:"powerDraw"`
4141
MaxPowerLimit float64 `json:"maxPowerLimit"`
4242
MemUsed float64 `json:"memUsed"`

agent/app/service/monitor.go

Lines changed: 50 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ var monitorCancel context.CancelFunc
3939
type IMonitorService interface {
4040
Run()
4141
LoadMonitorData(req dto.MonitorSearch) ([]dto.MonitorData, error)
42+
LoadGPUOptions() dto.MonitorGPUOptions
4243
LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.MonitorGPUData, error)
4344
LoadSetting() (*dto.MonitorSetting, error)
4445
UpdateSetting(key, value string) error
@@ -118,42 +119,43 @@ func (m *MonitorService) LoadMonitorData(req dto.MonitorSearch) ([]dto.MonitorDa
118119
return data, nil
119120
}
120121

122+
func (m *MonitorService) LoadGPUOptions() dto.MonitorGPUOptions {
123+
var data dto.MonitorGPUOptions
124+
gpuExist, gpuClient := gpu.New()
125+
xpuExist, xpuClient := xpu.New()
126+
if !gpuExist && !xpuExist {
127+
return data
128+
}
129+
if gpuExist {
130+
data.GPUType = "gpu"
131+
gpuInfo, err := gpuClient.LoadGpuInfo()
132+
if err != nil || len(gpuInfo.GPUs) == 0 {
133+
global.LOG.Error("Load GPU info failed or no GPU found, err: ", err)
134+
return data
135+
}
136+
sort.Slice(gpuInfo.GPUs, func(i, j int) bool {
137+
return gpuInfo.GPUs[i].Index < gpuInfo.GPUs[j].Index
138+
})
139+
for _, item := range gpuInfo.GPUs {
140+
data.Options = append(data.Options, fmt.Sprintf("%d - %s", item.Index, item.ProductName))
141+
}
142+
return data
143+
} else {
144+
data.GPUType = "xpu"
145+
var err error
146+
data.Options, err = xpuClient.LoadDeviceList()
147+
if err != nil || len(data.Options) == 0 {
148+
global.LOG.Error("Load XPU info failed or no XPU found, err: ", err)
149+
}
150+
return data
151+
}
152+
}
153+
121154
func (m *MonitorService) LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.MonitorGPUData, error) {
122155
loc, _ := time.LoadLocation(common.LoadTimeZoneByCmd())
123156
req.StartTime = req.StartTime.In(loc)
124157
req.EndTime = req.EndTime.In(loc)
125-
126158
var data dto.MonitorGPUData
127-
gpuExist, gpuclient := gpu.New()
128-
xpuExist, xpuClient := xpu.New()
129-
if !gpuExist && !xpuExist {
130-
return data, nil
131-
}
132-
if len(req.ProductName) == 0 {
133-
if gpuExist {
134-
data.GPUType = "gpu"
135-
gpuInfo, err := gpuclient.LoadGpuInfo()
136-
if err != nil || len(gpuInfo.GPUs) == 0 {
137-
global.LOG.Error("Load GPU info failed or no GPU found, err: ", err)
138-
return data, buserr.New("ErrRecordNotFound")
139-
}
140-
req.ProductName = gpuInfo.GPUs[0].ProductName
141-
for _, item := range gpuInfo.GPUs {
142-
data.ProductNames = append(data.ProductNames, item.ProductName)
143-
}
144-
} else {
145-
data.GPUType = "xpu"
146-
xpuInfo, err := xpuClient.LoadGpuInfo()
147-
if err != nil || len(xpuInfo.Xpu) == 0 {
148-
global.LOG.Error("Load XPU info failed or no XPU found, err: ", err)
149-
return data, buserr.New("ErrRecordNotFound")
150-
}
151-
req.ProductName = xpuInfo.Xpu[0].Basic.DeviceName
152-
for _, item := range xpuInfo.Xpu {
153-
data.ProductNames = append(data.ProductNames, item.Basic.DeviceName)
154-
}
155-
}
156-
}
157159
gpuList, err := monitorRepo.GetGPU(repo.WithByCreatedAt(req.StartTime, req.EndTime), monitorRepo.WithByProductName(req.ProductName))
158160
if err != nil {
159161
return data, err
@@ -571,13 +573,14 @@ func saveGPUDataToDB() {
571573
var list []model.MonitorGPU
572574
for _, gpuItem := range gpuInfo.GPUs {
573575
item := model.MonitorGPU{
574-
ProductName: gpuItem.ProductName,
575-
GPUUtil: loadGPUInfoFloat(gpuItem.GPUUtil),
576-
Temperature: loadGPUInfoInt(gpuItem.Temperature),
577-
PowerDraw: loadGPUInfoFloat(gpuItem.PowerDraw),
578-
MemUsed: loadGPUInfoFloat(gpuItem.MemUsed),
579-
MemTotal: loadGPUInfoFloat(gpuItem.MemTotal),
580-
FanSpeed: loadGPUInfoInt(gpuItem.FanSpeed),
576+
ProductName: fmt.Sprintf("%d - %s", gpuItem.Index, gpuItem.ProductName),
577+
GPUUtil: loadGPUInfoFloat(gpuItem.GPUUtil),
578+
Temperature: loadGPUInfoFloat(gpuItem.Temperature),
579+
PowerDraw: loadGPUInfoFloat(gpuItem.PowerDraw),
580+
MaxPowerLimit: loadGPUInfoFloat(gpuItem.MaxPowerLimit),
581+
MemUsed: loadGPUInfoFloat(gpuItem.MemUsed),
582+
MemTotal: loadGPUInfoFloat(gpuItem.MemTotal),
583+
FanSpeed: loadGPUInfoInt(gpuItem.FanSpeed),
581584
}
582585
process, _ := json.Marshal(gpuItem.Processes)
583586
if len(process) != 0 {
@@ -602,13 +605,12 @@ func saveXPUDataToDB() {
602605
var list []model.MonitorGPU
603606
for _, xpuItem := range xpuInfo.Xpu {
604607
item := model.MonitorGPU{
605-
ProductName: xpuItem.Basic.DeviceName,
606-
GPUUtil: loadGPUInfoFloat(xpuItem.Stats.MemoryUtil),
607-
Temperature: loadGPUInfoInt(xpuItem.Stats.Temperature),
608-
PowerDraw: loadGPUInfoFloat(xpuItem.Stats.Power),
609-
MaxPowerLimit: float64(xpuItem.Config.PowerLimit),
610-
MemUsed: loadGPUInfoFloat(xpuItem.Stats.MemoryUsed),
611-
MemTotal: loadGPUInfoFloat(xpuItem.Basic.Memory),
608+
ProductName: fmt.Sprintf("%d - %s", xpuItem.Basic.DeviceID, xpuItem.Basic.DeviceName),
609+
GPUUtil: loadGPUInfoFloat(xpuItem.Stats.MemoryUtil),
610+
Temperature: loadGPUInfoFloat(xpuItem.Stats.Temperature),
611+
PowerDraw: loadGPUInfoFloat(xpuItem.Stats.Power),
612+
MemUsed: loadGPUInfoFloat(xpuItem.Stats.MemoryUsed),
613+
MemTotal: loadGPUInfoFloat(xpuItem.Basic.Memory),
612614
}
613615
if len(xpuItem.Processes) != 0 {
614616
var processItem []dto.GPUProcess
@@ -643,6 +645,9 @@ func loadGPUInfoInt(val string) int {
643645
func loadGPUInfoFloat(val string) float64 {
644646
valItem := strings.ReplaceAll(val, "W", "")
645647
valItem = strings.ReplaceAll(valItem, "MB", "")
648+
valItem = strings.ReplaceAll(valItem, "MiB", "")
649+
valItem = strings.ReplaceAll(valItem, "C", "")
650+
valItem = strings.ReplaceAll(valItem, "°C", "")
646651
valItem = strings.ReplaceAll(valItem, "%", "")
647652
valItem = strings.TrimSpace(valItem)
648653
data, _ := strconv.ParseFloat(valItem, 64)

agent/init/migration/migrations/init.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -721,7 +721,7 @@ var UpdateDatabase = &gormigrate.Migration{
721721
}
722722

723723
var AddGPUMonitor = &gormigrate.Migration{
724-
ID: "20251127-add-gpu-monitor",
724+
ID: "20251122-add-gpu-monitor",
725725
Migrate: func(tx *gorm.DB) error {
726726
return global.GPUMonitorDB.AutoMigrate(&model.MonitorGPU{})
727727
},

agent/router/ro_host.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ func (s *HostRouter) InitRouter(Router *gin.RouterGroup) {
3131
hostRouter.POST("/monitor/search", baseApi.LoadMonitor)
3232
hostRouter.POST("/monitor/gpu/search", baseApi.LoadGPUMonitor)
3333
hostRouter.POST("/monitor/clean", baseApi.CleanMonitor)
34+
hostRouter.GET("/monitor/gpuoptions", baseApi.GetCPUOptions)
3435
hostRouter.GET("/monitor/netoptions", baseApi.GetNetworkOptions)
3536
hostRouter.GET("/monitor/iooptions", baseApi.GetIOOptions)
3637
hostRouter.GET("/monitor/setting", baseApi.LoadMonitorSetting)

agent/utils/ai_tools/xpu/xpu.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,23 @@ func (x XpuSMI) LoadDashData() ([]XPUSimpleInfo, error) {
120120
return res, nil
121121
}
122122

123+
func (x XpuSMI) LoadDeviceList() ([]string, error) {
124+
cmdMgr := cmd.NewCommandMgr(cmd.WithTimeout(5 * time.Second))
125+
data, err := cmdMgr.RunWithStdoutBashC("xpu-smi discovery -j")
126+
if err != nil {
127+
return nil, fmt.Errorf("calling xpu-smi failed, %v", err)
128+
}
129+
var deviceInfo DeviceInfo
130+
if err := json.Unmarshal([]byte(data), &deviceInfo); err != nil {
131+
return nil, fmt.Errorf("deviceInfo json unmarshal failed, err: %w", err)
132+
}
133+
var deviceNames []string
134+
for _, device := range deviceInfo.DeviceList {
135+
deviceNames = append(deviceNames, fmt.Sprintf("%d - %s", device.DeviceID, device.DeviceName))
136+
}
137+
return deviceNames, nil
138+
}
139+
123140
func (x XpuSMI) LoadGpuInfo() (*XpuInfo, error) {
124141
cmdMgr := cmd.NewCommandMgr(cmd.WithTimeout(5 * time.Second))
125142
data, err := cmdMgr.RunWithStdoutBashC("xpu-smi discovery -j")

agent/utils/ai_tools/xpu/xpu_info.go

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ type XpuInfo struct {
1010
type Xpu struct {
1111
Basic Basic `json:"basic"`
1212
Stats Stats `json:"stats"`
13-
Config Config `json:"config"`
1413
Processes []Process `json:"processes"`
1514
}
1615

@@ -24,11 +23,6 @@ type Basic struct {
2423
PciBdfAddress string `json:"pciBdfAddress"`
2524
}
2625

27-
type Config struct {
28-
PowerLimit int `json:"power_limit"`
29-
PowerValidRange string `json:"power_vaild_range"`
30-
}
31-
3226
type Stats struct {
3327
Power string `json:"power"`
3428
Frequency string `json:"frequency"`

frontend/src/api/interface/host.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -166,9 +166,11 @@ export namespace Host {
166166
startTime: Date;
167167
endTime: Date;
168168
}
169-
export interface MonitorGPUData {
169+
export interface MonitorGPUOptions {
170170
gpuType: string;
171-
productNames: Array<string>;
171+
options: Array<string>;
172+
}
173+
export interface MonitorGPUData {
172174
date: Array<Date>;
173175
gpuValue: Array<number>;
174176
temperatureValue: Array<number>;

frontend/src/api/modules/host.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,9 @@ export const operateFilterChain = (name: string, op: string) => {
6565
export const loadMonitor = (param: Host.MonitorSearch) => {
6666
return http.post<Array<Host.MonitorData>>(`/hosts/monitor/search`, param);
6767
};
68+
export const getGPUOptions = () => {
69+
return http.get<Host.MonitorGPUOptions>(`/hosts/monitor/gpuoptions`);
70+
};
6871
export const loadGPUMonitor = (param: Host.MonitorGPUSearch) => {
6972
return http.post<Host.MonitorGPUData>(`/hosts/monitor/gpu/search`, param);
7073
};

0 commit comments

Comments
 (0)