Skip to content

Commit d7891bc

Browse files
committed
perf: Optimize GPU monitoring style
1 parent 3d3d22d commit d7891bc

File tree

3 files changed

+47
-22
lines changed

3 files changed

+47
-22
lines changed

agent/app/dto/monitor.go

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,18 @@ type MonitorSettingUpdate struct {
3939
}
4040

4141
type MonitorGPUOptions struct {
42-
GPUType string `json:"gpuType"`
43-
Options []string `json:"options"`
42+
GPUType string `json:"gpuType"`
43+
ChartHide []GPUChartHide `json:"chartHide"`
44+
Options []string `json:"options"`
45+
}
46+
type GPUChartHide struct {
47+
ProductName string `json:"productName"`
48+
Process bool `json:"process"`
49+
GPU bool `json:"gpu"`
50+
Memory bool `json:"memory"`
51+
Power bool `json:"power"`
52+
Temperature bool `json:"temperature"`
53+
Speed bool `json:"speed"`
4454
}
4555
type MonitorGPUSearch struct {
4656
ProductName string `json:"productName"`
@@ -59,6 +69,7 @@ type MonitorGPUData struct {
5969
MemoryPercent []float64 `json:"memoryPercent"`
6070
SpeedValue []int `json:"speedValue"`
6171

72+
ProcessCount []int `json:"processCount"`
6273
GPUProcesses [][]GPUProcess `json:"gpuProcesses"`
6374
}
6475

agent/app/service/monitor.go

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -137,16 +137,45 @@ func (m *MonitorService) LoadGPUOptions() dto.MonitorGPUOptions {
137137
return gpuInfo.GPUs[i].Index < gpuInfo.GPUs[j].Index
138138
})
139139
for _, item := range gpuInfo.GPUs {
140+
var chartHide dto.GPUChartHide
141+
chartHide.ProductName = fmt.Sprintf("%d - %s", item.Index, item.ProductName)
142+
chartHide.GPU = item.GPUUtil == "" || item.GPUUtil == "N/A"
143+
if (item.MemTotal == "" || item.MemTotal == "N/A") && (item.MemUsed == "" || item.MemUsed == "N/A") {
144+
chartHide.Memory = true
145+
}
146+
if (item.MaxPowerLimit == "" || item.MaxPowerLimit == "N/A") && (item.PowerDraw == "" || item.PowerDraw == "N/A") {
147+
chartHide.Power = true
148+
}
149+
chartHide.Temperature = item.Temperature == "" || item.Temperature == "N/A"
150+
chartHide.Speed = item.FanSpeed == "" || item.FanSpeed == "N/A"
151+
data.ChartHide = append(data.ChartHide, chartHide)
140152
data.Options = append(data.Options, fmt.Sprintf("%d - %s", item.Index, item.ProductName))
141153
}
142154
return data
143155
} else {
144156
data.GPUType = "xpu"
145-
var err error
146-
data.Options, err = xpuClient.LoadDeviceList()
147-
if err != nil || len(data.Options) == 0 {
157+
xpu, err := xpuClient.LoadGpuInfo()
158+
if err != nil || len(xpu.Xpu) == 0 {
148159
global.LOG.Error("Load XPU info failed or no XPU found, err: ", err)
149160
}
161+
sort.Slice(xpu.Xpu, func(i, j int) bool {
162+
return xpu.Xpu[i].Basic.DeviceID < xpu.Xpu[j].Basic.DeviceID
163+
})
164+
for _, item := range xpu.Xpu {
165+
var chartHide dto.GPUChartHide
166+
chartHide.GPU = true
167+
chartHide.Speed = true
168+
chartHide.ProductName = fmt.Sprintf("%d - %s", item.Basic.DeviceID, item.Basic.DeviceName)
169+
if (item.Stats.MemoryUsed == "" || item.Stats.MemoryUsed == "N/A") && (item.Basic.Memory == "" || item.Basic.FreeMemory == "N/A") {
170+
chartHide.Memory = true
171+
}
172+
if item.Stats.Power == "" || item.Stats.Power == "N/A" {
173+
chartHide.Power = true
174+
}
175+
chartHide.Temperature = item.Stats.Temperature == "" || item.Stats.Temperature == "N/A"
176+
data.ChartHide = append(data.ChartHide, chartHide)
177+
data.Options = append(data.Options, fmt.Sprintf("%d - %s", item.Basic.DeviceID, item.Basic.DeviceName))
178+
}
150179
return data
151180
}
152181
}
@@ -182,8 +211,10 @@ func (m *MonitorService) LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.Monit
182211
}
183212
var process []dto.GPUProcess
184213
if err := json.Unmarshal([]byte(gpu.Processes), &process); err == nil {
214+
data.ProcessCount = append(data.ProcessCount, len(process))
185215
data.GPUProcesses = append(data.GPUProcesses, process)
186216
} else {
217+
data.ProcessCount = append(data.ProcessCount, 0)
187218
data.GPUProcesses = append(data.GPUProcesses, []dto.GPUProcess{})
188219
}
189220
data.SpeedValue = append(data.SpeedValue, gpu.FanSpeed)

agent/utils/ai_tools/xpu/xpu.go

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -120,23 +120,6 @@ func (x XpuSMI) LoadDashData() ([]XPUSimpleInfo, error) {
120120
return res, nil
121121
}
122122

123-
func (x XpuSMI) LoadDeviceList() ([]string, error) {
124-
cmdMgr := cmd.NewCommandMgr(cmd.WithTimeout(5 * time.Second))
125-
data, err := cmdMgr.RunWithStdoutBashC("xpu-smi discovery -j")
126-
if err != nil {
127-
return nil, fmt.Errorf("calling xpu-smi failed, %v", err)
128-
}
129-
var deviceInfo DeviceInfo
130-
if err := json.Unmarshal([]byte(data), &deviceInfo); err != nil {
131-
return nil, fmt.Errorf("deviceInfo json unmarshal failed, err: %w", err)
132-
}
133-
var deviceNames []string
134-
for _, device := range deviceInfo.DeviceList {
135-
deviceNames = append(deviceNames, fmt.Sprintf("%d - %s", device.DeviceID, device.DeviceName))
136-
}
137-
return deviceNames, nil
138-
}
139-
140123
func (x XpuSMI) LoadGpuInfo() (*XpuInfo, error) {
141124
cmdMgr := cmd.NewCommandMgr(cmd.WithTimeout(5 * time.Second))
142125
data, err := cmdMgr.RunWithStdoutBashC("xpu-smi discovery -j")

0 commit comments

Comments
 (0)