Skip to content

Commit 2930943

Browse files
committed
feat: Compatible with XPU monitoring
1 parent 747a0c4 commit 2930943

File tree

11 files changed

+89
-65
lines changed

11 files changed

+89
-65
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ build_core_on_darwin:
4242

4343
build_agent_on_darwin:
4444
cd $(AGENT_PATH) \
45-
&& CGO_ENABLED=0 GOOS=linux GOARCH=amd64 $(GOBUILD) -trimpath -ldflags '-s -w' -o $(BUILD_PATH)/$(AGENT_NAME) $(AGENT_MAIN)
45+
&& CGO_ENABLED=0 GOOS=linux GOARCH=amd64 $(GOBUILD) -tags=xpack -trimpath -ldflags '-s -w' -o $(BUILD_PATH)/$(AGENT_NAME) $(AGENT_MAIN)
4646

4747
build_all: build_frontend build_core_on_linux build_agent_on_linux
4848

agent/app/api/v2/monitor.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,3 +126,7 @@ func (b *BaseApi) GetIOOptions(c *gin.Context) {
126126
sort.Strings(options)
127127
helper.SuccessWithData(c, options)
128128
}
129+
130+
func (b *BaseApi) GetCPUOptions(c *gin.Context) {
131+
helper.SuccessWithData(c, monitorService.LoadGPUOptions())
132+
}

agent/app/dto/monitor.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,17 +38,20 @@ type MonitorSettingUpdate struct {
3838
Value string `json:"value"`
3939
}
4040

41+
type MonitorGPUOptions struct {
42+
GPUType string `json:"gpuType"`
43+
Options []string `json:"options"`
44+
}
4145
type MonitorGPUSearch struct {
4246
ProductName string `json:"productName"`
4347
StartTime time.Time `json:"startTime"`
4448
EndTime time.Time `json:"endTime"`
4549
}
4650
type MonitorGPUData struct {
47-
GPUType string `json:"gpuType"`
4851
ProductNames []string `json:"productNames"`
4952
Date []time.Time `json:"date"`
5053
GPUValue []float64 `json:"gpuValue"`
51-
TemperatureValue []int `json:"temperatureValue"`
54+
TemperatureValue []float64 `json:"temperatureValue"`
5255
PowerValue []GPUPowerUsageHelper `json:"powerValue"`
5356
MemoryValue []GPUMemoryUsageHelper `json:"memoryValue"`
5457
SpeedValue []int `json:"speedValue"`

agent/app/model/monitor.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ type MonitorGPU struct {
3636
BaseModel
3737
ProductName string `json:"productName"`
3838
GPUUtil float64 `json:"gpuUtil"`
39-
Temperature int `json:"temperature"`
39+
Temperature float64 `json:"temperature"`
4040
PowerDraw float64 `json:"powerDraw"`
4141
MaxPowerLimit float64 `json:"maxPowerLimit"`
4242
MemUsed float64 `json:"memUsed"`

agent/app/service/monitor.go

Lines changed: 48 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ var monitorCancel context.CancelFunc
3939
type IMonitorService interface {
4040
Run()
4141
LoadMonitorData(req dto.MonitorSearch) ([]dto.MonitorData, error)
42+
LoadGPUOptions() dto.MonitorGPUOptions
4243
LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.MonitorGPUData, error)
4344
LoadSetting() (*dto.MonitorSetting, error)
4445
UpdateSetting(key, value string) error
@@ -118,42 +119,49 @@ func (m *MonitorService) LoadMonitorData(req dto.MonitorSearch) ([]dto.MonitorDa
118119
return data, nil
119120
}
120121

122+
func (m *MonitorService) LoadGPUOptions() dto.MonitorGPUOptions {
123+
var data dto.MonitorGPUOptions
124+
gpuExist, gpuClient := gpu.New()
125+
xpuExist, xpuClient := xpu.New()
126+
if !gpuExist && !xpuExist {
127+
return data
128+
}
129+
if gpuExist {
130+
data.GPUType = "gpu"
131+
gpuInfo, err := gpuClient.LoadGpuInfo()
132+
if err != nil || len(gpuInfo.GPUs) == 0 {
133+
global.LOG.Error("Load GPU info failed or no GPU found, err: ", err)
134+
return data
135+
}
136+
sort.Slice(gpuInfo.GPUs, func(i, j int) bool {
137+
return gpuInfo.GPUs[i].Index < gpuInfo.GPUs[j].Index
138+
})
139+
for _, item := range gpuInfo.GPUs {
140+
data.Options = append(data.Options, fmt.Sprintf("%d - %s", item.Index, item.ProductName))
141+
}
142+
return data
143+
} else {
144+
data.GPUType = "xpu"
145+
xpuInfo, err := xpuClient.LoadGpuInfo()
146+
if err != nil || len(xpuInfo.Xpu) == 0 {
147+
global.LOG.Error("Load XPU info failed or no XPU found, err: ", err)
148+
return data
149+
}
150+
sort.Slice(xpuInfo.Xpu, func(i, j int) bool {
151+
return xpuInfo.Xpu[i].Basic.DeviceID < xpuInfo.Xpu[j].Basic.DeviceID
152+
})
153+
for _, item := range xpuInfo.Xpu {
154+
data.Options = append(data.Options, fmt.Sprintf("%d - %s", item.Basic.DeviceID, item.Basic.DeviceName))
155+
}
156+
return data
157+
}
158+
}
159+
121160
func (m *MonitorService) LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.MonitorGPUData, error) {
122161
loc, _ := time.LoadLocation(common.LoadTimeZoneByCmd())
123162
req.StartTime = req.StartTime.In(loc)
124163
req.EndTime = req.EndTime.In(loc)
125-
126164
var data dto.MonitorGPUData
127-
gpuExist, gpuclient := gpu.New()
128-
xpuExist, xpuClient := xpu.New()
129-
if !gpuExist && !xpuExist {
130-
return data, nil
131-
}
132-
if len(req.ProductName) == 0 {
133-
if gpuExist {
134-
data.GPUType = "gpu"
135-
gpuInfo, err := gpuclient.LoadGpuInfo()
136-
if err != nil || len(gpuInfo.GPUs) == 0 {
137-
global.LOG.Error("Load GPU info failed or no GPU found, err: ", err)
138-
return data, buserr.New("ErrRecordNotFound")
139-
}
140-
req.ProductName = gpuInfo.GPUs[0].ProductName
141-
for _, item := range gpuInfo.GPUs {
142-
data.ProductNames = append(data.ProductNames, item.ProductName)
143-
}
144-
} else {
145-
data.GPUType = "xpu"
146-
xpuInfo, err := xpuClient.LoadGpuInfo()
147-
if err != nil || len(xpuInfo.Xpu) == 0 {
148-
global.LOG.Error("Load XPU info failed or no XPU found, err: ", err)
149-
return data, buserr.New("ErrRecordNotFound")
150-
}
151-
req.ProductName = xpuInfo.Xpu[0].Basic.DeviceName
152-
for _, item := range xpuInfo.Xpu {
153-
data.ProductNames = append(data.ProductNames, item.Basic.DeviceName)
154-
}
155-
}
156-
}
157165
gpuList, err := monitorRepo.GetGPU(repo.WithByCreatedAt(req.StartTime, req.EndTime), monitorRepo.WithByProductName(req.ProductName))
158166
if err != nil {
159167
return data, err
@@ -571,9 +579,9 @@ func saveGPUDataToDB() {
571579
var list []model.MonitorGPU
572580
for _, gpuItem := range gpuInfo.GPUs {
573581
item := model.MonitorGPU{
574-
ProductName: gpuItem.ProductName,
582+
ProductName: fmt.Sprintf("%d - %s", gpuItem.Index, gpuItem.ProductName),
575583
GPUUtil: loadGPUInfoFloat(gpuItem.GPUUtil),
576-
Temperature: loadGPUInfoInt(gpuItem.Temperature),
584+
Temperature: loadGPUInfoFloat(gpuItem.Temperature),
577585
PowerDraw: loadGPUInfoFloat(gpuItem.PowerDraw),
578586
MemUsed: loadGPUInfoFloat(gpuItem.MemUsed),
579587
MemTotal: loadGPUInfoFloat(gpuItem.MemTotal),
@@ -602,13 +610,12 @@ func saveXPUDataToDB() {
602610
var list []model.MonitorGPU
603611
for _, xpuItem := range xpuInfo.Xpu {
604612
item := model.MonitorGPU{
605-
ProductName: xpuItem.Basic.DeviceName,
606-
GPUUtil: loadGPUInfoFloat(xpuItem.Stats.MemoryUtil),
607-
Temperature: loadGPUInfoInt(xpuItem.Stats.Temperature),
608-
PowerDraw: loadGPUInfoFloat(xpuItem.Stats.Power),
609-
MaxPowerLimit: float64(xpuItem.Config.PowerLimit),
610-
MemUsed: loadGPUInfoFloat(xpuItem.Stats.MemoryUsed),
611-
MemTotal: loadGPUInfoFloat(xpuItem.Basic.Memory),
613+
ProductName: fmt.Sprintf("%d - %s", xpuItem.Basic.DeviceID, xpuItem.Basic.DeviceName),
614+
GPUUtil: loadGPUInfoFloat(xpuItem.Stats.MemoryUtil),
615+
Temperature: loadGPUInfoFloat(xpuItem.Stats.Temperature),
616+
PowerDraw: loadGPUInfoFloat(xpuItem.Stats.Power),
617+
MemUsed: loadGPUInfoFloat(xpuItem.Stats.MemoryUsed),
618+
MemTotal: loadGPUInfoFloat(xpuItem.Basic.Memory),
612619
}
613620
if len(xpuItem.Processes) != 0 {
614621
var processItem []dto.GPUProcess
@@ -643,6 +650,7 @@ func loadGPUInfoInt(val string) int {
643650
func loadGPUInfoFloat(val string) float64 {
644651
valItem := strings.ReplaceAll(val, "W", "")
645652
valItem = strings.ReplaceAll(valItem, "MB", "")
653+
valItem = strings.ReplaceAll(valItem, "°C", "")
646654
valItem = strings.ReplaceAll(valItem, "%", "")
647655
valItem = strings.TrimSpace(valItem)
648656
data, _ := strconv.ParseFloat(valItem, 64)

agent/init/migration/migrations/init.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -721,7 +721,7 @@ var UpdateDatabase = &gormigrate.Migration{
721721
}
722722

723723
var AddGPUMonitor = &gormigrate.Migration{
724-
ID: "20251127-add-gpu-monitor",
724+
ID: "20251122-add-gpu-monitor",
725725
Migrate: func(tx *gorm.DB) error {
726726
return global.GPUMonitorDB.AutoMigrate(&model.MonitorGPU{})
727727
},

agent/router/ro_host.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ func (s *HostRouter) InitRouter(Router *gin.RouterGroup) {
3131
hostRouter.POST("/monitor/search", baseApi.LoadMonitor)
3232
hostRouter.POST("/monitor/gpu/search", baseApi.LoadGPUMonitor)
3333
hostRouter.POST("/monitor/clean", baseApi.CleanMonitor)
34+
hostRouter.GET("/monitor/gpuoptions", baseApi.GetCPUOptions)
3435
hostRouter.GET("/monitor/netoptions", baseApi.GetNetworkOptions)
3536
hostRouter.GET("/monitor/iooptions", baseApi.GetIOOptions)
3637
hostRouter.GET("/monitor/setting", baseApi.LoadMonitorSetting)

agent/utils/ai_tools/xpu/xpu_info.go

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ type XpuInfo struct {
1010
type Xpu struct {
1111
Basic Basic `json:"basic"`
1212
Stats Stats `json:"stats"`
13-
Config Config `json:"config"`
1413
Processes []Process `json:"processes"`
1514
}
1615

@@ -24,11 +23,6 @@ type Basic struct {
2423
PciBdfAddress string `json:"pciBdfAddress"`
2524
}
2625

27-
type Config struct {
28-
PowerLimit int `json:"power_limit"`
29-
PowerValidRange string `json:"power_vaild_range"`
30-
}
31-
3226
type Stats struct {
3327
Power string `json:"power"`
3428
Frequency string `json:"frequency"`

frontend/src/api/modules/host.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,9 @@ export const operateFilterChain = (name: string, op: string) => {
6565
export const loadMonitor = (param: Host.MonitorSearch) => {
6666
return http.post<Array<Host.MonitorData>>(`/hosts/monitor/search`, param);
6767
};
68+
export const getGPUOptions = () => {
69+
return http.get<Array<string>>(`/hosts/monitor/gpuoptions`);
70+
};
6871
export const loadGPUMonitor = (param: Host.MonitorGPUSearch) => {
6972
return http.post<Host.MonitorGPUData>(`/hosts/monitor/gpu/search`, param);
7073
};

frontend/src/views/ai/gpu/index.vue

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@
148148

149149
<script setup lang="ts">
150150
import { ref, reactive, onMounted, computed } from 'vue';
151-
import { loadGPUMonitor } from '@/api/modules/host';
151+
import { loadGPUMonitor, getGPUOptions } from '@/api/modules/host';
152152
import { dateFormatWithoutYear } from '@/utils/util';
153153
import { GlobalStore } from '@/store';
154154
import { shortcuts } from '@/utils/shortcuts';
@@ -183,6 +183,18 @@ const searchInfo = reactive<Host.MonitorGPUSearch>({
183183
endTime: new Date(),
184184
});
185185
186+
const loadOptions = async () => {
187+
await getGPUOptions()
188+
.then((res) => {
189+
options.value = res.data || [];
190+
searchInfo.productName = options.value.length > 0 ? options.value[0] : '';
191+
search();
192+
})
193+
.catch(() => {
194+
options.value = [];
195+
});
196+
};
197+
186198
const search = async () => {
187199
if (searchTime.value && searchTime.value.length === 2) {
188200
searchInfo.startTime = searchTime.value[0];
@@ -192,7 +204,6 @@ const search = async () => {
192204
await loadGPUMonitor(searchInfo)
193205
.then((res) => {
194206
loading.value = false;
195-
options.value = res.data.productNames || [];
196207
gpuType.value = res.data.gpuType || 'gpu';
197208
searchInfo.productName = searchInfo.productName || (options.value.length > 0 ? options.value[0] : '');
198209
let baseDate = res.data.date.length === 0 ? loadEmptyDate(timeRangeGlobal.value) : res.data.date;
@@ -425,7 +436,7 @@ const loadProcessType = (val: string) => {
425436
};
426437
427438
onMounted(() => {
428-
search();
439+
loadOptions();
429440
});
430441
</script>
431442

0 commit comments

Comments
 (0)