diff --git a/agent/app/api/v2/monitor.go b/agent/app/api/v2/monitor.go index bc57bd019d47..da122dd13558 100644 --- a/agent/app/api/v2/monitor.go +++ b/agent/app/api/v2/monitor.go @@ -126,3 +126,7 @@ func (b *BaseApi) GetIOOptions(c *gin.Context) { sort.Strings(options) helper.SuccessWithData(c, options) } + +func (b *BaseApi) GetCPUOptions(c *gin.Context) { + helper.SuccessWithData(c, monitorService.LoadGPUOptions()) +} diff --git a/agent/app/dto/monitor.go b/agent/app/dto/monitor.go index e8837feaf65e..e68bdacba39b 100644 --- a/agent/app/dto/monitor.go +++ b/agent/app/dto/monitor.go @@ -38,17 +38,19 @@ type MonitorSettingUpdate struct { Value string `json:"value"` } +type MonitorGPUOptions struct { + GPUType string `json:"gpuType"` + Options []string `json:"options"` +} type MonitorGPUSearch struct { ProductName string `json:"productName"` StartTime time.Time `json:"startTime"` EndTime time.Time `json:"endTime"` } type MonitorGPUData struct { - GPUType string `json:"gpuType"` - ProductNames []string `json:"productNames"` Date []time.Time `json:"date"` GPUValue []float64 `json:"gpuValue"` - TemperatureValue []int `json:"temperatureValue"` + TemperatureValue []float64 `json:"temperatureValue"` PowerValue []GPUPowerUsageHelper `json:"powerValue"` MemoryValue []GPUMemoryUsageHelper `json:"memoryValue"` SpeedValue []int `json:"speedValue"` diff --git a/agent/app/model/monitor.go b/agent/app/model/monitor.go index 628f5ab67812..6ec42ab9c0b2 100644 --- a/agent/app/model/monitor.go +++ b/agent/app/model/monitor.go @@ -36,7 +36,7 @@ type MonitorGPU struct { BaseModel ProductName string `json:"productName"` GPUUtil float64 `json:"gpuUtil"` - Temperature int `json:"temperature"` + Temperature float64 `json:"temperature"` PowerDraw float64 `json:"powerDraw"` MaxPowerLimit float64 `json:"maxPowerLimit"` MemUsed float64 `json:"memUsed"` diff --git a/agent/app/service/monitor.go b/agent/app/service/monitor.go index 2e88cd6be3c4..985fbb9f03c5 100644 --- a/agent/app/service/monitor.go +++ b/agent/app/service/monitor.go @@ -39,6 +39,7 @@ var monitorCancel context.CancelFunc type IMonitorService interface { Run() LoadMonitorData(req dto.MonitorSearch) ([]dto.MonitorData, error) + LoadGPUOptions() dto.MonitorGPUOptions LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.MonitorGPUData, error) LoadSetting() (*dto.MonitorSetting, error) UpdateSetting(key, value string) error @@ -118,42 +119,43 @@ func (m *MonitorService) LoadMonitorData(req dto.MonitorSearch) ([]dto.MonitorDa return data, nil } +func (m *MonitorService) LoadGPUOptions() dto.MonitorGPUOptions { + var data dto.MonitorGPUOptions + gpuExist, gpuClient := gpu.New() + xpuExist, xpuClient := xpu.New() + if !gpuExist && !xpuExist { + return data + } + if gpuExist { + data.GPUType = "gpu" + gpuInfo, err := gpuClient.LoadGpuInfo() + if err != nil || len(gpuInfo.GPUs) == 0 { + global.LOG.Error("Load GPU info failed or no GPU found, err: ", err) + return data + } + sort.Slice(gpuInfo.GPUs, func(i, j int) bool { + return gpuInfo.GPUs[i].Index < gpuInfo.GPUs[j].Index + }) + for _, item := range gpuInfo.GPUs { + data.Options = append(data.Options, fmt.Sprintf("%d - %s", item.Index, item.ProductName)) + } + return data + } else { + data.GPUType = "xpu" + var err error + data.Options, err = xpuClient.LoadDeviceList() + if err != nil || len(data.Options) == 0 { + global.LOG.Error("Load XPU info failed or no XPU found, err: ", err) + } + return data + } +} + func (m *MonitorService) LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.MonitorGPUData, error) { loc, _ := time.LoadLocation(common.LoadTimeZoneByCmd()) req.StartTime = req.StartTime.In(loc) req.EndTime = req.EndTime.In(loc) - var data dto.MonitorGPUData - gpuExist, gpuclient := gpu.New() - xpuExist, xpuClient := xpu.New() - if !gpuExist && !xpuExist { - return data, nil - } - if len(req.ProductName) == 0 { - if gpuExist { - data.GPUType = "gpu" - gpuInfo, err := gpuclient.LoadGpuInfo() - if err != nil || len(gpuInfo.GPUs) == 0 { - global.LOG.Error("Load GPU info failed or no GPU found, err: ", err) - return data, buserr.New("ErrRecordNotFound") - } - req.ProductName = gpuInfo.GPUs[0].ProductName - for _, item := range gpuInfo.GPUs { - data.ProductNames = append(data.ProductNames, item.ProductName) - } - } else { - data.GPUType = "xpu" - xpuInfo, err := xpuClient.LoadGpuInfo() - if err != nil || len(xpuInfo.Xpu) == 0 { - global.LOG.Error("Load XPU info failed or no XPU found, err: ", err) - return data, buserr.New("ErrRecordNotFound") - } - req.ProductName = xpuInfo.Xpu[0].Basic.DeviceName - for _, item := range xpuInfo.Xpu { - data.ProductNames = append(data.ProductNames, item.Basic.DeviceName) - } - } - } gpuList, err := monitorRepo.GetGPU(repo.WithByCreatedAt(req.StartTime, req.EndTime), monitorRepo.WithByProductName(req.ProductName)) if err != nil { return data, err @@ -571,13 +573,14 @@ func saveGPUDataToDB() { var list []model.MonitorGPU for _, gpuItem := range gpuInfo.GPUs { item := model.MonitorGPU{ - ProductName: gpuItem.ProductName, - GPUUtil: loadGPUInfoFloat(gpuItem.GPUUtil), - Temperature: loadGPUInfoInt(gpuItem.Temperature), - PowerDraw: loadGPUInfoFloat(gpuItem.PowerDraw), - MemUsed: loadGPUInfoFloat(gpuItem.MemUsed), - MemTotal: loadGPUInfoFloat(gpuItem.MemTotal), - FanSpeed: loadGPUInfoInt(gpuItem.FanSpeed), + ProductName: fmt.Sprintf("%d - %s", gpuItem.Index, gpuItem.ProductName), + GPUUtil: loadGPUInfoFloat(gpuItem.GPUUtil), + Temperature: loadGPUInfoFloat(gpuItem.Temperature), + PowerDraw: loadGPUInfoFloat(gpuItem.PowerDraw), + MaxPowerLimit: loadGPUInfoFloat(gpuItem.MaxPowerLimit), + MemUsed: loadGPUInfoFloat(gpuItem.MemUsed), + MemTotal: loadGPUInfoFloat(gpuItem.MemTotal), + FanSpeed: loadGPUInfoInt(gpuItem.FanSpeed), } process, _ := json.Marshal(gpuItem.Processes) if len(process) != 0 { @@ -602,13 +605,12 @@ func saveXPUDataToDB() { var list []model.MonitorGPU for _, xpuItem := range xpuInfo.Xpu { item := model.MonitorGPU{ - ProductName: xpuItem.Basic.DeviceName, - GPUUtil: loadGPUInfoFloat(xpuItem.Stats.MemoryUtil), - Temperature: loadGPUInfoInt(xpuItem.Stats.Temperature), - PowerDraw: loadGPUInfoFloat(xpuItem.Stats.Power), - MaxPowerLimit: float64(xpuItem.Config.PowerLimit), - MemUsed: loadGPUInfoFloat(xpuItem.Stats.MemoryUsed), - MemTotal: loadGPUInfoFloat(xpuItem.Basic.Memory), + ProductName: fmt.Sprintf("%d - %s", xpuItem.Basic.DeviceID, xpuItem.Basic.DeviceName), + GPUUtil: loadGPUInfoFloat(xpuItem.Stats.MemoryUtil), + Temperature: loadGPUInfoFloat(xpuItem.Stats.Temperature), + PowerDraw: loadGPUInfoFloat(xpuItem.Stats.Power), + MemUsed: loadGPUInfoFloat(xpuItem.Stats.MemoryUsed), + MemTotal: loadGPUInfoFloat(xpuItem.Basic.Memory), } if len(xpuItem.Processes) != 0 { var processItem []dto.GPUProcess @@ -643,6 +645,9 @@ func loadGPUInfoInt(val string) int { func loadGPUInfoFloat(val string) float64 { valItem := strings.ReplaceAll(val, "W", "") valItem = strings.ReplaceAll(valItem, "MB", "") + valItem = strings.ReplaceAll(valItem, "MiB", "") + valItem = strings.ReplaceAll(valItem, "C", "") + valItem = strings.ReplaceAll(valItem, "°C", "") valItem = strings.ReplaceAll(valItem, "%", "") valItem = strings.TrimSpace(valItem) data, _ := strconv.ParseFloat(valItem, 64) diff --git a/agent/init/migration/migrations/init.go b/agent/init/migration/migrations/init.go index 4eec39de6d5e..84d908c97665 100644 --- a/agent/init/migration/migrations/init.go +++ b/agent/init/migration/migrations/init.go @@ -721,7 +721,7 @@ var UpdateDatabase = &gormigrate.Migration{ } var AddGPUMonitor = &gormigrate.Migration{ - ID: "20251127-add-gpu-monitor", + ID: "20251122-add-gpu-monitor", Migrate: func(tx *gorm.DB) error { return global.GPUMonitorDB.AutoMigrate(&model.MonitorGPU{}) }, diff --git a/agent/router/ro_host.go b/agent/router/ro_host.go index 5c00efc85565..2b70bf6c6cfb 100644 --- a/agent/router/ro_host.go +++ b/agent/router/ro_host.go @@ -31,6 +31,7 @@ func (s *HostRouter) InitRouter(Router *gin.RouterGroup) { hostRouter.POST("/monitor/search", baseApi.LoadMonitor) hostRouter.POST("/monitor/gpu/search", baseApi.LoadGPUMonitor) hostRouter.POST("/monitor/clean", baseApi.CleanMonitor) + hostRouter.GET("/monitor/gpuoptions", baseApi.GetCPUOptions) hostRouter.GET("/monitor/netoptions", baseApi.GetNetworkOptions) hostRouter.GET("/monitor/iooptions", baseApi.GetIOOptions) hostRouter.GET("/monitor/setting", baseApi.LoadMonitorSetting) diff --git a/agent/utils/ai_tools/xpu/xpu.go b/agent/utils/ai_tools/xpu/xpu.go index 66702ab4d6d8..5608bb6d17d9 100644 --- a/agent/utils/ai_tools/xpu/xpu.go +++ b/agent/utils/ai_tools/xpu/xpu.go @@ -120,6 +120,23 @@ func (x XpuSMI) LoadDashData() ([]XPUSimpleInfo, error) { return res, nil } +func (x XpuSMI) LoadDeviceList() ([]string, error) { + cmdMgr := cmd.NewCommandMgr(cmd.WithTimeout(5 * time.Second)) + data, err := cmdMgr.RunWithStdoutBashC("xpu-smi discovery -j") + if err != nil { + return nil, fmt.Errorf("calling xpu-smi failed, %v", err) + } + var deviceInfo DeviceInfo + if err := json.Unmarshal([]byte(data), &deviceInfo); err != nil { + return nil, fmt.Errorf("deviceInfo json unmarshal failed, err: %w", err) + } + var deviceNames []string + for _, device := range deviceInfo.DeviceList { + deviceNames = append(deviceNames, fmt.Sprintf("%d - %s", device.DeviceID, device.DeviceName)) + } + return deviceNames, nil +} + func (x XpuSMI) LoadGpuInfo() (*XpuInfo, error) { cmdMgr := cmd.NewCommandMgr(cmd.WithTimeout(5 * time.Second)) data, err := cmdMgr.RunWithStdoutBashC("xpu-smi discovery -j") diff --git a/agent/utils/ai_tools/xpu/xpu_info.go b/agent/utils/ai_tools/xpu/xpu_info.go index e7a558e6c4c7..9c7d45656df0 100644 --- a/agent/utils/ai_tools/xpu/xpu_info.go +++ b/agent/utils/ai_tools/xpu/xpu_info.go @@ -10,7 +10,6 @@ type XpuInfo struct { type Xpu struct { Basic Basic `json:"basic"` Stats Stats `json:"stats"` - Config Config `json:"config"` Processes []Process `json:"processes"` } @@ -24,11 +23,6 @@ type Basic struct { PciBdfAddress string `json:"pciBdfAddress"` } -type Config struct { - PowerLimit int `json:"power_limit"` - PowerValidRange string `json:"power_vaild_range"` -} - type Stats struct { Power string `json:"power"` Frequency string `json:"frequency"` diff --git a/frontend/src/api/interface/host.ts b/frontend/src/api/interface/host.ts index f6b6a8fa39bb..5e38013e6e0b 100644 --- a/frontend/src/api/interface/host.ts +++ b/frontend/src/api/interface/host.ts @@ -166,9 +166,11 @@ export namespace Host { startTime: Date; endTime: Date; } - export interface MonitorGPUData { + export interface MonitorGPUOptions { gpuType: string; - productNames: Array; + options: Array; + } + export interface MonitorGPUData { date: Array; gpuValue: Array; temperatureValue: Array; diff --git a/frontend/src/api/modules/host.ts b/frontend/src/api/modules/host.ts index 2ae3b6f7355d..977a46c4f9ac 100644 --- a/frontend/src/api/modules/host.ts +++ b/frontend/src/api/modules/host.ts @@ -65,6 +65,9 @@ export const operateFilterChain = (name: string, op: string) => { export const loadMonitor = (param: Host.MonitorSearch) => { return http.post>(`/hosts/monitor/search`, param); }; +export const getGPUOptions = () => { + return http.get(`/hosts/monitor/gpuoptions`); +}; export const loadGPUMonitor = (param: Host.MonitorGPUSearch) => { return http.post(`/hosts/monitor/gpu/search`, param); }; diff --git a/frontend/src/views/ai/gpu/index.vue b/frontend/src/views/ai/gpu/index.vue index d7bf4ef54caa..6f1c803854ed 100644 --- a/frontend/src/views/ai/gpu/index.vue +++ b/frontend/src/views/ai/gpu/index.vue @@ -35,16 +35,16 @@
@@ -54,16 +54,16 @@
@@ -148,7 +148,7 @@ diff --git a/frontend/src/views/container/compose/index.vue b/frontend/src/views/container/compose/index.vue index 27295f6797d8..e92f89334009 100644 --- a/frontend/src/views/container/compose/index.vue +++ b/frontend/src/views/container/compose/index.vue @@ -20,7 +20,7 @@