Skip to content

Commit 5d9ae7b

Browse files
committed
perf: Optimize GPU monitoring style
1 parent 3d3d22d commit 5d9ae7b

File tree

19 files changed

+1292
-491
lines changed

19 files changed

+1292
-491
lines changed

agent/app/dto/monitor.go

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,18 @@ type MonitorSettingUpdate struct {
3939
}
4040

4141
type MonitorGPUOptions struct {
42-
GPUType string `json:"gpuType"`
43-
Options []string `json:"options"`
42+
GPUType string `json:"gpuType"`
43+
ChartHide []GPUChartHide `json:"chartHide"`
44+
Options []string `json:"options"`
45+
}
46+
type GPUChartHide struct {
47+
ProductName string `json:"productName"`
48+
Process bool `json:"process"`
49+
GPU bool `json:"gpu"`
50+
Memory bool `json:"memory"`
51+
Power bool `json:"power"`
52+
Temperature bool `json:"temperature"`
53+
Speed bool `json:"speed"`
4454
}
4555
type MonitorGPUSearch struct {
4656
ProductName string `json:"productName"`
@@ -59,6 +69,7 @@ type MonitorGPUData struct {
5969
MemoryPercent []float64 `json:"memoryPercent"`
6070
SpeedValue []int `json:"speedValue"`
6171

72+
ProcessCount []int `json:"processCount"`
6273
GPUProcesses [][]GPUProcess `json:"gpuProcesses"`
6374
}
6475

agent/app/service/monitor.go

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -137,16 +137,45 @@ func (m *MonitorService) LoadGPUOptions() dto.MonitorGPUOptions {
137137
return gpuInfo.GPUs[i].Index < gpuInfo.GPUs[j].Index
138138
})
139139
for _, item := range gpuInfo.GPUs {
140+
var chartHide dto.GPUChartHide
141+
chartHide.ProductName = fmt.Sprintf("%d - %s", item.Index, item.ProductName)
142+
chartHide.GPU = item.GPUUtil == "" || item.GPUUtil == "N/A"
143+
if (item.MemTotal == "" || item.MemTotal == "N/A") && (item.MemUsed == "" || item.MemUsed == "N/A") {
144+
chartHide.Memory = true
145+
}
146+
if (item.MaxPowerLimit == "" || item.MaxPowerLimit == "N/A") && (item.PowerDraw == "" || item.PowerDraw == "N/A") {
147+
chartHide.Power = true
148+
}
149+
chartHide.Temperature = item.Temperature == "" || item.Temperature == "N/A"
150+
chartHide.Speed = item.FanSpeed == "" || item.FanSpeed == "N/A"
151+
data.ChartHide = append(data.ChartHide, chartHide)
140152
data.Options = append(data.Options, fmt.Sprintf("%d - %s", item.Index, item.ProductName))
141153
}
142154
return data
143155
} else {
144156
data.GPUType = "xpu"
145-
var err error
146-
data.Options, err = xpuClient.LoadDeviceList()
147-
if err != nil || len(data.Options) == 0 {
157+
xpu, err := xpuClient.LoadGpuInfo()
158+
if err != nil || len(xpu.Xpu) == 0 {
148159
global.LOG.Error("Load XPU info failed or no XPU found, err: ", err)
149160
}
161+
sort.Slice(xpu.Xpu, func(i, j int) bool {
162+
return xpu.Xpu[i].Basic.DeviceID < xpu.Xpu[j].Basic.DeviceID
163+
})
164+
for _, item := range xpu.Xpu {
165+
var chartHide dto.GPUChartHide
166+
chartHide.GPU = true
167+
chartHide.Speed = true
168+
chartHide.ProductName = fmt.Sprintf("%d - %s", item.Basic.DeviceID, item.Basic.DeviceName)
169+
if (item.Stats.MemoryUsed == "" || item.Stats.MemoryUsed == "N/A") && (item.Basic.Memory == "" || item.Basic.FreeMemory == "N/A") {
170+
chartHide.Memory = true
171+
}
172+
if item.Stats.Power == "" || item.Stats.Power == "N/A" {
173+
chartHide.Power = true
174+
}
175+
chartHide.Temperature = item.Stats.Temperature == "" || item.Stats.Temperature == "N/A"
176+
data.ChartHide = append(data.ChartHide, chartHide)
177+
data.Options = append(data.Options, fmt.Sprintf("%d - %s", item.Basic.DeviceID, item.Basic.DeviceName))
178+
}
150179
return data
151180
}
152181
}
@@ -182,8 +211,10 @@ func (m *MonitorService) LoadGPUMonitorData(req dto.MonitorGPUSearch) (dto.Monit
182211
}
183212
var process []dto.GPUProcess
184213
if err := json.Unmarshal([]byte(gpu.Processes), &process); err == nil {
214+
data.ProcessCount = append(data.ProcessCount, len(process))
185215
data.GPUProcesses = append(data.GPUProcesses, process)
186216
} else {
217+
data.ProcessCount = append(data.ProcessCount, 0)
187218
data.GPUProcesses = append(data.GPUProcesses, []dto.GPUProcess{})
188219
}
189220
data.SpeedValue = append(data.SpeedValue, gpu.FanSpeed)

agent/utils/ai_tools/xpu/xpu.go

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -120,23 +120,6 @@ func (x XpuSMI) LoadDashData() ([]XPUSimpleInfo, error) {
120120
return res, nil
121121
}
122122

123-
func (x XpuSMI) LoadDeviceList() ([]string, error) {
124-
cmdMgr := cmd.NewCommandMgr(cmd.WithTimeout(5 * time.Second))
125-
data, err := cmdMgr.RunWithStdoutBashC("xpu-smi discovery -j")
126-
if err != nil {
127-
return nil, fmt.Errorf("calling xpu-smi failed, %v", err)
128-
}
129-
var deviceInfo DeviceInfo
130-
if err := json.Unmarshal([]byte(data), &deviceInfo); err != nil {
131-
return nil, fmt.Errorf("deviceInfo json unmarshal failed, err: %w", err)
132-
}
133-
var deviceNames []string
134-
for _, device := range deviceInfo.DeviceList {
135-
deviceNames = append(deviceNames, fmt.Sprintf("%d - %s", device.DeviceID, device.DeviceName))
136-
}
137-
return deviceNames, nil
138-
}
139-
140123
func (x XpuSMI) LoadGpuInfo() (*XpuInfo, error) {
141124
cmdMgr := cmd.NewCommandMgr(cmd.WithTimeout(5 * time.Second))
142125
data, err := cmdMgr.RunWithStdoutBashC("xpu-smi discovery -j")

frontend/src/api/interface/host.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,16 @@ export namespace Host {
169169
export interface MonitorGPUOptions {
170170
gpuType: string;
171171
options: Array<string>;
172+
chartHide: Array<ChartHide>;
173+
}
174+
export interface ChartHide {
175+
productName: string;
176+
process: boolean;
177+
gpu: boolean;
178+
memory: boolean;
179+
power: boolean;
180+
temperature: boolean;
181+
speed: boolean;
172182
}
173183
export interface MonitorGPUData {
174184
date: Array<Date>;

frontend/src/lang/modules/en.ts

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -711,6 +711,37 @@ const message = {
711711
memoryUsed: 'Memory Used',
712712
memoryTotal: 'Total Memory',
713713
percent: 'Utilization',
714+
715+
base: 'Basic Information',
716+
driverVersion: 'Driver Version',
717+
cudaVersion: 'CUDA Version',
718+
processMemoryUsage: 'Memory Usage',
719+
performanceStateHelper: 'From P0 (maximum performance) to P12 (minimum performance)',
720+
busID: 'Bus Address',
721+
persistenceMode: 'Persistence Mode',
722+
enabled: 'Enabled',
723+
disabled: 'Disabled',
724+
persistenceModeHelper:
725+
'Persistence mode responds to tasks more quickly, but standby power consumption will increase accordingly',
726+
displayActive: 'GPU Initialization',
727+
displayActiveT: 'Yes',
728+
displayActiveF: 'No',
729+
ecc: 'Error Checking and Correcting Technology',
730+
computeMode: 'Compute Mode',
731+
default: 'Default',
732+
exclusiveProcess: 'Exclusive Process',
733+
exclusiveThread: 'Exclusive Thread',
734+
prohibited: 'Prohibited',
735+
defaultHelper: 'Default: Processes can execute concurrently',
736+
exclusiveProcessHelper:
737+
'Exclusive Process: Only one CUDA context can use the GPU, but it can be shared by multiple threads',
738+
exclusiveThreadHelper: 'Exclusive Thread: Only one thread in a CUDA context can use the GPU',
739+
prohibitedHelper: 'Prohibited: Concurrent process execution is not allowed',
740+
migModeHelper: 'Used to create MIG instances, implementing physical GPU isolation at the user layer.',
741+
migModeNA: 'Not Supported',
742+
current: 'Real-time Monitoring',
743+
history: 'Historical Records',
744+
notSupport: 'The current version or driver does not support displaying this parameter.',
714745
},
715746
mcp: {
716747
server: 'MCP Server',

frontend/src/lang/modules/es-es.ts

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -710,6 +710,38 @@ const message = {
710710
memoryUsed: 'Memoria Utilizada',
711711
memoryTotal: 'Memoria Total',
712712
percent: 'Utilización',
713+
714+
base: 'Información Básica',
715+
driverVersion: 'Versión del Controlador',
716+
cudaVersion: 'Versión de CUDA',
717+
processMemoryUsage: 'Uso de Memoria',
718+
performanceStateHelper: 'Desde P0 (rendimiento máximo) hasta P12 (rendimiento mínimo)',
719+
busID: 'Dirección del Bus',
720+
persistenceMode: 'Modo de Persistencia',
721+
enabled: 'Habilitado',
722+
disabled: 'Deshabilitado',
723+
persistenceModeHelper:
724+
'El modo de persistencia responde a las tareas más rápidamente, pero el consumo de energía en espera aumentará en consecuencia',
725+
displayActive: 'Inicialización de GPU',
726+
displayActiveT: 'Sí',
727+
displayActiveF: 'No',
728+
ecc: 'Tecnología de Corrección de Errores',
729+
computeMode: 'Modo de Computación',
730+
default: 'Predeterminado',
731+
exclusiveProcess: 'Proceso Exclusivo',
732+
exclusiveThread: 'Hilo Exclusivo',
733+
prohibited: 'Prohibido',
734+
defaultHelper: 'Predeterminado: Los procesos pueden ejecutarse concurrentemente',
735+
exclusiveProcessHelper:
736+
'Proceso Exclusivo: Solo un contexto CUDA puede usar la GPU, pero puede ser compartido por múltiples hilos',
737+
exclusiveThreadHelper: 'Hilo Exclusivo: Solo un hilo en un contexto CUDA puede usar la GPU',
738+
prohibitedHelper: 'Prohibido: No se permite la ejecución concurrente de procesos',
739+
migModeHelper:
740+
'Se utiliza para crear instancias MIG, implementando aislamiento físico de GPU en la capa de usuario.',
741+
migModeNA: 'No Compatible',
742+
current: 'Monitoreo en Tiempo Real',
743+
history: 'Registros Históricos',
744+
notSupport: 'La versión actual o el controlador no admiten mostrar este parámetro.',
713745
},
714746
mcp: {
715747
server: 'Servidor MCP',

frontend/src/lang/modules/ja.ts

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -699,6 +699,36 @@ const message = {
699699
memoryUsed: '使用メモリ',
700700
memoryTotal: '総メモリ',
701701
percent: '使用率',
702+
703+
base: '基本情報',
704+
driverVersion: 'ドライバーバージョン',
705+
cudaVersion: 'CUDA バージョン',
706+
processMemoryUsage: 'メモリ使用量',
707+
performanceStateHelper: 'P0(最大パフォーマンス)から P12(最小パフォーマンス)まで',
708+
busID: 'バスアドレス',
709+
persistenceMode: '永続モード',
710+
enabled: '有効',
711+
disabled: '無効',
712+
persistenceModeHelper: '永続モードはタスクへの応答がより迅速ですが、それに応じて待機電力消費も増加します',
713+
displayActive: 'GPU 初期化',
714+
displayActiveT: 'はい',
715+
displayActiveF: 'いいえ',
716+
ecc: 'エラー修正技術',
717+
computeMode: '計算モード',
718+
default: 'デフォルト',
719+
exclusiveProcess: '排他プロセス',
720+
exclusiveThread: '排他スレッド',
721+
prohibited: '禁止',
722+
defaultHelper: 'デフォルト: プロセスは同時実行可能',
723+
exclusiveProcessHelper:
724+
'排他プロセス: 1つのCUDAコンテキストのみがGPUを使用可能、ただし複数スレッドで共有可能',
725+
exclusiveThreadHelper: '排他スレッド: CUDAコンテキスト内の1つのスレッドのみがGPUを使用可能',
726+
prohibitedHelper: '禁止: プロセスの同時実行は許可されません',
727+
migModeHelper: 'MIGインスタンスを作成するために使用され、ユーザーレイヤーでGPUの物理的隔離を実装します。',
728+
migModeNA: 'サポートされていません',
729+
current: 'リアルタイム監視',
730+
history: '履歴記録',
731+
notSupport: '現在のバージョンまたはドライバーはこのパラメータの表示をサポートしていません。',
702732
},
703733
mcp: {
704734
server: 'MCP サーバー',

frontend/src/lang/modules/ko.ts

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -695,6 +695,36 @@ const message = {
695695
memoryUsed: '사용된 메모리',
696696
memoryTotal: '전체 메모리',
697697
percent: '사용률',
698+
699+
base: '기본 정보',
700+
driverVersion: '드라이버 버전',
701+
cudaVersion: 'CUDA 버전',
702+
processMemoryUsage: '메모리 사용량',
703+
performanceStateHelper: 'P0(최대 성능)부터 P12(최소 성능)까지',
704+
busID: '버스 주소',
705+
persistenceMode: '지속성 모드',
706+
enabled: '활성화',
707+
disabled: '비활성화',
708+
persistenceModeHelper: '지속성 모드는 작업에 더 빠르게 응답하지만 대기 전력 소비도 그에 따라 증가합니다',
709+
displayActive: 'GPU 초기화',
710+
displayActiveT: '예',
711+
displayActiveF: '아니오',
712+
ecc: '오류 검사 및 수정 기술',
713+
computeMode: '계산 모드',
714+
default: '기본값',
715+
exclusiveProcess: '배타적 프로세스',
716+
exclusiveThread: '배타적 스레드',
717+
prohibited: '금지됨',
718+
defaultHelper: '기본값: 프로세스가 동시에 실행될 수 있음',
719+
exclusiveProcessHelper:
720+
'배타적 프로세스: 하나의 CUDA 컨텍스트만 GPU를 사용할 수 있지만 여러 스레드에서 공유 가능',
721+
exclusiveThreadHelper: '배타적 스레드: CUDA 컨텍스트의 하나의 스레드만 GPU를 사용할 수 있음',
722+
prohibitedHelper: '금지됨: 프로세스 동시 실행이 허용되지 않음',
723+
migModeHelper: 'MIG 인스턴스를 생성하는 데 사용되며 사용자 레이어에서 GPU의 물리적 격리를 구현합니다.',
724+
migModeNA: '지원되지 않음',
725+
current: '실시간 모니터링',
726+
history: '기록',
727+
notSupport: '현재 버전 또는 드라이버는 이 매개변수 표시를 지원하지 않습니다.',
698728
},
699729
mcp: {
700730
server: 'MCP サーバー',

frontend/src/lang/modules/ms.ts

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -711,6 +711,38 @@ const message = {
711711
memoryUsed: 'Memori Digunakan',
712712
memoryTotal: 'Jumlah Memori',
713713
percent: 'Penggunaan',
714+
base: 'Maklumat Asas',
715+
716+
driverVersion: 'Versi Pemacu',
717+
cudaVersion: 'Versi CUDA',
718+
processMemoryUsage: 'Penggunaan Memori',
719+
performanceStateHelper: 'Dari P0 (prestasi maksimum) hingga P12 (prestasi minimum)',
720+
busID: 'Alamat Bas',
721+
persistenceMode: 'Mod Kegigihan',
722+
enabled: 'Diaktifkan',
723+
disabled: 'Dilumpuhkan',
724+
persistenceModeHelper:
725+
'Mod kegigihan bertindak balas kepada tugas dengan lebih cepat, tetapi penggunaan kuasa siap sedia akan meningkat dengan sewajarnya',
726+
displayActive: 'Permulaan GPU',
727+
displayActiveT: 'Ya',
728+
displayActiveF: 'Tidak',
729+
ecc: 'Teknologi Pemeriksaan dan Pembetulan Ralat',
730+
computeMode: 'Mod Pengiraan',
731+
default: 'Lalai',
732+
exclusiveProcess: 'Proses Eksklusif',
733+
exclusiveThread: 'Benang Eksklusif',
734+
prohibited: 'Dilarang',
735+
defaultHelper: 'Lalai: Proses boleh dilaksanakan serentak',
736+
exclusiveProcessHelper:
737+
'Proses Eksklusif: Hanya satu konteks CUDA boleh menggunakan GPU, tetapi boleh dikongsi oleh berbilang benang',
738+
exclusiveThreadHelper: 'Benang Eksklusif: Hanya satu benang dalam konteks CUDA boleh menggunakan GPU',
739+
prohibitedHelper: 'Dilarang: Pelaksanaan proses serentak tidak dibenarkan',
740+
migModeHelper:
741+
'Digunakan untuk mencipta contoh MIG, melaksanakan pengasingan fizikal GPU pada lapisan pengguna.',
742+
migModeNA: 'Tidak Disokong',
743+
current: 'Pemantauan Masa Nyata',
744+
history: 'Rekod Sejarah',
745+
notSupport: 'Versi atau pemacu semasa tidak menyokong paparan parameter ini.',
714746
},
715747
mcp: {
716748
server: 'Pelayan MCP',

frontend/src/lang/modules/pt-br.ts

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -706,6 +706,38 @@ const message = {
706706
memoryUsed: 'Memória Usada',
707707
memoryTotal: 'Memória Total',
708708
percent: 'Utilização',
709+
710+
base: 'Informações Básicas',
711+
driverVersion: 'Versão do Driver',
712+
cudaVersion: 'Versão do CUDA',
713+
processMemoryUsage: 'Uso de Memória',
714+
performanceStateHelper: 'De P0 (desempenho máximo) a P12 (desempenho mínimo)',
715+
busID: 'Endereço do Barramento',
716+
persistenceMode: 'Modo de Persistência',
717+
enabled: 'Habilitado',
718+
disabled: 'Desabilitado',
719+
persistenceModeHelper:
720+
'O modo de persistência responde às tarefas mais rapidamente, mas o consumo de energia em espera aumentará correspondentemente',
721+
displayActive: 'Inicialização da GPU',
722+
displayActiveT: 'Sim',
723+
displayActiveF: 'Não',
724+
ecc: 'Tecnologia de Verificação e Correção de Erros',
725+
computeMode: 'Modo de Computação',
726+
default: 'Padrão',
727+
exclusiveProcess: 'Processo Exclusivo',
728+
exclusiveThread: 'Thread Exclusiva',
729+
prohibited: 'Proibido',
730+
defaultHelper: 'Padrão: Os processos podem executar simultaneamente',
731+
exclusiveProcessHelper:
732+
'Processo Exclusivo: Apenas um contexto CUDA pode usar a GPU, mas pode ser compartilhado por múltiplas threads',
733+
exclusiveThreadHelper: 'Thread Exclusiva: Apenas uma thread em um contexto CUDA pode usar a GPU',
734+
prohibitedHelper: 'Proibido: A execução simultânea de processos não é permitida',
735+
migModeHelper:
736+
'Usado para criar instâncias MIG, implementando isolamento físico da GPU na camada do usuário.',
737+
migModeNA: 'Não Suportado',
738+
current: 'Monitoramento em Tempo Real',
739+
history: 'Registros Históricos',
740+
notSupport: 'A versão atual ou o driver não suportam exibir este parâmetro.',
709741
},
710742
mcp: {
711743
server: 'Servidor MCP',

0 commit comments

Comments
 (0)