1313
1414class GPUMonitor :
1515 """Monitor NVIDIA GPUs using NVML"""
16-
16+
1717 def __init__ (self ):
1818 self .running = False
1919 self .gpu_data = {}
2020 self .collector = MetricsCollector ()
2121 self .use_smi = {} # Track which GPUs use nvidia-smi (decided at boot)
22-
22+
2323 try :
2424 pynvml .nvmlInit ()
2525 self .initialized = True
2626 version = pynvml .nvmlSystemGetDriverVersion ()
2727 if isinstance (version , bytes ):
2828 version = version .decode ('utf-8' )
2929 logger .info (f"NVML initialized - Driver: { version } " )
30-
30+
3131 # Detect which GPUs need nvidia-smi (once at boot)
3232 self ._detect_smi_gpus ()
33-
33+
3434 except Exception as e :
3535 logger .error (f"Failed to initialize NVML: { e } " )
3636 self .initialized = False
37-
37+
3838 def _detect_smi_gpus (self ):
3939 """Detect which GPUs need nvidia-smi fallback (called once at boot)"""
4040 try :
4141 device_count = pynvml .nvmlDeviceGetCount ()
4242 logger .info (f"Detected { device_count } GPU(s)" )
43-
43+
4444 if NVIDIA_SMI :
4545 logger .warning ("NVIDIA_SMI=True - Forcing nvidia-smi for all GPUs" )
4646 for i in range (device_count ):
4747 self .use_smi [str (i )] = True
4848 return
49-
49+
5050 # Auto-detect per GPU
5151 for i in range (device_count ):
5252 gpu_id = str (i )
5353 try :
5454 handle = pynvml .nvmlDeviceGetHandleByIndex (i )
5555 data = self .collector .collect_all (handle , gpu_id )
5656 gpu_name = data .get ('name' , 'Unknown' )
57-
57+
5858 if 'utilization' not in data or data .get ('utilization' ) is None :
5959 self .use_smi [gpu_id ] = True
6060 logger .warning (f"GPU { i } ({ gpu_name } ): Utilization metric not available via NVML" )
6161 logger .warning (f"GPU { i } ({ gpu_name } ): Switching to nvidia-smi mode" )
6262 else :
6363 self .use_smi [gpu_id ] = False
6464 logger .info (f"GPU { i } ({ gpu_name } ): Using NVML (utilization: { data .get ('utilization' )} %)" )
65-
65+
6666 except Exception as e :
6767 self .use_smi [gpu_id ] = True
6868 logger .error (f"GPU { i } : NVML detection failed - { e } " )
6969 logger .warning (f"GPU { i } : Falling back to nvidia-smi" )
70-
70+
7171 # Summary
7272 nvml_count = sum (1 for use_smi in self .use_smi .values () if not use_smi )
7373 smi_count = sum (1 for use_smi in self .use_smi .values () if use_smi )
7474 if smi_count > 0 :
7575 logger .info (f"Boot detection complete: { nvml_count } GPU(s) using NVML, { smi_count } GPU(s) using nvidia-smi" )
7676 else :
7777 logger .info (f"Boot detection complete: All { nvml_count } GPU(s) using NVML" )
78-
78+
7979 except Exception as e :
8080 logger .error (f"Failed to detect GPUs: { e } " )
81-
81+
8282 async def get_gpu_data (self ):
8383 """Async collect metrics from all detected GPUs"""
8484 if not self .initialized :
8585 logger .error ("Cannot get GPU data - NVML not initialized" )
8686 return {}
87-
87+
8888 try :
8989 device_count = pynvml .nvmlDeviceGetCount ()
9090 gpu_data = {}
91-
91+
9292 # Get nvidia-smi data once if any GPU needs it
9393 smi_data = None
9494 if any (self .use_smi .values ()):
@@ -99,7 +99,7 @@ async def get_gpu_data(self):
9999 )
100100 except Exception as e :
101101 logger .error (f"nvidia-smi failed: { e } " )
102-
102+
103103 # Collect GPU data concurrently
104104 tasks = []
105105 for i in range (device_count ):
@@ -116,7 +116,7 @@ async def get_gpu_data(self):
116116 None , self ._collect_single_gpu , i
117117 )
118118 tasks .append ((gpu_id , task ))
119-
119+
120120 # Wait for all NVML tasks to complete
121121 if tasks :
122122 results = await asyncio .gather (* [task for _ , task in tasks ], return_exceptions = True )
@@ -125,17 +125,17 @@ async def get_gpu_data(self):
125125 logger .error (f"GPU { gpu_id } : Error - { result } " )
126126 else :
127127 gpu_data [gpu_id ] = result
128-
128+
129129 if not gpu_data :
130130 logger .error ("No GPU data collected from any source" )
131-
131+
132132 self .gpu_data = gpu_data
133133 return gpu_data
134-
134+
135135 except Exception as e :
136136 logger .error (f"Failed to get GPU data: { e } " )
137137 return {}
138-
138+
139139 def _collect_single_gpu (self , gpu_index ):
140140 """Collect data for a single GPU (runs in thread pool)"""
141141 try :
@@ -144,12 +144,12 @@ def _collect_single_gpu(self, gpu_index):
144144 except Exception as e :
145145 logger .error (f"GPU { gpu_index } : Error - { e } " )
146146 return {}
147-
147+
148148 async def get_processes (self ):
149149 """Async get GPU process information"""
150150 if not self .initialized :
151151 return []
152-
152+
153153 try :
154154 # Run process collection in thread pool
155155 return await asyncio .get_event_loop ().run_in_executor (
@@ -158,75 +158,106 @@ async def get_processes(self):
158158 except Exception as e :
159159 logger .error (f"Error getting processes: { e } " )
160160 return []
161-
161+
162162 def _get_processes_sync (self ):
163163 """Synchronous process collection (runs in thread pool)"""
164164 try :
165165 device_count = pynvml .nvmlDeviceGetCount ()
166166 all_processes = []
167167 gpu_process_counts = {}
168-
168+
169169 for i in range (device_count ):
170170 try :
171171 handle = pynvml .nvmlDeviceGetHandleByIndex (i )
172172 uuid = pynvml .nvmlDeviceGetUUID (handle )
173173 if isinstance (uuid , bytes ):
174174 uuid = uuid .decode ('utf-8' )
175-
175+
176176 gpu_id = str (i )
177177 gpu_process_counts [gpu_id ] = {'compute' : 0 , 'graphics' : 0 }
178-
178+
179179 try :
180180 procs = pynvml .nvmlDeviceGetComputeRunningProcesses (handle )
181181 gpu_process_counts [gpu_id ]['compute' ] = len (procs )
182-
182+
183183 for proc in procs :
184184 all_processes .append ({
185185 'pid' : str (proc .pid ),
186186 'name' : self ._get_process_name (proc .pid ),
187187 'gpu_uuid' : uuid ,
188+ 'gpu_id' : gpu_id ,
188189 'memory' : float (proc .usedGpuMemory / (1024 ** 2 ))
189190 })
190191 except pynvml .NVMLError :
191192 pass
192-
193+
193194 except pynvml .NVMLError :
194195 continue
195-
196+
196197 for gpu_id , counts in gpu_process_counts .items ():
197198 if gpu_id in self .gpu_data :
198199 self .gpu_data [gpu_id ]['compute_processes_count' ] = counts ['compute' ]
199200 self .gpu_data [gpu_id ]['graphics_processes_count' ] = counts ['graphics' ]
200-
201+
201202 return all_processes
202-
203+
203204 except Exception as e :
204205 logger .error (f"Error getting processes: { e } " )
205206 return []
206-
207+
207208 def _get_process_name (self , pid ):
208- """Extract readable process name from PID"""
209+ """Extract readable process name from PID with improved logic """
209210 try :
210211 p = psutil .Process (pid )
212+
213+ # First try to get the process name
214+ try :
215+ process_name = p .name ()
216+ if process_name and process_name not in ['python' , 'python3' , 'sh' , 'bash' ]:
217+ return process_name
218+ except (psutil .AccessDenied , psutil .NoSuchProcess , psutil .ZombieProcess ):
219+ pass
220+
221+ # Try to get command line for better name extraction
211222 try :
212223 cmdline = p .cmdline ()
213- except (psutil .AccessDenied , psutil .NoSuchProcess ):
214- return p .name () if hasattr (p , 'name' ) else f'PID:{ pid } '
215-
216- if not cmdline :
217- return p .name () if hasattr (p , 'name' ) else f'PID:{ pid } '
218-
219- if len (cmdline ) > 1 :
220- for arg in cmdline [1 :]:
221- if arg and not arg .startswith ('-' ):
224+ if cmdline :
225+ # Look for the actual executable or script name
226+ for i , arg in enumerate (cmdline ):
227+ if not arg or arg .startswith ('-' ):
228+ continue
229+
230+ # Skip common interpreters and shells
231+ if arg in ['python' , 'python3' , 'node' , 'java' , 'sh' , 'bash' , 'zsh' ]:
232+ continue
233+
234+ # Extract filename from path
222235 filename = arg .split ('/' )[- 1 ].split ('\\ ' )[- 1 ]
223- if filename and filename not in ['python' , 'python3' , 'node' , 'java' ]:
236+
237+ # Skip if it's still a generic name
238+ if filename in ['python' , 'python3' , 'node' , 'java' , 'sh' , 'bash' ]:
239+ continue
240+
241+ # Found a meaningful name
242+ if filename :
224243 return filename
225-
226- return cmdline [0 ].split ('/' )[- 1 ].split ('\\ ' )[- 1 ]
227- except Exception :
244+
245+ # Fallback to first argument if nothing else worked
246+ if cmdline [0 ]:
247+ return cmdline [0 ].split ('/' )[- 1 ].split ('\\ ' )[- 1 ]
248+
249+ except (psutil .AccessDenied , psutil .NoSuchProcess , psutil .ZombieProcess ):
250+ pass
251+
252+ # Final fallback
228253 return f'PID:{ pid } '
229-
254+
255+ except (psutil .NoSuchProcess , psutil .ZombieProcess ):
256+ return f'PID:{ pid } '
257+ except Exception as e :
258+ logger .debug (f"Error getting process name for PID { pid } : { e } " )
259+ return f'PID:{ pid } '
260+
230261 async def shutdown (self ):
231262 """Async shutdown"""
232263 if self .initialized :
0 commit comments