Skip to content

Commit be93abc

Browse files
authored
pid names fix (#42)
Co-authored-by: Panos <>
1 parent 36faa9e commit be93abc

File tree

3 files changed

+82
-47
lines changed

3 files changed

+82
-47
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ Open `http://localhost:1312`
3737

3838
**Older GPUs:** Add `-e NVIDIA_SMI=true` if metrics don't appear.
3939

40+
**Process monitoring:** Add `--init --pid=host` to see process names. Note: This allows the container to access host process information.
41+
4042
**From source:**
4143
```bash
4244
git clone https://github.com/psalias2006/gpu-hot

core/monitor.py

Lines changed: 78 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -13,82 +13,82 @@
1313

1414
class GPUMonitor:
1515
"""Monitor NVIDIA GPUs using NVML"""
16-
16+
1717
def __init__(self):
1818
self.running = False
1919
self.gpu_data = {}
2020
self.collector = MetricsCollector()
2121
self.use_smi = {} # Track which GPUs use nvidia-smi (decided at boot)
22-
22+
2323
try:
2424
pynvml.nvmlInit()
2525
self.initialized = True
2626
version = pynvml.nvmlSystemGetDriverVersion()
2727
if isinstance(version, bytes):
2828
version = version.decode('utf-8')
2929
logger.info(f"NVML initialized - Driver: {version}")
30-
30+
3131
# Detect which GPUs need nvidia-smi (once at boot)
3232
self._detect_smi_gpus()
33-
33+
3434
except Exception as e:
3535
logger.error(f"Failed to initialize NVML: {e}")
3636
self.initialized = False
37-
37+
3838
def _detect_smi_gpus(self):
3939
"""Detect which GPUs need nvidia-smi fallback (called once at boot)"""
4040
try:
4141
device_count = pynvml.nvmlDeviceGetCount()
4242
logger.info(f"Detected {device_count} GPU(s)")
43-
43+
4444
if NVIDIA_SMI:
4545
logger.warning("NVIDIA_SMI=True - Forcing nvidia-smi for all GPUs")
4646
for i in range(device_count):
4747
self.use_smi[str(i)] = True
4848
return
49-
49+
5050
# Auto-detect per GPU
5151
for i in range(device_count):
5252
gpu_id = str(i)
5353
try:
5454
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
5555
data = self.collector.collect_all(handle, gpu_id)
5656
gpu_name = data.get('name', 'Unknown')
57-
57+
5858
if 'utilization' not in data or data.get('utilization') is None:
5959
self.use_smi[gpu_id] = True
6060
logger.warning(f"GPU {i} ({gpu_name}): Utilization metric not available via NVML")
6161
logger.warning(f"GPU {i} ({gpu_name}): Switching to nvidia-smi mode")
6262
else:
6363
self.use_smi[gpu_id] = False
6464
logger.info(f"GPU {i} ({gpu_name}): Using NVML (utilization: {data.get('utilization')}%)")
65-
65+
6666
except Exception as e:
6767
self.use_smi[gpu_id] = True
6868
logger.error(f"GPU {i}: NVML detection failed - {e}")
6969
logger.warning(f"GPU {i}: Falling back to nvidia-smi")
70-
70+
7171
# Summary
7272
nvml_count = sum(1 for use_smi in self.use_smi.values() if not use_smi)
7373
smi_count = sum(1 for use_smi in self.use_smi.values() if use_smi)
7474
if smi_count > 0:
7575
logger.info(f"Boot detection complete: {nvml_count} GPU(s) using NVML, {smi_count} GPU(s) using nvidia-smi")
7676
else:
7777
logger.info(f"Boot detection complete: All {nvml_count} GPU(s) using NVML")
78-
78+
7979
except Exception as e:
8080
logger.error(f"Failed to detect GPUs: {e}")
81-
81+
8282
async def get_gpu_data(self):
8383
"""Async collect metrics from all detected GPUs"""
8484
if not self.initialized:
8585
logger.error("Cannot get GPU data - NVML not initialized")
8686
return {}
87-
87+
8888
try:
8989
device_count = pynvml.nvmlDeviceGetCount()
9090
gpu_data = {}
91-
91+
9292
# Get nvidia-smi data once if any GPU needs it
9393
smi_data = None
9494
if any(self.use_smi.values()):
@@ -99,7 +99,7 @@ async def get_gpu_data(self):
9999
)
100100
except Exception as e:
101101
logger.error(f"nvidia-smi failed: {e}")
102-
102+
103103
# Collect GPU data concurrently
104104
tasks = []
105105
for i in range(device_count):
@@ -116,7 +116,7 @@ async def get_gpu_data(self):
116116
None, self._collect_single_gpu, i
117117
)
118118
tasks.append((gpu_id, task))
119-
119+
120120
# Wait for all NVML tasks to complete
121121
if tasks:
122122
results = await asyncio.gather(*[task for _, task in tasks], return_exceptions=True)
@@ -125,17 +125,17 @@ async def get_gpu_data(self):
125125
logger.error(f"GPU {gpu_id}: Error - {result}")
126126
else:
127127
gpu_data[gpu_id] = result
128-
128+
129129
if not gpu_data:
130130
logger.error("No GPU data collected from any source")
131-
131+
132132
self.gpu_data = gpu_data
133133
return gpu_data
134-
134+
135135
except Exception as e:
136136
logger.error(f"Failed to get GPU data: {e}")
137137
return {}
138-
138+
139139
def _collect_single_gpu(self, gpu_index):
140140
"""Collect data for a single GPU (runs in thread pool)"""
141141
try:
@@ -144,12 +144,12 @@ def _collect_single_gpu(self, gpu_index):
144144
except Exception as e:
145145
logger.error(f"GPU {gpu_index}: Error - {e}")
146146
return {}
147-
147+
148148
async def get_processes(self):
149149
"""Async get GPU process information"""
150150
if not self.initialized:
151151
return []
152-
152+
153153
try:
154154
# Run process collection in thread pool
155155
return await asyncio.get_event_loop().run_in_executor(
@@ -158,75 +158,106 @@ async def get_processes(self):
158158
except Exception as e:
159159
logger.error(f"Error getting processes: {e}")
160160
return []
161-
161+
162162
def _get_processes_sync(self):
163163
"""Synchronous process collection (runs in thread pool)"""
164164
try:
165165
device_count = pynvml.nvmlDeviceGetCount()
166166
all_processes = []
167167
gpu_process_counts = {}
168-
168+
169169
for i in range(device_count):
170170
try:
171171
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
172172
uuid = pynvml.nvmlDeviceGetUUID(handle)
173173
if isinstance(uuid, bytes):
174174
uuid = uuid.decode('utf-8')
175-
175+
176176
gpu_id = str(i)
177177
gpu_process_counts[gpu_id] = {'compute': 0, 'graphics': 0}
178-
178+
179179
try:
180180
procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
181181
gpu_process_counts[gpu_id]['compute'] = len(procs)
182-
182+
183183
for proc in procs:
184184
all_processes.append({
185185
'pid': str(proc.pid),
186186
'name': self._get_process_name(proc.pid),
187187
'gpu_uuid': uuid,
188+
'gpu_id': gpu_id,
188189
'memory': float(proc.usedGpuMemory / (1024 ** 2))
189190
})
190191
except pynvml.NVMLError:
191192
pass
192-
193+
193194
except pynvml.NVMLError:
194195
continue
195-
196+
196197
for gpu_id, counts in gpu_process_counts.items():
197198
if gpu_id in self.gpu_data:
198199
self.gpu_data[gpu_id]['compute_processes_count'] = counts['compute']
199200
self.gpu_data[gpu_id]['graphics_processes_count'] = counts['graphics']
200-
201+
201202
return all_processes
202-
203+
203204
except Exception as e:
204205
logger.error(f"Error getting processes: {e}")
205206
return []
206-
207+
207208
def _get_process_name(self, pid):
208-
"""Extract readable process name from PID"""
209+
"""Extract readable process name from PID with improved logic"""
209210
try:
210211
p = psutil.Process(pid)
212+
213+
# First try to get the process name
214+
try:
215+
process_name = p.name()
216+
if process_name and process_name not in ['python', 'python3', 'sh', 'bash']:
217+
return process_name
218+
except (psutil.AccessDenied, psutil.NoSuchProcess, psutil.ZombieProcess):
219+
pass
220+
221+
# Try to get command line for better name extraction
211222
try:
212223
cmdline = p.cmdline()
213-
except (psutil.AccessDenied, psutil.NoSuchProcess):
214-
return p.name() if hasattr(p, 'name') else f'PID:{pid}'
215-
216-
if not cmdline:
217-
return p.name() if hasattr(p, 'name') else f'PID:{pid}'
218-
219-
if len(cmdline) > 1:
220-
for arg in cmdline[1:]:
221-
if arg and not arg.startswith('-'):
224+
if cmdline:
225+
# Look for the actual executable or script name
226+
for i, arg in enumerate(cmdline):
227+
if not arg or arg.startswith('-'):
228+
continue
229+
230+
# Skip common interpreters and shells
231+
if arg in ['python', 'python3', 'node', 'java', 'sh', 'bash', 'zsh']:
232+
continue
233+
234+
# Extract filename from path
222235
filename = arg.split('/')[-1].split('\\')[-1]
223-
if filename and filename not in ['python', 'python3', 'node', 'java']:
236+
237+
# Skip if it's still a generic name
238+
if filename in ['python', 'python3', 'node', 'java', 'sh', 'bash']:
239+
continue
240+
241+
# Found a meaningful name
242+
if filename:
224243
return filename
225-
226-
return cmdline[0].split('/')[-1].split('\\')[-1]
227-
except Exception:
244+
245+
# Fallback to first argument if nothing else worked
246+
if cmdline[0]:
247+
return cmdline[0].split('/')[-1].split('\\')[-1]
248+
249+
except (psutil.AccessDenied, psutil.NoSuchProcess, psutil.ZombieProcess):
250+
pass
251+
252+
# Final fallback
228253
return f'PID:{pid}'
229-
254+
255+
except (psutil.NoSuchProcess, psutil.ZombieProcess):
256+
return f'PID:{pid}'
257+
except Exception as e:
258+
logger.debug(f"Error getting process name for PID {pid}: {e}")
259+
return f'PID:{pid}'
260+
230261
async def shutdown(self):
231262
"""Async shutdown"""
232263
if self.initialized:

docker-compose.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ services:
1414
- driver: nvidia
1515
count: all
1616
capabilities: [gpu]
17+
init: true
18+
pid: "host"
1719
restart: unless-stopped
1820
healthcheck:
1921
test: ["CMD", "curl", "-f", "http://localhost:1312/api/gpu-data"]

0 commit comments

Comments
 (0)