Skip to content

Commit 31389b5

Browse files
authored
fix: use with_global_system() in Jetson, Tenstorrent, and NVIDIA readers (#121)
Replace per-call System::new() instances with with_global_system() to prevent file descriptor leaks in API mode (long-running metrics loop). Changes: - NVIDIA Jetson (nvidia_jetson.rs): - Line 176: get_process_info() now uses with_global_system() - Line 261: get_gpu_processes() helper now uses with_global_system() - Tenstorrent (tenstorrent.rs): - Line 201: get_process_info() now uses with_global_system() - NVIDIA (nvidia.rs): - Removed system: Mutex<System> field from struct - Migrated get_process_info() to use with_global_system() for consistency This follows the standard pattern already used by AMD reader, Apple Silicon reader, and local collector. Fixes #120
1 parent 2e7193c commit 31389b5

File tree

3 files changed

+73
-74
lines changed

3 files changed

+73
-74
lines changed

src/device/readers/nvidia.rs

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,13 @@ use crate::device::process_list::{get_all_processes, merge_gpu_processes};
1818
use crate::device::readers::common_cache::{DetailBuilder, DeviceStaticInfo, MAX_DEVICES};
1919
use crate::device::types::{GpuInfo, ProcessInfo};
2020
use crate::device::GpuReader;
21-
use crate::utils::get_hostname;
21+
use crate::utils::{get_hostname, with_global_system};
2222
use chrono::Local;
2323
use nvml_wrapper::enums::device::UsedGpuMemory;
2424
use nvml_wrapper::error::NvmlError;
2525
use nvml_wrapper::{cuda_driver_version_major, cuda_driver_version_minor, Nvml};
2626
use std::collections::{HashMap, HashSet};
2727
use std::sync::{Mutex, OnceLock};
28-
use sysinfo::System;
2928

3029
// Global status for NVML error messages
3130
static NVML_STATUS: Mutex<Option<String>> = Mutex::new(None);
@@ -39,8 +38,6 @@ pub struct NvidiaGpuReader {
3938
device_static_info: OnceLock<HashMap<u32, DeviceStaticInfo>>,
4039
/// Cached NVML handle (initialized once, reused across calls)
4140
nvml: Mutex<Option<Nvml>>,
42-
/// Cached System instance for process info (reused across calls)
43-
system: Mutex<System>,
4441
}
4542

4643
impl Default for NvidiaGpuReader {
@@ -56,7 +53,6 @@ impl NvidiaGpuReader {
5653
cuda_version: OnceLock::new(),
5754
device_static_info: OnceLock::new(),
5855
nvml: Mutex::new(Nvml::init().ok()),
59-
system: Mutex::new(System::new()),
6056
}
6157
}
6258

@@ -231,20 +227,21 @@ impl GpuReader for NvidiaGpuReader {
231227
fn get_process_info(&self) -> Vec<ProcessInfo> {
232228
use sysinfo::{ProcessRefreshKind, ProcessesToUpdate, UpdateKind};
233229

234-
// Reuse the cached System instance
235-
let mut system = self.system.lock().unwrap_or_else(|e| e.into_inner());
236-
system.refresh_processes_specifics(
237-
ProcessesToUpdate::All,
238-
true,
239-
ProcessRefreshKind::everything().with_user(UpdateKind::Always),
240-
);
241-
system.refresh_memory();
242-
243230
// Get GPU processes and PIDs using cached NVML handle
244231
let (gpu_processes, gpu_pids) = self.get_gpu_processes_cached();
245232

246-
// Get all system processes
247-
let mut all_processes = get_all_processes(&system, &gpu_pids);
233+
// Use global system instance to avoid file descriptor leak
234+
let mut all_processes = with_global_system(|system| {
235+
system.refresh_processes_specifics(
236+
ProcessesToUpdate::All,
237+
true,
238+
ProcessRefreshKind::everything().with_user(UpdateKind::Always),
239+
);
240+
system.refresh_memory();
241+
242+
// Get all system processes
243+
get_all_processes(system, &gpu_pids)
244+
});
248245

249246
// Merge GPU information into the process list
250247
merge_gpu_processes(&mut all_processes, gpu_processes);

src/device/readers/nvidia_jetson.rs

Lines changed: 47 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,11 @@ use crate::device::process_list::{get_all_processes, merge_gpu_processes};
1717
use crate::device::readers::common_cache::{DetailBuilder, DeviceStaticInfo};
1818
use crate::device::types::{GpuInfo, ProcessInfo};
1919
use crate::device::GpuReader;
20-
use crate::utils::{get_hostname, hz_to_mhz, millicelsius_to_celsius};
20+
use crate::utils::{get_hostname, hz_to_mhz, millicelsius_to_celsius, with_global_system};
2121
use chrono::Local;
2222
use std::collections::HashSet;
2323
use std::fs;
2424
use std::sync::OnceLock;
25-
use sysinfo::System;
2625

2726
pub struct NvidiaJetsonGpuReader {
2827
/// Cached static device information (fetched only once)
@@ -171,22 +170,23 @@ impl GpuReader for NvidiaJetsonGpuReader {
171170
}
172171

173172
fn get_process_info(&self) -> Vec<ProcessInfo> {
174-
// Create a lightweight system instance and only refresh what we need
175173
use sysinfo::{ProcessRefreshKind, ProcessesToUpdate, UpdateKind};
176-
let mut system = System::new();
177-
// Refresh processes with user information
178-
system.refresh_processes_specifics(
179-
ProcessesToUpdate::All,
180-
true,
181-
ProcessRefreshKind::everything().with_user(UpdateKind::Always),
182-
);
183-
system.refresh_memory();
184174

185175
// Get GPU processes and PIDs
186176
let (gpu_processes, gpu_pids) = get_gpu_processes();
187177

188-
// Get all system processes
189-
let mut all_processes = get_all_processes(&system, &gpu_pids);
178+
// Use global system instance to avoid file descriptor leak
179+
let mut all_processes = with_global_system(|system| {
180+
system.refresh_processes_specifics(
181+
ProcessesToUpdate::All,
182+
true,
183+
ProcessRefreshKind::everything().with_user(UpdateKind::Always),
184+
);
185+
system.refresh_memory();
186+
187+
// Get all system processes
188+
get_all_processes(system, &gpu_pids)
189+
});
190190

191191
// Merge GPU information into the process list
192192
merge_gpu_processes(&mut all_processes, gpu_processes);
@@ -258,41 +258,42 @@ fn get_gpu_processes() -> (Vec<ProcessInfo>, HashSet<u32>) {
258258
"cuda",
259259
];
260260

261-
let mut system = System::new();
262-
system.refresh_memory();
263-
for (pid, process) in system.processes() {
264-
let process_name = process.name().to_string_lossy().to_lowercase();
265-
for gpu_name in &gpu_process_names {
266-
if process_name.contains(gpu_name) {
267-
let pid_u32 = pid.as_u32();
268-
gpu_pids.insert(pid_u32);
269-
270-
gpu_processes.push(ProcessInfo {
271-
device_id: 0,
272-
device_uuid: "JetsonGPU".to_string(),
273-
pid: pid_u32,
274-
process_name: String::new(), // Will be filled by sysinfo
275-
used_memory: 0, // Can't determine GPU memory usage without nvidia-smi
276-
cpu_percent: 0.0, // Will be filled by sysinfo
277-
memory_percent: 0.0, // Will be filled by sysinfo
278-
memory_rss: 0, // Will be filled by sysinfo
279-
memory_vms: 0, // Will be filled by sysinfo
280-
user: String::new(), // Will be filled by sysinfo
281-
state: String::new(), // Will be filled by sysinfo
282-
start_time: String::new(), // Will be filled by sysinfo
283-
cpu_time: 0, // Will be filled by sysinfo
284-
command: String::new(), // Will be filled by sysinfo
285-
ppid: 0, // Will be filled by sysinfo
286-
threads: 0, // Will be filled by sysinfo
287-
uses_gpu: true,
288-
priority: 0, // Will be filled by sysinfo
289-
nice_value: 0, // Will be filled by sysinfo
290-
gpu_utilization: 0.0, // Can't determine per-process GPU utilization
291-
});
292-
break;
261+
with_global_system(|system| {
262+
system.refresh_memory();
263+
for (pid, process) in system.processes() {
264+
let process_name = process.name().to_string_lossy().to_lowercase();
265+
for gpu_name in &gpu_process_names {
266+
if process_name.contains(gpu_name) {
267+
let pid_u32 = pid.as_u32();
268+
gpu_pids.insert(pid_u32);
269+
270+
gpu_processes.push(ProcessInfo {
271+
device_id: 0,
272+
device_uuid: "JetsonGPU".to_string(),
273+
pid: pid_u32,
274+
process_name: String::new(), // Will be filled by sysinfo
275+
used_memory: 0, // Can't determine GPU memory usage without nvidia-smi
276+
cpu_percent: 0.0, // Will be filled by sysinfo
277+
memory_percent: 0.0, // Will be filled by sysinfo
278+
memory_rss: 0, // Will be filled by sysinfo
279+
memory_vms: 0, // Will be filled by sysinfo
280+
user: String::new(), // Will be filled by sysinfo
281+
state: String::new(), // Will be filled by sysinfo
282+
start_time: String::new(), // Will be filled by sysinfo
283+
cpu_time: 0, // Will be filled by sysinfo
284+
command: String::new(), // Will be filled by sysinfo
285+
ppid: 0, // Will be filled by sysinfo
286+
threads: 0, // Will be filled by sysinfo
287+
uses_gpu: true,
288+
priority: 0, // Will be filled by sysinfo
289+
nice_value: 0, // Will be filled by sysinfo
290+
gpu_utilization: 0.0, // Can't determine per-process GPU utilization
291+
});
292+
break;
293+
}
293294
}
294295
}
295-
}
296+
});
296297
}
297298

298299
(gpu_processes, gpu_pids)

src/device/readers/tenstorrent.rs

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ use crate::device::process_list::{get_all_processes, merge_gpu_processes};
1616
use crate::device::readers::common_cache::{DetailBuilder, DeviceStaticInfo};
1717
use crate::device::types::{GpuInfo, ProcessInfo};
1818
use crate::device::GpuReader;
19-
use crate::utils::get_hostname;
19+
use crate::utils::{get_hostname, with_global_system};
2020
use all_smi_luwen_core;
2121
use all_smi_luwen_if::chip::{Chip, ChipImpl, Telemetry};
2222
use all_smi_luwen_if::ChipDetectOptions;
@@ -25,7 +25,6 @@ use chrono::Local;
2525
use once_cell::sync::Lazy;
2626
use std::collections::{HashMap, HashSet};
2727
use std::sync::Mutex;
28-
use sysinfo::System;
2928

3029
/// Collection method for Tenstorrent NPU metrics
3130
#[derive(Debug, Clone, Copy)]
@@ -196,21 +195,23 @@ impl GpuReader for TenstorrentReader {
196195
}
197196

198197
fn get_process_info(&self) -> Vec<ProcessInfo> {
199-
// Create system instance and refresh processes
200198
use sysinfo::{ProcessRefreshKind, ProcessesToUpdate, UpdateKind};
201-
let mut system = System::new();
202-
system.refresh_processes_specifics(
203-
ProcessesToUpdate::All,
204-
true,
205-
ProcessRefreshKind::everything().with_user(UpdateKind::Always),
206-
);
207-
system.refresh_memory();
208199

209200
// Get NPU processes (currently empty for Tenstorrent)
210201
let (npu_processes, npu_pids) = self.get_npu_processes();
211202

212-
// Get all system processes
213-
let mut all_processes = get_all_processes(&system, &npu_pids);
203+
// Use global system instance to avoid file descriptor leak
204+
let mut all_processes = with_global_system(|system| {
205+
system.refresh_processes_specifics(
206+
ProcessesToUpdate::All,
207+
true,
208+
ProcessRefreshKind::everything().with_user(UpdateKind::Always),
209+
);
210+
system.refresh_memory();
211+
212+
// Get all system processes
213+
get_all_processes(system, &npu_pids)
214+
});
214215

215216
// Merge NPU information
216217
merge_gpu_processes(&mut all_processes, npu_processes);

0 commit comments

Comments
 (0)