Skip to content

Commit c48d058

Browse files
Andrew1326Syllo
authored andcommitted
Fix assertion failure when duplicate client_id encountered
The assertion "We should not be processing a client id twice per update" can fail when a process has multiple file descriptors referencing the same DRM client (e.g., via dup(), fork(), or DRM master operations). The kcmp syscall filters duplicate file descriptions but not distinct file descriptions that report the same underlying DRM client_id. This change converts the debug assertion into a runtime check that gracefully skips duplicate entries and frees any newly allocated cache entries to prevent memory leaks. Fixes the crash: nvtop: ./src/extract_gpuinfo_amdgpu.c:964: parse_drm_fdinfo_amd: Assertion `!cache_entry_check && "We should not be processing a client id twice per update"' failed. Applied to all affected drivers: - AMDGPU - Intel i915 - Intel Xe - Qualcomm MSM (also fixed incorrect hash key usage) - ARM Mali
1 parent c757823 commit c48d058

File tree

5 files changed

+46
-21
lines changed

5 files changed

+46
-21
lines changed

src/extract_gpuinfo_amdgpu.c

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -980,12 +980,17 @@ static bool parse_drm_fdinfo_amd(struct gpu_info *info, FILE *fdinfo_file, struc
980980
if (static_info->encode_decode_shared)
981981
SET_GPUINFO_PROCESS(process_info, decode_usage, process_info->decode_usage + process_info->encode_usage);
982982

983-
#ifndef NDEBUG
984-
// We should only process one fdinfo entry per client id per update
983+
// Check if we already processed this client_id in the current update cycle.
984+
// This can happen when a process has multiple file descriptors referencing
985+
// the same DRM client (e.g., via DRM master operations).
985986
struct amdgpu_process_info_cache *cache_entry_check;
986987
HASH_FIND_CLIENT(gpu_info->current_update_process_cache, &cache_entry->client_id, cache_entry_check);
987-
assert(!cache_entry_check && "We should not be processing a client id twice per update");
988-
#endif
988+
if (cache_entry_check) {
989+
// Already processed this client_id, free the entry if we allocated it
990+
if (cache_entry != cache_entry_check)
991+
free(cache_entry);
992+
goto parse_fdinfo_exit;
993+
}
989994

990995
// Store this measurement data
991996
RESET_ALL(cache_entry->valid);

src/extract_gpuinfo_intel_i915.c

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -303,12 +303,17 @@ bool parse_drm_fdinfo_intel_i915(struct gpu_info *info, FILE *fdinfo_file, struc
303303
cache_entry->client_id.pdev = gpu_info->base.pdev;
304304
}
305305

306-
#ifndef NDEBUG
307-
// We should only process one fdinfo entry per client id per update
306+
// Check if we already processed this client_id in the current update cycle.
307+
// This can happen when a process has multiple file descriptors referencing
308+
// the same DRM client (e.g., via DRM master operations).
308309
struct intel_process_info_cache *cache_entry_check;
309310
HASH_FIND_CLIENT(gpu_info->current_update_process_cache, &cache_entry->client_id, cache_entry_check);
310-
assert(!cache_entry_check && "We should not be processing a client id twice per update");
311-
#endif
311+
if (cache_entry_check) {
312+
// Already processed this client_id, free the entry if we allocated it
313+
if (cache_entry != cache_entry_check)
314+
free(cache_entry);
315+
goto parse_fdinfo_exit;
316+
}
312317

313318
RESET_ALL(cache_entry->valid);
314319
if (GPUINFO_PROCESS_FIELD_VALID(process_info, gfx_engine_used))

src/extract_gpuinfo_intel_xe.c

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -254,12 +254,17 @@ bool parse_drm_fdinfo_intel_xe(struct gpu_info *info, FILE *fdinfo_file, struct
254254
cache_entry->client_id.pdev = gpu_info->base.pdev;
255255
}
256256

257-
#ifndef NDEBUG
258-
// We should only process one fdinfo entry per client id per update
257+
// Check if we already processed this client_id in the current update cycle.
258+
// This can happen when a process has multiple file descriptors referencing
259+
// the same DRM client (e.g., via DRM master operations).
259260
struct intel_process_info_cache *cache_entry_check;
260261
HASH_FIND_CLIENT(gpu_info->current_update_process_cache, &cache_entry->client_id, cache_entry_check);
261-
assert(!cache_entry_check && "We should not be processing a client id twice per update");
262-
#endif
262+
if (cache_entry_check) {
263+
// Already processed this client_id, free the entry if we allocated it
264+
if (cache_entry != cache_entry_check)
265+
free(cache_entry);
266+
goto parse_fdinfo_exit;
267+
}
263268

264269
RESET_ALL(cache_entry->valid);
265270
SET_INTEL_CACHE(cache_entry, gpu_cycles, gpu_cycles);

src/extract_gpuinfo_mali_common.c

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -435,12 +435,17 @@ void mali_common_parse_fdinfo_handle_cache(struct gpu_info_mali *gpu_info,
435435
cache_entry->last_cycles = total_cycles;
436436
}
437437

438-
#ifndef NDEBUG
439-
// We should only process one fdinfo entry per client id per update
438+
// Check if we already processed this client_id in the current update cycle.
439+
// This can happen when a process has multiple file descriptors referencing
440+
// the same DRM client (e.g., via DRM master operations).
440441
struct mali_process_info_cache *cache_entry_check;
441442
HASH_FIND_CLIENT(gpu_info->current_update_process_cache, &cache_entry->client_id, cache_entry_check);
442-
assert(!cache_entry_check && "We should not be processing a client id twice per update");
443-
#endif
443+
if (cache_entry_check) {
444+
// Already processed this client_id, free the entry if we allocated it
445+
if (cache_entry != cache_entry_check)
446+
free(cache_entry);
447+
return;
448+
}
444449

445450
RESET_ALL(cache_entry->valid);
446451
if (GPUINFO_PROCESS_FIELD_VALID(process_info, gfx_engine_used))

src/extract_gpuinfo_msm.c

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -357,12 +357,17 @@ static bool parse_drm_fdinfo_msm(struct gpu_info *info, FILE *fdinfo_file, struc
357357
cache_entry->client_id.pid = process_info->pid;
358358
}
359359

360-
#ifndef NDEBUG
361-
// We should only process one fdinfo entry per client id per update
360+
// Check if we already processed this client_id in the current update cycle.
361+
// This can happen when a process has multiple file descriptors referencing
362+
// the same DRM client (e.g., via DRM master operations).
362363
struct msm_process_info_cache *cache_entry_check;
363-
HASH_FIND_CLIENT(gpu_info->current_update_process_cache, &cid, cache_entry_check);
364-
assert(!cache_entry_check && "We should not be processing a client id twice per update");
365-
#endif
364+
HASH_FIND_CLIENT(gpu_info->current_update_process_cache, &cache_entry->client_id, cache_entry_check);
365+
if (cache_entry_check) {
366+
// Already processed this client_id, free the entry if we allocated it
367+
if (cache_entry != cache_entry_check)
368+
free(cache_entry);
369+
goto parse_fdinfo_exit;
370+
}
366371

367372
RESET_ALL(cache_entry->valid);
368373
if (GPUINFO_PROCESS_FIELD_VALID(process_info, gfx_engine_used))

0 commit comments

Comments
 (0)