Fix assertion failure when duplicate client_id encountered

Andrew1326 · Syllo · commit c48d0586bc5b · 2026-01-16T10:57:21.000+01:00
The assertion "We should not be processing a client id twice per update"
can fail when a process has multiple file descriptors referencing the
same DRM client (e.g., via dup(), fork(), or DRM master operations).

The kcmp syscall filters duplicate file descriptions but not distinct
file descriptions that report the same underlying DRM client_id.

This change converts the debug assertion into a runtime check that
gracefully skips duplicate entries and frees any newly allocated
cache entries to prevent memory leaks.

Fixes the crash:
  nvtop: ./src/extract_gpuinfo_amdgpu.c:964: parse_drm_fdinfo_amd:
  Assertion `!cache_entry_check &amp;&amp; "We should not be processing a
  client id twice per update"' failed.

Applied to all affected drivers:
- AMDGPU
- Intel i915
- Intel Xe
- Qualcomm MSM (also fixed incorrect hash key usage)
- ARM Mali
diff --git a/src/extract_gpuinfo_amdgpu.c b/src/extract_gpuinfo_amdgpu.c
@@ -980,12 +980,17 @@ static bool parse_drm_fdinfo_amd(struct gpu_info *info, FILE *fdinfo_file, struc
     if (static_info->encode_decode_shared)
       SET_GPUINFO_PROCESS(process_info, decode_usage, process_info->decode_usage + process_info->encode_usage);
 
-#ifndef NDEBUG
-    // We should only process one fdinfo entry per client id per update
+    // Check if we already processed this client_id in the current update cycle.
+    // This can happen when a process has multiple file descriptors referencing
+    // the same DRM client (e.g., via DRM master operations).
     struct amdgpu_process_info_cache *cache_entry_check;
     HASH_FIND_CLIENT(gpu_info->current_update_process_cache, &cache_entry->client_id, cache_entry_check);
-    assert(!cache_entry_check && "We should not be processing a client id twice per update");
-#endif
+    if (cache_entry_check) {
+      // Already processed this client_id, free the entry if we allocated it
+      if (cache_entry != cache_entry_check)
+        free(cache_entry);
+      goto parse_fdinfo_exit;
+    }
 
     // Store this measurement data
     RESET_ALL(cache_entry->valid);
diff --git a/src/extract_gpuinfo_intel_i915.c b/src/extract_gpuinfo_intel_i915.c
@@ -303,12 +303,17 @@ bool parse_drm_fdinfo_intel_i915(struct gpu_info *info, FILE *fdinfo_file, struc
     cache_entry->client_id.pdev = gpu_info->base.pdev;
   }
 
-#ifndef NDEBUG
-  // We should only process one fdinfo entry per client id per update
+  // Check if we already processed this client_id in the current update cycle.
+  // This can happen when a process has multiple file descriptors referencing
+  // the same DRM client (e.g., via DRM master operations).
   struct intel_process_info_cache *cache_entry_check;
   HASH_FIND_CLIENT(gpu_info->current_update_process_cache, &cache_entry->client_id, cache_entry_check);
-  assert(!cache_entry_check && "We should not be processing a client id twice per update");
-#endif
+  if (cache_entry_check) {
+    // Already processed this client_id, free the entry if we allocated it
+    if (cache_entry != cache_entry_check)
+      free(cache_entry);
+    goto parse_fdinfo_exit;
+  }
 
   RESET_ALL(cache_entry->valid);
   if (GPUINFO_PROCESS_FIELD_VALID(process_info, gfx_engine_used))
diff --git a/src/extract_gpuinfo_intel_xe.c b/src/extract_gpuinfo_intel_xe.c
@@ -254,12 +254,17 @@ bool parse_drm_fdinfo_intel_xe(struct gpu_info *info, FILE *fdinfo_file, struct
     cache_entry->client_id.pdev = gpu_info->base.pdev;
   }
 
-#ifndef NDEBUG
-  // We should only process one fdinfo entry per client id per update
+  // Check if we already processed this client_id in the current update cycle.
+  // This can happen when a process has multiple file descriptors referencing
+  // the same DRM client (e.g., via DRM master operations).
   struct intel_process_info_cache *cache_entry_check;
   HASH_FIND_CLIENT(gpu_info->current_update_process_cache, &cache_entry->client_id, cache_entry_check);
-  assert(!cache_entry_check && "We should not be processing a client id twice per update");
-#endif
+  if (cache_entry_check) {
+    // Already processed this client_id, free the entry if we allocated it
+    if (cache_entry != cache_entry_check)
+      free(cache_entry);
+    goto parse_fdinfo_exit;
+  }
 
   RESET_ALL(cache_entry->valid);
   SET_INTEL_CACHE(cache_entry, gpu_cycles, gpu_cycles);
diff --git a/src/extract_gpuinfo_mali_common.c b/src/extract_gpuinfo_mali_common.c
@@ -435,12 +435,17 @@ void mali_common_parse_fdinfo_handle_cache(struct gpu_info_mali *gpu_info,
     cache_entry->last_cycles = total_cycles;
   }
 
-#ifndef NDEBUG
-  // We should only process one fdinfo entry per client id per update
+  // Check if we already processed this client_id in the current update cycle.
+  // This can happen when a process has multiple file descriptors referencing
+  // the same DRM client (e.g., via DRM master operations).
   struct mali_process_info_cache *cache_entry_check;
   HASH_FIND_CLIENT(gpu_info->current_update_process_cache, &cache_entry->client_id, cache_entry_check);
-  assert(!cache_entry_check && "We should not be processing a client id twice per update");
-#endif
+  if (cache_entry_check) {
+    // Already processed this client_id, free the entry if we allocated it
+    if (cache_entry != cache_entry_check)
+      free(cache_entry);
+    return;
+  }
 
   RESET_ALL(cache_entry->valid);
   if (GPUINFO_PROCESS_FIELD_VALID(process_info, gfx_engine_used))
diff --git a/src/extract_gpuinfo_msm.c b/src/extract_gpuinfo_msm.c
@@ -357,12 +357,17 @@ static bool parse_drm_fdinfo_msm(struct gpu_info *info, FILE *fdinfo_file, struc
     cache_entry->client_id.pid = process_info->pid;
   }
 
-#ifndef NDEBUG
-  // We should only process one fdinfo entry per client id per update
+  // Check if we already processed this client_id in the current update cycle.
+  // This can happen when a process has multiple file descriptors referencing
+  // the same DRM client (e.g., via DRM master operations).
   struct msm_process_info_cache *cache_entry_check;
-  HASH_FIND_CLIENT(gpu_info->current_update_process_cache, &cid, cache_entry_check);
-  assert(!cache_entry_check && "We should not be processing a client id twice per update");
-#endif
+  HASH_FIND_CLIENT(gpu_info->current_update_process_cache, &cache_entry->client_id, cache_entry_check);
+  if (cache_entry_check) {
+    // Already processed this client_id, free the entry if we allocated it
+    if (cache_entry != cache_entry_check)
+      free(cache_entry);
+    goto parse_fdinfo_exit;
+  }
 
   RESET_ALL(cache_entry->valid);
   if (GPUINFO_PROCESS_FIELD_VALID(process_info, gfx_engine_used))