Skip to content

Commit 8fa9696

Browse files
committed
Fix: filter out nvidia utilization samples
The function nvmlDeviceGetProcessUtilization might return a number of samples that exceeds the running number of processes on the GPU. Furthermore, most of the returned samples are filled with values that, either do not make sense (e.g. >100% utilization rate) or with a timestamp in the past of what we asked for. Fixes #110.
1 parent e4c38da commit 8fa9696

File tree

1 file changed

+20
-4
lines changed

1 file changed

+20
-4
lines changed

src/extract_gpuinfo_nvidia.c

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -608,14 +608,29 @@ static void gpuinfo_nvidia_get_process_utilization(
608608
free(samples);
609609
return;
610610
}
611-
if (samples_count) {
612-
internal->last_utilization_timestamp = samples[0].timeStamp;
613-
}
611+
unsigned long long newest_timestamp_candidate =
612+
internal->last_utilization_timestamp;
614613
for (unsigned i = 0; i < samples_count; ++i) {
615614
bool process_matched = false;
616615
for (unsigned j = 0; !process_matched && j < num_processes_recovered;
617616
++j) {
618-
if ((pid_t)samples[i].pid == processes[j].pid) {
617+
// Filter out samples due to inconsistency in the results returned by
618+
// the function nvmlDeviceGetProcessUtilization (see bug #110 on
619+
// Github). Check for a valid running process returned by
620+
// nvmlDeviceGetComputeRunningProcesses or
621+
// nvmlDeviceGetGraphicsRunningProcesses, filter out inconsistent
622+
// utilization value greater than 100% and filter out timestamp results
623+
// that are less recent than what we were asking for
624+
if ((pid_t)samples[i].pid == processes[j].pid &&
625+
samples[i].smUtil <= 100 && samples[i].encUtil <= 100 &&
626+
samples[i].decUtil <= 100 &&
627+
samples[i].timeStamp > internal->last_utilization_timestamp) {
628+
// Collect the largest valid timestamp for this device to filter out
629+
// the samples during the next call to the function
630+
// nvmlDeviceGetProcessUtilization
631+
if (samples[i].timeStamp > newest_timestamp_candidate)
632+
newest_timestamp_candidate = samples[i].timeStamp;
633+
619634
processes[j].gpu_usage = samples[i].smUtil;
620635
SET_VALID(gpuinfo_process_gpu_usage_valid, processes[j].valid);
621636
processes[j].encode_usage = samples[i].encUtil;
@@ -626,6 +641,7 @@ static void gpuinfo_nvidia_get_process_utilization(
626641
}
627642
}
628643
}
644+
internal->last_utilization_timestamp = newest_timestamp_candidate;
629645
free(samples);
630646
}
631647
}

0 commit comments

Comments
 (0)