Skip to content

Commit 24c9bfb

Browse files
authored
[SWDEV-488303] Updated CU occupancy for per-process retrieval (#243)
Change-Id: I2990597c6dd4b2e8cf3e11ce60f72049ebdd9a8c Signed-off-by: Maisam Arif <Maisam.Arif@amd.com> [ROCm/amdsmi commit: 0fdaebd]
1 parent ccdf4f0 commit 24c9bfb

File tree

12 files changed

+258
-285
lines changed

12 files changed

+258
-285
lines changed

projects/amdsmi/CHANGELOG.md

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,11 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
1212

1313
### Changed
1414

15+
- **Added Compute Unit Occupancy information per process**
16+
Measuring compute units are the best way currently to determine gfx usage on a per process basis
17+
- Added `CU_OCCUPANCY` to `amd-smi process` output.
18+
- Added `CU%` to `amd-smi monitor -q`
19+
1520
- **Expanded Violation Status tracking for GPU metrics 1.8.**
1621
- The driver will no longer be supporting existing single-value GFX Clk Below Host Limit fields (`acc_gfx_clk_below_host_limit`, `per_gfx_clk_below_host_limit`, `active_gfx_clk_below_host_limit`), they are now changed in favor of new per-XCP/XCC arrays.
1722
- Added new fields to `amdsmi_violation_status_t` and related interfaces for enhanced violation breakdown:
@@ -54,11 +59,8 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
5459

5560
### Resolved issues
5661

57-
- N/A
58-
59-
### Upcoming changes
60-
61-
- N/A
62+
- **Corrected VRAM memory calculation in `amdsmi_get_gpu_process_list`.**
63+
- Previously, the VRAM memory usage reported by `amdsmi_get_gpu_process_list` was inaccurate and calculated using KB vs KiB.
6264

6365
### Known issues
6466

projects/amdsmi/amdsmi_cli/amdsmi_commands.py

Lines changed: 38 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3300,8 +3300,21 @@ def process(self, args, multiple_devices=False, watching_output=False,
33003300

33013301
filtered_process_values = []
33023302
for process_info in process_list:
3303-
process_info['mem_usage'] = process_info.pop('mem')
3304-
process_info['usage'] = process_info.pop('engine_usage')
3303+
process_info = {
3304+
"name": process_info["name"],
3305+
"pid": process_info["pid"],
3306+
"memory_usage": {
3307+
"gtt_mem": process_info["memory_usage"]["gtt_mem"],
3308+
"cpu_mem": process_info["memory_usage"]["cpu_mem"],
3309+
"vram_mem": process_info["memory_usage"]["vram_mem"],
3310+
},
3311+
"mem_usage": process_info["mem"],
3312+
"usage": {
3313+
"gfx": process_info["engine_usage"]["gfx"],
3314+
"enc": process_info["engine_usage"]["enc"],
3315+
},
3316+
"cu_occupancy": process_info["cu_occupancy"]
3317+
}
33053318

33063319
engine_usage_unit = "ns"
33073320
memory_usage_unit = "B"
@@ -5714,35 +5727,43 @@ def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None,
57145727
logging.debug("Failed to get process list for gpu %s | %s", gpu_id, e.get_error_info())
57155728
raise e
57165729

5730+
try:
5731+
num_compute_units = amdsmi_interface.amdsmi_get_gpu_asic_info(args.gpu)['num_compute_units']
5732+
except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e:
5733+
num_compute_units = "N/A"
5734+
logging.debug("Failed to get num compute units for gpu %s | %s", gpu_id, e.get_error_info())
5735+
57175736
# Clean processes dictionary
57185737
filtered_process_values = []
57195738
for process_info in process_list:
5720-
process_info['mem_usage'] = process_info.pop('mem')
5721-
process_info['usage'] = process_info.pop('engine_usage')
5739+
process_info.pop('mem') # Remove 'mem' value
5740+
process_info.pop('engine_usage') # Remove 'engine_usage' value
57225741

5723-
engine_usage_unit = "ns"
57245742
memory_usage_unit = "B"
5725-
57265743
if self.logger.is_human_readable_format():
5727-
process_info['mem_usage'] = self.helpers.convert_bytes_to_readable(process_info['mem_usage'])
57285744
for usage_metric in process_info['memory_usage']:
57295745
process_info["memory_usage"][usage_metric] = self.helpers.convert_bytes_to_readable(process_info["memory_usage"][usage_metric])
57305746
memory_usage_unit = ""
57315747

5732-
process_info['mem_usage'] = self.helpers.unit_format(self.logger,
5733-
process_info['mem_usage'],
5734-
memory_usage_unit)
5735-
5736-
for usage_metric in process_info['usage']:
5737-
process_info['usage'][usage_metric] = self.helpers.unit_format(self.logger,
5738-
process_info['usage'][usage_metric],
5739-
engine_usage_unit)
5740-
57415748
for usage_metric in process_info['memory_usage']:
57425749
process_info['memory_usage'][usage_metric] = self.helpers.unit_format(self.logger,
57435750
process_info['memory_usage'][usage_metric],
57445751
memory_usage_unit)
57455752

5753+
if 'cu_occupancy' in process_info:
5754+
try:
5755+
cu_occupancy = process_info['cu_occupancy']
5756+
if num_compute_units != "N/A" and num_compute_units > 0:
5757+
cu_percentage = round((cu_occupancy / num_compute_units) * 100, 1)
5758+
process_info['cu_occupancy'] = self.helpers.unit_format(self.logger,
5759+
cu_percentage,
5760+
'%')
5761+
else:
5762+
process_info['cu_occupancy'] = "N/A"
5763+
except Exception as e:
5764+
process_info['cu_occupancy'] = "N/A"
5765+
logging.debug("Failed to calculate cu_occupancy percentage for GPU %s | %s", gpu_id, str(e))
5766+
57465767
filtered_process_values.append({'process_info': process_info})
57475768

57485769
# If no processes are populated then we populate an N/A placeholder
@@ -5757,8 +5778,7 @@ def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None,
57575778
# Build the process table's title and header
57585779
self.logger.secondary_table_title = "PROCESS INFO"
57595780
self.logger.secondary_table_header = 'GPU'.rjust(3) + "NAME".rjust(22) + "PID".rjust(9) + "GTT_MEM".rjust(10) + \
5760-
"CPU_MEM".rjust(10) + "VRAM_MEM".rjust(10) + "MEM_USAGE".rjust(11) + \
5761-
"GFX".rjust(8) + "ENC".rjust(8)
5781+
"CPU_MEM".rjust(10) + "VRAM_MEM".rjust(10) + "CU%".rjust(9)
57625782

57635783
if watching_output:
57645784
self.logger.secondary_table_header = 'TIMESTAMP'.rjust(10) + ' ' + self.logger.secondary_table_header

projects/amdsmi/amdsmi_cli/amdsmi_logger.py

Lines changed: 4 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -214,9 +214,9 @@ def _convert_json_to_tabular(self, json_object: Dict[str, any], dynamic=False):
214214
if process_dict['process_info'] == "No running processes detected":
215215
# Add N/A for empty process_info
216216
table_values += "N/A".rjust(20) + "N/A".rjust(9) + "N/A".rjust(10) + \
217-
"N/A".rjust(10) + "N/A".rjust(10) + "N/A".rjust(11) + \
218-
"N/A".rjust(8) + "N/A".rjust(8) + '\n'
217+
"N/A".rjust(10) + "N/A".rjust(10) + "N/A".rjust(9) + '\n'
219218
else:
219+
#Fix this herre
220220
for process_key, process_value in process_dict['process_info'].items():
221221
string_process_value = str(process_value)
222222
if process_key == "name":
@@ -230,11 +230,8 @@ def _convert_json_to_tabular(self, json_object: Dict[str, any], dynamic=False):
230230
elif process_key == "memory_usage":
231231
for memory_key, memory_value in process_value.items():
232232
table_values += str(memory_value).rjust(10)
233-
elif process_key == "mem_usage":
234-
table_values += string_process_value.rjust(11)
235-
elif process_key == "usage":
236-
for usage_key, usage_value in process_value.items():
237-
table_values += str(usage_value).rjust(8)
233+
elif process_key == "cu_occupancy":
234+
table_values += string_process_value.rjust(9)
238235
# Add the stored gpu and stored timestamp to the next line
239236
table_values += '\n'
240237
if stored_timestamp:
@@ -486,20 +483,6 @@ def _store_output_amdsmi(self, gpu_id, argument, data):
486483
raise amdsmi_cli_exceptions(self, "Invalid output format given, only json, csv, and human_readable supported")
487484

488485

489-
def _store_output_rocmsmi(self, gpu_id, argument, data):
490-
if self.is_json_format():
491-
# put output into self.json_output
492-
pass
493-
elif self.is_csv_format():
494-
# put output into self.csv_output
495-
pass
496-
elif self.is_human_readable_format():
497-
# put output into self.human_readable_output
498-
pass
499-
else:
500-
raise amdsmi_cli_exceptions(self, "Invalid output format given, only json, csv, and human_readable supported")
501-
502-
503486
def store_multiple_device_output(self):
504487
""" Store the current output into the multiple_device_output
505488
then clear the current output

projects/amdsmi/docs/reference/amdsmi-py-api.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1093,7 +1093,6 @@ except AmdSmiException as e:
10931093
print(e)
10941094
```
10951095

1096-
10971096
### amdsmi_get_gpu_process_list
10981097

10991098
Description: Returns the list of processes running on the target GPU; Requires root level access to display root process names; otherwise will return "N/A"
@@ -1111,6 +1110,7 @@ Field | Description
11111110
`mem` | Process memory usage
11121111
`engine_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gfx`</td><td>GFX engine usage in ns</td></tr><tr><td>`enc`</td><td>Encode engine usage in ns</td></tr></tbody></table>
11131112
`memory_usage` | <table><thead><tr> <th> Subfield </th> <th> Description</th> </tr></thead><tbody><tr><td>`gtt_mem`</td><td>GTT memory usage</td></tr><tr><td>`cpu_mem`</td><td>CPU memory usage</td></tr><tr><td>`vram_mem`</td><td>VRAM memory usage</td></tr> </tbody></table>
1113+
`cu_occupancy` | Number of Compute Units utilized
11141114

11151115
Exceptions that can be thrown by `amdsmi_get_gpu_process_list` function:
11161116

projects/amdsmi/example/amd_smi_drm_example.cc

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -817,6 +817,7 @@ int main() {
817817
amdsmi_proc_info_t process = {};
818818
uint64_t mem = 0, gtt_mem = 0, cpu_mem = 0, vram_mem = 0;
819819
uint64_t gfx = 0, enc = 0;
820+
uint32_t cu_occupancy = 0;
820821
char bdf_str[20];
821822
sprintf(bdf_str, "%04" PRIx64 ":%02" PRIx32 ":%02" PRIx32 ".%" PRIu32,
822823
static_cast<uint64_t>(bdf.domain_number),
@@ -837,7 +838,7 @@ int main() {
837838
printf(
838839
"| pid | name | user | gpu bdf | "
839840
"fb usage | gtt memory | cpu memory | vram memory | "
840-
"engine usage (ns) |\n");
841+
"engine usage (ns) | cu occupancy |\n");
841842
printf("| | | | "
842843
"| | | | "
843844
" | gfx enc |\n");
@@ -855,30 +856,34 @@ int main() {
855856
pwd = getpwuid(st.st_uid);
856857
if (!pwd)
857858
printf("| %5d | %16s | %10d | %s | %7ld KiB | %7ld KiB "
858-
"| %7ld KiB | %7ld KiB | %lu %lu |\n",
859+
"| %7ld KiB | %7ld KiB | %lu %lu | %u |\n",
859860
process_info_list[it].pid, process_info_list[it].name, st.st_uid,
860861
bdf_str, process_info_list[it].mem / 1024,
861862
process_info_list[it].memory_usage.gtt_mem / 1024,
862863
process_info_list[it].memory_usage.cpu_mem / 1024,
863864
process_info_list[it].memory_usage.vram_mem / 1024,
864865
process_info_list[it].engine_usage.gfx,
865-
process_info_list[it].engine_usage.enc);
866+
process_info_list[it].engine_usage.enc,
867+
process_info_list[it].cu_occupancy);
866868
else
867869
printf("| %5d | %16s | %10s | %s | %7ld KiB | %7ld KiB "
868-
"| %7ld KiB | %7ld KiB | %lu %lu |\n",
870+
"| %7ld KiB | %7ld KiB | %lu %lu | %u |\n",
869871
process_info_list[it].pid, process_info_list[it].name,
870872
pwd->pw_name, bdf_str, process_info_list[it].mem / 1024,
871873
process_info_list[it].memory_usage.gtt_mem / 1024,
872874
process_info_list[it].memory_usage.cpu_mem / 1024,
873875
process_info_list[it].memory_usage.vram_mem / 1024,
874876
process_info_list[it].engine_usage.gfx,
875-
process_info_list[it].engine_usage.enc);
877+
process_info_list[it].engine_usage.enc,
878+
process_info_list[it].cu_occupancy);
879+
876880
mem += process_info_list[it].mem / 1024;
877881
gtt_mem += process_info_list[it].memory_usage.gtt_mem / 1024;
878882
cpu_mem += process_info_list[it].memory_usage.cpu_mem / 1024;
879883
vram_mem += process_info_list[it].memory_usage.vram_mem / 1024;
880884
gfx = process_info_list[it].engine_usage.gfx;
881885
enc = process_info_list[it].engine_usage.enc;
886+
cu_occupancy = process_info_list[it].cu_occupancy;
882887
printf(
883888
"+-------+------------------+------------+-------------"
884889
"-+-------------+-------------+-------------+----------"
@@ -887,10 +892,9 @@ int main() {
887892
// TODO: To remove compiler warning, the last 3 values in this printf were
888893
// set to 0L. Need to find out what these values need to be.
889894
printf("| TOTAL:| %s | %7ld "
890-
"KiB | %7ld KiB | %7ld KiB | %7ld KiB | %lu %lu "
891-
"%lu %lu %lu |\n",
895+
"KiB | %7ld KiB | %7ld KiB | %7ld KiB | %lu %lu | %u |\n",
892896
bdf_str, mem, gtt_mem, cpu_mem, vram_mem, gfx,
893-
enc, 0L, 0L, 0L);
897+
enc, cu_occupancy, 0L);
894898
printf("+=======+==================+============+=============="
895899
"+=============+=============+=============+============"
896900
"=+==========================================+\n");

projects/amdsmi/include/amd_smi/amdsmi.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1093,7 +1093,8 @@ typedef struct {
10931093
uint32_t reserved[10];
10941094
} memory_usage; //!< in bytes
10951095
char container_name[AMDSMI_MAX_STRING_LENGTH];
1096-
uint32_t reserved[12];
1096+
uint32_t cu_occupancy; //!< Num CUs utilized
1097+
uint32_t reserved[11];
10971098
} amdsmi_proc_info_t;
10981099

10991100
/**

projects/amdsmi/py-interface/amdsmi_interface.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2691,6 +2691,7 @@ def amdsmi_get_gpu_process_list(
26912691
"cpu_mem": process_list[index].memory_usage.cpu_mem,
26922692
"vram_mem": process_list[index].memory_usage.vram_mem,
26932693
},
2694+
"cu_occupancy": process_list[index].cu_occupancy
26942695
})
26952696

26962697
return result

projects/amdsmi/py-interface/amdsmi_wrapper.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1289,7 +1289,8 @@ class struct_engine_usage_(Structure):
12891289
('engine_usage', struct_engine_usage_),
12901290
('memory_usage', struct_memory_usage_),
12911291
('container_name', ctypes.c_char * 256),
1292-
('reserved', ctypes.c_uint32 * 12),
1292+
('cu_occupancy', ctypes.c_uint32),
1293+
('PADDING_1', ctypes.c_ubyte * 4),
12931294
]
12941295

12951296
amdsmi_proc_info_t = struct_amdsmi_proc_info_t

projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -456,7 +456,6 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
456456
proc->sdma_usage = 0;
457457
proc->cu_occupancy = 0;
458458

459-
uint32_t cu_count = 0;
460459
static amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
461460
static std::map<uint64_t, std::shared_ptr<KFDNode>>& kfd_node_map =
462461
smi.kfd_node_map();
@@ -510,23 +509,15 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc,
510509
}
511510
else if(sysfs_data_errcode==0){
512511
// Update CU usage by the process
513-
proc->cu_occupancy += std::stoi(tmp);
514-
// Collect count of compute units
515-
cu_count += kfd_node_map[gpu_id]->cu_count();
512+
proc->cu_occupancy = std::stoi(tmp);
516513
}
517514
else {
518515
// Some GFX revisions do not provide cu_occupancy debugfs method
519516
// which may cause ENOENT
520517
proc->cu_occupancy = CU_OCCUPANCY_INVALID;
521-
cu_count = 0;
522518
}
523519
}
524520

525-
// Adjust CU occupancy to percent.
526-
if (cu_count > 0) {
527-
proc->cu_occupancy = ((proc->cu_occupancy * 100) / cu_count);
528-
}
529-
530521
return 0;
531522
}
532523

projects/amdsmi/src/amd_smi/amd_smi_gpu_device.cc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,9 @@ int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t&
216216

217217
/**
218218
* Complete the process information
219+
* This is where we copy rsmi_process_info_t into the larger amdsmi_proc_info_t
220+
* Then populate the remaining fields with the gpuvsmi_get_pid_info()
221+
* TODO FIX HERE TO GRAB KFD VRAM if /proc is inconsistent
219222
*/
220223
auto get_process_info = [&](const rsmi_process_info_t& rsmi_proc_info, amdsmi_proc_info_t& asmi_proc_info) {
221224
auto status_code = gpuvsmi_get_pid_info(get_bdf(), rsmi_proc_info.process_id, asmi_proc_info);
@@ -225,6 +228,9 @@ int32_t AMDSmiGPUDevice::get_compute_process_list_impl(GPUComputeProcessList_t&
225228
asmi_proc_info.memory_usage.vram_mem = rsmi_proc_info.vram_usage;
226229
}
227230

231+
// Copy the cu occupancy from rsmi_process_info_t to amdsmi_proc_info_t
232+
asmi_proc_info.cu_occupancy = rsmi_proc_info.cu_occupancy;
233+
228234
return status_code;
229235
};
230236

0 commit comments

Comments
 (0)