Skip to content

Commit fff2d21

Browse files
authored
SWDEV-518209: GPU Metrics 1.8 (#177)
- Updates: - Adding the following metrics to allow new calculations for violation status: - Per XCP metrics gfx_below_host_limit_ppt_acc - Per XCP metrics gfx_below_host_limit_thm_acc - Per XCP metrics gfx_low_utilization_acc - Per XCP metrics gfx_below_host_limit_total_acc - Increasing available JPEG engines to 40. Current ASICs may not support all 40. These will be indicated as UINT16_MAX or N/A in CLI. Signed-off-by: Juan Castillo <juan.castillo@amd.com> Co-authored-by: Charis Poag <Charis.Poag@amd.com> [ROCm/amdsmi commit: 7c882b2]
1 parent 2f5792e commit fff2d21

File tree

9 files changed

+869
-702
lines changed

9 files changed

+869
-702
lines changed

projects/amdsmi/CHANGELOG.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,20 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
44

55
***All information listed below is for reference and subject to change.***
66

7+
## amd_smi_lib for ROCm 6.5.0
8+
9+
### Added
10+
11+
- **Added support for GPU metrics 1.8**.
12+
- Added new fields for `amdsmi_gpu_xcp_metrics_t` including:
13+
- Adding the following metrics to allow new calculations for violation status:
14+
- Per XCP metrics `gfx_below_host_limit_ppt_acc[XCP][MAX_XCC]` - GFX Clock Host limit Package Power Tracking violation counts
15+
- Per XCP metrics `gfx_below_host_limit_thm_acc[XCP][MAX_XCC]` - GFX Clock Host limit Thermal (TVIOL) violation counts
16+
- Per XCP metrics `gfx_low_utilization_acc[XCP][MAX_XCC]` - violation counts for how did low utilization caused the GPU to be below application clocks.
17+
- Per XCP metrics `gfx_below_host_limit_total_acc[XCP][MAX_XCC]`- violation counts for how long GPU was held below application clocks any limiter (see above new violation metrics).
18+
- Increasing available JPEG engines to 40.
19+
Current ASICs may not support all 40. These will be indicated as UINT16_MAX or N/A in CLI.
20+
721
## amd_smi_lib for ROCm 6.4.0
822

923
### Added

projects/amdsmi/example/amd_smi_drm_example.cc

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1256,6 +1256,79 @@ int main() {
12561256
idx++;
12571257
}
12581258

1259+
/*New scp stats v1.8*/
1260+
idx = 0;
1261+
idy = 0;
1262+
std::cout << "\txcp_stats.gfx_below_host_limit_ppt_acc: " << "\n";
1263+
for (auto& row : smu.xcp_stats) {
1264+
std::cout << "\t XCP [" << idx << "] : [";
1265+
for (auto& col : row.gfx_below_host_limit_ppt_acc) {
1266+
if ((idy + 1) != static_cast<int>(std::size(row.gfx_below_host_limit_ppt_acc))) {
1267+
std::cout << col << ", ";
1268+
} else {
1269+
std::cout << col;
1270+
}
1271+
idy++;
1272+
}
1273+
std::cout << "]\n";
1274+
idy = 0;
1275+
idx++;
1276+
}
1277+
1278+
idx = 0;
1279+
idy = 0;
1280+
std::cout << "\txcp_stats.gfx_below_host_limit_thm_acc: " << "\n";
1281+
for (auto& row : smu.xcp_stats) {
1282+
std::cout << "\t XCP [" << idx << "] : [";
1283+
for (auto& col : row.gfx_below_host_limit_thm_acc) {
1284+
if ((idy + 1) != static_cast<int>(std::size(row.gfx_below_host_limit_thm_acc))) {
1285+
std::cout << col << ", ";
1286+
} else {
1287+
std::cout << col;
1288+
}
1289+
idy++;
1290+
}
1291+
std::cout << "]\n";
1292+
idy = 0;
1293+
idx++;
1294+
}
1295+
1296+
idx = 0;
1297+
idy = 0;
1298+
std::cout << "\txcp_stats.gfx_low_utilization_acc: " << "\n";
1299+
for (auto& row : smu.xcp_stats) {
1300+
std::cout << "\t XCP [" << idx << "] : [";
1301+
for (auto& col : row.gfx_low_utilization_acc) {
1302+
if ((idy + 1) != static_cast<int>(std::size(row.gfx_low_utilization_acc))) {
1303+
std::cout << col << ", ";
1304+
} else {
1305+
std::cout << col;
1306+
}
1307+
idy++;
1308+
}
1309+
std::cout << "]\n";
1310+
idy = 0;
1311+
idx++;
1312+
}
1313+
1314+
idx = 0;
1315+
idy = 0;
1316+
std::cout << "\txcp_stats.gfx_below_host_limit_total_acc: " << "\n";
1317+
for (auto& row : smu.xcp_stats) {
1318+
std::cout << "\t XCP [" << idx << "] : [";
1319+
for (auto& col : row.gfx_below_host_limit_total_acc) {
1320+
if ((idy + 1) != static_cast<int>(std::size(row.gfx_below_host_limit_total_acc))) {
1321+
std::cout << col << ", ";
1322+
} else {
1323+
std::cout << col;
1324+
}
1325+
idy++;
1326+
}
1327+
std::cout << "]\n";
1328+
idy = 0;
1329+
idx++;
1330+
}
1331+
12591332
std::cout << "\n\n";
12601333
std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n";
12611334
constexpr uint16_t kMAX_ITER_TEST = 10;

projects/amdsmi/include/amd_smi/amdsmi.h

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,12 @@ typedef enum {
152152
*/
153153
#define AMDSMI_MAX_NUM_JPEG 32
154154

155+
/**
156+
* @brief new for gpu metrics v1.8, document presents NUM_JPEG_ENG_V1
157+
* but will change to AMDSMI_MAX_NUM_JPEG_ENG_V1 for continuity
158+
*/
159+
#define AMDSMI_MAX_NUM_JPEG_ENG_V1 40
160+
155161
/**
156162
* @brief This should match AMDSMI_MAX_NUM_XCC;
157163
* XCC - Accelerated Compute Core, the collection of compute units,
@@ -1688,9 +1694,9 @@ typedef struct {
16881694
* @brief v1.6 additions
16891695
* The max uint32_t will be used if that information is N/A
16901696
*/
1691-
uint32_t gfx_busy_inst[AMDSMI_MAX_NUM_XCC]; //!< Utilization Instantaneous in %
1692-
uint16_t jpeg_busy[AMDSMI_MAX_NUM_JPEG]; //!< Utilization Instantaneous in %
1693-
uint16_t vcn_busy[AMDSMI_MAX_NUM_VCN]; //!< Utilization Instantaneous in %
1697+
uint32_t gfx_busy_inst[AMDSMI_MAX_NUM_XCC]; //!< Utilization Instantaneous in %
1698+
uint16_t jpeg_busy[AMDSMI_MAX_NUM_JPEG_ENG_V1]; //!< Utilization Instantaneous in % (UPDATED: to 40 in v1.8)
1699+
uint16_t vcn_busy[AMDSMI_MAX_NUM_VCN]; //!< Utilization Instantaneous in %
16941700

16951701
uint64_t gfx_busy_acc[AMDSMI_MAX_NUM_XCC]; //!< Utilization Accumulated in %
16961702

@@ -1699,6 +1705,17 @@ typedef struct {
16991705
*/
17001706
/* Total App Clock Counter Accumulated */
17011707
uint64_t gfx_below_host_limit_acc[AMDSMI_MAX_NUM_XCC]; //!< Total App Clock Counter Accumulated
1708+
1709+
/**
1710+
* @brief v1.8 additions
1711+
*/
1712+
/* Total App Clock Counter Accumulated */
1713+
uint64_t gfx_below_host_limit_ppt_acc[AMDSMI_MAX_NUM_XCC];
1714+
uint64_t gfx_below_host_limit_thm_acc[AMDSMI_MAX_NUM_XCC];
1715+
uint64_t gfx_low_utilization_acc[AMDSMI_MAX_NUM_XCC];
1716+
uint64_t gfx_below_host_limit_total_acc[AMDSMI_MAX_NUM_XCC];
1717+
1718+
17021719
} amdsmi_gpu_xcp_metrics_t;
17031720

17041721
/**
@@ -1889,7 +1906,7 @@ typedef struct {
18891906

18901907
uint32_t pcie_lc_perf_other_end_recovery; //!< PCIE other end recovery counter
18911908

1892-
/*
1909+
/**
18931910
* @brief v1.7 additions
18941911
*/
18951912
uint64_t vram_max_bandwidth; //!< VRAM max bandwidth at max memory clock (GB/s)

projects/amdsmi/py-interface/amdsmi_interface.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4200,6 +4200,10 @@ def amdsmi_get_gpu_metrics_info(
42004200
"xcp_stats.vcn_busy": list(gpu_metrics.xcp_stats),
42014201
"xcp_stats.gfx_busy_acc": list(gpu_metrics.xcp_stats),
42024202
"xcp_stats.gfx_below_host_limit_acc": list(gpu_metrics.xcp_stats),
4203+
"xcp_stats.gfx_below_host_limit_ppt_acc": list(gpu_metrics.xcp_stats),
4204+
"xcp_stats.gfx_below_host_limit_thm_acc": list(gpu_metrics.xcp_stats),
4205+
"xcp_stats.gfx_low_utilization_acc": list(gpu_metrics.xcp_stats),
4206+
"xcp_stats.gfx_below_host_limit_total_acc": list(gpu_metrics.xcp_stats),
42034207
"pcie_lc_perf_other_end_recovery": _validate_if_max_uint(gpu_metrics.pcie_lc_perf_other_end_recovery, MaxUIntegerTypes.UINT32_T),
42044208
"vram_max_bandwidth": _validate_if_max_uint(gpu_metrics.vram_max_bandwidth, MaxUIntegerTypes.UINT64_T),
42054209
"xgmi_link_status": _validate_if_max_uint(list(gpu_metrics.xgmi_link_status), MaxUIntegerTypes.UINT16_T),
@@ -4231,15 +4235,40 @@ def amdsmi_get_gpu_metrics_info(
42314235
for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_busy_acc']):
42324236
xcp_detail = []
42334237
for val in xcp_metrics.gfx_busy_acc:
4234-
xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T, isActivity=True))
4238+
xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T))
42354239
gpu_metrics_output["xcp_stats.gfx_busy_acc"][xcp_index] = xcp_detail
42364240

42374241
if 'xcp_stats.gfx_below_host_limit_acc' in gpu_metrics_output:
42384242
for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_below_host_limit_acc']):
42394243
xcp_detail = []
42404244
for val in xcp_metrics.gfx_below_host_limit_acc:
4241-
xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T, isActivity=True))
4245+
xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T))
42424246
gpu_metrics_output['xcp_stats.gfx_below_host_limit_acc'][xcp_index] = xcp_detail
4247+
# new for gpu metrics v1.8
4248+
if 'xcp_stats.gfx_below_host_limit_ppt_acc' in gpu_metrics_output:
4249+
for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_below_host_limit_ppt_acc']):
4250+
xcp_detail = []
4251+
for val in xcp_metrics.gfx_below_host_limit_ppt_acc:
4252+
xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T))
4253+
gpu_metrics_output['xcp_stats.gfx_below_host_limit_ppt_acc'][xcp_index] = xcp_detail
4254+
if 'xcp_stats.gfx_below_host_limit_thm_acc' in gpu_metrics_output:
4255+
for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_below_host_limit_thm_acc']):
4256+
xcp_detail = []
4257+
for val in xcp_metrics.gfx_below_host_limit_thm_acc:
4258+
xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T))
4259+
gpu_metrics_output['xcp_stats.gfx_below_host_limit_thm_acc'][xcp_index] = xcp_detail
4260+
if 'xcp_stats.gfx_low_utilization_acc' in gpu_metrics_output:
4261+
for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_low_utilization_acc']):
4262+
xcp_detail = []
4263+
for val in xcp_metrics.gfx_low_utilization_acc:
4264+
xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T))
4265+
gpu_metrics_output['xcp_stats.gfx_low_utilization_acc'][xcp_index] = xcp_detail
4266+
if 'xcp_stats.gfx_below_host_limit_total_acc' in gpu_metrics_output:
4267+
for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_below_host_limit_total_acc']):
4268+
xcp_detail = []
4269+
for val in xcp_metrics.gfx_below_host_limit_total_acc:
4270+
xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T))
4271+
gpu_metrics_output['xcp_stats.gfx_below_host_limit_total_acc'][xcp_index] = xcp_detail
42434272
return gpu_metrics_output
42444273

42454274

projects/amdsmi/py-interface/amdsmi_wrapper.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1849,10 +1849,14 @@ class struct_amdsmi_gpu_xcp_metrics_t(Structure):
18491849
struct_amdsmi_gpu_xcp_metrics_t._pack_ = 1 # source:False
18501850
struct_amdsmi_gpu_xcp_metrics_t._fields_ = [
18511851
('gfx_busy_inst', ctypes.c_uint32 * 8),
1852-
('jpeg_busy', ctypes.c_uint16 * 32),
1852+
('jpeg_busy', ctypes.c_uint16 * 40),
18531853
('vcn_busy', ctypes.c_uint16 * 4),
18541854
('gfx_busy_acc', ctypes.c_uint64 * 8),
18551855
('gfx_below_host_limit_acc', ctypes.c_uint64 * 8),
1856+
('gfx_below_host_limit_ppt_acc', ctypes.c_uint64 * 8),
1857+
('gfx_below_host_limit_thm_acc', ctypes.c_uint64 * 8),
1858+
('gfx_low_utilization_acc', ctypes.c_uint64 * 8),
1859+
('gfx_below_host_limit_total_acc', ctypes.c_uint64 * 8),
18561860
]
18571861

18581862
amdsmi_gpu_xcp_metrics_t = struct_amdsmi_gpu_xcp_metrics_t

projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1068,6 +1068,11 @@ typedef struct metrics_table_header_t metrics_table_header_t;
10681068
*/
10691069
#define RSMI_MAX_NUM_JPEG_ENGS 32
10701070

1071+
/**
1072+
* @brief This should match kRSMI_MAX_NUM_JPEG_ENG_V1
1073+
*/
1074+
#define RSMI_MAX_NUM_JPEG_ENG_V1 40
1075+
10711076
/**
10721077
* @brief This should match kRSMI_MAX_NUM_CLKS
10731078
*/
@@ -1115,7 +1120,7 @@ struct amdgpu_xcp_metrics_t {
11151120
*/
11161121
/* Utilization Instantaneous (%) */
11171122
uint32_t gfx_busy_inst[RSMI_MAX_NUM_XCC];
1118-
uint16_t jpeg_busy[RSMI_MAX_NUM_JPEG_ENGS];
1123+
uint16_t jpeg_busy[RSMI_MAX_NUM_JPEG_ENG_V1];
11191124
uint16_t vcn_busy[RSMI_MAX_NUM_VCNS];
11201125

11211126
/* Utilization Accumulated (%) */
@@ -1126,6 +1131,14 @@ struct amdgpu_xcp_metrics_t {
11261131
*/
11271132
/* Total App Clock Counter Accumulated */
11281133
uint64_t gfx_below_host_limit_acc[RSMI_MAX_NUM_XCC];
1134+
1135+
/**
1136+
* v1.8 additions
1137+
*/
1138+
uint64_t gfx_below_host_limit_ppt_acc[RSMI_MAX_NUM_XCC];
1139+
uint64_t gfx_below_host_limit_thm_acc[RSMI_MAX_NUM_XCC];
1140+
uint64_t gfx_low_utilization_acc[RSMI_MAX_NUM_XCC];
1141+
uint64_t gfx_below_host_limit_total_acc[RSMI_MAX_NUM_XCC];
11291142
};
11301143

11311144
typedef struct {

0 commit comments

Comments
 (0)