Skip to content

Commit b668063

Browse files
miketxlibgoglin
authored andcommitted
Add XGMI Info to RSMI plugin
Change-Id: I95524183adee7730a53ec505505045e480a75132 Signed-off-by: Mike Li <[email protected]>
1 parent cd03c01 commit b668063

File tree

2 files changed

+79
-1
lines changed

2 files changed

+79
-1
lines changed

hwloc/topology-rsmi.c

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,54 @@ static int get_device_serial_number(uint32_t dv_ind, char *serial, unsigned int
117117
return 0;
118118
}
119119

120+
/*
121+
* Get the XGMI hive id of the GPU
122+
*
123+
* dv_ind (IN) The device index
124+
* hive_id (OUT) The XGMI hive id of GPU devices
125+
*/
126+
static int get_device_xgmi_hive_id(uint32_t dv_ind, char *buffer)
127+
{
128+
uint64_t hive_id;
129+
rsmi_status_t rsmi_rc = rsmi_dev_xgmi_hive_id_get(dv_ind, &hive_id);
130+
131+
if (rsmi_rc != RSMI_STATUS_SUCCESS) {
132+
if (!hwloc_hide_errors()) {
133+
const char *status_string;
134+
rsmi_rc = rsmi_status_string(rsmi_rc, &status_string);
135+
fprintf(stderr, "RSMI: GPU(%u): Failed to get hive id: %s\n", (unsigned)dv_ind, status_string);
136+
}
137+
return -1;
138+
}
139+
sprintf(buffer, "%lx", hive_id);
140+
return 0;
141+
}
142+
143+
/*
144+
* Get the IO Link type of the GPU
145+
*
146+
* dv_ind_src (IN) The source device index
147+
* dv_ind_dst (IN) The destination device index
148+
* type (OUT) The type of IO Link
149+
*/
150+
static int get_device_io_link_type(uint32_t dv_ind_src, uint32_t dv_ind_dst,
151+
RSMI_IO_LINK_TYPE *type)
152+
{
153+
uint64_t hops;
154+
rsmi_status_t rsmi_rc = rsmi_topo_get_link_type(dv_ind_src, dv_ind_dst,
155+
&hops, type);
156+
157+
if (rsmi_rc != RSMI_STATUS_SUCCESS) {
158+
if (!hwloc_hide_errors()) {
159+
const char *status_string;
160+
rsmi_rc = rsmi_status_string(rsmi_rc, &status_string);
161+
fprintf(stderr, "RSMI: GPU(%u): Failed to get link type: %s\n", (unsigned)dv_ind_src, status_string);
162+
}
163+
return -1;
164+
}
165+
return 0;
166+
}
167+
120168
static int
121169
hwloc_rsmi_discover(struct hwloc_backend *backend, struct hwloc_disc_status *dstatus)
122170
{
@@ -131,7 +179,7 @@ hwloc_rsmi_discover(struct hwloc_backend *backend, struct hwloc_disc_status *dst
131179
rsmi_version_t version;
132180
rsmi_status_t ret;
133181
int may_shutdown;
134-
unsigned nb, i;
182+
unsigned nb, i, j;
135183

136184
assert(dstatus->phase == HWLOC_DISC_PHASE_IO);
137185

@@ -166,6 +214,8 @@ hwloc_rsmi_discover(struct hwloc_backend *backend, struct hwloc_disc_status *dst
166214
uint64_t bdfid = 0;
167215
hwloc_obj_t osdev, parent;
168216
char buffer[64];
217+
char *xgmi_peers, *xgmi_peers_ptr;
218+
RSMI_IO_LINK_TYPE type;
169219

170220
osdev = hwloc_alloc_setup_object(topology, HWLOC_OBJ_OS_DEVICE, HWLOC_UNKNOWN_INDEX);
171221
snprintf(buffer, sizeof(buffer), "rsmi%u", i);
@@ -189,6 +239,27 @@ hwloc_rsmi_discover(struct hwloc_backend *backend, struct hwloc_disc_status *dst
189239
if (get_device_unique_id(i, buffer) == 0)
190240
hwloc_obj_add_info(osdev, "AMDUUID", buffer);
191241

242+
buffer[0] = '\0';
243+
if (get_device_xgmi_hive_id(i, buffer) == 0)
244+
hwloc_obj_add_info(osdev, "XGMIHiveID", buffer);
245+
246+
xgmi_peers = malloc(nb*15+1); /* "rsmi" + unsigned int + space = 15 chars max, + ending \0 */
247+
if (xgmi_peers) {
248+
xgmi_peers[0] = '\0';
249+
xgmi_peers_ptr = xgmi_peers;
250+
for (j=0; j<nb; j++) {
251+
if (i == j)
252+
continue;
253+
if ((get_device_io_link_type(i, j, &type) == 0) &&
254+
(type == RSMI_IOLINK_TYPE_XGMI)) {
255+
xgmi_peers_ptr += sprintf(xgmi_peers_ptr, "rsmi%u ", j);
256+
}
257+
if (xgmi_peers[0] != '\0')
258+
hwloc_obj_add_info(osdev, "XGMIPeers", xgmi_peers);
259+
}
260+
free(xgmi_peers);
261+
}
262+
192263
parent = NULL;
193264
if (get_device_pci_info(i, &bdfid) == 0) {
194265
unsigned domain, device, bus, func;

tests/hwloc/ports/include/rsmi/rocm_smi/rocm_smi.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ typedef int rsmi_status_t;
1818

1919
typedef struct { int major; int minor; int patch; } rsmi_version_t;
2020

21+
typedef int RSMI_IO_LINK_TYPE;
22+
#define RSMI_IOLINK_TYPE_XGMI 2
23+
2124
#define RSMI_MAX_NUM_FREQUENCIES 32
2225

2326
/**
@@ -70,5 +73,9 @@ rsmi_status_t rsmi_dev_name_get(uint32_t dv_ind, char *name, size_t len);
7073
rsmi_status_t rsmi_dev_serial_number_get(uint32_t dv_ind, char *serial_num, uint32_t len);
7174
rsmi_status_t rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *id);
7275
rsmi_status_t rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *bandwidth);
76+
rsmi_status_t rsmi_dev_xgmi_hive_id_get(uint32_t dv_ind, uint64_t *hive_id);
77+
rsmi_status_t rsmi_topo_get_link_type(uint32_t dv_ind_src, uint32_t dv_ind_dst,
78+
uint64_t *hops, RSMI_IO_LINK_TYPE *type);
79+
7380

7481
#endif /* HWLOC_PORT_RSMI_RSMI_H */

0 commit comments

Comments
 (0)