@@ -117,6 +117,54 @@ static int get_device_serial_number(uint32_t dv_ind, char *serial, unsigned int
117
117
return 0 ;
118
118
}
119
119
120
+ /*
121
+ * Get the XGMI hive id of the GPU
122
+ *
123
+ * dv_ind (IN) The device index
124
+ * hive_id (OUT) The XGMI hive id of GPU devices
125
+ */
126
+ static int get_device_xgmi_hive_id (uint32_t dv_ind , char * buffer )
127
+ {
128
+ uint64_t hive_id ;
129
+ rsmi_status_t rsmi_rc = rsmi_dev_xgmi_hive_id_get (dv_ind , & hive_id );
130
+
131
+ if (rsmi_rc != RSMI_STATUS_SUCCESS ) {
132
+ if (!hwloc_hide_errors ()) {
133
+ const char * status_string ;
134
+ rsmi_rc = rsmi_status_string (rsmi_rc , & status_string );
135
+ fprintf (stderr , "RSMI: GPU(%u): Failed to get hive id: %s\n" , (unsigned )dv_ind , status_string );
136
+ }
137
+ return -1 ;
138
+ }
139
+ sprintf (buffer , "%lx" , hive_id );
140
+ return 0 ;
141
+ }
142
+
143
+ /*
144
+ * Get the IO Link type of the GPU
145
+ *
146
+ * dv_ind_src (IN) The source device index
147
+ * dv_ind_dst (IN) The destination device index
148
+ * type (OUT) The type of IO Link
149
+ */
150
+ static int get_device_io_link_type (uint32_t dv_ind_src , uint32_t dv_ind_dst ,
151
+ RSMI_IO_LINK_TYPE * type )
152
+ {
153
+ uint64_t hops ;
154
+ rsmi_status_t rsmi_rc = rsmi_topo_get_link_type (dv_ind_src , dv_ind_dst ,
155
+ & hops , type );
156
+
157
+ if (rsmi_rc != RSMI_STATUS_SUCCESS ) {
158
+ if (!hwloc_hide_errors ()) {
159
+ const char * status_string ;
160
+ rsmi_rc = rsmi_status_string (rsmi_rc , & status_string );
161
+ fprintf (stderr , "RSMI: GPU(%u): Failed to get link type: %s\n" , (unsigned )dv_ind_src , status_string );
162
+ }
163
+ return -1 ;
164
+ }
165
+ return 0 ;
166
+ }
167
+
120
168
static int
121
169
hwloc_rsmi_discover (struct hwloc_backend * backend , struct hwloc_disc_status * dstatus )
122
170
{
@@ -131,7 +179,7 @@ hwloc_rsmi_discover(struct hwloc_backend *backend, struct hwloc_disc_status *dst
131
179
rsmi_version_t version ;
132
180
rsmi_status_t ret ;
133
181
int may_shutdown ;
134
- unsigned nb , i ;
182
+ unsigned nb , i , j ;
135
183
136
184
assert (dstatus -> phase == HWLOC_DISC_PHASE_IO );
137
185
@@ -166,6 +214,8 @@ hwloc_rsmi_discover(struct hwloc_backend *backend, struct hwloc_disc_status *dst
166
214
uint64_t bdfid = 0 ;
167
215
hwloc_obj_t osdev , parent ;
168
216
char buffer [64 ];
217
+ char * xgmi_peers , * xgmi_peers_ptr ;
218
+ RSMI_IO_LINK_TYPE type ;
169
219
170
220
osdev = hwloc_alloc_setup_object (topology , HWLOC_OBJ_OS_DEVICE , HWLOC_UNKNOWN_INDEX );
171
221
snprintf (buffer , sizeof (buffer ), "rsmi%u" , i );
@@ -189,6 +239,27 @@ hwloc_rsmi_discover(struct hwloc_backend *backend, struct hwloc_disc_status *dst
189
239
if (get_device_unique_id (i , buffer ) == 0 )
190
240
hwloc_obj_add_info (osdev , "AMDUUID" , buffer );
191
241
242
+ buffer [0 ] = '\0' ;
243
+ if (get_device_xgmi_hive_id (i , buffer ) == 0 )
244
+ hwloc_obj_add_info (osdev , "XGMIHiveID" , buffer );
245
+
246
+ xgmi_peers = malloc (nb * 15 + 1 ); /* "rsmi" + unsigned int + space = 15 chars max, + ending \0 */
247
+ if (xgmi_peers ) {
248
+ xgmi_peers [0 ] = '\0' ;
249
+ xgmi_peers_ptr = xgmi_peers ;
250
+ for (j = 0 ; j < nb ; j ++ ) {
251
+ if (i == j )
252
+ continue ;
253
+ if ((get_device_io_link_type (i , j , & type ) == 0 ) &&
254
+ (type == RSMI_IOLINK_TYPE_XGMI )) {
255
+ xgmi_peers_ptr += sprintf (xgmi_peers_ptr , "rsmi%u " , j );
256
+ }
257
+ if (xgmi_peers [0 ] != '\0' )
258
+ hwloc_obj_add_info (osdev , "XGMIPeers" , xgmi_peers );
259
+ }
260
+ free (xgmi_peers );
261
+ }
262
+
192
263
parent = NULL ;
193
264
if (get_device_pci_info (i , & bdfid ) == 0 ) {
194
265
unsigned domain , device , bus , func ;
0 commit comments