1111from .__utils__ import (
1212 PCIDevice ,
1313 byte_to_mebibyte ,
14+ compare_pci_devices ,
1415 get_brief_version ,
1516 get_numa_node_by_bdf ,
1617 get_pci_devices ,
@@ -217,7 +218,7 @@ def detect(self) -> Devices | None:
217218
218219 with contextlib .suppress (pyamdsmi .AmdSmiException ):
219220 dev_bdf = pyamdsmi .amdsmi_get_gpu_device_bdf (dev )
220- dev_appendix ["bdf" ] = str ( dev_bdf ). lower ()
221+ dev_appendix ["bdf" ] = dev_bdf
221222
222223 ret .append (
223224 Device (
@@ -287,20 +288,48 @@ def get_device_handle(dev: Device):
287288 return devs_mapping .get (dev .index )
288289
289290 try :
291+ pci_devices = self .detect_pci_devices ()
292+
293+ def distance_pci_devices (bdf_a : str , bdf_b : str ) -> TopologyDistanceEnum :
294+ """
295+ Compute distance between two PCI devices by their BDFs.
296+
297+ Args:
298+ bdf_a:
299+ The BDF of the first PCI device.
300+ bdf_b:
301+ The BDF of the second PCI device.
302+
303+ Returns:
304+ The TopologyDistanceEnum representing the distance.
305+
306+ """
307+ pcid_a = pci_devices .get (bdf_a , None )
308+ pcid_b = pci_devices .get (bdf_b , None )
309+
310+ score = compare_pci_devices (pcid_a , pcid_b )
311+ if score > 0 :
312+ return TopologyDistanceEnum .PIX
313+ if score == 0 :
314+ return TopologyDistanceEnum .PXB
315+ return TopologyDistanceEnum .PHB
316+
290317 pyamdsmi .amdsmi_init ()
291318
319+ # Get NUMA and CPU affinities.
292320 for i , dev_i in enumerate (devices ):
293- dev_i_handle = get_device_handle (dev_i )
294-
295321 # Get affinity with PCIe BDF if possible.
296322 if dev_i_bdf := dev_i .appendix .get ("bdf" , "" ):
297- numa_node = get_numa_node_by_bdf (dev_i_bdf )
298- topology .devices_numa_affinities [i ] = numa_node
323+ topology .devices_numa_affinities [i ] = get_numa_node_by_bdf (
324+ dev_i_bdf ,
325+ )
299326 topology .devices_cpu_affinities [i ] = map_numa_node_to_cpu_affinity (
300- numa_node ,
327+ topology . devices_numa_affinities [ i ] ,
301328 )
302329 # Otherwise, get affinity via AMD SMI.
303330 if not topology .devices_cpu_affinities [i ]:
331+ dev_i_handle = get_device_handle (dev_i )
332+
304333 # Get NUMA affinity.
305334 try :
306335 dev_i_numa_node = pyamdsmi .amdsmi_topo_get_numa_node_number (
@@ -315,41 +344,50 @@ def get_device_handle(dev: Device):
315344 )
316345 # Get CPU affinity.
317346 topology .devices_cpu_affinities [i ] = map_numa_node_to_cpu_affinity (
318- numa_node = topology .devices_numa_affinities [i ],
347+ topology .devices_numa_affinities [i ],
319348 )
320349
321- # Get distances to other devices.
350+ # Get distances to other devices.
351+ for i , dev_i in enumerate (devices ):
352+ dev_i_handle = get_device_handle (dev_i )
353+
322354 for j , dev_j in enumerate (devices ):
323- if i == j :
324- continue
325- if topology .devices_distances [i ][j ] != 0 :
355+ if (
356+ dev_i .index == dev_j .index
357+ or topology .devices_distances [i ][j ] != 0
358+ ):
326359 continue
327360
328361 dev_j_handle = get_device_handle (dev_j )
329362
330363 distance = TopologyDistanceEnum .UNK
331364 try :
332- link_type = pyamdsmi .amdsmi_topo_get_link_type (
365+ link = pyamdsmi .amdsmi_topo_get_link_type (
333366 dev_i_handle ,
334367 dev_j_handle ,
335368 )
336- match int (link_type .type ):
369+ link_type = link .get ("type" , - 1 )
370+ link_hops = link .get ("hops" , - 1 )
371+ match link_type :
337372 case pyamdsmi .AMDSMI_LINK_TYPE_INTERNAL :
338373 distance = TopologyDistanceEnum .SELF
339- case pyamdsmi .AMDSMI_LINK_TYPE_XGMI :
340- distance = TopologyDistanceEnum .LINK
341- # For PCIe links,
342- # further distinguish between PHB and SYS based on NUMA affinity.
343374 case pyamdsmi .AMDSMI_LINK_TYPE_PCIE :
344- if link_type == pyamdsmi .AMDSMI_LINK_TYPE_PCIE :
345- dev_i_numa , dev_j_numa = (
346- topology .devices_numa_affinities [i ],
347- topology .devices_numa_affinities [j ],
375+ dev_i_numa , dev_j_numa = (
376+ topology .devices_numa_affinities [i ],
377+ topology .devices_numa_affinities [j ],
378+ )
379+ if dev_i_numa and dev_i_numa == dev_j_numa :
380+ distance = distance_pci_devices (
381+ dev_i .appendix .get ("bdf" , "" ),
382+ dev_j .appendix .get ("bdf" , "" ),
348383 )
349- if dev_i_numa != "" and dev_i_numa == dev_j_numa :
350- distance = TopologyDistanceEnum .PHB
351- else :
352- distance = TopologyDistanceEnum .SYS
384+ else :
385+ distance = TopologyDistanceEnum .SYS
386+ case pyamdsmi .AMDSMI_LINK_TYPE_XGMI :
387+ distance = TopologyDistanceEnum .LINK
388+ case _:
389+ if link_hops == 0 :
390+ distance = TopologyDistanceEnum .SELF
353391 except pyamdsmi .AmdSmiException :
354392 debug_log_exception (
355393 logger ,
@@ -361,7 +399,7 @@ def get_device_handle(dev: Device):
361399 topology .devices_distances [i ][j ] = distance
362400 topology .devices_distances [j ][i ] = distance
363401 except pyamdsmi .AmdSmiException :
364- debug_log_exception (logger , "Failed to get topology" )
402+ debug_log_exception (logger , "Failed to fetch topology" )
365403 raise
366404 except Exception :
367405 debug_log_exception (logger , "Failed to process topology fetching" )
0 commit comments