33import contextlib
44import logging
55from functools import lru_cache
6+ from pathlib import Path
67
78from .. import envs
89from ..logging import debug_log_exception , debug_log_warning
@@ -102,47 +103,47 @@ def detect(self) -> Devices | None:
102103 dev_index = dev_idx
103104
104105 dev_gpu_asic_info = pyamdsmi .amdsmi_get_gpu_asic_info (dev )
105- dev_uuid = f"GPU-{ (dev_gpu_asic_info .get ('asic_serial' )[2 :]).lower ()} "
106+ if dev_gpu_asic_info .get ("asic_serial" ) != "N/A" :
107+ asic_serial = dev_gpu_asic_info .get ("asic_serial" )
108+ dev_uuid = f"GPU-{ (asic_serial [2 :]).lower ()} "
109+ else :
110+ dev_uuid = f"GPU-{ pyrocmsmi .rsmi_dev_unique_id_get (dev_idx )[2 :]} "
106111 dev_hsa_agent = hsa_agents .get (dev_uuid )
107112
108- dev_card_id = None
109- if dev_hsa_agent :
110- dev_card_id = dev_hsa_agent .driver_node_id
111- elif hasattr (pyamdsmi , "amdsmi_get_gpu_kfd_info" ):
112- dev_kfd_info = pyamdsmi .amdsmi_get_gpu_kfd_info (dev )
113- dev_card_id = dev_kfd_info .get ("node_id" )
114- else :
113+ dev_gpu_driver_info = pyamdsmi .amdsmi_get_gpu_driver_info (dev )
114+ dev_driver_ver = dev_gpu_driver_info .get ("driver_version" )
115+
116+ dev_name = dev_hsa_agent .name
117+ if not dev_name :
118+ dev_name = dev_gpu_asic_info .get ("market_name" )
119+
120+ dev_cc = dev_hsa_agent .compute_capability
121+ if not dev_cc :
115122 with contextlib .suppress (pyrocmsmi .ROCMSMIError ):
116- dev_card_id = pyrocmsmi .rsmi_dev_node_id_get (dev_idx )
123+ dev_cc = pyrocmsmi .rsmi_dev_target_graphics_version_get (dev_idx )
117124
118- dev_gpudev_info = None
119- if dev_card_id is not None :
125+ dev_bdf = None
126+ dev_card_id = None
127+ dev_renderd_id = None
128+ with contextlib .suppress (pyamdsmi .AmdSmiException ):
129+ dev_bdf = pyamdsmi .amdsmi_get_gpu_device_bdf (dev )
130+ dev_card_id , dev_renderd_id = _get_card_and_renderd_id (dev_bdf )
131+
132+ dev_cores = dev_hsa_agent .compute_units
133+ dev_asic_family_id = dev_hsa_agent .asic_family_id
134+ if (
135+ not dev_cores or not dev_asic_family_id
136+ ) and dev_card_id is not None :
120137 with contextlib .suppress (pyamdgpu .AMDGPUError ):
121138 _ , _ , dev_gpudev = pyamdgpu .amdgpu_device_initialize (
122139 dev_card_id ,
123140 )
124141 dev_gpudev_info = pyamdgpu .amdgpu_query_gpu_info (dev_gpudev )
125142 pyamdgpu .amdgpu_device_deinitialize (dev_gpudev )
126-
127- dev_gpu_driver_info = pyamdsmi .amdsmi_get_gpu_driver_info (dev )
128- dev_driver_ver = dev_gpu_driver_info .get ("driver_version" )
129-
130- if dev_hsa_agent :
131- dev_name = dev_hsa_agent .name
132- if not dev_name :
133- dev_name = dev_gpu_asic_info .get ("market_name" )
134- dev_cc = dev_hsa_agent .compute_capability
135- else :
136- dev_name = dev_gpu_asic_info .get ("market_name" )
137- dev_cc = None
138- with contextlib .suppress (pyrocmsmi .ROCMSMIError ):
139- dev_cc = pyrocmsmi .rsmi_dev_target_graphics_version_get (dev_idx )
140-
141- dev_cores = None
142- if dev_hsa_agent :
143- dev_cores = dev_hsa_agent .compute_units
144- elif dev_gpudev_info and hasattr (dev_gpudev_info , "cu_active_number" ):
145- dev_cores = dev_gpudev_info .cu_active_number
143+ if not dev_cores :
144+ dev_cores = dev_gpudev_info .cu_active_number
145+ if not dev_asic_family_id :
146+ dev_asic_family_id = dev_gpudev_info .family_id
146147
147148 dev_cores_util = None
148149 dev_temp = None
@@ -201,24 +202,22 @@ def detect(self) -> Devices | None:
201202 )
202203
203204 dev_appendix = {
204- "arch_family" : _get_arch_family (dev_gpudev_info ),
205+ "arch_family" : _get_arch_family (dev_asic_family_id ),
205206 "vgpu" : dev_compute_partition is not None ,
206- "card_id" : dev_card_id ,
207207 }
208+ if dev_bdf :
209+ dev_appendix ["bdf" ] = dev_bdf
210+ if dev_card_id is not None :
211+ dev_appendix ["card_id" ] = dev_card_id
212+ if dev_renderd_id is not None :
213+ dev_appendix ["renderd_id" ] = dev_renderd_id
208214
209215 with contextlib .suppress (pyamdsmi .AmdSmiException ):
210216 dev_xgmi = pyamdsmi .amdsmi_get_xgmi_info (dev )
211- for k in [
212- "xgmi_lanes" ,
213- "xgmi_hive_id" ,
214- "xgmi_node_id" ,
215- ]:
216- if value := dev_xgmi .get (k ):
217- dev_appendix [k ] = value
218-
219- with contextlib .suppress (pyamdsmi .AmdSmiException ):
220- dev_bdf = pyamdsmi .amdsmi_get_gpu_device_bdf (dev )
221- dev_appendix ["bdf" ] = dev_bdf
217+ if xgmi_lanes := dev_xgmi .get ("xgmi_lanes" , None ):
218+ dev_appendix ["xgmi_lanes" ] = xgmi_lanes
219+ dev_appendix ["xgmi_hive_id" ] = dev_xgmi .get ("xgmi_hive_id" )
220+ dev_appendix ["xgmi_node_id" ] = dev_xgmi .get ("xgmi_node_id" )
222221
223222 ret .append (
224223 Device (
@@ -410,15 +409,20 @@ def distance_pci_devices(bdf_a: str, bdf_b: str) -> TopologyDistanceEnum:
410409 return topology
411410
412411
413- def _get_arch_family (
414- dev_gpudev_info : pyamdgpu .c_amdgpu_gpu_info | None ,
415- ) -> str | None :
416- if not dev_gpudev_info :
417- return None
412+ def _get_arch_family (dev_family_id : int | None ) -> str :
413+ """
414+ Get the architecture family name from the device family ID.
415+
416+ Args:
417+ dev_family_id:
418+ The device family ID.
419+
420+ Returns:
421+ The architecture family as string.
418422
419- family_id = dev_gpudev_info . family_id
420- if family_id is None :
421- return None
423+ """
424+ if dev_family_id is None :
425+ return "Unknown"
422426
423427 arch_family = {
424428 pyamdgpu .AMDGPU_FAMILY_SI : "Southern Islands" ,
@@ -439,4 +443,29 @@ def _get_arch_family(
439443 pyamdgpu .AMDGPU_FAMILY_GC_12_0_0 : "GC 12.0.0" ,
440444 }
441445
442- return arch_family .get (family_id , "Unknown" )
446+ return arch_family .get (dev_family_id , "Unknown" )
447+
448+
449+ def _get_card_and_renderd_id (dev_bdf : str ) -> tuple [int | None , int | None ]:
450+ """
451+ Get the card ID and renderD ID for a given device bdf.
452+
453+ Args:
454+ dev_bdf:
455+ The device bdf.
456+
457+ Returns:
458+ A tuple of (card_id, renderd_id).
459+
460+ """
461+ card_id = None
462+ renderd_id = None
463+ drm_path = Path (f"/sys/module/amdgpu/drivers/pci:amdgpu/{ dev_bdf } /drm" )
464+ if drm_path .exists ():
465+ for dir_path in drm_path .iterdir ():
466+ if dir_path .name .startswith ("card" ):
467+ card_id = int (dir_path .name [4 :])
468+ elif dir_path .name .startswith ("renderD" ):
469+ renderd_id = int (dir_path .name [7 :])
470+
471+ return card_id , renderd_id
0 commit comments