|
9 | 9 | from .__types__ import Detector, Device, Devices, ManufacturerEnum |
10 | 10 | from .__utils__ import ( |
11 | 11 | PCIDevice, |
| 12 | + byte_to_mebibyte, |
12 | 13 | get_brief_version, |
13 | 14 | get_pci_devices, |
14 | 15 | get_utilization, |
@@ -141,20 +142,48 @@ def detect(self) -> Devices | None: |
141 | 142 | elif dev_gpudev_info and hasattr(dev_gpudev_info, "cu_active_number"): |
142 | 143 | dev_cores = dev_gpudev_info.cu_active_number |
143 | 144 |
|
144 | | - dev_gpu_metrics_info = pyamdsmi.amdsmi_get_gpu_metrics_info(dev) |
145 | | - dev_cores_util = dev_gpu_metrics_info.get("average_gfx_activity", 0) |
146 | | - dev_gpu_vram_usage = pyamdsmi.amdsmi_get_gpu_vram_usage(dev) |
147 | | - dev_mem = dev_gpu_vram_usage.get("vram_total") |
148 | | - dev_mem_used = dev_gpu_vram_usage.get("vram_used") |
149 | | - dev_temp = dev_gpu_metrics_info.get("temperature_hotspot", 0) |
150 | | - |
151 | | - dev_power_info = pyamdsmi.amdsmi_get_power_info(dev) |
152 | | - dev_power = dev_power_info.get("power_limit", 0) // 1000000 # uW to W |
153 | | - dev_power_used = ( |
154 | | - dev_power_info.get("current_socket_power") |
155 | | - if dev_power_info.get("current_socket_power", "N/A") != "N/A" |
156 | | - else dev_power_info.get("average_socket_power", 0) |
157 | | - ) |
| 145 | + dev_cores_util, dev_temp = None, None |
| 146 | + try: |
| 147 | + dev_gpu_metrics_info = pyamdsmi.amdsmi_get_gpu_metrics_info(dev) |
| 148 | + dev_cores_util = dev_gpu_metrics_info.get("average_gfx_activity", 0) |
| 149 | + dev_temp = dev_gpu_metrics_info.get("temperature_hotspot", 0) |
| 150 | + except pyamdsmi.AmdSmiException: |
| 151 | + with contextlib.suppress(pyrocmsmi.ROCMSMIError): |
| 152 | + pyrocmsmi.rsmi_init() |
| 153 | + dev_cores_util = pyrocmsmi.rsmi_dev_busy_percent_get(dev_idx) |
| 154 | + dev_temp = pyrocmsmi.rsmi_dev_temp_metric_get(dev_idx) |
| 155 | + |
| 156 | + dev_mem, dev_mem_used = None, None |
| 157 | + try: |
| 158 | + dev_gpu_vram_usage = pyamdsmi.amdsmi_get_gpu_vram_usage(dev) |
| 159 | + dev_mem = dev_gpu_vram_usage.get("vram_total") |
| 160 | + dev_mem_used = dev_gpu_vram_usage.get("vram_used") |
| 161 | + except pyamdsmi.AmdSmiException: |
| 162 | + with contextlib.suppress(pyrocmsmi.ROCMSMIError): |
| 163 | + pyrocmsmi.rsmi_init() |
| 164 | + dev_mem = byte_to_mebibyte( # byte to MiB |
| 165 | + pyrocmsmi.rsmi_dev_memory_total_get(dev_idx), |
| 166 | + ) |
| 167 | + dev_mem_used = byte_to_mebibyte( # byte to MiB |
| 168 | + pyrocmsmi.rsmi_dev_memory_usage_get(dev_idx), |
| 169 | + ) |
| 170 | + |
| 171 | + dev_power, dev_power_used = None, None |
| 172 | + try: |
| 173 | + dev_power_info = pyamdsmi.amdsmi_get_power_info(dev) |
| 174 | + dev_power = ( |
| 175 | + dev_power_info.get("power_limit", 0) // 1000000 |
| 176 | + ) # uW to W |
| 177 | + dev_power_used = ( |
| 178 | + dev_power_info.get("current_socket_power") |
| 179 | + if dev_power_info.get("current_socket_power", "N/A") != "N/A" |
| 180 | + else dev_power_info.get("average_socket_power", 0) |
| 181 | + ) |
| 182 | + except pyamdsmi.AmdSmiException: |
| 183 | + with contextlib.suppress(pyrocmsmi.ROCMSMIError): |
| 184 | + pyrocmsmi.rsmi_init() |
| 185 | + dev_power = pyrocmsmi.rsmi_dev_power_cap_get(dev_idx) |
| 186 | + dev_power_used = pyrocmsmi.rsmi_dev_power_get(dev_idx) |
158 | 187 |
|
159 | 188 | dev_compute_partition = None |
160 | 189 | with contextlib.suppress(pyamdsmi.AmdSmiException): |
|
0 commit comments