Skip to content

Commit 85db4e4

Browse files
bkanangomarifamd
andauthored
[ROCM] Enhance amd-smi node to display baseboard temp (#2943)
Added amdsmi_get_device_handle_from_node API to retrieve device handle from a node handle. Enhanced `amd-smi node` command to display baseboard temperatures. Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com> Co-authored-by: Maisam Arif <Maisam.Arif@amd.com>
1 parent 67a57d9 commit 85db4e4

File tree

8 files changed

+240
-48
lines changed

8 files changed

+240
-48
lines changed

projects/amdsmi/CHANGELOG.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,21 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
88

99
### Added
1010

11+
- **Added `amdsmi_get_device_handle_from_node` API**.
12+
- Added C API function to retrieve a device handle from a node handle.
13+
- Provides inverse functionality to `amdsmi_get_node_handle`.
14+
- Added python binding for the API and exported in `py-interface/__init__.py` for public API access.
15+
- Returns `AMDSMI_STATUS_SUCCESS` on success, `AMDSMI_STATUS_NOT_FOUND` if no matching device found.
16+
17+
- **Enhanced `amd-smi node` command to display baseboard temperatures**.
18+
- Added `--base-board-temps` / `-b` option to display baseboard temperature sensors.
19+
- Selective display: Use `-p` for NPM only, `-b` for Baseboard only.
20+
- Default behavior (no flags): Shows both power management and baseboard temperatures.
21+
22+
## amd_smi_lib for ROCm 7.11.0
23+
24+
### Added
25+
1126
- **Added `--hex` flag to `amd-smi bad-pages` command**.
1227
- Added `--hex` option to display page addresses and sizes in hexadecimal format with `0x` prefix
1328

projects/amdsmi/amdsmi_cli/amdsmi_commands.py

Lines changed: 77 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -7489,65 +7489,114 @@ def ras(self, args, multiple_devices=False, gpu=None, cper=None, afid=None,
74897489
time.sleep(1)
74907490

74917491

7492-
def node(self, args, multiple_devices=False, nodes=None, power_management=None):
7492+
def node(self, args, multiple_devices=False, nodes=None, power_management=None, base_board_temps=None):
74937493
"""List node informations
74947494
74957495
Args:
74967496
args (Namespace): Namespace containing the parsed CLI args
74977497
multiple_devices (bool, optional): True if checking for multiple devices.
74987498
Defaults to False.
7499+
nodes (node_handle, optional): node_handle for target node. Defaults to None.
7500+
power_management (bool, optional): Value override for args.power_management. Defaults to None.
7501+
base_board_temps (bool, optional): Value override for args.base_board_temps. Defaults to None.
74997502
75007503
Returns:
75017504
None: Print output via AMDSMILogger to destination
75027505
"""
75037506
# Set args.* to passed in arguments
75047507
if nodes:
75057508
args.nodes = nodes
7506-
if power_management:
7507-
args.power_management = power_management
7509+
# Store args that are applicable to the current platform
7510+
current_platform_args = ["power_management", "base_board_temps"]
7511+
7512+
# Check if any node-specific options were passed via command line
7513+
current_platform_values = []
7514+
if args.power_management:
7515+
current_platform_values += [args.power_management]
7516+
if args.base_board_temps:
7517+
current_platform_values += [args.base_board_temps]
7518+
7519+
# If no node options are passed, enable all by default
7520+
if not any(current_platform_values):
7521+
for arg in current_platform_args:
7522+
setattr(args, arg, True)
75087523
if getattr(args, 'nodes', None) is None:
75097524
args.nodes = self.node_handle
75107525

75117526
if not self.group_check_printed:
75127527
self.helpers.check_required_groups(check_render=True, check_video=False)
75137528
self.group_check_printed = True
75147529

7530+
# Initialize variables for both power management and base board temps
7531+
npm_dict = {"limit": "N/A", "status": "N/A"}
7532+
power_unit = "W"
7533+
limit = "N/A"
7534+
base_board_temp_dict = {}
7535+
75157536
# Get NPM info
7516-
if args.nodes is not None:
7517-
try:
7518-
npm_info = amdsmi_interface.amdsmi_get_npm_info(args.nodes)
7519-
except amdsmi_exception.AmdSmiLibraryException as e:
7520-
logging.debug("amdsmi_get_npm_info failed: %s", e.get_error_info())
7537+
if args.power_management:
7538+
if args.nodes is not None:
7539+
try:
7540+
npm_info = amdsmi_interface.amdsmi_get_npm_info(args.nodes)
7541+
except amdsmi_exception.AmdSmiLibraryException as e:
7542+
logging.debug("amdsmi_get_npm_info failed: %s", e.get_error_info())
7543+
npm_info = "N/A"
7544+
else:
7545+
logging.debug('No node handle available to query NPM info')
75217546
npm_info = "N/A"
7522-
else:
7523-
logging.debug('No node handle available to query NPM info')
7524-
npm_info = "N/A"
75257547

7526-
# Log outputs
7527-
npm_dict = {"limit": "N/A", "status": "N/A"}
7528-
power_unit ="W"
7548+
if isinstance(npm_info, dict):
7549+
limit = npm_info.get('limit', "N/A")
7550+
status = npm_info.get('status', npm_info.get('current', "N/A"))
75297551

7530-
limit = "N/A"
7531-
if isinstance(npm_info, dict):
7532-
limit = npm_info.get('limit', "N/A")
7533-
status = npm_info.get('status', npm_info.get('current', "N/A"))
7534-
7535-
if limit !="N/A":
7536-
npm_dict['limit'] = limit
7537-
status = "DISABLED" if status == amdsmi_interface.amdsmi_wrapper.AMDSMI_NPM_STATUS_DISABLED else "ENABLED"
7538-
npm_dict.update({"status": status})
7552+
if limit !="N/A":
7553+
npm_dict['limit'] = limit
7554+
status = "DISABLED" if status == amdsmi_interface.amdsmi_wrapper.AMDSMI_NPM_STATUS_DISABLED else "ENABLED"
7555+
npm_dict.update({"status": status})
7556+
7557+
# Get base board temperatures using node_handle
7558+
if args.base_board_temps:
7559+
if args.nodes is not None:
7560+
try:
7561+
# Get device_handle from node_handle
7562+
device_handle = amdsmi_interface.amdsmi_get_device_handle_from_node(args.nodes)
7563+
gpu_id = self.helpers.get_gpu_id_from_device_handle(device_handle)
7564+
base_board_temp_dict = self.helpers.get_base_board_temperatures(device_handle, gpu_id, self.logger)
7565+
except amdsmi_exception.AmdSmiLibraryException as e:
7566+
logging.debug("Failed to get device handle from node: %s", e.get_error_info())
7567+
base_board_temp_dict = {}
7568+
7569+
# Print output
75397570
if self.logger.is_human_readable_format() and self.logger.destination == 'stdout':
7540-
print(f"NODE:\n POWER_MANAGEMENT:\n LIMIT: {npm_dict.get('limit', 'N/A')} {power_unit}\n STATUS: {npm_dict.get('status', 'N/A')}")
7571+
node_output = ["NODE:"]
7572+
if args.power_management:
7573+
node_output.append(" POWER_MANAGEMENT:")
7574+
node_output.append(f" LIMIT: {npm_dict.get('limit', 'N/A')} {power_unit}")
7575+
node_output.append(f" STATUS: {npm_dict.get('status', 'N/A')}")
7576+
if args.base_board_temps and base_board_temp_dict:
7577+
node_output.append(" BASEBOARD:")
7578+
node_output.append(" TEMPERATURE:")
7579+
for temp_name, temp_value in base_board_temp_dict.items():
7580+
node_output.append(f" {temp_name.upper()}: {temp_value}")
7581+
print("\n".join(node_output))
75417582
else:
75427583
if self.logger.is_csv_format():
75437584
csv_dict = {}
7544-
csv_dict['limit'] = npm_dict.get('limit', "N/A")
7545-
csv_dict['status'] = npm_dict.get('status', "N/A")
7585+
if args.power_management:
7586+
csv_dict['limit'] = npm_dict.get('limit', "N/A")
7587+
csv_dict['status'] = npm_dict.get('status', "N/A")
7588+
if args.base_board_temps and base_board_temp_dict:
7589+
csv_dict.update(base_board_temp_dict)
75467590
self.logger.output = csv_dict
75477591
else:
75487592
# For JSON and human readable format with file output
7549-
npm_dict["limit"] = self.helpers.unit_format(self.logger, limit, power_unit)
7550-
self.logger.output = {'node': {'power_management': npm_dict}}
7593+
node_output = {}
7594+
if args.power_management:
7595+
npm_dict["limit"] = self.helpers.unit_format(self.logger, limit, power_unit)
7596+
node_output['power_management'] = npm_dict
7597+
if args.base_board_temps and base_board_temp_dict:
7598+
node_output['base_board'] = {'temperature': base_board_temp_dict}
7599+
self.logger.output = {'node': node_output}
75517600
if multiple_devices:
75527601
self.logger.store_multiple_device_output()
75537602
return

projects/amdsmi/amdsmi_cli/amdsmi_parser.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1690,13 +1690,14 @@ def _add_ras_parser(self, subparsers: argparse._SubParsersAction, func):
16901690

16911691
def _add_node_parser(self, subparsers: argparse._SubParsersAction, func):
16921692
# Subparser help text
1693-
node_help = "Gets power information for the node"
1693+
node_help = "Gets power and baseboard information for the node"
16941694
node_subcommand_help = f"{self.description}\n\nReturns information for node 0 on the system.\
16951695
\nIf no node argument is provided, all node information will be displayed."
16961696
node_optionals_title = "Node arguments"
16971697

16981698
# Help text for Node arguments
16991699
power_management_help = "Displays power management information"
1700+
base_board_temps_help = "Displays baseboard temperatures"
17001701

17011702
node_parser = subparsers.add_parser("node", help=node_help, description=node_subcommand_help)
17021703
node_parser._optionals.title = node_optionals_title
@@ -1705,6 +1706,7 @@ def _add_node_parser(self, subparsers: argparse._SubParsersAction, func):
17051706

17061707
# Optional Args
17071708
node_parser.add_argument('-p', '--power-management', action='store_true', required=False, help=power_management_help)
1709+
node_parser.add_argument('-b', '--base-board-temps', action='store_true', required=False, help=base_board_temps_help)
17081710

17091711
# Add Universal Arguments
17101712
self._add_command_modifiers(node_parser)

projects/amdsmi/include/amd_smi/amdsmi.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2820,6 +2820,27 @@ amdsmi_status_t amdsmi_get_processor_handles(amdsmi_socket_handle socket_handle,
28202820
*/
28212821
amdsmi_status_t amdsmi_get_node_handle(amdsmi_processor_handle processor_handle, amdsmi_node_handle *node_handle);
28222822

2823+
/**
2824+
* @brief Get the processor (device) handle associated with a node handle.
2825+
*
2826+
* @ingroup tagProcDiscovery
2827+
*
2828+
* @platform{gpu_bm_linux}
2829+
*
2830+
* @details This function retrieves the processor (device) handle from a node handle.
2831+
* The @p node_handle must be provided for the node. This is the inverse operation
2832+
* of amdsmi_get_node_handle API.
2833+
* Currently, only AMD GPUs are supported.
2834+
*
2835+
* @param[in] node_handle A pointer to a ::amdsmi_node_handle, this identifies
2836+
* the node from which to retrieve the associated device handle.
2837+
*
2838+
* @param[out] processor_handle A pointer to a block of memory where amdsmi_processor_handle
2839+
* will be written.
2840+
*
2841+
* @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail
2842+
*/
2843+
amdsmi_status_t amdsmi_get_device_handle_from_node(amdsmi_node_handle node_handle, amdsmi_processor_handle *processor_handle);
28232844

28242845
#ifdef ENABLE_ESMI_LIB
28252846
/**

projects/amdsmi/py-interface/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@
3131
from .amdsmi_interface import amdsmi_get_socket_info
3232
from .amdsmi_interface import amdsmi_get_processor_count_from_handles
3333
from .amdsmi_interface import amdsmi_get_processor_handles_by_type
34+
from .amdsmi_interface import amdsmi_get_node_handle
35+
from .amdsmi_interface import amdsmi_get_device_handle_from_node
36+
from .amdsmi_interface import amdsmi_get_npm_info
3437

3538
# ESMI Dependent Functions
3639
try:

projects/amdsmi/py-interface/amdsmi_interface.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4596,6 +4596,34 @@ def amdsmi_get_node_handle(processor_handle):
45964596
return node_handle
45974597

45984598

4599+
def amdsmi_get_device_handle_from_node(node_handle):
4600+
"""
4601+
Get the processor (device) handle associated with a node handle.
4602+
4603+
This function retrieves the processor (device) handle from a node handle.
4604+
This is the inverse operation of amdsmi_get_node_handle.
4605+
4606+
Args:
4607+
node_handle: A node handle (amdsmi_node_handle) to get the device handle from.
4608+
4609+
Returns:
4610+
amdsmi_processor_handle: The processor handle associated with the node.
4611+
4612+
Raises:
4613+
AmdSmiParameterException: If node_handle is not the correct type.
4614+
AmdSmiLibraryException: If the library call fails.
4615+
"""
4616+
if not isinstance(node_handle, amdsmi_wrapper.amdsmi_node_handle):
4617+
raise AmdSmiParameterException(node_handle, amdsmi_wrapper.amdsmi_node_handle)
4618+
4619+
processor_handle = amdsmi_wrapper.amdsmi_processor_handle()
4620+
_check_res(
4621+
amdsmi_wrapper.amdsmi_get_device_handle_from_node(node_handle, ctypes.byref(processor_handle))
4622+
)
4623+
4624+
return processor_handle
4625+
4626+
45994627
def amdsmi_get_npm_info(node_handle: processor_handle_t) -> Dict[str, Any]:
46004628
if not isinstance(node_handle, amdsmi_wrapper.amdsmi_node_handle):
46014629
raise AmdSmiParameterException(node_handle, amdsmi_wrapper.amdsmi_node_handle)

projects/amdsmi/py-interface/amdsmi_wrapper.py

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -990,6 +990,21 @@ class struct_amdsmi_enumeration_info_t(Structure):
990990
class struct_amdsmi_pcie_info_t(Structure):
991991
pass
992992

993+
class struct_pcie_static_(Structure):
994+
pass
995+
996+
struct_pcie_static_._pack_ = 1 # source:False
997+
struct_pcie_static_._fields_ = [
998+
('max_pcie_width', ctypes.c_uint16),
999+
('PADDING_0', ctypes.c_ubyte * 2),
1000+
('max_pcie_speed', ctypes.c_uint32),
1001+
('pcie_interface_version', ctypes.c_uint32),
1002+
('slot_type', amdsmi_card_form_factor_t),
1003+
('max_pcie_interface_version', ctypes.c_uint32),
1004+
('PADDING_1', ctypes.c_ubyte * 4),
1005+
('reserved', ctypes.c_uint64 * 9),
1006+
]
1007+
9931008
class struct_pcie_metric_(Structure):
9941009
pass
9951010

@@ -1010,21 +1025,6 @@ class struct_pcie_metric_(Structure):
10101025
('reserved', ctypes.c_uint64 * 12),
10111026
]
10121027

1013-
class struct_pcie_static_(Structure):
1014-
pass
1015-
1016-
struct_pcie_static_._pack_ = 1 # source:False
1017-
struct_pcie_static_._fields_ = [
1018-
('max_pcie_width', ctypes.c_uint16),
1019-
('PADDING_0', ctypes.c_ubyte * 2),
1020-
('max_pcie_speed', ctypes.c_uint32),
1021-
('pcie_interface_version', ctypes.c_uint32),
1022-
('slot_type', amdsmi_card_form_factor_t),
1023-
('max_pcie_interface_version', ctypes.c_uint32),
1024-
('PADDING_1', ctypes.c_ubyte * 4),
1025-
('reserved', ctypes.c_uint64 * 9),
1026-
]
1027-
10281028
struct_amdsmi_pcie_info_t._pack_ = 1 # source:False
10291029
struct_amdsmi_pcie_info_t._fields_ = [
10301030
('pcie_static', struct_pcie_static_),
@@ -2617,6 +2617,9 @@ class struct_amdsmi_sock_info_t(Structure):
26172617
amdsmi_get_node_handle = _libraries['libamd_smi.so'].amdsmi_get_node_handle
26182618
amdsmi_get_node_handle.restype = amdsmi_status_t
26192619
amdsmi_get_node_handle.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.POINTER(None))]
2620+
amdsmi_get_device_handle_from_node = _libraries['libamd_smi.so'].amdsmi_get_device_handle_from_node
2621+
amdsmi_get_device_handle_from_node.restype = amdsmi_status_t
2622+
amdsmi_get_device_handle_from_node.argtypes = [amdsmi_node_handle, ctypes.POINTER(ctypes.POINTER(None))]
26202623
amdsmi_get_cpucore_handles = _libraries['libamd_smi.so'].amdsmi_get_cpucore_handles
26212624
amdsmi_get_cpucore_handles.restype = amdsmi_status_t
26222625
amdsmi_get_cpucore_handles.argtypes = [ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(ctypes.POINTER(None))]
@@ -3419,9 +3422,9 @@ class struct_amdsmi_cper_hdr_t(Structure):
34193422
'AMDSMI_PROCESSOR_TYPE_NON_AMD_GPU',
34203423
'AMDSMI_PROCESSOR_TYPE_UNKNOWN', 'AMDSMI_PTL_DATA_FORMAT_BF16',
34213424
'AMDSMI_PTL_DATA_FORMAT_F16', 'AMDSMI_PTL_DATA_FORMAT_F32',
3422-
'AMDSMI_PTL_DATA_FORMAT_F64', 'AMDSMI_PTL_DATA_FORMAT_I8',
3423-
'AMDSMI_PTL_DATA_FORMAT_F8', 'AMDSMI_PTL_DATA_FORMAT_VECTOR',
3424-
'AMDSMI_PTL_DATA_FORMAT_INVALID',
3425+
'AMDSMI_PTL_DATA_FORMAT_F64', 'AMDSMI_PTL_DATA_FORMAT_F8',
3426+
'AMDSMI_PTL_DATA_FORMAT_I8', 'AMDSMI_PTL_DATA_FORMAT_INVALID',
3427+
'AMDSMI_PTL_DATA_FORMAT_VECTOR',
34253428
'AMDSMI_PWR_PROF_PRST_3D_FULL_SCR_MASK',
34263429
'AMDSMI_PWR_PROF_PRST_BOOTUP_DEFAULT',
34273430
'AMDSMI_PWR_PROF_PRST_COMPUTE_MASK',
@@ -3604,7 +3607,8 @@ class struct_amdsmi_cper_hdr_t(Structure):
36043607
'amdsmi_get_cpu_socket_power', 'amdsmi_get_cpu_socket_power_cap',
36053608
'amdsmi_get_cpu_socket_power_cap_max',
36063609
'amdsmi_get_cpu_socket_temperature', 'amdsmi_get_cpucore_handles',
3607-
'amdsmi_get_cpusocket_handles', 'amdsmi_get_dfc_ctrl',
3610+
'amdsmi_get_cpusocket_handles',
3611+
'amdsmi_get_device_handle_from_node', 'amdsmi_get_dfc_ctrl',
36083612
'amdsmi_get_energy_count', 'amdsmi_get_esmi_err_msg',
36093613
'amdsmi_get_fw_info',
36103614
'amdsmi_get_gpu_accelerator_partition_profile',

0 commit comments

Comments
 (0)