@@ -7489,65 +7489,114 @@ def ras(self, args, multiple_devices=False, gpu=None, cper=None, afid=None,
74897489 time .sleep (1 )
74907490
74917491
7492- def node (self , args , multiple_devices = False , nodes = None , power_management = None ):
7492+ def node (self , args , multiple_devices = False , nodes = None , power_management = None , base_board_temps = None ):
74937493 """List node informations
74947494
74957495 Args:
74967496 args (Namespace): Namespace containing the parsed CLI args
74977497 multiple_devices (bool, optional): True if checking for multiple devices.
74987498 Defaults to False.
7499+ nodes (node_handle, optional): node_handle for target node. Defaults to None.
7500+ power_management (bool, optional): Value override for args.power_management. Defaults to None.
7501+ base_board_temps (bool, optional): Value override for args.base_board_temps. Defaults to None.
74997502
75007503 Returns:
75017504 None: Print output via AMDSMILogger to destination
75027505 """
75037506 # Set args.* to passed in arguments
75047507 if nodes :
75057508 args .nodes = nodes
7506- if power_management :
7507- args .power_management = power_management
7509+ # Store args that are applicable to the current platform
7510+ current_platform_args = ["power_management" , "base_board_temps" ]
7511+
7512+ # Check if any node-specific options were passed via command line
7513+ current_platform_values = []
7514+ if args .power_management :
7515+ current_platform_values += [args .power_management ]
7516+ if args .base_board_temps :
7517+ current_platform_values += [args .base_board_temps ]
7518+
7519+ # If no node options are passed, enable all by default
7520+ if not any (current_platform_values ):
7521+ for arg in current_platform_args :
7522+ setattr (args , arg , True )
75087523 if getattr (args , 'nodes' , None ) is None :
75097524 args .nodes = self .node_handle
75107525
75117526 if not self .group_check_printed :
75127527 self .helpers .check_required_groups (check_render = True , check_video = False )
75137528 self .group_check_printed = True
75147529
7530+ # Initialize variables for both power management and base board temps
7531+ npm_dict = {"limit" : "N/A" , "status" : "N/A" }
7532+ power_unit = "W"
7533+ limit = "N/A"
7534+ base_board_temp_dict = {}
7535+
75157536 # Get NPM info
7516- if args .nodes is not None :
7517- try :
7518- npm_info = amdsmi_interface .amdsmi_get_npm_info (args .nodes )
7519- except amdsmi_exception .AmdSmiLibraryException as e :
7520- logging .debug ("amdsmi_get_npm_info failed: %s" , e .get_error_info ())
7537+ if args .power_management :
7538+ if args .nodes is not None :
7539+ try :
7540+ npm_info = amdsmi_interface .amdsmi_get_npm_info (args .nodes )
7541+ except amdsmi_exception .AmdSmiLibraryException as e :
7542+ logging .debug ("amdsmi_get_npm_info failed: %s" , e .get_error_info ())
7543+ npm_info = "N/A"
7544+ else :
7545+ logging .debug ('No node handle available to query NPM info' )
75217546 npm_info = "N/A"
7522- else :
7523- logging .debug ('No node handle available to query NPM info' )
7524- npm_info = "N/A"
75257547
7526- # Log outputs
7527- npm_dict = { " limit" : "N/A" , "status" : " N/A"}
7528- power_unit = "W"
7548+ if isinstance ( npm_info , dict ):
7549+ limit = npm_info . get ( 'limit' , "N/A" )
7550+ status = npm_info . get ( 'status' , npm_info . get ( 'current' , "N/A" ))
75297551
7530- limit = "N/A"
7531- if isinstance (npm_info , dict ):
7532- limit = npm_info .get ('limit' , "N/A" )
7533- status = npm_info .get ('status' , npm_info .get ('current' , "N/A" ))
7534-
7535- if limit != "N/A" :
7536- npm_dict ['limit' ] = limit
7537- status = "DISABLED" if status == amdsmi_interface .amdsmi_wrapper .AMDSMI_NPM_STATUS_DISABLED else "ENABLED"
7538- npm_dict .update ({"status" : status })
7552+ if limit != "N/A" :
7553+ npm_dict ['limit' ] = limit
7554+ status = "DISABLED" if status == amdsmi_interface .amdsmi_wrapper .AMDSMI_NPM_STATUS_DISABLED else "ENABLED"
7555+ npm_dict .update ({"status" : status })
7556+
7557+ # Get base board temperatures using node_handle
7558+ if args .base_board_temps :
7559+ if args .nodes is not None :
7560+ try :
7561+ # Get device_handle from node_handle
7562+ device_handle = amdsmi_interface .amdsmi_get_device_handle_from_node (args .nodes )
7563+ gpu_id = self .helpers .get_gpu_id_from_device_handle (device_handle )
7564+ base_board_temp_dict = self .helpers .get_base_board_temperatures (device_handle , gpu_id , self .logger )
7565+ except amdsmi_exception .AmdSmiLibraryException as e :
7566+ logging .debug ("Failed to get device handle from node: %s" , e .get_error_info ())
7567+ base_board_temp_dict = {}
7568+
7569+ # Print output
75397570 if self .logger .is_human_readable_format () and self .logger .destination == 'stdout' :
7540- print (f"NODE:\n POWER_MANAGEMENT:\n LIMIT: { npm_dict .get ('limit' , 'N/A' )} { power_unit } \n STATUS: { npm_dict .get ('status' , 'N/A' )} " )
7571+ node_output = ["NODE:" ]
7572+ if args .power_management :
7573+ node_output .append (" POWER_MANAGEMENT:" )
7574+ node_output .append (f" LIMIT: { npm_dict .get ('limit' , 'N/A' )} { power_unit } " )
7575+ node_output .append (f" STATUS: { npm_dict .get ('status' , 'N/A' )} " )
7576+ if args .base_board_temps and base_board_temp_dict :
7577+ node_output .append (" BASEBOARD:" )
7578+ node_output .append (" TEMPERATURE:" )
7579+ for temp_name , temp_value in base_board_temp_dict .items ():
7580+ node_output .append (f" { temp_name .upper ()} : { temp_value } " )
7581+ print ("\n " .join (node_output ))
75417582 else :
75427583 if self .logger .is_csv_format ():
75437584 csv_dict = {}
7544- csv_dict ['limit' ] = npm_dict .get ('limit' , "N/A" )
7545- csv_dict ['status' ] = npm_dict .get ('status' , "N/A" )
7585+ if args .power_management :
7586+ csv_dict ['limit' ] = npm_dict .get ('limit' , "N/A" )
7587+ csv_dict ['status' ] = npm_dict .get ('status' , "N/A" )
7588+ if args .base_board_temps and base_board_temp_dict :
7589+ csv_dict .update (base_board_temp_dict )
75467590 self .logger .output = csv_dict
75477591 else :
75487592 # For JSON and human readable format with file output
7549- npm_dict ["limit" ] = self .helpers .unit_format (self .logger , limit , power_unit )
7550- self .logger .output = {'node' : {'power_management' : npm_dict }}
7593+ node_output = {}
7594+ if args .power_management :
7595+ npm_dict ["limit" ] = self .helpers .unit_format (self .logger , limit , power_unit )
7596+ node_output ['power_management' ] = npm_dict
7597+ if args .base_board_temps and base_board_temp_dict :
7598+ node_output ['base_board' ] = {'temperature' : base_board_temp_dict }
7599+ self .logger .output = {'node' : node_output }
75517600 if multiple_devices :
75527601 self .logger .store_multiple_device_output ()
75537602 return
0 commit comments