@@ -102,6 +102,7 @@ INVALID_IP = '0.0.0.0'
102102CHASSIS_MODULE_ADMIN_STATUS = 'admin_status'
103103MODULE_ADMIN_DOWN = 0
104104MODULE_ADMIN_UP = 1
105+ MODULE_PRE_SHUTDOWN = 2
105106MODULE_REBOOT_CAUSE_DIR = "/host/reboot-cause/module/"
106107MAX_HISTORY_FILES = 10
107108
@@ -218,15 +219,13 @@ class ModuleConfigUpdater(logger.Logger):
218219
219220class SmartSwitchModuleConfigUpdater (logger .Logger ):
220221
221- def __init__ (self , log_identifier , chassis , module_table , set_flag_callback ):
222+ def __init__ (self , log_identifier , chassis ):
222223 """
223224 Constructor for SmartSwitchModuleConfigUpdater
224225 :param chassis: Object representing a platform chassis
225226 """
226227 super (SmartSwitchModuleConfigUpdater , self ).__init__ (log_identifier )
227228 self .chassis = chassis
228- self .module_table = module_table
229- self .set_transition_flag = set_flag_callback
230229
231230 def deinit (self ):
232231 """
@@ -249,17 +248,16 @@ class SmartSwitchModuleConfigUpdater(logger.Logger):
249248
250249 if (admin_state == MODULE_ADMIN_DOWN ) or (admin_state == MODULE_ADMIN_UP ):
251250 self .log_info ("Changing module {} to admin {} state" .format (key , 'DOWN' if admin_state == MODULE_ADMIN_DOWN else 'UP' ))
252- t = threading .Thread (target = self .submit_callback , args = (module_index , admin_state ))
251+ t = threading .Thread (target = self .submit_callback , args = (module_index , admin_state , key ))
253252 t .start ()
254253 else :
255254 self .log_warning ("Invalid admin_state value: {}" .format (admin_state ))
256255
257- def submit_callback (self , module_index , admin_state ):
256+ def submit_callback (self , module_index , admin_state , key ):
258257 if admin_state == MODULE_ADMIN_DOWN :
259258 # This is only valid on platforms which have pci_detach and sensord changes required. If it is not implemented,
260259 # there are no actions taken during this function execution.
261260 try_get (self .chassis .get_module (module_index ).module_pre_shutdown , default = False )
262-
263261 try_get (self .chassis .get_module (module_index ).set_admin_state , admin_state , default = False )
264262 if admin_state == MODULE_ADMIN_UP :
265263 # This is only valid on platforms which have pci_rescan sensord changes required. If it is not implemented,
@@ -360,7 +358,7 @@ class ModuleUpdater(logger.Logger):
360358 fvs = self .module_table .get (key )
361359 if isinstance (fvs , list ) and fvs [0 ] is True :
362360 fvs = dict (fvs [- 1 ])
363- return fvs [ CHASSIS_MODULE_INFO_OPERSTATUS_FIELD ]
361+ return fvs . get ( CHASSIS_MODULE_INFO_OPERSTATUS_FIELD , ModuleBase . MODULE_STATUS_EMPTY )
364362 return ModuleBase .MODULE_STATUS_EMPTY
365363
366364 def get_module_admin_status (self , chassis_module_name ):
@@ -725,6 +723,7 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
725723 self .module_reboot_table = swsscommon .Table (self .chassis_state_db , CHASSIS_MODULE_REBOOT_INFO_TABLE )
726724 self .down_modules = {}
727725 self .chassis_app_db_clean_sha = None
726+ self .module_transition_flag_helper = ModuleTransitionFlagHelper ()
728727
729728 self .midplane_initialized = try_get (chassis .init_midplane_switch , default = False )
730729 if not self .midplane_initialized :
@@ -786,28 +785,6 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
786785 self .log_error (f"{ module } : Failed to read previous-reboot-cause.json: { e } " )
787786 return None , None
788787
789- def clear_transition_flag (self , key ):
790- status , fvs = self .module_table .get (key )
791- if status and fvs :
792- fvs_dict = dict (fvs )
793- if 'state_transition_in_progress' in fvs_dict :
794- fvs_dict ['state_transition_in_progress' ] = 'False'
795- else :
796- self .log_warning (f"Transition flag not found in { key } , adding it as False." )
797- fvs_dict ['state_transition_in_progress' ] = 'False'
798- self .module_table .set (key , list (fvs_dict .items ()))
799- else :
800- self .log_error (f"Could not clear module admin_status transition flag for { key } " )
801-
802- def clear_all_transition_flags (self ):
803- self .log_warning ("Clearing all stale state_transition_in_progress flags at startup" )
804- keys = self .module_table .getKeys ()
805- for key in keys :
806- try :
807- self .clear_transition_flag (key )
808- except Exception as e :
809- self .log_error (f"Failed to clear transition flag for { key } : { e } " )
810-
811788 def module_db_update (self ):
812789 for module_index in range (0 , self .num_modules ):
813790 module_info_dict = self ._get_module_info (module_index )
@@ -838,6 +815,9 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
838815 # Persist dpu down time
839816 self .persist_dpu_reboot_time (key )
840817 # persist reboot cause
818+ # Clear transition flag in STATE_DB
819+ self .module_transition_flag_helper .clear_transition_flag (key )
820+
841821 reboot_cause = try_get (self .chassis .get_module (module_index ).get_reboot_cause )
842822 self .persist_dpu_reboot_cause (reboot_cause , key )
843823 # publish reboot cause to db
@@ -873,7 +853,7 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
873853 self .update_dpu_reboot_cause_to_db (key )
874854
875855 # Clear transition flag in STATE_DB
876- self .clear_transition_flag (key )
856+ self .module_transition_flag_helper . clear_transition_flag (key )
877857
878858 def _get_module_info (self , module_index ):
879859 """
@@ -1215,10 +1195,9 @@ class ConfigManagerTask(ProcessTaskBase):
12151195
12161196
12171197class SmartSwitchConfigManagerTask (ProcessTaskBase ):
1218- def __init__ (self , set_transition_flag_callback ):
1198+ def __init__ (self ):
12191199 super (SmartSwitchConfigManagerTask , self ).__init__ ()
12201200 self .logger = logger .Logger (SYSLOG_IDENTIFIER )
1221- self .set_transition_flag = set_transition_flag_callback
12221201 self .config_updater = None
12231202
12241203 def task_worker (self ):
@@ -1228,9 +1207,7 @@ class SmartSwitchConfigManagerTask(ProcessTaskBase):
12281207
12291208 self .config_updater = SmartSwitchModuleConfigUpdater (
12301209 SYSLOG_IDENTIFIER ,
1231- get_chassis (),
1232- module_table ,
1233- self .set_transition_flag
1210+ get_chassis ()
12341211 )
12351212
12361213 # Subscribe to CHASSIS_MODULE table notifications in the Config DB
@@ -1359,6 +1336,33 @@ class DpuStateUpdater(logger.Logger):
13591336 self ._update_dp_dpu_state ('down' )
13601337 self ._update_cp_dpu_state ('down' )
13611338
1339+ class ModuleTransitionFlagHelper (logger .Logger ):
1340+ def __init__ (self , log_identifier = SYSLOG_IDENTIFIER ):
1341+ super (ModuleTransitionFlagHelper , self ).__init__ (log_identifier )
1342+ # Use new connector to avoid redis failures
1343+ """Create a helper function to get the module table,
1344+ since multiple threads updating with the same connector will cause redis failures"""
1345+ state_db = daemon_base .db_connect ("STATE_DB" )
1346+ self .module_table = swsscommon .Table (state_db , CHASSIS_MODULE_INFO_TABLE )
1347+
1348+ def set_transition_flag (self , module_name ):
1349+ try :
1350+ self .module_table .hset (module_name , 'state_transition_in_progress' , 'True' )
1351+ self .module_table .hset (module_name , 'transition_start_time' , datetime .now (timezone .utc ).replace (tzinfo = None ).isoformat ())
1352+ except Exception as e :
1353+ self .log_error (f"Error setting transition flag for { module_name } : { e } " )
1354+
1355+ def clear_transition_flag (self , module_name ):
1356+ try :
1357+ self .log_info (f"Clearing transition flag for { module_name } " )
1358+ self .module_table .hdel (module_name , 'state_transition_in_progress' )
1359+ self .module_table .hdel (module_name , 'transition_start_time' )
1360+ except Exception as e :
1361+ self .log_error (f"Error clearing transition flag for { module_name } : { e } " )
1362+
1363+ def clear_all_transition_flags (self ):
1364+ for module_name in self .module_table .getKeys ():
1365+ self .clear_transition_flag (module_name )
13621366
13631367#
13641368# Daemon =======================================================================
@@ -1396,48 +1400,14 @@ class ChassisdDaemon(daemon_base.DaemonBase):
13961400 else :
13971401 self .log_warning ("Caught unhandled signal '{}' - ignoring..." .format (SIGNALS_TO_NAMES_DICT [sig ]))
13981402
1399- def set_transition_flag (self , module_table , key ):
1400- self .log_info (f"set transition flag: key={ key } " )
1401-
1402- _ , fvs = module_table .get (key )
1403-
1404- # Merge existing fields, preserving all others
1405- fvs_dict = dict (fvs ) if fvs else {}
1406-
1407- # Ensure mandatory fields are present, but only if missing
1408- mandatory_defaults = {
1409- 'desc' : 'N/A' ,
1410- 'slot' : 'N/A' ,
1411- 'serial' : 'N/A' ,
1412- 'oper_status' : ModuleBase .MODULE_STATUS_EMPTY
1413- }
1414- for field , default in mandatory_defaults .items ():
1415- if field not in fvs_dict :
1416- self .log_info (f"{ key } : '{ field } ' missing, setting default '{ default } '" )
1417- fvs_dict [field ] = default
1418-
1419- # Update or add transition fields
1420- fvs_dict .update ({
1421- 'state_transition_in_progress' : 'True' ,
1422- 'transition_start_time' : get_formatted_time ()
1423- })
1424-
1425- # Write merged data back
1426- module_table .set (key , list (fvs_dict .items ()))
1427-
14281403 def submit_dpu_callback (self , module_index , admin_state , module_name ):
1429- if admin_state == MODULE_ADMIN_DOWN :
1430- # This is only valid on platforms which have pci_detach and sensord changes required. If it is not implemented,
1431- # there are no actions taken during this function execution.
1432- try_get (self .module_updater .chassis .get_module (module_index ).module_pre_shutdown , default = False )
1404+ # This is only valid on platforms which have pci_detach and sensord changes required. If it is not implemented,
1405+ # there are no actions taken during this function execution.
1406+ try_get (self .module_updater .chassis .get_module (module_index ).module_pre_shutdown , default = False )
14331407 # Set admin_state change in progress using the centralized method
1434- self .set_transition_flag (self .module_updater .module_table , module_name )
1435- try_get (self .module_updater .chassis .get_module (module_index ).set_admin_state , admin_state , default = False )
1436- if admin_state == MODULE_ADMIN_UP :
1437- # This is only valid on platforms which have pci_rescan sensord changes required. If it is not implemented,
1438- # there are no actions taken during this function execution.
1439- try_get (self .module_updater .chassis .get_module (module_index ).module_post_startup , default = False )
1440- pass
1408+ if admin_state == MODULE_ADMIN_DOWN :
1409+ ModuleTransitionFlagHelper ().set_transition_flag (module_name )
1410+ try_get (self .module_updater .chassis .get_module (module_index ).set_admin_state , admin_state , default = False )
14411411
14421412 def set_initial_dpu_admin_state (self ):
14431413 """Send admin_state trigger once to modules those are powered up"""
@@ -1451,9 +1421,11 @@ class ChassisdDaemon(daemon_base.DaemonBase):
14511421 try :
14521422 # Get admin state of DPU
14531423 admin_state = self .module_updater .get_module_admin_status (module_name )
1454- if admin_state == ModuleBase .MODULE_STATUS_EMPTY and operational_state != ModuleBase .MODULE_STATUS_OFFLINE :
1455- # shutdown DPU
1456- op = MODULE_ADMIN_DOWN
1424+ if admin_state == ModuleBase .MODULE_STATUS_EMPTY :
1425+ op = MODULE_PRE_SHUTDOWN
1426+ if operational_state != ModuleBase .MODULE_STATUS_OFFLINE :
1427+ # shutdown DPU
1428+ op = MODULE_ADMIN_DOWN
14571429
14581430 # Initialize DPU_STATE DB table on bootup
14591431 dpu_state_key = "DPU_STATE|" + module_name
@@ -1501,7 +1473,7 @@ class ChassisdDaemon(daemon_base.DaemonBase):
15011473
15021474 # Start configuration manager task
15031475 if self .smartswitch :
1504- config_manager = SmartSwitchConfigManagerTask (self . set_transition_flag )
1476+ config_manager = SmartSwitchConfigManagerTask ()
15051477 config_manager .task_run ()
15061478 elif self .module_updater .supervisor_slot == self .module_updater .my_slot :
15071479 config_manager = ConfigManagerTask ()
@@ -1514,8 +1486,16 @@ class ChassisdDaemon(daemon_base.DaemonBase):
15141486
15151487 # Set the initial DPU admin state for SmartSwitch
15161488 if self .smartswitch :
1517- self .module_updater .clear_all_transition_flags ()
1489+ # Clear all stale transition flags for SmartSwitch on startup
1490+ ModuleTransitionFlagHelper ().clear_all_transition_flags ()
1491+ self .set_initial_dpu_admin_state ()
1492+ # Clear all transition flags for SmartSwitch after setting the initial DPU admin state
1493+ module_transition_flag_helper = ModuleTransitionFlagHelper ()
1494+ # Clear all stale transition flags for SmartSwitch on startup
1495+ module_transition_flag_helper .clear_all_transition_flags ()
15181496 self .set_initial_dpu_admin_state ()
1497+ # Clear all transition flags for SmartSwitch after setting the initial DPU admin state
1498+ module_transition_flag_helper .clear_all_transition_flags ()
15191499
15201500 while not self .stop .wait (CHASSIS_INFO_UPDATE_PERIOD_SECS ):
15211501 self .module_updater .module_db_update ()
0 commit comments