Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 19 additions & 63 deletions sonic-chassisd/scripts/chassisd
Original file line number Diff line number Diff line change
Expand Up @@ -254,16 +254,7 @@ class SmartSwitchModuleConfigUpdater(logger.Logger):
self.log_warning("Invalid admin_state value: {}".format(admin_state))

def submit_callback(self, module_index, admin_state, key):
if admin_state == MODULE_ADMIN_DOWN:
# This is only valid on platforms which have pci_detach and sensord changes required. If it is not implemented,
# there are no actions taken during this function execution.
try_get(self.chassis.get_module(module_index).module_pre_shutdown, default=False)
try_get(self.chassis.get_module(module_index).set_admin_state, admin_state, default=False)
if admin_state == MODULE_ADMIN_UP:
# This is only valid on platforms which have pci_rescan sensord changes required. If it is not implemented,
# there are no actions taken during this function execution.
try_get(self.chassis.get_module(module_index).module_post_startup, default=False)
pass
try_get(self.chassis.get_module(module_index).set_admin_state_gracefully, admin_state, default=False)

#
# Module Updater ==============================================================
Expand Down Expand Up @@ -723,7 +714,6 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
self.module_reboot_table = swsscommon.Table(self.chassis_state_db, CHASSIS_MODULE_REBOOT_INFO_TABLE)
self.down_modules = {}
self.chassis_app_db_clean_sha = None
self.module_transition_flag_helper = ModuleTransitionFlagHelper()

self.midplane_initialized = try_get(chassis.init_midplane_switch, default=False)
if not self.midplane_initialized:
Expand Down Expand Up @@ -815,9 +805,6 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
# Persist dpu down time
self.persist_dpu_reboot_time(key)
# persist reboot cause
# Clear transition flag in STATE_DB
self.module_transition_flag_helper.clear_transition_flag(key)

reboot_cause = try_get(self.chassis.get_module(module_index).get_reboot_cause)
self.persist_dpu_reboot_cause(reboot_cause, key)
# publish reboot cause to db
Expand Down Expand Up @@ -852,9 +839,6 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
self.persist_dpu_reboot_cause(reboot_cause, key)
self.update_dpu_reboot_cause_to_db(key)

# Clear transition flag in STATE_DB
self.module_transition_flag_helper.clear_transition_flag(key)

def _get_module_info(self, module_index):
"""
Retrieves module info of this module
Expand Down Expand Up @@ -1336,34 +1320,6 @@ class DpuStateUpdater(logger.Logger):
self._update_dp_dpu_state('down')
self._update_cp_dpu_state('down')

class ModuleTransitionFlagHelper(logger.Logger):
def __init__(self, log_identifier = SYSLOG_IDENTIFIER):
super(ModuleTransitionFlagHelper, self).__init__(log_identifier)
# Use new connector to avoid redis failures
"""Create a helper function to get the module table,
since multiple threads updating with the same connector will cause redis failures"""
state_db = daemon_base.db_connect("STATE_DB")
self.module_table = swsscommon.Table(state_db, CHASSIS_MODULE_INFO_TABLE)

def set_transition_flag(self, module_name):
try:
self.module_table.hset(module_name, 'state_transition_in_progress', 'True')
self.module_table.hset(module_name, 'transition_start_time', datetime.now(timezone.utc).replace(tzinfo=None).isoformat())
except Exception as e:
self.log_error(f"Error setting transition flag for {module_name}: {e}")

def clear_transition_flag(self, module_name):
try:
self.log_info(f"Clearing transition flag for {module_name}")
self.module_table.hdel(module_name, 'state_transition_in_progress')
self.module_table.hdel(module_name, 'transition_start_time')
except Exception as e:
self.log_error(f"Error clearing transition flag for {module_name}: {e}")

def clear_all_transition_flags(self):
for module_name in self.module_table.getKeys():
self.clear_transition_flag(module_name)

#
# Daemon =======================================================================
#
Expand Down Expand Up @@ -1400,32 +1356,41 @@ class ChassisdDaemon(daemon_base.DaemonBase):
else:
self.log_warning("Caught unhandled signal '{}' - ignoring...".format(SIGNALS_TO_NAMES_DICT[sig]))

def submit_dpu_callback(self, module_index, admin_state, module_name):
def submit_dpu_callback(self, module_index, admin_state):
# This is only valid on platforms which have pci_detach and sensord changes required. If it is not implemented,
# there are no actions taken during this function execution.
try_get(self.module_updater.chassis.get_module(module_index).module_pre_shutdown, default=False)
if admin_state == MODULE_PRE_SHUTDOWN:
try_get(self.module_updater.chassis.get_module(module_index).module_pre_shutdown, default=False)
# Set admin_state change in progress using the centralized method
if admin_state == MODULE_ADMIN_DOWN:
ModuleTransitionFlagHelper().set_transition_flag(module_name)
try_get(self.module_updater.chassis.get_module(module_index).set_admin_state, admin_state, default=False)
try_get(self.module_updater.chassis.get_module(module_index).set_admin_state_gracefully,
admin_state, default=False)

def set_initial_dpu_admin_state(self):
"""Send admin_state trigger once to modules those are powered up"""
threads = []
for module_index in range(0, self.module_updater.num_modules):
op = None
# Get operational state of DPU
module_name = self.platform_chassis.get_module(module_index).get_name()

# Clear any existing state transition flags
self.module_updater.chassis.get_module(module_index).clear_module_state_transition(module_name)
self.module_updater.chassis.get_module(module_index).clear_module_gnoi_halt_in_progress()

# Get operational state of DPU
op = None
operational_state = self.platform_chassis.get_module(module_index).get_oper_status()

try:
# Get admin state of DPU
admin_state = self.module_updater.get_module_admin_status(module_name)
if admin_state == ModuleBase.MODULE_STATUS_EMPTY:
op = MODULE_PRE_SHUTDOWN
if operational_state != ModuleBase.MODULE_STATUS_OFFLINE:
# shutdown DPU
if operational_state == ModuleBase.MODULE_STATUS_ONLINE:
# DPU is online and needs full shutdown
op = MODULE_ADMIN_DOWN
elif admin_state == 'down':
# Admin state is explicitly set to down - issue shutdown
op = MODULE_ADMIN_DOWN

# Initialize DPU_STATE DB table on bootup
dpu_state_key = "DPU_STATE|" + module_name
Expand All @@ -1437,7 +1402,7 @@ class ChassisdDaemon(daemon_base.DaemonBase):

if op is not None:
# Create and start a thread for the DPU logic
thread = threading.Thread(target=self.submit_dpu_callback, args=(module_index, op, module_name))
thread = threading.Thread(target=self.submit_dpu_callback, args=(module_index, op))
thread.daemon = True # Set as a daemon thread
thread.start()
threads.append(thread)
Expand Down Expand Up @@ -1486,16 +1451,7 @@ class ChassisdDaemon(daemon_base.DaemonBase):

# Set the initial DPU admin state for SmartSwitch
if self.smartswitch:
# Clear all stale transition flags for SmartSwitch on startup
ModuleTransitionFlagHelper().clear_all_transition_flags()
self.set_initial_dpu_admin_state()
# Clear all transition flags for SmartSwitch after setting the initial DPU admin state
module_transition_flag_helper = ModuleTransitionFlagHelper()
# Clear all stale transition flags for SmartSwitch on startup
module_transition_flag_helper.clear_all_transition_flags()
self.set_initial_dpu_admin_state()
# Clear all transition flags for SmartSwitch after setting the initial DPU admin state
module_transition_flag_helper.clear_all_transition_flags()

while not self.stop.wait(CHASSIS_INFO_UPDATE_PERIOD_SECS):
self.module_updater.module_db_update()
Expand Down
12 changes: 12 additions & 0 deletions sonic-chassisd/tests/mock_platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,18 @@ def module_pre_shutdown(self):
def module_post_startup(self):
pass

def set_admin_state_gracefully(self, up):
"""Mock implementation of set_admin_state_gracefully"""
return self.set_admin_state(up)

def clear_module_state_transition(self, module_name):
"""Mock implementation of clear_module_state_transition"""
return True

def clear_module_gnoi_halt_in_progress(self):
"""Mock implementation of clear_module_gnoi_halt_in_progress"""
return True

def is_midplane_reachable(self):
return self.midplane_access

Expand Down
Loading
Loading