Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
ecd68c9
Refactored for graceful shutdown
rameshraghupathy Aug 24, 2025
b022801
Refactored for graceful shutdown
rameshraghupathy Aug 25, 2025
ef2f282
Refactored for graceful shutdown, fixing UT
rameshraghupathy Aug 26, 2025
98c2146
Refactored for graceful shutdown, fixing UT
rameshraghupathy Sep 1, 2025
b959da9
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 8, 2025
5b2c0f6
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 9, 2025
b02d32e
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 9, 2025
fde2fd4
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
3aee661
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
983992d
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
ad81510
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
e74e00e
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
72f7eba
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
b920d0f
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
6d3bb42
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
468f2ee
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
7a28e87
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
36d0df1
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
32a4ee6
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
8f7e3ae
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
9f406f5
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
b0dafa2
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
e56983a
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
9f81fac
Addressed review comments after the refactoring
rameshraghupathy Sep 12, 2025
33c503e
Addressed review comments after the refactoring
rameshraghupathy Sep 12, 2025
08faf9d
Fixing ut
rameshraghupathy Sep 12, 2025
b612f80
Fixing ut
rameshraghupathy Sep 12, 2025
1987132
Fixing ut
rameshraghupathy Sep 12, 2025
3169b9d
Addressed review comments related to refactoring
rameshraghupathy Oct 2, 2025
500f3d3
Fixing test failures
rameshraghupathy Oct 2, 2025
8762717
Improving coverage
rameshraghupathy Oct 2, 2025
ddf58cc
Improving coverage
rameshraghupathy Oct 2, 2025
9223003
Merge branch 'sonic-net:master' into graceful-shutdown
rameshraghupathy Oct 2, 2025
8cd0c59
Improving coverage
rameshraghupathy Oct 2, 2025
75e0d47
Improving coverage
rameshraghupathy Oct 2, 2025
b424cb7
Improving coverage
rameshraghupathy Oct 2, 2025
663c496
Fixing test failures
rameshraghupathy Oct 2, 2025
8332565
Merge branch 'sonic-net:master' into graceful-shutdown
rameshraghupathy Oct 21, 2025
6148a5e
Aliging with the module_base.py changes such as common API, timezone,…
rameshraghupathy Oct 21, 2025
b44cd97
Merge branch 'graceful-shutdown' of https://github.com/rameshraghupat…
rameshraghupathy Oct 21, 2025
e1b2dc5
Fixing test failures
rameshraghupathy Oct 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
202 changes: 138 additions & 64 deletions sonic-chassisd/scripts/chassisd
Original file line number Diff line number Diff line change
Expand Up @@ -254,16 +254,39 @@ class SmartSwitchModuleConfigUpdater(logger.Logger):
self.log_warning("Invalid admin_state value: {}".format(admin_state))

def submit_callback(self, module_index, admin_state, key):
module = self.chassis.get_module(module_index)

# Use a local STATE_DB connector for centralized ModuleBase API
try:
v2 = swsscommon.SonicV2Connector(use_unix_socket_path=True)
v2.connect(v2.STATE_DB)
except Exception as e:
self.log_error(f"STATE_DB connect failed for transition flag: {e}")
v2 = None

if admin_state == MODULE_ADMIN_DOWN:
# This is only valid on platforms which have pci_detach and sensord changes required. If it is not implemented,
# there are no actions taken during this function execution.
try_get(self.chassis.get_module(module_index).module_pre_shutdown, default=False)
try_get(self.chassis.get_module(module_index).set_admin_state, admin_state, default=False)
if admin_state == MODULE_ADMIN_UP:
# This is only valid on platforms which have pci_rescan sensord changes required. If it is not implemented,
# there are no actions taken during this function execution.
try_get(self.chassis.get_module(module_index).module_post_startup, default=False)
pass
# Pre-shutdown (if implemented), then mark shutdown transition and drive admin down
try_get(module.module_pre_shutdown, default=False)
if v2:
try:
module.set_module_state_transition(v2, key, "shutdown")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rameshraghupathy when do we clear the transition flags set by chassisd admin state changes?

except Exception as e:
self.log_error(f"Failed to set shutdown transition for {key}: {e}")
try_get(module.set_admin_state, admin_state, default=False)

elif admin_state == MODULE_ADMIN_UP:
# Mark startup transition before bring-up
if v2:
try:
module.set_module_state_transition(v2, key, "startup")
except Exception as e:
self.log_error(f"Failed to set startup transition for {key}: {e}")
try_get(module.set_admin_state, admin_state, default=False)
# Optional post-startup hook (if implemented)
try_get(module.module_post_startup, default=False)

else:
self.log_warning(f"Invalid admin_state value: {admin_state}")

#
# Module Updater ==============================================================
Expand Down Expand Up @@ -723,7 +746,10 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
self.module_reboot_table = swsscommon.Table(self.chassis_state_db, CHASSIS_MODULE_REBOOT_INFO_TABLE)
self.down_modules = {}
self.chassis_app_db_clean_sha = None
self.module_transition_flag_helper = ModuleTransitionFlagHelper()

# Centralized transition API: reuse one STATE_DB connector
self._state_v2 = swsscommon.SonicV2Connector(use_unix_socket_path=True)
self._state_v2.connect(self._state_v2.STATE_DB)

self.midplane_initialized = try_get(chassis.init_midplane_switch, default=False)
if not self.midplane_initialized:
Expand Down Expand Up @@ -815,8 +841,12 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
# Persist dpu down time
self.persist_dpu_reboot_time(key)
# persist reboot cause
# Clear transition flag in STATE_DB
self.module_transition_flag_helper.clear_transition_flag(key)
# Clear transition flag in STATE_DB via ModuleBase centralized API
try:
module_obj = self.chassis.get_module(module_index)
module_obj.clear_module_state_transition(self._state_v2, key)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rameshraghupathy Clearing on operational state change is a problem, this can be intentional or unintentional, moreover, there is a transition from online to offline to online when we do a reboot, this will clear the transition during that state change, which is not the expected behavior

except Exception as e:
self.log_error(f"Failed to clear transition for {key}: {e}")

reboot_cause = try_get(self.chassis.get_module(module_index).get_reboot_cause)
self.persist_dpu_reboot_cause(reboot_cause, key)
Expand Down Expand Up @@ -852,8 +882,12 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
self.persist_dpu_reboot_cause(reboot_cause, key)
self.update_dpu_reboot_cause_to_db(key)

# Clear transition flag in STATE_DB
self.module_transition_flag_helper.clear_transition_flag(key)
# Clear transition flag in STATE_DB via ModuleBase centralized API
try:
module_obj = self.chassis.get_module(module_index)
module_obj.clear_module_state_transition(self._state_v2, key)
except Exception as e:
self.log_error(f"Failed to clear transition for {key}: {e}")

def _get_module_info(self, module_index):
"""
Expand Down Expand Up @@ -1336,33 +1370,6 @@ class DpuStateUpdater(logger.Logger):
self._update_dp_dpu_state('down')
self._update_cp_dpu_state('down')

class ModuleTransitionFlagHelper(logger.Logger):
def __init__(self, log_identifier = SYSLOG_IDENTIFIER):
super(ModuleTransitionFlagHelper, self).__init__(log_identifier)
# Use new connector to avoid redis failures
"""Create a helper function to get the module table,
since multiple threads updating with the same connector will cause redis failures"""
state_db = daemon_base.db_connect("STATE_DB")
self.module_table = swsscommon.Table(state_db, CHASSIS_MODULE_INFO_TABLE)

def set_transition_flag(self, module_name):
try:
self.module_table.hset(module_name, 'state_transition_in_progress', 'True')
self.module_table.hset(module_name, 'transition_start_time', datetime.now(timezone.utc).replace(tzinfo=None).isoformat())
except Exception as e:
self.log_error(f"Error setting transition flag for {module_name}: {e}")

def clear_transition_flag(self, module_name):
try:
self.log_info(f"Clearing transition flag for {module_name}")
self.module_table.hdel(module_name, 'state_transition_in_progress')
self.module_table.hdel(module_name, 'transition_start_time')
except Exception as e:
self.log_error(f"Error clearing transition flag for {module_name}: {e}")

def clear_all_transition_flags(self):
for module_name in self.module_table.getKeys():
self.clear_transition_flag(module_name)

#
# Daemon =======================================================================
Expand Down Expand Up @@ -1401,22 +1408,88 @@ class ChassisdDaemon(daemon_base.DaemonBase):
self.log_warning("Caught unhandled signal '{}' - ignoring...".format(SIGNALS_TO_NAMES_DICT[sig]))

def submit_dpu_callback(self, module_index, admin_state, module_name):
# This is only valid on platforms which have pci_detach and sensord changes required. If it is not implemented,
# there are no actions taken during this function execution.
try_get(self.module_updater.chassis.get_module(module_index).module_pre_shutdown, default=False)
# Set admin_state change in progress using the centralized method
module = self.module_updater.chassis.get_module(module_index)

# Mark transition via centralized API for BOTH directions
try:
v2 = swsscommon.SonicV2Connector(use_unix_socket_path=True)
v2.connect(v2.STATE_DB)
except Exception as e:
self.log_error(f"Failed to connect STATE_DB for {module_name} transition: {e}")
v2 = None

if admin_state == MODULE_ADMIN_DOWN:
ModuleTransitionFlagHelper().set_transition_flag(module_name)
try_get(self.module_updater.chassis.get_module(module_index).set_admin_state, admin_state, default=False)
# Only run pre-shutdown on DOWN path
try_get(module.module_pre_shutdown, default=False)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

module_pre_shutdown is supposed to be called if we need to power off the DPU, or if we need to stay in dark mode, as the exisitng code was present in the same format, please align

if v2:
try:
module.set_module_state_transition(v2, module_name, "shutdown")
except Exception as e:
self.log_error(f"Failed to set transition flag (shutdown) for {module_name}: {e}")
try_get(module.set_admin_state, admin_state, default=False)

elif admin_state == MODULE_ADMIN_UP:
# STARTUP path on daemon init or config change
if v2:
try:
module.set_module_state_transition(v2, module_name, "startup")
except Exception as e:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When is the module state transition from here cleared?

self.log_error(f"Failed to set transition flag (startup) for {module_name}: {e}")
try_get(module.set_admin_state, admin_state, default=False)
try_get(module.module_post_startup, default=False)

else:
# Preserve existing behavior for any special sentinel values (e.g., MODULE_PRE_SHUTDOWN)
try_get(module.module_pre_shutdown, default=False)

def clear_all_transition_flags_centralized(self):
"""
Clears any lingering 'state_transition_in_progress' flags in CHASSIS_MODULE_TABLE
using the ModuleBase centralized API. We map names back to module objects to avoid
raw writes.
"""
try:
# Use DBConnector to enumerate keys; use SonicV2Connector to write via ModuleBase API
state_db = daemon_base.db_connect("STATE_DB")
keys = state_db.keys("CHASSIS_MODULE_TABLE|*")
if not keys:
return
v2 = swsscommon.SonicV2Connector(use_unix_socket_path=True)
v2.connect(v2.STATE_DB)
for redis_key in keys:
try:
_, name = redis_key.split("|", 1)
except ValueError:
continue
idx = try_get(self.platform_chassis.get_module_index, name, default=-1)
if idx < 0:
# If we cannot resolve the module, skip (avoid raw writes)
continue
module_obj = self.platform_chassis.get_module(idx)
entry = module_obj.get_module_state_transition(v2, name) or {}
if entry.get("state_transition_in_progress") == "True":
module_obj.clear_module_state_transition(v2, name)
except Exception as e:
self.log_error(f"Failed to clear stale transition flags centrally: {e}")

def set_initial_dpu_admin_state(self):
"""Send admin_state trigger once to modules those are powered up"""
"""Send admin_state trigger once to modules those are powered up,
and mark centralized 'startup' for DPUs intended UP but not ONLINE."""
threads = []

# One STATE_DB connector for centralized ModuleBase API writes
try:
v2 = swsscommon.SonicV2Connector(use_unix_socket_path=True)
v2.connect(v2.STATE_DB)
except Exception as e:
self.log_error(f"STATE_DB connect failed for initial transitions: {e}")
v2 = None

for module_index in range(0, self.module_updater.num_modules):
op = None
# Get operational state of DPU
module_name = self.platform_chassis.get_module(module_index).get_name()
operational_state = self.platform_chassis.get_module(module_index).get_oper_status()
module = self.platform_chassis.get_module(module_index)
module_name = module.get_name()
operational_state = module.get_oper_status()

try:
# Get admin state of DPU
Expand All @@ -1429,12 +1502,20 @@ class ChassisdDaemon(daemon_base.DaemonBase):

# Initialize DPU_STATE DB table on bootup
dpu_state_key = "DPU_STATE|" + module_name
if operational_state == ModuleBase.MODULE_STATUS_ONLINE:
op_state = 'up'
else:
op_state = 'down'
op_state = 'up' if operational_state == ModuleBase.MODULE_STATUS_ONLINE else 'down'
self.module_updater.update_dpu_state(dpu_state_key, op_state)

# mark startup for DPUs that are intended 'up' but not yet ONLINE
wants_up = (admin_state != 'down')
not_online = (str(operational_state).lower()
!= str(ModuleBase.MODULE_STATUS_ONLINE).lower())
if wants_up and not_online and v2:
try:
module.set_module_state_transition(v2, module_name, "startup")
self.log_info(f"Marked startup transition for {module_name} at boot")
except Exception as e:
self.log_error(f"Failed to set startup transition for {module_name}: {e}")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need this section, this function was only present to handle the case were the CONFIG_DB entry is not present, in which case dark mode is implied and we power off the DPUs


if op is not None:
# Create and start a thread for the DPU logic
thread = threading.Thread(target=self.submit_dpu_callback, args=(module_index, op, module_name))
Expand Down Expand Up @@ -1486,16 +1567,9 @@ class ChassisdDaemon(daemon_base.DaemonBase):

# Set the initial DPU admin state for SmartSwitch
if self.smartswitch:
# Clear all stale transition flags for SmartSwitch on startup
ModuleTransitionFlagHelper().clear_all_transition_flags()
self.set_initial_dpu_admin_state()
# Clear all transition flags for SmartSwitch after setting the initial DPU admin state
module_transition_flag_helper = ModuleTransitionFlagHelper()
# Clear all stale transition flags for SmartSwitch on startup
module_transition_flag_helper.clear_all_transition_flags()
# Clear stale transition flags ONCE at startup, then mark startup
self.clear_all_transition_flags_centralized()
self.set_initial_dpu_admin_state()
# Clear all transition flags for SmartSwitch after setting the initial DPU admin state
module_transition_flag_helper.clear_all_transition_flags()

while not self.stop.wait(CHASSIS_INFO_UPDATE_PERIOD_SECS):
self.module_updater.module_db_update()
Expand Down
Loading
Loading