Skip to content

Commit fde28a6

Browse files
authored
[Smartswitch] Fix chassisd to prevent crashes on smartswitch (#658)
1 parent cd8c4a7 commit fde28a6

File tree

2 files changed

+66
-114
lines changed

2 files changed

+66
-114
lines changed

sonic-chassisd/scripts/chassisd

Lines changed: 60 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ INVALID_IP = '0.0.0.0'
102102
CHASSIS_MODULE_ADMIN_STATUS = 'admin_status'
103103
MODULE_ADMIN_DOWN = 0
104104
MODULE_ADMIN_UP = 1
105+
MODULE_PRE_SHUTDOWN = 2
105106
MODULE_REBOOT_CAUSE_DIR = "/host/reboot-cause/module/"
106107
MAX_HISTORY_FILES = 10
107108

@@ -218,15 +219,13 @@ class ModuleConfigUpdater(logger.Logger):
218219

219220
class SmartSwitchModuleConfigUpdater(logger.Logger):
220221

221-
def __init__(self, log_identifier, chassis, module_table, set_flag_callback):
222+
def __init__(self, log_identifier, chassis):
222223
"""
223224
Constructor for SmartSwitchModuleConfigUpdater
224225
:param chassis: Object representing a platform chassis
225226
"""
226227
super(SmartSwitchModuleConfigUpdater, self).__init__(log_identifier)
227228
self.chassis = chassis
228-
self.module_table = module_table
229-
self.set_transition_flag = set_flag_callback
230229

231230
def deinit(self):
232231
"""
@@ -249,17 +248,16 @@ class SmartSwitchModuleConfigUpdater(logger.Logger):
249248

250249
if (admin_state == MODULE_ADMIN_DOWN) or (admin_state == MODULE_ADMIN_UP):
251250
self.log_info("Changing module {} to admin {} state".format(key, 'DOWN' if admin_state == MODULE_ADMIN_DOWN else 'UP'))
252-
t = threading.Thread(target=self.submit_callback, args=(module_index, admin_state))
251+
t = threading.Thread(target=self.submit_callback, args=(module_index, admin_state, key))
253252
t.start()
254253
else:
255254
self.log_warning("Invalid admin_state value: {}".format(admin_state))
256255

257-
def submit_callback(self, module_index, admin_state):
256+
def submit_callback(self, module_index, admin_state, key):
258257
if admin_state == MODULE_ADMIN_DOWN:
259258
# This is only valid on platforms which have pci_detach and sensord changes required. If it is not implemented,
260259
# there are no actions taken during this function execution.
261260
try_get(self.chassis.get_module(module_index).module_pre_shutdown, default=False)
262-
263261
try_get(self.chassis.get_module(module_index).set_admin_state, admin_state, default=False)
264262
if admin_state == MODULE_ADMIN_UP:
265263
# This is only valid on platforms which have pci_rescan sensord changes required. If it is not implemented,
@@ -360,7 +358,7 @@ class ModuleUpdater(logger.Logger):
360358
fvs = self.module_table.get(key)
361359
if isinstance(fvs, list) and fvs[0] is True:
362360
fvs = dict(fvs[-1])
363-
return fvs[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD]
361+
return fvs.get(CHASSIS_MODULE_INFO_OPERSTATUS_FIELD, ModuleBase.MODULE_STATUS_EMPTY)
364362
return ModuleBase.MODULE_STATUS_EMPTY
365363

366364
def get_module_admin_status(self, chassis_module_name):
@@ -725,6 +723,7 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
725723
self.module_reboot_table = swsscommon.Table(self.chassis_state_db, CHASSIS_MODULE_REBOOT_INFO_TABLE)
726724
self.down_modules = {}
727725
self.chassis_app_db_clean_sha = None
726+
self.module_transition_flag_helper = ModuleTransitionFlagHelper()
728727

729728
self.midplane_initialized = try_get(chassis.init_midplane_switch, default=False)
730729
if not self.midplane_initialized:
@@ -786,28 +785,6 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
786785
self.log_error(f"{module}: Failed to read previous-reboot-cause.json: {e}")
787786
return None, None
788787

789-
def clear_transition_flag(self, key):
790-
status, fvs = self.module_table.get(key)
791-
if status and fvs:
792-
fvs_dict = dict(fvs)
793-
if 'state_transition_in_progress' in fvs_dict:
794-
fvs_dict['state_transition_in_progress'] = 'False'
795-
else:
796-
self.log_warning(f"Transition flag not found in {key}, adding it as False.")
797-
fvs_dict['state_transition_in_progress'] = 'False'
798-
self.module_table.set(key, list(fvs_dict.items()))
799-
else:
800-
self.log_error(f"Could not clear module admin_status transition flag for {key}")
801-
802-
def clear_all_transition_flags(self):
803-
self.log_warning("Clearing all stale state_transition_in_progress flags at startup")
804-
keys = self.module_table.getKeys()
805-
for key in keys:
806-
try:
807-
self.clear_transition_flag(key)
808-
except Exception as e:
809-
self.log_error(f"Failed to clear transition flag for {key}: {e}")
810-
811788
def module_db_update(self):
812789
for module_index in range(0, self.num_modules):
813790
module_info_dict = self._get_module_info(module_index)
@@ -838,6 +815,9 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
838815
# Persist dpu down time
839816
self.persist_dpu_reboot_time(key)
840817
# persist reboot cause
818+
# Clear transition flag in STATE_DB
819+
self.module_transition_flag_helper.clear_transition_flag(key)
820+
841821
reboot_cause = try_get(self.chassis.get_module(module_index).get_reboot_cause)
842822
self.persist_dpu_reboot_cause(reboot_cause, key)
843823
# publish reboot cause to db
@@ -873,7 +853,7 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
873853
self.update_dpu_reboot_cause_to_db(key)
874854

875855
# Clear transition flag in STATE_DB
876-
self.clear_transition_flag(key)
856+
self.module_transition_flag_helper.clear_transition_flag(key)
877857

878858
def _get_module_info(self, module_index):
879859
"""
@@ -1215,10 +1195,9 @@ class ConfigManagerTask(ProcessTaskBase):
12151195

12161196

12171197
class SmartSwitchConfigManagerTask(ProcessTaskBase):
1218-
def __init__(self, set_transition_flag_callback):
1198+
def __init__(self):
12191199
super(SmartSwitchConfigManagerTask, self).__init__()
12201200
self.logger = logger.Logger(SYSLOG_IDENTIFIER)
1221-
self.set_transition_flag = set_transition_flag_callback
12221201
self.config_updater = None
12231202

12241203
def task_worker(self):
@@ -1228,9 +1207,7 @@ class SmartSwitchConfigManagerTask(ProcessTaskBase):
12281207

12291208
self.config_updater = SmartSwitchModuleConfigUpdater(
12301209
SYSLOG_IDENTIFIER,
1231-
get_chassis(),
1232-
module_table,
1233-
self.set_transition_flag
1210+
get_chassis()
12341211
)
12351212

12361213
# Subscribe to CHASSIS_MODULE table notifications in the Config DB
@@ -1359,6 +1336,33 @@ class DpuStateUpdater(logger.Logger):
13591336
self._update_dp_dpu_state('down')
13601337
self._update_cp_dpu_state('down')
13611338

1339+
class ModuleTransitionFlagHelper(logger.Logger):
1340+
def __init__(self, log_identifier = SYSLOG_IDENTIFIER):
1341+
super(ModuleTransitionFlagHelper, self).__init__(log_identifier)
1342+
# Use new connector to avoid redis failures
1343+
"""Create a helper function to get the module table,
1344+
since multiple threads updating with the same connector will cause redis failures"""
1345+
state_db = daemon_base.db_connect("STATE_DB")
1346+
self.module_table = swsscommon.Table(state_db, CHASSIS_MODULE_INFO_TABLE)
1347+
1348+
def set_transition_flag(self, module_name):
1349+
try:
1350+
self.module_table.hset(module_name, 'state_transition_in_progress', 'True')
1351+
self.module_table.hset(module_name, 'transition_start_time', datetime.now(timezone.utc).replace(tzinfo=None).isoformat())
1352+
except Exception as e:
1353+
self.log_error(f"Error setting transition flag for {module_name}: {e}")
1354+
1355+
def clear_transition_flag(self, module_name):
1356+
try:
1357+
self.log_info(f"Clearing transition flag for {module_name}")
1358+
self.module_table.hdel(module_name, 'state_transition_in_progress')
1359+
self.module_table.hdel(module_name, 'transition_start_time')
1360+
except Exception as e:
1361+
self.log_error(f"Error clearing transition flag for {module_name}: {e}")
1362+
1363+
def clear_all_transition_flags(self):
1364+
for module_name in self.module_table.getKeys():
1365+
self.clear_transition_flag(module_name)
13621366

13631367
#
13641368
# Daemon =======================================================================
@@ -1396,48 +1400,14 @@ class ChassisdDaemon(daemon_base.DaemonBase):
13961400
else:
13971401
self.log_warning("Caught unhandled signal '{}' - ignoring...".format(SIGNALS_TO_NAMES_DICT[sig]))
13981402

1399-
def set_transition_flag(self, module_table, key):
1400-
self.log_info(f"set transition flag: key={key}")
1401-
1402-
_, fvs = module_table.get(key)
1403-
1404-
# Merge existing fields, preserving all others
1405-
fvs_dict = dict(fvs) if fvs else {}
1406-
1407-
# Ensure mandatory fields are present, but only if missing
1408-
mandatory_defaults = {
1409-
'desc': 'N/A',
1410-
'slot': 'N/A',
1411-
'serial': 'N/A',
1412-
'oper_status': ModuleBase.MODULE_STATUS_EMPTY
1413-
}
1414-
for field, default in mandatory_defaults.items():
1415-
if field not in fvs_dict:
1416-
self.log_info(f"{key}: '{field}' missing, setting default '{default}'")
1417-
fvs_dict[field] = default
1418-
1419-
# Update or add transition fields
1420-
fvs_dict.update({
1421-
'state_transition_in_progress': 'True',
1422-
'transition_start_time': get_formatted_time()
1423-
})
1424-
1425-
# Write merged data back
1426-
module_table.set(key, list(fvs_dict.items()))
1427-
14281403
def submit_dpu_callback(self, module_index, admin_state, module_name):
1429-
if admin_state == MODULE_ADMIN_DOWN:
1430-
# This is only valid on platforms which have pci_detach and sensord changes required. If it is not implemented,
1431-
# there are no actions taken during this function execution.
1432-
try_get(self.module_updater.chassis.get_module(module_index).module_pre_shutdown, default=False)
1404+
# This is only valid on platforms which have pci_detach and sensord changes required. If it is not implemented,
1405+
# there are no actions taken during this function execution.
1406+
try_get(self.module_updater.chassis.get_module(module_index).module_pre_shutdown, default=False)
14331407
# Set admin_state change in progress using the centralized method
1434-
self.set_transition_flag(self.module_updater.module_table, module_name)
1435-
try_get(self.module_updater.chassis.get_module(module_index).set_admin_state, admin_state, default=False)
1436-
if admin_state == MODULE_ADMIN_UP:
1437-
# This is only valid on platforms which have pci_rescan sensord changes required. If it is not implemented,
1438-
# there are no actions taken during this function execution.
1439-
try_get(self.module_updater.chassis.get_module(module_index).module_post_startup, default=False)
1440-
pass
1408+
if admin_state == MODULE_ADMIN_DOWN:
1409+
ModuleTransitionFlagHelper().set_transition_flag(module_name)
1410+
try_get(self.module_updater.chassis.get_module(module_index).set_admin_state, admin_state, default=False)
14411411

14421412
def set_initial_dpu_admin_state(self):
14431413
"""Send admin_state trigger once to modules those are powered up"""
@@ -1451,9 +1421,11 @@ class ChassisdDaemon(daemon_base.DaemonBase):
14511421
try:
14521422
# Get admin state of DPU
14531423
admin_state = self.module_updater.get_module_admin_status(module_name)
1454-
if admin_state == ModuleBase.MODULE_STATUS_EMPTY and operational_state != ModuleBase.MODULE_STATUS_OFFLINE:
1455-
# shutdown DPU
1456-
op = MODULE_ADMIN_DOWN
1424+
if admin_state == ModuleBase.MODULE_STATUS_EMPTY:
1425+
op = MODULE_PRE_SHUTDOWN
1426+
if operational_state != ModuleBase.MODULE_STATUS_OFFLINE:
1427+
# shutdown DPU
1428+
op = MODULE_ADMIN_DOWN
14571429

14581430
# Initialize DPU_STATE DB table on bootup
14591431
dpu_state_key = "DPU_STATE|" + module_name
@@ -1501,7 +1473,7 @@ class ChassisdDaemon(daemon_base.DaemonBase):
15011473

15021474
# Start configuration manager task
15031475
if self.smartswitch:
1504-
config_manager = SmartSwitchConfigManagerTask(self.set_transition_flag)
1476+
config_manager = SmartSwitchConfigManagerTask()
15051477
config_manager.task_run()
15061478
elif self.module_updater.supervisor_slot == self.module_updater.my_slot:
15071479
config_manager = ConfigManagerTask()
@@ -1514,8 +1486,16 @@ class ChassisdDaemon(daemon_base.DaemonBase):
15141486

15151487
# Set the initial DPU admin state for SmartSwitch
15161488
if self.smartswitch:
1517-
self.module_updater.clear_all_transition_flags()
1489+
# Clear all stale transition flags for SmartSwitch on startup
1490+
ModuleTransitionFlagHelper().clear_all_transition_flags()
1491+
self.set_initial_dpu_admin_state()
1492+
# Clear all transition flags for SmartSwitch after setting the initial DPU admin state
1493+
module_transition_flag_helper = ModuleTransitionFlagHelper()
1494+
# Clear all stale transition flags for SmartSwitch on startup
1495+
module_transition_flag_helper.clear_all_transition_flags()
15181496
self.set_initial_dpu_admin_state()
1497+
# Clear all transition flags for SmartSwitch after setting the initial DPU admin state
1498+
module_transition_flag_helper.clear_all_transition_flags()
15191499

15201500
while not self.stop.wait(CHASSIS_INFO_UPDATE_PERIOD_SECS):
15211501
self.module_updater.module_db_update()

sonic-chassisd/tests/test_chassisd.py

Lines changed: 6 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -327,13 +327,9 @@ def test_smartswitch_moduleupdater_check_invalid_name():
327327
fvs = module_updater.module_table.get(name)
328328
assert fvs == None
329329

330-
mock_module_table = MagicMock()
331-
mock_set_flag_callback = MagicMock()
332330
config_updater = SmartSwitchModuleConfigUpdater(
333331
SYSLOG_IDENTIFIER,
334332
chassis,
335-
mock_module_table,
336-
mock_set_flag_callback
337333
)
338334
admin_state = 0
339335
config_updater.module_config_update(name, admin_state)
@@ -361,13 +357,9 @@ def test_smartswitch_moduleupdater_check_invalid_admin_state():
361357
module_updater.module_db_update()
362358
fvs = module_updater.module_table.get(name)
363359

364-
mock_module_table = MagicMock()
365-
mock_set_flag_callback = MagicMock()
366360
config_updater = SmartSwitchModuleConfigUpdater(
367361
SYSLOG_IDENTIFIER,
368362
chassis,
369-
mock_module_table,
370-
mock_set_flag_callback
371363
)
372364
admin_state = 2
373365
config_updater.module_config_update(name, admin_state)
@@ -629,13 +621,9 @@ def test_smartswitch_configupdater_check_admin_state():
629621
module.set_oper_status(status)
630622
chassis.module_list.append(module)
631623

632-
mock_module_table = MagicMock()
633-
mock_set_flag_callback = MagicMock()
634624
config_updater = SmartSwitchModuleConfigUpdater(
635625
SYSLOG_IDENTIFIER,
636-
chassis,
637-
mock_module_table,
638-
mock_set_flag_callback
626+
chassis
639627
)
640628

641629
# Test setting admin state to down
@@ -1686,7 +1674,7 @@ def test_task_worker_loop():
16861674

16871675
# Patch the swsscommon.Select to use this mock
16881676
with patch('tests.mock_swsscommon.Select', return_value=mock_select):
1689-
config_manager = SmartSwitchConfigManagerTask(set_transition_flag_callback=MagicMock())
1677+
config_manager = SmartSwitchConfigManagerTask()
16901678

16911679
config_manager.config_updater = MagicMock()
16921680

@@ -1888,22 +1876,6 @@ def is_valid_date(date_str):
18881876
AssertionError("Date is not set!")
18891877
assert is_valid_date(date_value)
18901878

1891-
def test_clear_transition_flag_sets_false_when_flag_present():
1892-
module_table = MagicMock()
1893-
module_table.get.return_value = (True, [('state_transition_in_progress', 'True')])
1894-
1895-
# Use a real updater instance
1896-
updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, MagicMock())
1897-
updater.module_table = module_table
1898-
1899-
daemon_chassisd = ChassisdDaemon(SYSLOG_IDENTIFIER, MagicMock())
1900-
daemon_chassisd.module_updater = updater
1901-
1902-
daemon_chassisd.module_updater.clear_transition_flag("DPU0")
1903-
1904-
args = module_table.set.call_args[0][1]
1905-
assert ('state_transition_in_progress', 'False') in args
1906-
19071879
def test_smartswitch_moduleupdater_midplane_state_change():
19081880
"""Test that when midplane goes down, control plane and data plane states are set to down"""
19091881
chassis = MockSmartSwitchChassis()
@@ -2009,9 +1981,9 @@ def test_submit_dpu_callback():
20091981
patch.object(module, 'module_post_startup') as mock_post_startup:
20101982

20111983
module_updater.module_table.get = MagicMock(return_value=(True, []))
2012-
daemon_chassisd.submit_dpu_callback(index, MODULE_ADMIN_UP, name)
1984+
daemon_chassisd.submit_dpu_callback(index, MODULE_PRE_SHUTDOWN, name)
20131985

20141986
# Verify correct functions are called for admin up
2015-
mock_pre_shutdown.assert_not_called()
2016-
mock_set_admin_state.assert_called_once_with(MODULE_ADMIN_UP)
2017-
mock_post_startup.assert_called_once()
1987+
mock_pre_shutdown.assert_called_once()
1988+
mock_set_admin_state.assert_not_called()
1989+
mock_post_startup.assert_not_called()

0 commit comments

Comments
 (0)