Skip to content

Commit b431dad

Browse files
Fix for 22430 - SmartSwitch - Reboot cause for DPUs not updated on complete system reboot (#631)
1 parent 7733080 commit b431dad

File tree

2 files changed

+110
-18
lines changed

2 files changed

+110
-18
lines changed

sonic-chassisd/scripts/chassisd

Lines changed: 50 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ CHASSIS_MODULE_REBOOT_TIMESTAMP_FIELD = 'timestamp'
8080
CHASSIS_MODULE_REBOOT_REBOOT_FIELD = 'reboot'
8181
DEFAULT_LINECARD_REBOOT_TIMEOUT = 180
8282
DEFAULT_DPU_REBOOT_TIMEOUT = 360
83+
MAX_DPU_REBOOT_DURATION = 800
8384
PLATFORM_ENV_CONF_FILE = "/usr/share/sonic/platform/platform_env.conf"
8485
PLATFORM_JSON_FILE = "/usr/share/sonic/platform/platform.json"
8586

@@ -766,6 +767,25 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
766767
else:
767768
return 'empty'
768769

770+
def retrieve_dpu_reboot_info(self, module):
771+
"""
772+
Retrieve the most recent reboot cause and time from previous-reboot-cause.json.
773+
Returns (cause_string, time_string), or (None, None) if unavailable.
774+
"""
775+
try:
776+
path = os.path.join(MODULE_REBOOT_CAUSE_DIR, module.lower(), "previous-reboot-cause.json")
777+
if os.path.exists(path):
778+
with open(path, 'r') as f:
779+
data = json.load(f)
780+
cause = data.get("cause")
781+
time_str = data.get("name") # Format: "YYYY_MM_DD_HH_MM_SS"
782+
return cause, time_str
783+
else:
784+
self.log_debug(f"{module}: previous-reboot-cause.json not found")
785+
except Exception as e:
786+
self.log_error(f"{module}: Failed to read previous-reboot-cause.json: {e}")
787+
return None, None
788+
769789
def clear_transition_flag(self, key):
770790
status, fvs = self.module_table.get(key)
771791
if status and fvs:
@@ -812,23 +832,44 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
812832
current_status = module_info_dict[CHASSIS_MODULE_INFO_OPERSTATUS_FIELD]
813833

814834
# Operational status transitioning to offline
815-
if prev_status != str(ModuleBase.MODULE_STATUS_OFFLINE) and current_status == str(ModuleBase.MODULE_STATUS_OFFLINE):
835+
if prev_status != ModuleBase.MODULE_STATUS_EMPTY and prev_status != str(ModuleBase.MODULE_STATUS_OFFLINE) and current_status == str(ModuleBase.MODULE_STATUS_OFFLINE):
816836
self.log_notice("{} operational status transitioning to offline".format(key))
817837

818838
# Persist dpu down time
819839
self.persist_dpu_reboot_time(key)
840+
# persist reboot cause
841+
reboot_cause = try_get(self.chassis.get_module(module_index).get_reboot_cause)
842+
self.persist_dpu_reboot_cause(reboot_cause, key)
843+
# publish reboot cause to db
844+
self.update_dpu_reboot_cause_to_db(key)
820845

821-
# Clear transition flag in STATE_DB
822-
self.clear_transition_flag(key)
846+
elif (prev_status == ModuleBase.MODULE_STATUS_EMPTY or prev_status == str(ModuleBase.MODULE_STATUS_OFFLINE)) and current_status != str(ModuleBase.MODULE_STATUS_OFFLINE):
847+
self.log_notice(f"{key} operational status transitioning to online")
823848

824-
elif prev_status == str(ModuleBase.MODULE_STATUS_OFFLINE) and current_status != str(ModuleBase.MODULE_STATUS_OFFLINE):
825-
self.log_notice("{} operational status transitioning to online".format(key))
826849
reboot_cause = try_get(self.chassis.get_module(module_index).get_reboot_cause)
827-
828-
if not self.retrieve_dpu_reboot_time(key) is None or self._is_first_boot(key):
829-
# persist reboot cause
850+
if isinstance(reboot_cause, (tuple, list)):
851+
current_cause = reboot_cause[0]
852+
else:
853+
current_cause = reboot_cause
854+
855+
stored_cause, stored_time_str = self.retrieve_dpu_reboot_info(key)
856+
857+
is_reboot = False
858+
if current_cause and stored_cause and stored_time_str:
859+
try:
860+
stored_dt = datetime.strptime(stored_time_str, "%Y_%m_%d_%H_%M_%S").replace(tzinfo=timezone.utc)
861+
now = datetime.now(timezone.utc)
862+
delta_sec = (now - stored_dt).total_seconds()
863+
864+
if current_cause == stored_cause and delta_sec < MAX_DPU_REBOOT_DURATION:
865+
self.log_info(f"{key}: is_reboot=True — same reboot cause within {int(delta_sec)}s")
866+
is_reboot = True
867+
except Exception as e:
868+
self.log_error(f"{key}: Reboot cause/time comparison failed: {e}")
869+
870+
if not is_reboot and (stored_time_str is not None or self._is_first_boot(key)):
871+
# persist reboot cause and publish to db
830872
self.persist_dpu_reboot_cause(reboot_cause, key)
831-
# publish reboot cause to db
832873
self.update_dpu_reboot_cause_to_db(key)
833874

834875
# Clear transition flag in STATE_DB

sonic-chassisd/tests/test_chassisd.py

Lines changed: 60 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -223,15 +223,13 @@ def test_smartswitch_moduleupdater_status_transitions():
223223
module_updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis)
224224

225225
# Mock dependent methods
226-
with patch.object(module_updater, 'retrieve_dpu_reboot_time', return_value="2024-11-19T00:00:00") \
227-
as mock_retrieve_reboot_time, \
228-
patch.object(module_updater, '_is_first_boot', return_value=False) as mock_is_first_boot, \
229-
patch.object(module_updater, 'persist_dpu_reboot_cause') as mock_persist_reboot_cause, \
230-
patch.object(module_updater, 'update_dpu_reboot_cause_to_db') as mock_update_reboot_db, \
231-
patch("os.makedirs") as mock_makedirs, \
232-
patch("builtins.open", mock_open()) as mock_file, \
233-
patch.object(module_updater, '_get_history_path',
234-
return_value="/tmp/prev_reboot_time.txt") as mock_get_history_path:
226+
with patch.object(module_updater, 'retrieve_dpu_reboot_info', return_value=("Switch rebooted DPU", "2023_01_01_00_00_00")) as mock_reboot_info, \
227+
patch.object(module_updater, '_is_first_boot', return_value=False) as mock_is_first_boot, \
228+
patch.object(module_updater, 'persist_dpu_reboot_cause') as mock_persist_reboot_cause, \
229+
patch.object(module_updater, 'update_dpu_reboot_cause_to_db') as mock_update_reboot_db, \
230+
patch("os.makedirs") as mock_makedirs, \
231+
patch("builtins.open", mock_open()) as mock_file, \
232+
patch.object(module_updater, '_get_history_path', return_value="/tmp/prev_reboot_time.txt") as mock_get_history_path:
235233

236234
# Transition from ONLINE to OFFLINE
237235
offline_status = ModuleBase.MODULE_STATUS_OFFLINE
@@ -255,6 +253,59 @@ def test_smartswitch_moduleupdater_status_transitions():
255253
mock_persist_reboot_cause.assert_called_once()
256254
mock_update_reboot_db.assert_called_once()
257255

256+
def test_online_transition_skips_reboot_update():
257+
chassis = MockSmartSwitchChassis()
258+
index = 0
259+
name = "DPU0"
260+
module = MockModule(index, name, "DPU", ModuleBase.MODULE_TYPE_DPU, 0, "SN123")
261+
module.set_oper_status(ModuleBase.MODULE_STATUS_OFFLINE)
262+
chassis.module_list.append(module)
263+
264+
updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, chassis)
265+
266+
# Mock the module going ONLINE
267+
module.set_oper_status(ModuleBase.MODULE_STATUS_ONLINE)
268+
269+
with patch.object(updater, 'retrieve_dpu_reboot_info',
270+
return_value=("Switch rebooted DPU", datetime.now(timezone.utc).strftime("%Y_%m_%d_%H_%M_%S"))), \
271+
patch.object(module, 'get_reboot_cause', return_value="Switch rebooted DPU"), \
272+
patch.object(updater, '_is_first_boot', return_value=False), \
273+
patch.object(updater, 'persist_dpu_reboot_cause') as mock_persist, \
274+
patch.object(updater, 'update_dpu_reboot_cause_to_db') as mock_update, \
275+
patch("builtins.open", mock_open()), \
276+
patch("os.makedirs"), \
277+
patch.object(updater, '_get_history_path', return_value="/tmp/fake.json"):
278+
279+
updater.module_db_update()
280+
281+
# Ensure no reboot update due to is_reboot = True
282+
mock_persist.assert_not_called()
283+
mock_update.assert_not_called()
284+
285+
def test_retrieve_dpu_reboot_info_success():
286+
class DummyChassis:
287+
def get_num_modules(self): return 0
288+
def init_midplane_switch(self): return False
289+
290+
updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, DummyChassis())
291+
sample_json = {"cause": "Switch rebooted DPU", "name": "2025_06_25_17_18_52"}
292+
with patch("os.path.exists", return_value=True), \
293+
patch("builtins.open", mock_open(read_data=json.dumps(sample_json))):
294+
cause, time_str = updater.retrieve_dpu_reboot_info("dpu0")
295+
assert cause == "Switch rebooted DPU"
296+
assert time_str == "2025_06_25_17_18_52"
297+
298+
def test_retrieve_dpu_reboot_info_file_missing():
299+
class DummyChassis:
300+
def get_num_modules(self): return 0
301+
def init_midplane_switch(self): return False # required for SmartSwitchModuleUpdater
302+
303+
updater = SmartSwitchModuleUpdater(SYSLOG_IDENTIFIER, DummyChassis())
304+
with patch("os.path.exists", return_value=False):
305+
cause, time_str = updater.retrieve_dpu_reboot_info("dpu0")
306+
assert cause is None
307+
assert time_str is None
308+
258309
def test_smartswitch_moduleupdater_check_invalid_name():
259310
chassis = MockSmartSwitchChassis()
260311
index = 0

0 commit comments

Comments
 (0)