Skip to content

Commit bb0a31c

Browse files
Adding support for persistent storage and retrieval of DPU reboot-cause (#169)
* Adding support for persistent storage and retrieval of DPU reboot-cause * Added support for persisting dpu reboot-cause on smartswitch host * Working on coverage * Working on ut coverage * working on coverage * working on coverage * working on coverage * working on coverage * working on coverage * Fixed a typo * Working on coverage * Fixing test failure * improving coverage * Improving coverage * working on coverage * Modifying reboot-cause workflow to meet multiple smartswitch vendor hardware implementation requirements * Fixig the assertions to meet the new change * Fixed the DB * Using the common API device_info.get_dpu_list() * Addressed review comments * Added new test file tests/process-reboot-cause_test.py * Added the scripts_path * Moved setup outside the test class * Fixed the file name * Fixing test isssues * Working on UT * Fixed the numbeer of arguments to load_module_from_source * addressed review comments * adding mock for uid * passing uid arg * Fixing test failure * Fixing test failure * Fixing test failure * Fixing test failure * Fixing test failure * Fixing test failure * Iproving coverage * Iproving coverage * Iproving coverage * Iproving coverage * Iproving coverage * Iproving coverage * Iproving coverage * Iproving coverage * Iproving coverage * Iproving coverage * Addressed review comments * Addressed review comments * Addressed review comments * Addressed review comments * Addressed review comments * Addressed review comments * Addressed review comments * Addressed review comments * Addressed review comments * Addressed review comments * Addressed review comments * Addressed review comments: Using a common function ead_reboot_cause_files_and_save_to_db for regular switch and smartswitch * Working on coverage * Working on coverage * Working on coverage * Working on coverage * Addressed review comments * Addressed review comments * Addressed review comments * Addressed review comments * Fixed a test issue * Fixed a test issue * Fixed a test issue * Fixed a test issue * Fixed a test issue * Fixed a test issue * Fixed a test issue * Fixed a test issue * Fixed a test issue * Fixed a test issue * Fixed a bug * Did a minor cleanup * Addressed a review comment
1 parent 5e08927 commit bb0a31c

File tree

4 files changed

+237
-20
lines changed

4 files changed

+237
-20
lines changed

scripts/determine-reboot-cause

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ VERSION = "1.0"
2424
SYSLOG_IDENTIFIER = "determine-reboot-cause"
2525

2626
REBOOT_CAUSE_DIR = "/host/reboot-cause/"
27+
REBOOT_CAUSE_MODULE_DIR = "/host/reboot-cause/module"
2728
REBOOT_CAUSE_HISTORY_DIR = "/host/reboot-cause/history/"
2829
REBOOT_CAUSE_FILE = os.path.join(REBOOT_CAUSE_DIR, "reboot-cause.txt")
2930
PREVIOUS_REBOOT_CAUSE_FILE = os.path.join(REBOOT_CAUSE_DIR, "previous-reboot-cause.json")
@@ -136,10 +137,10 @@ def find_hardware_reboot_cause():
136137

137138

138139
def get_reboot_cause_dict(previous_reboot_cause, comment, gen_time):
139-
"""Store the key infomation of device reboot into a dictionary by parsing the string in
140+
"""Store the key information of device reboot into a dictionary by parsing the string in
140141
previous_reboot_cause.
141142
142-
If user issused a command to reboot device, then user, command and time will be
143+
If user issued a command to reboot device, then user, command and time will be
143144
stored into a dictionary.
144145
145146
If device was rebooted due to the kernel panic, then the string `Kernel Panic`
@@ -185,7 +186,7 @@ def determine_reboot_cause():
185186

186187
# The main decision logic of the reboot cause:
187188
# If there is a valid hardware reboot cause indicated by platform API,
188-
# check the software reboot cause to add additional rebot cause.
189+
# check the software reboot cause to add additional reboot cause.
189190
# If there is a reboot cause indicated by /proc/cmdline, and/or warmreboot/fastreboot/softreboot
190191
# the software_reboot_cause which is the content of /hosts/reboot-cause/reboot-cause.txt
191192
# will be treated as the additional reboot cause
@@ -211,6 +212,27 @@ def determine_reboot_cause():
211212

212213
return previous_reboot_cause, additional_reboot_info
213214

215+
def check_and_create_dpu_dirs():
216+
# Get the list of DPUs
217+
dpus = device_info.get_dpu_list()
218+
219+
# Create directories for each DPU and its history
220+
for dpu in dpus:
221+
dpu_dir = os.path.join(REBOOT_CAUSE_MODULE_DIR, dpu)
222+
history_dir = os.path.join(dpu_dir, "history")
223+
224+
# Create the DPU directory if it doesn't exist
225+
if not os.path.exists(dpu_dir):
226+
os.makedirs(dpu_dir)
227+
228+
# Create reboot-cause.txt and write 'First boot' to it
229+
reboot_file = os.path.join(dpu_dir, 'reboot-cause.txt')
230+
with open(reboot_file, 'w') as f:
231+
f.write('First boot\n')
232+
233+
# Create the history directory if it doesn't exist
234+
if not os.path.exists(history_dir):
235+
os.makedirs(history_dir)
214236

215237
def main():
216238
# Configure logger to log all messages INFO level and higher
@@ -261,6 +283,10 @@ def main():
261283
with open(REBOOT_CAUSE_FILE, "w") as cause_file:
262284
cause_file.write(REBOOT_CAUSE_UNKNOWN)
263285

286+
# Create directories for DPUs in SmartSwitch platforms
287+
if device_info.is_smartswitch():
288+
check_and_create_dpu_dirs()
289+
264290

265291
if __name__ == "__main__":
266292
main()

scripts/process-reboot-cause

Lines changed: 39 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,12 @@ try:
1414

1515
from swsscommon import swsscommon
1616
from sonic_py_common import syslogger
17+
from sonic_py_common import device_info
1718
except ImportError as err:
1819
raise ImportError("%s - required module not found" % str(err))
1920

2021
VERSION = "1.0"
22+
CHASSIS_SERVER_PORT = 6380
2123

2224
SYSLOG_IDENTIFIER = "process-reboot-cause"
2325

@@ -28,6 +30,7 @@ USER_ISSUED_REBOOT_CAUSE_REGEX ="User issued \'{}\' command [User: {}, Time: {}]
2830

2931
REBOOT_CAUSE_UNKNOWN = "Unknown"
3032
REBOOT_CAUSE_TABLE_NAME = "REBOOT_CAUSE"
33+
MAX_HISTORY_FILES = 10
3134

3235
REDIS_HOSTIP = "127.0.0.1"
3336
state_db = None
@@ -37,39 +40,52 @@ sonic_logger = syslogger.SysLogger(SYSLOG_IDENTIFIER)
3740

3841

3942
# ============================= Functions =============================
40-
def read_reboot_cause_files_and_save_state_db():
43+
def read_reboot_cause_files_and_save_to_db(device='npu'):
4144
# Connect State DB
42-
state_db = swsscommon.SonicV2Connector(host=REDIS_HOSTIP)
43-
state_db.connect(state_db.STATE_DB)
45+
if device == 'npu':
46+
db = swsscommon.SonicV2Connector(host=REDIS_HOSTIP)
47+
table = db.STATE_DB
48+
history_dir = REBOOT_CAUSE_HISTORY_DIR
49+
else:
50+
db = swsscommon.SonicV2Connector(host="redis_chassis.server", port=CHASSIS_SERVER_PORT)
51+
table = db.CHASSIS_STATE_DB
52+
history_dir = os.path.join('/host/reboot-cause/module', device , 'history')
53+
db.connect(table)
4454

4555
# Sort the previous reboot cause files by creation time
46-
REBOOT_FILE_LIST = [os.path.join(REBOOT_CAUSE_HISTORY_DIR, i) for i in os.listdir(REBOOT_CAUSE_HISTORY_DIR)]
56+
REBOOT_FILE_LIST = [os.path.join(history_dir, i) for i in os.listdir(history_dir)]
4757
TIME_SORTED_FULL_REBOOT_FILE_LIST = sorted(REBOOT_FILE_LIST, key=os.path.getmtime, reverse=True)
4858

4959
data = []
5060
# Read each sorted previous reboot cause file and update the state db with previous reboot cause information
51-
for i in range(min(10, len(TIME_SORTED_FULL_REBOOT_FILE_LIST))):
61+
for i in range(min(MAX_HISTORY_FILES, len(TIME_SORTED_FULL_REBOOT_FILE_LIST))):
5262
x = TIME_SORTED_FULL_REBOOT_FILE_LIST[i]
5363
if os.path.isfile(x):
5464
with open(x, "r") as cause_file:
5565
try:
5666
data = json.load(cause_file)
57-
_hash = '{}|{}'.format(REBOOT_CAUSE_TABLE_NAME, data['gen_time'])
58-
state_db.set(state_db.STATE_DB, _hash, 'cause', data['cause'])
59-
state_db.set(state_db.STATE_DB, _hash, 'time', data['time'])
60-
state_db.set(state_db.STATE_DB, _hash, 'user', data['user'])
61-
state_db.set(state_db.STATE_DB, _hash, 'comment', data['comment'])
67+
if device == 'npu':
68+
_hash = '{}|{}'.format(REBOOT_CAUSE_TABLE_NAME, data['gen_time'])
69+
else:
70+
# Ensure keys exist
71+
if 'name' not in data:
72+
sonic_logger.log_warning(f"Missing 'name' in reboot-cause file")
73+
continue # Skip this file
74+
_hash = f"{REBOOT_CAUSE_TABLE_NAME}|{device.upper()}|{data['name']}"
75+
db.set(table, _hash, 'cause', data.get('cause', ''))
76+
db.set(table, _hash, 'time', data.get('time', ''))
77+
db.set(table, _hash, 'user', data.get('user', ''))
78+
db.set(table, _hash, 'comment', data.get('comment', ''))
6279
except json.decoder.JSONDecodeError as je:
63-
sonic_logger.log_info("Unable to process reload cause file {}: {}".format(x, je))
80+
sonic_logger.log_error("Unable to process reload cause file {}: {}".format(x, je))
6481
pass
6582

66-
if len(TIME_SORTED_FULL_REBOOT_FILE_LIST) > 10:
83+
if len(TIME_SORTED_FULL_REBOOT_FILE_LIST) > MAX_HISTORY_FILES:
6784
for i in range(len(TIME_SORTED_FULL_REBOOT_FILE_LIST)):
68-
if i >= 10:
85+
if i >= MAX_HISTORY_FILES:
6986
x = TIME_SORTED_FULL_REBOOT_FILE_LIST[i]
7087
os.remove(x)
7188

72-
7389
def main():
7490
# Configure logger to log all messages INFO level and higher
7591
sonic_logger.set_min_log_priority(sonic_logger.DEFAULT_LOG_LEVEL)
@@ -96,9 +112,15 @@ def main():
96112
sonic_logger.log_info("Previous reboot cause: {}".format(previous_reboot_cause))
97113

98114
if os.path.exists(REBOOT_CAUSE_HISTORY_DIR):
99-
# Read the previous reboot cause from saved reboot-cause files and save the previous reboot cause upto 10 entry to the state db
100-
read_reboot_cause_files_and_save_state_db()
101-
115+
# Read the previous npu reboot cause from saved reboot-cause files
116+
# Save the previous npu reboot cause upto 10 entry to the state db
117+
read_reboot_cause_files_and_save_to_db('npu')
118+
# Read the previous dpu reboot cause from saved reboot-cause files
119+
# Save the previous dpu reboot cause upto 10 entry to the state db
120+
if device_info.is_smartswitch():
121+
dpu_list = device_info.get_dpu_list()
122+
for dpu in dpu_list:
123+
read_reboot_cause_files_and_save_to_db(dpu)
102124

103125
if __name__ == "__main__":
104126
main()

tests/determine-reboot-cause_test.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import os
33
import shutil
44
import pytest
5+
import json
56

67
from swsscommon import swsscommon
78
from sonic_py_common.general import load_module_from_source
@@ -33,6 +34,8 @@
3334
determine_reboot_cause_path = os.path.join(scripts_path, 'determine-reboot-cause')
3435
determine_reboot_cause = load_module_from_source('determine_reboot_cause', determine_reboot_cause_path)
3536

37+
# Get the function to create dpu dir
38+
check_and_create_dpu_dirs = determine_reboot_cause.check_and_create_dpu_dirs
3639

3740
PROC_CMDLINE_CONTENTS = """\
3841
BOOT_IMAGE=/image-20191130.52/boot/vmlinuz-4.9.0-11-2-amd64 root=/dev/sda4 rw console=tty0 console=ttyS1,9600n8 quiet net.ifnames=0 biosdevname=0 loop=image-20191130.52/fs.squashfs loopfstype=squashfs apparmor=1 security=apparmor varlog_size=4096 usbcore.autosuspend=-1 module_blacklist=gpio_ich SONIC_BOOT_TYPE=warm"""
@@ -73,6 +76,8 @@
7376
EXPECTED_KERNEL_PANIC_REBOOT_CAUSE_DICT = {'comment': '', 'gen_time': '2021_3_28_13_48_49', 'cause': 'Kernel Panic', 'user': 'N/A', 'time': 'Sun Mar 28 13:45:12 UTC 2021'}
7477

7578
REBOOT_CAUSE_DIR="host/reboot-cause/"
79+
PLATFORM_JSON_PATH = "/usr/share/sonic/device/test_platform/platform.json"
80+
REBOOT_CAUSE_MODULE_DIR = "/host/reboot-cause/module"
7681

7782
class TestDetermineRebootCause(object):
7883
def test_parse_warmfast_reboot_from_proc_cmdline(self):
@@ -206,3 +211,35 @@ def test_determine_reboot_cause_main_with_reboot_cause_dir(self):
206211
determine_reboot_cause.main()
207212
assert os.path.exists("host/reboot-cause/reboot-cause.txt") == True
208213
assert os.path.exists("host/reboot-cause/previous-reboot-cause.json") == True
214+
215+
def create_mock_platform_json(self, dpus):
216+
"""Helper function to create a mock platform.json file."""
217+
os.makedirs(os.path.dirname(PLATFORM_JSON_PATH), exist_ok=True)
218+
with open(PLATFORM_JSON_PATH, "w") as f:
219+
json.dump({"DPUS": dpus}, f)
220+
221+
@mock.patch('os.makedirs')
222+
@mock.patch('builtins.open', new_callable=mock.mock_open)
223+
@mock.patch('os.path.exists', side_effect=lambda path: False)
224+
@mock.patch('sonic_py_common.device_info.is_smartswitch', return_value=True)
225+
@mock.patch('sonic_py_common.device_info.get_dpu_list', return_value=["dpu0", "dpu1"])
226+
def test_check_and_create_dpu_dirs(
227+
self,
228+
mock_get_dpu_list,
229+
mock_is_smartswitch,
230+
mock_exists,
231+
mock_open,
232+
mock_makedirs
233+
):
234+
# Call the function under test
235+
check_and_create_dpu_dirs()
236+
237+
# Assert that directories were created for each DPU
238+
mock_makedirs.assert_any_call(os.path.join(REBOOT_CAUSE_MODULE_DIR, "dpu0"))
239+
mock_makedirs.assert_any_call(os.path.join(REBOOT_CAUSE_MODULE_DIR, "dpu1"))
240+
mock_makedirs.assert_any_call(os.path.join(REBOOT_CAUSE_MODULE_DIR, "dpu0", "history"))
241+
mock_makedirs.assert_any_call(os.path.join(REBOOT_CAUSE_MODULE_DIR, "dpu1", "history"))
242+
243+
# Assert that reboot-cause.txt was created for each DPU
244+
mock_open.assert_any_call(os.path.join(REBOOT_CAUSE_MODULE_DIR, "dpu0", "reboot-cause.txt"), 'w')
245+
mock_open.assert_any_call(os.path.join(REBOOT_CAUSE_MODULE_DIR, "dpu1", "reboot-cause.txt"), 'w')

tests/process-reboot-cause_test.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
import sys
2+
import os
3+
from unittest import TestCase
4+
from unittest.mock import patch, MagicMock, mock_open
5+
from io import StringIO
6+
from sonic_py_common.general import load_module_from_source
7+
8+
# Mock the connector
9+
from .mock_connector import MockConnector
10+
import swsscommon
11+
12+
# Mock the SonicV2Connector
13+
swsscommon.SonicV2Connector = MockConnector
14+
15+
# Define the path to the script and load it using the helper function
16+
test_path = os.path.dirname(os.path.abspath(__file__))
17+
modules_path = os.path.dirname(test_path)
18+
scripts_path = os.path.join(modules_path, "scripts")
19+
sys.path.insert(0, modules_path)
20+
21+
# Load the process-reboot-cause module using the helper function
22+
process_reboot_cause_path = os.path.join(scripts_path, "process-reboot-cause")
23+
process_reboot_cause = load_module_from_source('process_reboot_cause', process_reboot_cause_path)
24+
25+
# Now proceed with your test class and mocks
26+
class TestProcessRebootCause(TestCase):
27+
@patch("builtins.open", new_callable=mock_open, read_data='{"cause": "Non-Hardware", "user": "", "comment": "Switch rebooted DPU", "device": "DPU0", "time": "Fri Dec 13 01:12:36 AM UTC 2024", "gen_time": "2024_12_13_01_12_36"}')
28+
@patch("os.listdir", return_value=["file1.json", "file2.json"])
29+
@patch("os.path.isfile", return_value=True)
30+
@patch("os.path.exists", return_value=True)
31+
@patch("os.path.getmtime", side_effect=lambda path: 1700000000 if "file1.json" in path else 1700001000)
32+
@patch("os.remove")
33+
@patch("process_reboot_cause.swsscommon.SonicV2Connector")
34+
@patch("process_reboot_cause.device_info.is_smartswitch", return_value=False)
35+
@patch("sys.stdout", new_callable=StringIO)
36+
@patch("os.geteuid", return_value=0)
37+
@patch("process_reboot_cause.device_info.get_dpu_list", return_value=["dpu1"])
38+
def test_process_reboot_cause(self, mock_get_dpu_list, mock_geteuid, mock_stdout, mock_is_smartswitch, mock_connector, mock_remove, mock_getmtime, mock_exists, mock_isfile, mock_listdir, mock_open):
39+
# Mock DB
40+
mock_db = MagicMock()
41+
mock_connector.return_value = mock_db
42+
43+
# Simulate running the script
44+
with patch.object(sys, "argv", ["process-reboot-cause"]):
45+
process_reboot_cause.main()
46+
47+
# Validate syslog and stdout logging
48+
output = mock_stdout.getvalue()
49+
50+
# Verify DB interactions
51+
mock_db.connect.assert_called()
52+
53+
@patch("builtins.open", new_callable=mock_open, read_data='{"invalid_json": ') # Malformed JSON
54+
@patch("os.listdir", return_value=["file1.json"])
55+
@patch("os.path.isfile", return_value=True)
56+
@patch("os.path.exists", return_value=True)
57+
@patch("os.path.getmtime", side_effect=lambda path: 1700000000 if "file1.json" in path else 1700001000)
58+
@patch("os.remove")
59+
@patch("process_reboot_cause.swsscommon.SonicV2Connector")
60+
@patch("process_reboot_cause.device_info.is_smartswitch", return_value=False)
61+
@patch("sys.stdout", new_callable=StringIO)
62+
@patch("os.geteuid", return_value=0)
63+
@patch("process_reboot_cause.device_info.get_dpu_list", return_value=["dpu1", "dpu2"])
64+
def test_invalid_json(
65+
self, mock_get_dpu_list, mock_geteuid, mock_stdout, mock_is_smartswitch,
66+
mock_connector, mock_remove, mock_getmtime, mock_exists, mock_isfile,
67+
mock_listdir, mock_open
68+
):
69+
# Mock DB
70+
mock_db = MagicMock()
71+
mock_connector.return_value = mock_db
72+
73+
# Simulate running the script
74+
with patch.object(sys, "argv", ["process-reboot-cause"]):
75+
try:
76+
process_reboot_cause.read_reboot_cause_files_and_save_to_db('npu')
77+
except json.JSONDecodeError:
78+
pass # Expected failure due to invalid JSON
79+
80+
# Check invalid JSON handling
81+
output = mock_stdout.getvalue()
82+
83+
# Test read_reboot_cause_files_and_save_to_db - smartswitch
84+
@patch("builtins.open", new_callable=mock_open, read_data='{"cause": "Non-Hardware", "user": "admin", "name": "2024_12_13_01_12_36", "comment": "Switch rebooted DPU", "device": "DPU0", "time": "Fri Dec 13 01:12:36 AM UTC 2024"}')
85+
@patch("os.listdir", return_value=["file1.json"])
86+
@patch("os.path.isfile", return_value=True)
87+
@patch("os.path.exists", return_value=True)
88+
@patch("os.path.getmtime", side_effect=lambda path: 1700000000 if "file1.json" in path else 1700001000)
89+
@patch("os.remove")
90+
@patch("process_reboot_cause.swsscommon.SonicV2Connector")
91+
@patch("process_reboot_cause.device_info.is_smartswitch", return_value=True)
92+
@patch("sys.stdout", new_callable=StringIO)
93+
@patch("os.geteuid", return_value=0)
94+
@patch("process_reboot_cause.device_info.get_dpu_list", return_value=["dpu1"])
95+
def test_read_reboot_cause_files_and_save_to_db(
96+
self, mock_get_dpu_list, mock_geteuid, mock_stdout, mock_is_smartswitch,
97+
mock_connector, mock_remove, mock_getmtime, mock_exists, mock_isfile,
98+
mock_listdir, mock_open
99+
):
100+
# Mock DB
101+
mock_db = MagicMock()
102+
mock_connector.return_value = mock_db
103+
104+
# Simulate running the script
105+
with patch.object(sys, "argv", ["process-reboot-cause"]):
106+
process_reboot_cause.read_reboot_cause_files_and_save_to_db('dpu1')
107+
108+
# Test read_reboot_cause_files_and_save_to_db - smartswitch - name not in data
109+
@patch("builtins.open", new_callable=mock_open, read_data='{"cause": "Non-Hardware", "user": "admin", "comment": "Switch rebooted DPU", "device": "DPU0", "time": "Fri Dec 13 01:12:36 AM UTC 2024"}')
110+
@patch("os.listdir", return_value=["file1.json"])
111+
@patch("os.path.isfile", return_value=True)
112+
@patch("os.path.exists", return_value=True)
113+
@patch("os.path.getmtime", side_effect=lambda path: 1700000000 if "file1.json" in path else 1700001000)
114+
@patch("os.remove")
115+
@patch("process_reboot_cause.swsscommon.SonicV2Connector")
116+
@patch("process_reboot_cause.device_info.is_smartswitch", return_value=True)
117+
@patch("sys.stdout", new_callable=StringIO)
118+
@patch("os.geteuid", return_value=0)
119+
@patch("process_reboot_cause.device_info.get_dpu_list", return_value=["dpu1"])
120+
def test_read_reboot_cause_files_name_not_in_data(
121+
self, mock_get_dpu_list, mock_geteuid, mock_stdout, mock_is_smartswitch,
122+
mock_connector, mock_remove, mock_getmtime, mock_exists, mock_isfile,
123+
mock_listdir, mock_open
124+
):
125+
# Mock DB
126+
mock_db = MagicMock()
127+
mock_connector.return_value = mock_db
128+
129+
# Simulate running the script
130+
with patch.object(sys, "argv", ["process-reboot-cause"]):
131+
process_reboot_cause.read_reboot_cause_files_and_save_to_db('dpu1')
132+

0 commit comments

Comments
 (0)