Skip to content

Commit 5460c29

Browse files
authored
Improve reboot status tracking and add reboot count & method to status (#259)
Why I did it To provide more accurate and detailed tracking of reboot operations on the host. This includes: Fixing incorrect or missing values in the reboot_status_flag. Adding the ability to track how many times a reboot has been initiated (count). Recording which reboot method was used (method), which is critical for debugging and telemetry. Fixes: sonic-net/sonic-buildimage#22157 How I did it Enhanced the populate_reboot_status_flag() function to also track: -- count: Number of reboots initiated by the host service. -- method: Type of reboot (e.g., cold, warm, halt). Corrected how container status is evaluated using Docker SDK by checking attrs['State']['Running'] instead of relying on the status string. Updated all relevant call sites of populate_reboot_status_flag() to pass the correct parameters. Adjusted unit tests to validate updated logic and ensure coverage of new parameters.
1 parent e46e36e commit 5460c29

File tree

2 files changed

+47
-21
lines changed

2 files changed

+47
-21
lines changed

host_modules/reboot.py

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import psutil
99
from host_modules import host_service
1010
from utils.run_cmd import _run_command
11+
from enum import Enum
1112

1213
MOD_NAME = 'reboot'
1314
# Reboot method in reboot request
@@ -25,6 +26,12 @@
2526
EXECUTE_HALT_REBOOT_COMMAND = "sudo reboot -p"
2627
EXECUTE_WARM_REBOOT_COMMAND = "sudo warm-reboot"
2728

29+
class RebootStatus(Enum):
30+
STATUS_UNKNOWN = 0
31+
STATUS_SUCCESS = 1
32+
STATUS_RETRIABLE_FAILURE = 2
33+
STATUS_FAILURE = 3
34+
2835
logger = logging.getLogger(__name__)
2936

3037

@@ -38,16 +45,23 @@ def __init__(self, mod_name):
3845
self.lock = threading.Lock()
3946
# reboot_status_flag is used to keep track of reboot status on host
4047
self.reboot_status_flag = {}
48+
49+
# reboot count
50+
self.reboot_count = 0
51+
4152
# Populating with default value i.e., no active reboot
4253
self.populate_reboot_status_flag()
4354
super(Reboot, self).__init__(mod_name)
4455

45-
def populate_reboot_status_flag(self, active = False, when = 0, reason = ""):
56+
def populate_reboot_status_flag(self, active = False, when = 0, reason = "", method = "", status = RebootStatus.STATUS_UNKNOWN):
4657
"""Populates the reboot_status_flag with given input params"""
4758
self.lock.acquire()
4859
self.reboot_status_flag["active"] = active
4960
self.reboot_status_flag["when"] = when
5061
self.reboot_status_flag["reason"] = reason
62+
self.reboot_status_flag["count"] = self.reboot_count
63+
self.reboot_status_flag["method"] = method
64+
self.reboot_status_flag["status"] = status
5165
self.lock.release()
5266
return
5367

@@ -71,10 +85,10 @@ def is_container_running(self, container_name):
7185
"""Check if a given container is running using the Docker SDK."""
7286
try:
7387
client = docker.from_env()
74-
containers = client.containers.list(filters={"name": container_name})
88+
containers = client.containers.list(filters={"name": f"^{container_name}$"})
7589

7690
for container in containers:
77-
if container.name == container_name and container.status == "running":
91+
if container.name == container_name and container.attrs['State']['Running']:
7892
return True
7993
return False
8094
except Exception as e:
@@ -111,7 +125,7 @@ def execute_reboot(self, reboot_method):
111125

112126
rc, stdout, stderr = _run_command(command)
113127
if rc:
114-
self.populate_reboot_status_flag()
128+
self.populate_reboot_status_flag(False, int (time.time()), "Failed to execute reboot command", reboot_method, RebootStatus.STATUS_FAILURE)
115129
logger.error("%s: Reboot failed execution with stdout: %s, "
116130
"stderr: %s", MOD_NAME, stdout, stderr)
117131
return
@@ -128,29 +142,30 @@ def execute_reboot(self, reboot_method):
128142
logger.info("%s: Waiting until services are halted or timeout occurs", MOD_NAME)
129143
timeout = HALT_TIMEOUT
130144
start_time = time.monotonic()
145+
131146
while time.monotonic() - start_time < timeout:
132147
if not self.is_halt_command_running() and not self.is_container_running("pmon"):
133148
logger.info("%s: Halting the services is completed on the device", MOD_NAME)
149+
self.populate_reboot_status_flag(False, 0, "Halt reboot completed", reboot_method, RebootStatus.STATUS_SUCCESS)
134150
return
135151
time.sleep(5)
136152

137153
# Check if PMON container is still running after timeout
138154
if self.is_halt_command_running() or self.is_container_running("pmon"):
139155
#Halt reboot has failed, as pmon is still running.
156+
self.populate_reboot_status_flag(False, int(time.time()), "Halt reboot did not complete", reboot_method, RebootStatus.STATUS_FAILURE)
140157
logger.error("%s: HALT reboot failed: Services are still running", MOD_NAME)
141-
self.populate_reboot_status_flag()
142-
return
143158
else:
159+
self.populate_reboot_status_flag(False, 0, "Halt reboot completed", reboot_method, RebootStatus.STATUS_SUCCESS)
144160
logger.info("%s: Halting the services is completed on the device", MOD_NAME)
145-
161+
return
146162
else:
147163
time.sleep(REBOOT_TIMEOUT)
148164
# Conclude that the reboot has failed if we reach this point
149-
self.populate_reboot_status_flag()
165+
self.populate_reboot_status_flag(False, int(time.time()), "Reboot command failed to execute", reboot_method, RebootStatus.STATUS_FAILURE)
150166
return
151167

152168
@host_service.method(host_service.bus_name(MOD_NAME), in_signature='as', out_signature='is')
153-
154169
def issue_reboot(self, options):
155170
"""Initializes reboot thorugh RPC based on the reboot flag assigned.
156171
Issues reboot after performing the following steps sequentially:
@@ -162,6 +177,7 @@ def issue_reboot(self, options):
162177
logger.warning("%s: issue_reboot rpc called", MOD_NAME)
163178
self.lock.acquire()
164179
is_reboot_ongoing = self.reboot_status_flag["active"]
180+
self.reboot_count += 1
165181
self.lock.release()
166182
# Return without issuing the reboot if the previous reboot is ongoing
167183
if is_reboot_ongoing:
@@ -185,7 +201,7 @@ def issue_reboot(self, options):
185201
return err, errstr
186202

187203
# Sets reboot_status_flag to be in active state
188-
self.populate_reboot_status_flag(True, int(time.time()), reboot_request.get("message", ""))
204+
self.populate_reboot_status_flag(True, int(time.time()), reboot_request.get("message", ""), reboot_request["method"], RebootStatus.STATUS_UNKNOWN)
189205

190206
# Issue reboot in a new thread and reset the reboot_status_flag if the reboot fails
191207
try:

tests/host_modules/reboot_test.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import json
55
import sys
66
import os
7+
import time
78
import pytest
89
import logging
910

@@ -19,6 +20,8 @@
1920
host_modules_path = os.path.join(sonic_host_service_path, "../host_modules")
2021
sys.path.insert(0, sonic_host_service_path)
2122

23+
from host_modules.reboot import RebootStatus
24+
2225
TIME = 1617811205
2326
TEST_ACTIVE_RESPONSE_DATA = "{\"active\": true, \"when\": 1617811205, \"reason\": \"testing reboot response\"}"
2427
TEST_INACTIVE_RESPONSE_DATA = "{\"active\": false, \"when\": 0, \"reason\": \"\"}"
@@ -54,6 +57,9 @@ def test_populate_reboot_status_flag(self):
5457
assert self.reboot_module.reboot_status_flag["active"] == False
5558
assert self.reboot_module.reboot_status_flag["when"] == 0
5659
assert self.reboot_module.reboot_status_flag["reason"] == ""
60+
assert self.reboot_module.reboot_status_flag["count"] == 0
61+
assert self.reboot_module.reboot_status_flag["method"] == ""
62+
assert self.reboot_module.reboot_status_flag["status"] == RebootStatus.STATUS_UNKNOWN
5763

5864
def test_validate_reboot_request_success_cold_boot_enum_method(self):
5965
reboot_request = {"method": REBOOT_METHOD_COLD_BOOT_ENUM, "reason": "test reboot request reason"}
@@ -115,12 +121,12 @@ def test_is_container_running_success(self):
115121
mock_docker.return_value = mock_client
116122
mock_container = mock.Mock()
117123
mock_container.name = "pmon"
118-
mock_container.status = "running"
124+
mock_container.attrs = {'State': {'Running': True}}
119125
mock_client.containers.list.return_value = [mock_container]
120126

121127
result = self.reboot_module.is_container_running("pmon")
122128
assert result is True
123-
mock_client.containers.list.assert_called_once_with(filters={"name": "pmon"})
129+
mock_client.containers.list.assert_called_once_with(filters={"name": "^pmon$"})
124130

125131
def test_is_container_running_failure(self):
126132
with mock.patch("docker.from_env") as mock_docker:
@@ -130,7 +136,7 @@ def test_is_container_running_failure(self):
130136

131137
result = self.reboot_module.is_container_running("pmon")
132138
assert result is False
133-
mock_client.containers.list.assert_called_once_with(filters={"name": "pmon"})
139+
mock_client.containers.list.assert_called_once_with(filters={"name": "^pmon$"})
134140

135141
def test_is_container_running_exception(self, caplog):
136142
with mock.patch("docker.from_env", side_effect=Exception("Docker error")) as mock_docker, \
@@ -176,7 +182,7 @@ def test_execute_reboot_success(self):
176182
self.reboot_module.execute_reboot("WARM")
177183
mock_run_command.assert_called_once_with("sudo warm-reboot")
178184
mock_sleep.assert_called_once_with(260)
179-
mock_populate_reboot_status_flag.assert_called_once_with()
185+
mock_populate_reboot_status_flag.assert_called_once_with(False, int(time.time()), "Reboot command failed to execute", 'WARM', RebootStatus.STATUS_FAILURE)
180186

181187
def test_execute_reboot_fail_unknown_reboot(self, caplog):
182188
with caplog.at_level(logging.ERROR):
@@ -196,7 +202,7 @@ def test_execute_reboot_fail_issue_reboot_command_cold_boot(self, caplog):
196202
"stdout: ['stdout: execute cold reboot'], stderr: "
197203
"['stderror: execute cold reboot']")
198204
assert caplog.records[0].message == msg
199-
mock_populate_reboot_status_flag.assert_called_once_with()
205+
mock_populate_reboot_status_flag.assert_called_once_with(False, int (time.time()), "Failed to execute reboot command", 1, RebootStatus.STATUS_FAILURE)
200206

201207
def test_execute_reboot_fail_issue_reboot_command_halt(self, caplog):
202208
with (
@@ -210,7 +216,7 @@ def test_execute_reboot_fail_issue_reboot_command_halt(self, caplog):
210216
"stdout: ['stdout: execute halt reboot'], stderr: "
211217
"['stderror: execute halt reboot']")
212218
assert caplog.records[0].message == msg
213-
mock_populate_reboot_status_flag.assert_called_once_with()
219+
mock_populate_reboot_status_flag.assert_called_once_with(False, int (time.time()), "Failed to execute reboot command", 3, RebootStatus.STATUS_FAILURE)
214220

215221
def test_execute_reboot_success_halt(self):
216222
with (
@@ -225,7 +231,7 @@ def test_execute_reboot_success_halt(self):
225231
mock_run_command.assert_called_once_with("sudo reboot -p")
226232
mock_is_halt_command_running.assert_called()
227233
mock_is_container_running.assert_called_with("pmon")
228-
mock_populate_reboot_status_flag.assert_not_called()
234+
mock_populate_reboot_status_flag.assert_called_once_with(False, 0, 'Halt reboot completed', 3, RebootStatus.STATUS_SUCCESS)
229235

230236
def test_execute_reboot_fail_halt_timeout(self, caplog):
231237
with (
@@ -242,7 +248,7 @@ def test_execute_reboot_fail_halt_timeout(self, caplog):
242248
mock_sleep.assert_called_with(5)
243249
mock_is_halt_command_running.assert_called()
244250
assert any("HALT reboot failed: Services are still running" in record.message for record in caplog.records)
245-
mock_populate_reboot_status_flag.assert_called_once_with()
251+
mock_populate_reboot_status_flag.assert_called_once_with(False, int(time.time()), 'Halt reboot did not complete', 3, RebootStatus.STATUS_FAILURE)
246252

247253
def test_execute_reboot_fail_issue_reboot_command_warm(self, caplog):
248254
with (
@@ -256,7 +262,7 @@ def test_execute_reboot_fail_issue_reboot_command_warm(self, caplog):
256262
"stdout: ['stdout: execute WARM reboot'], stderr: "
257263
"['stderror: execute WARM reboot']")
258264
assert caplog.records[0].message == msg
259-
mock_populate_reboot_status_flag.assert_called_once_with()
265+
mock_populate_reboot_status_flag.assert_called_once_with(False, int (time.time()), "Failed to execute reboot command", 'WARM', RebootStatus.STATUS_FAILURE)
260266

261267
def test_issue_reboot_success_cold_boot(self):
262268
with (
@@ -355,22 +361,26 @@ def test_issue_reboot_fail_issue_reboot_thread(self):
355361

356362
def test_get_reboot_status_active(self):
357363
MSG="testing reboot response"
358-
self.reboot_module.populate_reboot_status_flag(True, TIME, MSG)
364+
self.reboot_module.populate_reboot_status_flag(True, TIME, MSG, REBOOT_METHOD_COLD_BOOT_ENUM, RebootStatus.STATUS_SUCCESS.name)
359365
result = self.reboot_module.get_reboot_status()
360366
assert result[0] == 0
361367
response_data = json.loads(result[1])
362368
assert response_data["active"] == True
363369
assert response_data["when"] == TIME
364370
assert response_data["reason"] == MSG
371+
assert response_data["method"] == REBOOT_METHOD_COLD_BOOT_ENUM
372+
assert response_data["status"] == RebootStatus.STATUS_SUCCESS.name
365373

366374
def test_get_reboot_status_inactive(self):
367-
self.reboot_module.populate_reboot_status_flag(False, 0, "")
375+
self.reboot_module.populate_reboot_status_flag(False, 0, "", REBOOT_METHOD_COLD_BOOT_ENUM, RebootStatus.STATUS_SUCCESS.name)
368376
result = self.reboot_module.get_reboot_status()
369377
assert result[0] == 0
370378
response_data = json.loads(result[1])
371379
assert response_data["active"] == False
372380
assert response_data["when"] == 0
373381
assert response_data["reason"] == ""
382+
assert response_data["method"] == REBOOT_METHOD_COLD_BOOT_ENUM
383+
assert response_data["status"] == RebootStatus.STATUS_SUCCESS.name
374384

375385
# assert result[1] == TEST_INACTIVE_RESPONSE_DATA
376386

0 commit comments

Comments
 (0)