Skip to content

Commit ac7064c

Browse files
Merge branch 'master' into healthz_BE
2 parents 05d6ff3 + 286827f commit ac7064c

15 files changed

+939
-30
lines changed

host_modules/debug_service.py

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
import pty
2+
import subprocess
3+
import os
4+
import select
5+
import errno
6+
import logging
7+
8+
from concurrent.futures import ThreadPoolExecutor, TimeoutError
9+
from threading import Event
10+
11+
from host_modules import host_service
12+
13+
# Timeout should be slightly less than default DBUS timeout (25 sec)
14+
TIMEOUT = 20
15+
MOD_NAME = 'DebugExecutor'
16+
INTERFACE = host_service.bus_name(MOD_NAME)
17+
logger = logging.getLogger(__name__)
18+
19+
20+
class DebugExecutor(host_service.HostModule):
21+
"""
22+
Debug container command handler.
23+
Allows the debug container to execute arbitrary commands on the device, after having been validated against the whitelist.
24+
"""
25+
26+
def __init__(self, mod_name):
27+
super().__init__(mod_name)
28+
self.executor = ThreadPoolExecutor(max_workers=1)
29+
30+
def _run_and_stream(self, argv, cancellation_event):
31+
"""
32+
Internal method to asynchronously run a command and stream stdout/stderr to the requesting client.
33+
"""
34+
master_fd, slave_fd = pty.openpty()
35+
36+
# Populate an environment for interactive commands (i.e. 'top')
37+
env = os.environ.copy()
38+
env['TERM'] = 'xterm'
39+
40+
p = subprocess.Popen(
41+
argv,
42+
stdin = slave_fd,
43+
stdout = slave_fd,
44+
stderr = subprocess.PIPE,
45+
close_fds = True,
46+
bufsize = 0,
47+
universal_newlines = False,
48+
env = env,
49+
)
50+
os.close(slave_fd)
51+
if p.stderr == None:
52+
raise Exception("Could not open pipe for stderr")
53+
54+
stderr_fd = p.stderr.fileno()
55+
fds = [master_fd, stderr_fd]
56+
57+
try:
58+
while True:
59+
ready, _, _ = select.select(fds, [], [])
60+
61+
# Terminate this process if calling thread exits
62+
if cancellation_event.is_set():
63+
break
64+
65+
if master_fd in ready:
66+
# Master FD is a PTY, will throw exception when closed
67+
try:
68+
data = os.read(master_fd, 4096)
69+
self.Stdout(data.decode(errors='ignore'))
70+
except OSError as e:
71+
if e.errno == errno.EIO:
72+
fds.remove(master_fd)
73+
else:
74+
raise
75+
76+
if stderr_fd in ready:
77+
# Stderr FD is a normal fd, will be empty when closed
78+
data = os.read(stderr_fd, 4096)
79+
if not data:
80+
fds.remove(stderr_fd)
81+
else:
82+
self.Stderr(data.decode(errors='ignore'))
83+
84+
if not fds:
85+
break
86+
87+
finally:
88+
os.close(master_fd)
89+
os.close(stderr_fd)
90+
91+
# Check if the process is still running before trying to stop it
92+
if p.poll() is None:
93+
logger.info(f"Terminating subprocess (PID: {p.pid}) for command '{argv}'...")
94+
p.terminate()
95+
try:
96+
rc = p.wait(timeout=5)
97+
logger.info(f"Subprocess for '{argv}' terminated gracefully with code: {rc}")
98+
except subprocess.TimeoutExpired:
99+
logger.warning(f"Process for '{argv}' did not terminate gracefully. Forcing kill...")
100+
p.kill()
101+
rc = p.wait()
102+
logger.info(f"Subprocess for '{argv}' was forcefully killed, exited with code: {rc}")
103+
else:
104+
rc = p.poll()
105+
106+
return rc
107+
108+
@host_service.signal(INTERFACE, signature='s')
109+
def Stdout(self, data):
110+
"""
111+
Signal to emit a line of stdout for a given command.
112+
"""
113+
pass
114+
115+
@host_service.signal(INTERFACE, signature='s')
116+
def Stderr(self, data):
117+
"""
118+
Signal to emit a line of stderr for a given command.
119+
"""
120+
pass
121+
122+
@host_service.method(INTERFACE, in_signature='as', out_signature='is')
123+
def RunCommand(self, argv):
124+
"""
125+
DBus endpoint - receives a command, and streams the response data back to the client.
126+
Starts the command in a separate thread, with a timeout once the command has begun execution.
127+
128+
The thread pool has a limit of 1, to ensure that only one user at a time may execute commands on the device.
129+
Additionally, the timeout ensures that commands are stopped once the default DBUS timeout has been reached.
130+
131+
Returns a tuple, consisting of (int_return_code, string_details)
132+
"""
133+
logger.info(f"Running command: '{argv}'")
134+
cancellation_event = Event()
135+
future = self.executor.submit(self._run_and_stream, argv, cancellation_event)
136+
try:
137+
rc = future.result(timeout=TIMEOUT)
138+
logger.info(f"Command '{argv}' exited with code: {rc}")
139+
140+
return (rc, f"Command exited with {rc}")
141+
except TimeoutError as e:
142+
err_msg = f"TimeoutError: Command '{argv}' took longer than {TIMEOUT} sec to complete"
143+
logger.error(err_msg)
144+
145+
cancellation_event.set()
146+
147+
return (errno.ETIMEDOUT, err_msg)
148+
except Exception as e:
149+
exception_type = type(e).__name__
150+
err_details = str(e) if str(e) else 'No details within error message'
151+
152+
err_msg = f"{exception_type}: Command '{argv}' caused exception to be thrown: {err_details}"
153+
logger.error(err_msg)
154+
155+
cancellation_event.set()
156+
157+
return (errno.EIO, err_msg)

host_modules/gnoi_reset.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
"""gNOI reset module which performs factory reset."""
2+
3+
import json
4+
import logging
5+
import threading
6+
import time
7+
from host_modules import host_service
8+
from host_modules.reboot import Reboot
9+
10+
MOD_NAME = "gnoi_reset"
11+
12+
logger = logging.getLogger(__name__)
13+
14+
15+
class GnoiReset(host_service.HostModule):
16+
"""DBus endpoint that executes the factory reset and returns the reset status and response."""
17+
18+
def __init__(self, mod_name):
19+
self.lock = threading.Lock()
20+
self.is_reset_ongoing = False
21+
self.reset_request = {}
22+
self.reset_response = {}
23+
super(GnoiReset, self).__init__(mod_name)
24+
25+
def populate_reset_response(
26+
self,
27+
reset_success=True,
28+
factory_os_unsupported=False,
29+
zero_fill_unsupported=False,
30+
detail="",
31+
) -> tuple[int, str]:
32+
"""Populate the factory reset response.
33+
"""
34+
with self.lock:
35+
self.reset_response = {}
36+
response = {}
37+
if reset_success:
38+
self.reset_response["reset_success"] = {}
39+
response["reset_success"] = {}
40+
else:
41+
self.reset_response["reset_error"] = {}
42+
response["reset_error"] = {}
43+
if factory_os_unsupported:
44+
self.reset_response["reset_error"]["factory_os_unsupported"] = True
45+
elif zero_fill_unsupported:
46+
self.reset_response["reset_error"]["zero_fill_unsupported"] = True
47+
else:
48+
self.reset_response["reset_error"]["other"] = True
49+
response["reset_error"]["detail"] = detail
50+
response_data = json.dumps(response)
51+
return 0, response_data
52+
53+
def _check_reboot_in_progress(self) -> int:
54+
"""Checks if reboot is already in progress."""
55+
if self.is_reset_ongoing:
56+
return 1
57+
else:
58+
return 0
59+
60+
def _parse_arguments(self, options) -> tuple[int, str]:
61+
"""Parses and validates the given arguments into a reset request."""
62+
try:
63+
raw = json.loads(options)
64+
except ValueError as e:
65+
logger.error("[%s]:Failed to parse factory reset request: %s", MOD_NAME, str(e))
66+
return self.populate_reset_response(
67+
reset_success=False,
68+
detail="Failed to parse json formatted factory reset request into python dict.",
69+
)
70+
71+
# Normalize: support both camelCase and snake_case
72+
self.reset_request = {
73+
"factoryOs": raw.get("factoryOs", raw.get("factory_os", False)),
74+
"zeroFill": raw.get("zeroFill", raw.get("zero_fill", False)),
75+
"retainCerts": raw.get("retainCerts", raw.get("retain_certs", False)),
76+
}
77+
78+
# Reject the request if zero_fill is set.
79+
if self.reset_request["factoryOs"] and self.reset_request["zeroFill"]:
80+
return self.populate_reset_response(
81+
reset_success=False,
82+
zero_fill_unsupported=True,
83+
detail="zero_fill operation is currently unsupported.",
84+
)
85+
# Issue a warning if retain_certs is set.
86+
if self.reset_request["factoryOs"] and self.reset_request["retainCerts"]:
87+
logger.warning("%s: retain_certs is currently ignored.", MOD_NAME)
88+
return self.populate_reset_response(
89+
reset_success=False,
90+
detail="Method FactoryReset.Start is currently unsupported."
91+
)
92+
# Reject the request if factoryOs is set. As the method is currently unsupported
93+
if self.reset_request["factoryOs"]:
94+
return self.populate_reset_response(
95+
reset_success=False,
96+
detail="Method FactoryReset.Start is currently unsupported."
97+
)
98+
99+
# Default fallback if no valid options triggered any action
100+
return self.populate_reset_response(
101+
reset_success=False,
102+
detail="Method FactoryReset.Start is currently unsupported."
103+
)
104+
105+
def _execute_reboot(self) -> int:
106+
try:
107+
r = Reboot("reboot")
108+
t = threading.Thread(target=r.execute_reboot, args=("COLD",))
109+
t.start()
110+
except RuntimeError:
111+
self.is_reset_ongoing = False
112+
return 1
113+
114+
return 0
115+
116+
@host_service.method(
117+
host_service.bus_name(MOD_NAME), in_signature="as", out_signature="is"
118+
)
119+
120+
def issue_reset(self, options) -> tuple[int, str]:
121+
"""Issues the factory reset."""
122+
print("Issuing reset from back end")
123+
124+
rc, resp = self._parse_arguments(options)
125+
if not rc:
126+
return rc, resp
127+
128+
rc = self._check_reboot_in_progress()
129+
if rc:
130+
return self.populate_reset_response(reset_success=False, detail="Previous reset is ongoing.")
131+
132+
self.is_reset_ongoing = True
133+
134+
rc, resp = self._execute_reboot()
135+
if rc:
136+
return self.populate_reset_response(reset_success=False,detail="Failed to start thread to execute reboot.")
137+
138+
# Default fallback if no valid options triggered any action
139+
return self.populate_reset_response(
140+
reset_success=False,
141+
detail="Method FactoryReset.Start is currently unsupported."
142+
)
143+
144+
145+
def register():
146+
"""Return the class name"""
147+
return GnoiReset, MOD_NAME

host_modules/host_service.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ def bus_path(mod_name):
1515
return BUS_PATH + '/' + mod_name
1616

1717
method = dbus.service.method
18+
signal = dbus.service.signal
1819

1920
class HostService(dbus.service.Object):
2021
"""Service class for top level DBus endpoint"""

host_modules/reboot.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def populate_reboot_status_flag(self, active = False, when = 0, reason = "", met
6161
self.reboot_status_flag["reason"] = reason
6262
self.reboot_status_flag["count"] = self.reboot_count
6363
self.reboot_status_flag["method"] = method
64-
self.reboot_status_flag["status"] = status
64+
self.reboot_status_flag["status"] = status.value
6565
self.lock.release()
6666
return
6767

scripts/determine-reboot-cause

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ REBOOT_TYPE_KEXEC_PATTERN_EXPRESS = ".*SONIC_BOOT_TYPE=(express).*"
4444
REBOOT_CAUSE_UNKNOWN = "Unknown"
4545
REBOOT_CAUSE_NON_HARDWARE = "Non-Hardware"
4646
REBOOT_CAUSE_HARDWARE_OTHER = "Hardware - Other"
47+
REBOOT_CAUSE_HEARTBEAT_LOSS = "Heartbeat with the Supervisor card lost"
48+
REBOOT_CAUSE_KERNEL_PANIC = "Kernel Panic"
4749

4850
# Global logger class instance
4951
sonic_logger = syslogger.SysLogger(SYSLOG_IDENTIFIER)
@@ -165,6 +167,8 @@ def get_reboot_cause_dict(previous_reboot_cause, comment, gen_time):
165167
if match is not None:
166168
reboot_cause_dict['cause'] = "Kernel Panic"
167169
reboot_cause_dict['time'] = match.group(1)
170+
elif re.search(r'Heartbeat with the Supervisor card lost', previous_reboot_cause):
171+
reboot_cause_dict['cause'] = 'Heartbeat with the Supervisor card lost'
168172

169173
return reboot_cause_dict
170174

@@ -185,7 +189,7 @@ def determine_reboot_cause():
185189
software_reboot_cause = find_software_reboot_cause()
186190

187191
# The main decision logic of the reboot cause:
188-
# If there is a valid hardware reboot cause indicated by platform API,
192+
# If software reboot cause is not Kernel Panic or heartbeat loss and there is a valid hardware reboot cause indicated by platform API,
189193
# check the software reboot cause to add additional reboot cause.
190194
# If there is a reboot cause indicated by /proc/cmdline, and/or warmreboot/fastreboot/softreboot
191195
# the software_reboot_cause which is the content of /hosts/reboot-cause/reboot-cause.txt
@@ -194,7 +198,9 @@ def determine_reboot_cause():
194198
# the software_reboot_cause will be treated as the reboot cause if it's not unknown
195199
# otherwise, the cmdline_reboot_cause will be treated as the reboot cause if it's not none
196200
# Else the software_reboot_cause will be treated as the reboot cause
197-
if REBOOT_CAUSE_NON_HARDWARE not in hardware_reboot_cause:
201+
if (REBOOT_CAUSE_KERNEL_PANIC not in software_reboot_cause and
202+
REBOOT_CAUSE_HEARTBEAT_LOSS not in software_reboot_cause and
203+
REBOOT_CAUSE_NON_HARDWARE not in hardware_reboot_cause):
198204
previous_reboot_cause = hardware_reboot_cause
199205
# Check if any software reboot was issued before this hardware reboot happened
200206
if software_reboot_cause is not REBOOT_CAUSE_UNKNOWN:

0 commit comments

Comments
 (0)