Skip to content

Commit 48c8300

Browse files
Merge branch 'master' into healthz_BE
2 parents 5c6567a + 0028383 commit 48c8300

14 files changed

+906
-19
lines changed

host_modules/debug_service.py

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
import pty
2+
import subprocess
3+
import os
4+
import select
5+
import errno
6+
import logging
7+
8+
from concurrent.futures import ThreadPoolExecutor, TimeoutError
9+
from threading import Event
10+
11+
from host_modules import host_service
12+
13+
# Timeout should be slightly less than default DBUS timeout (25 sec)
14+
TIMEOUT = 20
15+
MOD_NAME = 'DebugExecutor'
16+
INTERFACE = host_service.bus_name(MOD_NAME)
17+
logger = logging.getLogger(__name__)
18+
19+
20+
class DebugExecutor(host_service.HostModule):
21+
"""
22+
Debug container command handler.
23+
Allows the debug container to execute arbitrary commands on the device, after having been validated against the whitelist.
24+
"""
25+
26+
def __init__(self, mod_name):
27+
super().__init__(mod_name)
28+
self.executor = ThreadPoolExecutor(max_workers=1)
29+
30+
def _run_and_stream(self, argv, cancellation_event):
31+
"""
32+
Internal method to asynchronously run a command and stream stdout/stderr to the requesting client.
33+
"""
34+
master_fd, slave_fd = pty.openpty()
35+
36+
# Populate an environment for interactive commands (i.e. 'top')
37+
env = os.environ.copy()
38+
env['TERM'] = 'xterm'
39+
40+
p = subprocess.Popen(
41+
argv,
42+
stdin = slave_fd,
43+
stdout = slave_fd,
44+
stderr = subprocess.PIPE,
45+
close_fds = True,
46+
bufsize = 0,
47+
universal_newlines = False,
48+
env = env,
49+
)
50+
os.close(slave_fd)
51+
if p.stderr == None:
52+
raise Exception("Could not open pipe for stderr")
53+
54+
stderr_fd = p.stderr.fileno()
55+
fds = [master_fd, stderr_fd]
56+
57+
try:
58+
while True:
59+
ready, _, _ = select.select(fds, [], [])
60+
61+
# Terminate this process if calling thread exits
62+
if cancellation_event.is_set():
63+
break
64+
65+
if master_fd in ready:
66+
# Master FD is a PTY, will throw exception when closed
67+
try:
68+
data = os.read(master_fd, 4096)
69+
self.Stdout(data.decode(errors='ignore'))
70+
except OSError as e:
71+
if e.errno == errno.EIO:
72+
fds.remove(master_fd)
73+
else:
74+
raise
75+
76+
if stderr_fd in ready:
77+
# Stderr FD is a normal fd, will be empty when closed
78+
data = os.read(stderr_fd, 4096)
79+
if not data:
80+
fds.remove(stderr_fd)
81+
else:
82+
self.Stderr(data.decode(errors='ignore'))
83+
84+
if not fds:
85+
break
86+
87+
finally:
88+
os.close(master_fd)
89+
os.close(stderr_fd)
90+
91+
# Check if the process is still running before trying to stop it
92+
if p.poll() is None:
93+
logger.info(f"Terminating subprocess (PID: {p.pid}) for command '{argv}'...")
94+
p.terminate()
95+
try:
96+
rc = p.wait(timeout=5)
97+
logger.info(f"Subprocess for '{argv}' terminated gracefully with code: {rc}")
98+
except subprocess.TimeoutExpired:
99+
logger.warning(f"Process for '{argv}' did not terminate gracefully. Forcing kill...")
100+
p.kill()
101+
rc = p.wait()
102+
logger.info(f"Subprocess for '{argv}' was forcefully killed, exited with code: {rc}")
103+
else:
104+
rc = p.poll()
105+
106+
return rc
107+
108+
@host_service.signal(INTERFACE, signature='s')
109+
def Stdout(self, data):
110+
"""
111+
Signal to emit a line of stdout for a given command.
112+
"""
113+
pass
114+
115+
@host_service.signal(INTERFACE, signature='s')
116+
def Stderr(self, data):
117+
"""
118+
Signal to emit a line of stderr for a given command.
119+
"""
120+
pass
121+
122+
@host_service.method(INTERFACE, in_signature='as', out_signature='is')
123+
def RunCommand(self, argv):
124+
"""
125+
DBus endpoint - receives a command, and streams the response data back to the client.
126+
Starts the command in a separate thread, with a timeout once the command has begun execution.
127+
128+
The thread pool has a limit of 1, to ensure that only one user at a time may execute commands on the device.
129+
Additionally, the timeout ensures that commands are stopped once the default DBUS timeout has been reached.
130+
131+
Returns a tuple, consisting of (int_return_code, string_details)
132+
"""
133+
logger.info(f"Running command: '{argv}'")
134+
cancellation_event = Event()
135+
future = self.executor.submit(self._run_and_stream, argv, cancellation_event)
136+
try:
137+
rc = future.result(timeout=TIMEOUT)
138+
logger.info(f"Command '{argv}' exited with code: {rc}")
139+
140+
return (rc, f"Command exited with {rc}")
141+
except TimeoutError as e:
142+
err_msg = f"TimeoutError: Command '{argv}' took longer than {TIMEOUT} sec to complete"
143+
logger.error(err_msg)
144+
145+
cancellation_event.set()
146+
147+
return (errno.ETIMEDOUT, err_msg)
148+
except Exception as e:
149+
exception_type = type(e).__name__
150+
err_details = str(e) if str(e) else 'No details within error message'
151+
152+
err_msg = f"{exception_type}: Command '{argv}' caused exception to be thrown: {err_details}"
153+
logger.error(err_msg)
154+
155+
cancellation_event.set()
156+
157+
return (errno.EIO, err_msg)

host_modules/gnoi_reset.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
"""gNOI reset module which performs factory reset."""
2+
3+
import json
4+
import logging
5+
import threading
6+
import time
7+
from host_modules import host_service
8+
from host_modules.reboot import Reboot
9+
10+
MOD_NAME = "gnoi_reset"
11+
12+
logger = logging.getLogger(__name__)
13+
14+
15+
class GnoiReset(host_service.HostModule):
16+
"""DBus endpoint that executes the factory reset and returns the reset status and response."""
17+
18+
def __init__(self, mod_name):
19+
self.lock = threading.Lock()
20+
self.is_reset_ongoing = False
21+
self.reset_request = {}
22+
self.reset_response = {}
23+
super(GnoiReset, self).__init__(mod_name)
24+
25+
def populate_reset_response(
26+
self,
27+
reset_success=True,
28+
factory_os_unsupported=False,
29+
zero_fill_unsupported=False,
30+
detail="",
31+
) -> tuple[int, str]:
32+
"""Populate the factory reset response.
33+
"""
34+
with self.lock:
35+
self.reset_response = {}
36+
response = {}
37+
if reset_success:
38+
self.reset_response["reset_success"] = {}
39+
response["reset_success"] = {}
40+
else:
41+
self.reset_response["reset_error"] = {}
42+
response["reset_error"] = {}
43+
if factory_os_unsupported:
44+
self.reset_response["reset_error"]["factory_os_unsupported"] = True
45+
elif zero_fill_unsupported:
46+
self.reset_response["reset_error"]["zero_fill_unsupported"] = True
47+
else:
48+
self.reset_response["reset_error"]["other"] = True
49+
response["reset_error"]["detail"] = detail
50+
response_data = json.dumps(response)
51+
return 0, response_data
52+
53+
def _check_reboot_in_progress(self) -> int:
54+
"""Checks if reboot is already in progress."""
55+
if self.is_reset_ongoing:
56+
return 1
57+
else:
58+
return 0
59+
60+
def _parse_arguments(self, options) -> tuple[int, str]:
61+
"""Parses and validates the given arguments into a reset request."""
62+
try:
63+
raw = json.loads(options)
64+
except ValueError as e:
65+
logger.error("[%s]:Failed to parse factory reset request: %s", MOD_NAME, str(e))
66+
return self.populate_reset_response(
67+
reset_success=False,
68+
detail="Failed to parse json formatted factory reset request into python dict.",
69+
)
70+
71+
# Normalize: support both camelCase and snake_case
72+
self.reset_request = {
73+
"factoryOs": raw.get("factoryOs", raw.get("factory_os", False)),
74+
"zeroFill": raw.get("zeroFill", raw.get("zero_fill", False)),
75+
"retainCerts": raw.get("retainCerts", raw.get("retain_certs", False)),
76+
}
77+
78+
# Reject the request if zero_fill is set.
79+
if self.reset_request["factoryOs"] and self.reset_request["zeroFill"]:
80+
return self.populate_reset_response(
81+
reset_success=False,
82+
zero_fill_unsupported=True,
83+
detail="zero_fill operation is currently unsupported.",
84+
)
85+
# Issue a warning if retain_certs is set.
86+
if self.reset_request["factoryOs"] and self.reset_request["retainCerts"]:
87+
logger.warning("%s: retain_certs is currently ignored.", MOD_NAME)
88+
return self.populate_reset_response(
89+
reset_success=False,
90+
detail="Method FactoryReset.Start is currently unsupported."
91+
)
92+
# Reject the request if factoryOs is set. As the method is currently unsupported
93+
if self.reset_request["factoryOs"]:
94+
return self.populate_reset_response(
95+
reset_success=False,
96+
detail="Method FactoryReset.Start is currently unsupported."
97+
)
98+
99+
# Default fallback if no valid options triggered any action
100+
return self.populate_reset_response(
101+
reset_success=False,
102+
detail="Method FactoryReset.Start is currently unsupported."
103+
)
104+
105+
def _execute_reboot(self) -> int:
106+
try:
107+
r = Reboot("reboot")
108+
t = threading.Thread(target=r.execute_reboot, args=("COLD",))
109+
t.start()
110+
except RuntimeError:
111+
self.is_reset_ongoing = False
112+
return 1
113+
114+
return 0
115+
116+
@host_service.method(
117+
host_service.bus_name(MOD_NAME), in_signature="as", out_signature="is"
118+
)
119+
120+
def issue_reset(self, options) -> tuple[int, str]:
121+
"""Issues the factory reset."""
122+
print("Issuing reset from back end")
123+
124+
rc, resp = self._parse_arguments(options)
125+
if not rc:
126+
return rc, resp
127+
128+
rc = self._check_reboot_in_progress()
129+
if rc:
130+
return self.populate_reset_response(reset_success=False, detail="Previous reset is ongoing.")
131+
132+
self.is_reset_ongoing = True
133+
134+
rc, resp = self._execute_reboot()
135+
if rc:
136+
return self.populate_reset_response(reset_success=False,detail="Failed to start thread to execute reboot.")
137+
138+
# Default fallback if no valid options triggered any action
139+
return self.populate_reset_response(
140+
reset_success=False,
141+
detail="Method FactoryReset.Start is currently unsupported."
142+
)
143+
144+
145+
def register():
146+
"""Return the class name"""
147+
return GnoiReset, MOD_NAME

host_modules/host_service.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ def bus_path(mod_name):
1515
return BUS_PATH + '/' + mod_name
1616

1717
method = dbus.service.method
18+
signal = dbus.service.signal
1819

1920
class HostService(dbus.service.Object):
2021
"""Service class for top level DBus endpoint"""

scripts/determine-reboot-cause

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ REBOOT_TYPE_KEXEC_PATTERN_EXPRESS = ".*SONIC_BOOT_TYPE=(express).*"
4444
REBOOT_CAUSE_UNKNOWN = "Unknown"
4545
REBOOT_CAUSE_NON_HARDWARE = "Non-Hardware"
4646
REBOOT_CAUSE_HARDWARE_OTHER = "Hardware - Other"
47+
REBOOT_CAUSE_HEARTBEAT_LOSS = "Heartbeat with the Supervisor card lost"
4748

4849
# Global logger class instance
4950
sonic_logger = syslogger.SysLogger(SYSLOG_IDENTIFIER)
@@ -165,6 +166,8 @@ def get_reboot_cause_dict(previous_reboot_cause, comment, gen_time):
165166
if match is not None:
166167
reboot_cause_dict['cause'] = "Kernel Panic"
167168
reboot_cause_dict['time'] = match.group(1)
169+
elif re.search(r'Heartbeat with the Supervisor card lost', previous_reboot_cause):
170+
reboot_cause_dict['cause'] = 'Heartbeat with the Supervisor card lost'
168171

169172
return reboot_cause_dict
170173

@@ -185,7 +188,7 @@ def determine_reboot_cause():
185188
software_reboot_cause = find_software_reboot_cause()
186189

187190
# The main decision logic of the reboot cause:
188-
# If there is a valid hardware reboot cause indicated by platform API,
191+
# If software reboot cause is not heartbeat loss and there is a valid hardware reboot cause indicated by platform API,
189192
# check the software reboot cause to add additional reboot cause.
190193
# If there is a reboot cause indicated by /proc/cmdline, and/or warmreboot/fastreboot/softreboot
191194
# the software_reboot_cause which is the content of /hosts/reboot-cause/reboot-cause.txt
@@ -194,7 +197,7 @@ def determine_reboot_cause():
194197
# the software_reboot_cause will be treated as the reboot cause if it's not unknown
195198
# otherwise, the cmdline_reboot_cause will be treated as the reboot cause if it's not none
196199
# Else the software_reboot_cause will be treated as the reboot cause
197-
if REBOOT_CAUSE_NON_HARDWARE not in hardware_reboot_cause:
200+
if REBOOT_CAUSE_HEARTBEAT_LOSS not in software_reboot_cause and REBOOT_CAUSE_NON_HARDWARE not in hardware_reboot_cause:
198201
previous_reboot_cause = hardware_reboot_cause
199202
# Check if any software reboot was issued before this hardware reboot happened
200203
if software_reboot_cause is not REBOOT_CAUSE_UNKNOWN:

scripts/hostcfgd

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2177,14 +2177,27 @@ class HostConfigDaemon:
21772177
# Initialize LoggingCfg
21782178
self.loggingcfg = LoggingCfg()
21792179

2180-
def load(self, init_data):
2180+
def load_independent_config(self, init_data):
2181+
# Load config that does not rely on any services
21812182
aaa = init_data['AAA']
21822183
tacacs_global = init_data['TACPLUS']
21832184
tacacs_server = init_data['TACPLUS_SERVER']
21842185
radius_global = init_data['RADIUS']
21852186
radius_server = init_data['RADIUS_SERVER']
21862187
ldap_global = init_data['LDAP']
21872188
ldap_server = init_data['LDAP_SERVER']
2189+
self.aaacfg.load(aaa, tacacs_global, tacacs_server, radius_global, radius_server, ldap_global, ldap_server)
2190+
2191+
def load(self, init_data):
2192+
self.load_independent_config(init_data)
2193+
2194+
syslog.syslog(syslog.LOG_INFO,
2195+
"Waiting for systemctl to finish initialization")
2196+
self.wait_till_system_init_done()
2197+
syslog.syslog(syslog.LOG_INFO,
2198+
"systemctl has finished initialization -- proceeding ...")
2199+
2200+
# Load configuration that depends on initialized services
21882201
lpbk_table = init_data['LOOPBACK_INTERFACE']
21892202
kdump = init_data['KDUMP']
21902203
passwh = init_data['PASSW_HARDENING']
@@ -2204,7 +2217,6 @@ class HostConfigDaemon:
22042217
banner_messages = init_data.get(swsscommon.CFG_BANNER_MESSAGE_TABLE_NAME)
22052218
logging = init_data.get(swsscommon.CFG_LOGGING_TABLE_NAME, {})
22062219

2207-
self.aaacfg.load(aaa, tacacs_global, tacacs_server, radius_global, radius_server, ldap_global, ldap_server)
22082220
self.iptables.load(lpbk_table)
22092221
self.kdumpCfg.load(kdump)
22102222
self.passwcfg.load(passwh)
@@ -2465,12 +2477,6 @@ class HostConfigDaemon:
24652477
self.config_db.subscribe(swsscommon.CFG_LOGGING_TABLE_NAME,
24662478
make_callback(self.logging_handler))
24672479

2468-
syslog.syslog(syslog.LOG_INFO,
2469-
"Waiting for systemctl to finish initialization")
2470-
self.wait_till_system_init_done()
2471-
syslog.syslog(syslog.LOG_INFO,
2472-
"systemctl has finished initialization -- proceeding ...")
2473-
24742480
def start(self):
24752481
self.config_db.listen(init_data_handler=self.load)
24762482

0 commit comments

Comments
 (0)