Skip to content

Commit b2139f5

Browse files
authored
[Mellanox][Smartswitch]Added debug logs to reboot and admin state change (#24263)
[Mellanox][Smartswitch]Added debug logs to reboot and admin state change
1 parent eccc743 commit b2139f5

File tree

2 files changed

+42
-14
lines changed

2 files changed

+42
-14
lines changed

platform/mellanox/mlnx-platform-api/sonic_platform/dpuctlplat.py

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from contextlib import contextmanager
2525
from select import poll, POLLPRI, POLLIN
2626
from enum import Enum
27+
import signal
2728

2829
try:
2930
from .inotify_helper import InotifyHelper
@@ -107,7 +108,7 @@ def __init__(self, dpu_name):
107108
self.pci_dev_path = []
108109
self.verbosity = False
109110

110-
def setup_logger(self, use_print=False):
111+
def setup_logger(self, use_print=False, use_notice_level=False):
111112
def print_with_time(msg):
112113
timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
113114
print(f"[{timestamp}] {msg}")
@@ -118,8 +119,12 @@ def print_with_time(msg):
118119
self.logger_warning = print_with_time
119120
self.logger_debug = print_with_time
120121
return
121-
self.logger_debug = logger.log_debug
122-
self.logger_info = logger.log_info
122+
if use_notice_level:
123+
self.logger_debug = logger.log_notice
124+
self.logger_info = logger.log_notice
125+
else:
126+
self.logger_debug = logger.log_debug
127+
self.logger_info = logger.log_info
123128
self.logger_error = logger.log_error
124129
self.logger_warning = logger.log_warning
125130

@@ -414,20 +419,32 @@ def update_boot_prog_once(self, poll_var):
414419
read_value = self.read_boot_prog()
415420
if read_value != self.boot_prog_state:
416421
self.dpu_boot_prog_update(read_value)
417-
self.log_error(f"The boot_progress status is changed to = {self.boot_prog_indication}")
422+
self.log_info(f"The boot_progress status is changed to = {self.boot_prog_indication}")
418423

419424
def watch_boot_prog(self):
420425
"""Read boot_progress and update the value in an infinite loop"""
426+
def signal_handler(signum, frame):
427+
self.log_info("Received termination signal, shutting down...")
428+
raise SystemExit("Terminated by signal")
429+
430+
# Register signal handler for SIGTERM
431+
signal.signal(signal.SIGTERM, signal_handler)
432+
433+
file = None
434+
file = open(self.boot_prog_path, "r")
435+
p = poll()
436+
p.register(file.fileno(), POLLPRI)
421437
try:
422-
self.dpu_boot_prog_update()
423-
self.log_info(f"The initial boot_progress status is = {self.boot_prog_indication}")
424-
file = open(self.boot_prog_path, "r")
425-
p = poll()
426-
p.register(file.fileno(), POLLPRI)
427438
while True:
428-
self.update_boot_prog_once(p)
429-
except Exception:
430-
self.log_error(f"Exception occured during watch_boot_progress!")
439+
try:
440+
self.update_boot_prog_once(p)
441+
except SystemExit:
442+
break # Exit on termination signal
443+
except Exception as e:
444+
self.log_error(f"Error during watch_boot_progress: {e}")
445+
finally:
446+
if file:
447+
file.close()
431448

432449
@contextmanager
433450
def boot_prog_context(self):
@@ -444,6 +461,8 @@ def boot_prog_context(self):
444461
finally:
445462
if self.boot_prog_proc and self.boot_prog_proc.is_alive():
446463
self.boot_prog_proc.terminate()
464+
self.boot_prog_proc.join(timeout=3)
465+
self.boot_prog_proc.kill()
447466
self.boot_prog_proc.join()
448467
else:
449468
yield

platform/mellanox/mlnx-platform-api/sonic_platform/module.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,8 @@ def __init__(self, dpu_id):
262262
self.dpu_id = dpu_id
263263
self._name = f"DPU{self.dpu_id}"
264264
self.dpuctl_obj = DpuCtlPlat(self._name.lower())
265+
self.dpuctl_obj.setup_logger(use_notice_level=True)
266+
self.dpuctl_obj.verbosity = True
265267
self.fault_state = False
266268
self.dpu_vpd_parser = DpuVpdParser('/var/run/hw-management/eeprom/vpd_data', self.dpuctl_obj._name.upper())
267269
self.CONFIG_DB_NAME = "CONFIG_DB"
@@ -334,8 +336,11 @@ def reboot(self, reboot_type=ModuleBase.MODULE_REBOOT_DPU):
334336
Returns:
335337
bool: True if the request has been issued successfully, False if not
336338
"""
339+
logger.log_notice(f"Rebooting {self._name} with type {reboot_type}")
337340
# no_wait=True is not supported at this point, because of race conditions with other drivers
338-
return self.dpuctl_obj.dpu_reboot(skip_pre_post=True)
341+
return_value = self.dpuctl_obj.dpu_reboot(skip_pre_post=True)
342+
logger.log_notice(f"Rebooted {self._name} with type {reboot_type} and return value {return_value}")
343+
return return_value
339344

340345
def set_admin_state(self, up):
341346
"""
@@ -352,12 +357,16 @@ def set_admin_state(self, up):
352357
Returns:
353358
bool: True if the request has been issued successfully, False if not
354359
"""
360+
logger.log_notice(f"Setting the admin state for {self._name} to {up}")
355361
if up:
356362
if self.dpuctl_obj.dpu_power_on(skip_pre_post=True):
363+
logger.log_notice(f"Completed the admin state change for {self._name} to {up}")
357364
return True
358365
logger.log_error(f"Failed to set the admin state for {self._name}")
359366
return False
360-
return self.dpuctl_obj.dpu_power_off(skip_pre_post=True)
367+
return_value = self.dpuctl_obj.dpu_power_off(skip_pre_post=True)
368+
logger.log_notice(f"Completed the admin state change for {self._name} to {up}")
369+
return return_value
361370

362371
def get_type(self):
363372
"""

0 commit comments

Comments
 (0)