Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 9 additions & 62 deletions config/chassis_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import subprocess
import utilities_common.cli as clicommon
from utilities_common.chassis import is_smartswitch, get_all_dpus
from utilities_common.module import ModuleHelper
from datetime import datetime, timedelta, timezone

TIMEOUT_SECS = 10
Expand Down Expand Up @@ -64,50 +65,6 @@ def get_config_module_state(db, chassis_module_name):
else:
return fvs['admin_status']


def get_state_transition_in_progress(db, chassis_module_name):
ensure_statedb_connected(db)
fvs = db.statedb.get_entry('CHASSIS_MODULE_TABLE', chassis_module_name)
value = fvs.get('state_transition_in_progress', 'False') if fvs else 'False'
return value


def set_state_transition_in_progress(db, chassis_module_name, value):
ensure_statedb_connected(db)
state_db = db.statedb
entry = state_db.get_entry('CHASSIS_MODULE_TABLE', chassis_module_name) or {}
entry['state_transition_in_progress'] = value
if value == 'True':
entry['transition_start_time'] = datetime.now(timezone.utc).isoformat()
else:
# Remove transition_start_time from both local entry and database
entry.pop('transition_start_time', None)
state_db.delete_field('CHASSIS_MODULE_TABLE', chassis_module_name, 'transition_start_time')
state_db.set_entry('CHASSIS_MODULE_TABLE', chassis_module_name, entry)


def is_transition_timed_out(db, chassis_module_name):
ensure_statedb_connected(db)
state_db = db.statedb
fvs = state_db.get_entry('CHASSIS_MODULE_TABLE', chassis_module_name)
if not fvs:
return False
start_time_str = fvs.get('transition_start_time')
if not start_time_str:
return False
try:
start_time = datetime.fromisoformat(start_time_str)
except ValueError:
return False

# Use UTC everywhere for consistent comparison
current_time = datetime.now(timezone.utc)
if start_time.tzinfo is None:
# If stored time is naive, assume it's UTC
start_time = start_time.replace(tzinfo=timezone.utc)

return current_time - start_time > TRANSITION_TIMEOUT

#
# Name: check_config_module_state_with_timeout
# return: True: timeout, False: not timeout
Expand Down Expand Up @@ -196,15 +153,10 @@ def shutdown_chassis_module(db, chassis_module_name):
return

if is_smartswitch():
if get_state_transition_in_progress(db, chassis_module_name) == 'True':
if is_transition_timed_out(db, chassis_module_name):
set_state_transition_in_progress(db, chassis_module_name, 'False')
click.echo(f"Previous transition for module {chassis_module_name} timed out. Proceeding with shutdown.")
else:
click.echo(f"Module {chassis_module_name} state transition is already in progress")
return
else:
set_state_transition_in_progress(db, chassis_module_name, 'True')
module_helper = ModuleHelper()
if module_helper.get_module_state_transition(chassis_module_name):
click.echo(f"Module {chassis_module_name} state transition is already in progress")
return

click.echo(f"Shutting down chassis module {chassis_module_name}")
fvs = {
Expand Down Expand Up @@ -243,15 +195,10 @@ def startup_chassis_module(db, chassis_module_name):
return

if is_smartswitch():
if get_state_transition_in_progress(db, chassis_module_name) == 'True':
if is_transition_timed_out(db, chassis_module_name):
set_state_transition_in_progress(db, chassis_module_name, 'False')
click.echo(f"Previous transition for module {chassis_module_name} timed out. Proceeding with startup.")
else:
click.echo(f"Module {chassis_module_name} state transition is already in progress")
return
else:
set_state_transition_in_progress(db, chassis_module_name, 'True')
module_helper = ModuleHelper()
if module_helper.get_module_state_transition(chassis_module_name):
click.echo(f"Module {chassis_module_name} state transition is already in progress")
return

click.echo(f"Starting up chassis module {chassis_module_name}")
fvs = {
Expand Down
48 changes: 47 additions & 1 deletion scripts/reboot_smartswitch_helper
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ declare -r MODULE_REBOOT_SMARTSWITCH="SMARTSWITCH"

declare -r EXIT_DPU_DOWN=2

# Set default platform JSON path if not already set
PLATFORM_JSON_PATH=${PLATFORM_JSON_PATH:-"/usr/share/sonic/platform/platform.json"}

# Function to print debug message
function log_message() {
local message=$1
Expand Down Expand Up @@ -89,6 +92,34 @@ function module_post_startup()
fi
}

# Function to set state_transition_in_progress flag
function set_module_state_transition_flag()
{
local DPU_NAME=$1
local FLAG_VALUE=$2
python3 -c "from utilities_common.module import ModuleHelper; helper = ModuleHelper(); helper.set_module_state_transition('${DPU_NAME}', ${FLAG_VALUE})"
if [ $? -ne 0 ]; then
log_message "ERROR: Setting module state transition flag failed"
fi
}

# Function to clear state_transition_in_progress flag
function clear_module_state_transition_flag()
{
local DPU_NAME=$1
python3 -c "from utilities_common.module import ModuleHelper; helper = ModuleHelper(); helper.clear_module_state_transition('${DPU_NAME}')"
if [ $? -ne 0 ]; then
log_message "ERROR: Clearing module state transition flag failed"
fi
}

# Function to get state_transition_in_progress flag
function get_module_state_transition_flag()
{
local DPU_NAME=$1
python3 -c "from utilities_common.module import ModuleHelper; helper = ModuleHelper(); helper.get_module_state_transition('${DPU_NAME}')"
}

# Function to reboot DPU
function reboot_dpu_platform()
{
Expand Down Expand Up @@ -171,7 +202,7 @@ function reboot_dpu()
local REBOOT_TYPE=$2
local DPU_INDEX=${DPU_NAME//[!0-9]/}

debug "User requested rebooting device ${DPU_NAME} ..."
log_message "User requested rebooting device ${DPU_NAME} ..."

# Check if the DPU operation status is online before rebooting
local oper_status
Expand All @@ -187,6 +218,16 @@ function reboot_dpu()
fi
fi

if [[ "$REBOOT_TYPE" != $MODULE_REBOOT_SMARTSWITCH ]]; then
# get and set the state_transition_in_progress flag before reboot
if ! get_module_state_transition_flag "${DPU_NAME}"; then
set_module_state_transition_flag "${DPU_NAME}" true
else
log_message "ERROR: state_transition_in_progress flag is already set for ${DPU_NAME}"
return ${EXIT_ERROR}
fi
fi

# Send reboot command to DPU
gnmi_reboot_dpu "${DPU_NAME}"
if [ $? -ne 0 ]; then
Expand All @@ -213,6 +254,11 @@ function reboot_dpu()

if [[ "$REBOOT_TYPE" != $MODULE_REBOOT_SMARTSWITCH ]]; then
module_post_startup ${DPU_NAME} ${DPU_BUS_INFO}
if [ $? -ne 0 ]; then
log_message "ERROR: Failed to rescan PCI module for ${DPU_NAME}"
return ${EXIT_ERROR}
fi
clear_module_state_transition_flag ${DPU_NAME}
fi
}

Expand Down
Loading
Loading