Skip to content
Open
Show file tree
Hide file tree
Changes from 54 commits
Commits
Show all changes
114 commits
Select commit Hold shift + click to select a range
4e3a096
Did the instrumentation for gnoi-reboot.service
rameshraghupathy May 13, 2025
4a7e6bf
Modified based on the Redis based IPC
rameshraghupathy May 21, 2025
c2f9cb8
Modified based on the Redis based IPC
rameshraghupathy May 21, 2025
db7848f
made check_platform.sh executable
rameshraghupathy May 21, 2025
f946e72
Did some cleanup
rameshraghupathy May 21, 2025
4434463
Draft version. Need to test again
rameshraghupathy Jul 7, 2025
91897ed
Fixing test failure
rameshraghupathy Jul 10, 2025
118a27a
Working on coverage
rameshraghupathy Jul 10, 2025
1654d44
Working on coverage
rameshraghupathy Jul 10, 2025
b1ca2a3
Merge branch 'sonic-net:master' into graceful-shutdown
rameshraghupathy Aug 12, 2025
f6936e5
refactored based on the revised HLD
rameshraghupathy Aug 12, 2025
4b709ea
refactored based on the revised HLD
rameshraghupathy Aug 14, 2025
d510290
Fixing ut
rameshraghupathy Aug 20, 2025
dfa9761
Fixing ut
rameshraghupathy Aug 20, 2025
380b5f9
Improving coverage
rameshraghupathy Aug 20, 2025
62450d6
Refactored for graceful shutdown
rameshraghupathy Aug 24, 2025
a7f1a39
Refactored for graceful shutdown
rameshraghupathy Aug 25, 2025
f45358a
Fixing ut
rameshraghupathy Aug 26, 2025
14f20e6
Fixing ut
rameshraghupathy Aug 26, 2025
8d647fa
Fixing ut
rameshraghupathy Aug 26, 2025
e2c2a71
Fixing ut
rameshraghupathy Aug 26, 2025
ada6883
Fixing ut
rameshraghupathy Aug 26, 2025
ca6d463
Fixing ut
rameshraghupathy Aug 26, 2025
e2bbe5f
Fixing ut
rameshraghupathy Aug 26, 2025
28bc69b
Fixing ut
rameshraghupathy Aug 26, 2025
29183bd
Fixing ut
rameshraghupathy Aug 26, 2025
e228ffb
workign on coverage
rameshraghupathy Aug 26, 2025
37d73ce
workign on coverage
rameshraghupathy Aug 26, 2025
601cb90
workign on coverage
rameshraghupathy Aug 26, 2025
dfda223
workign on coverage
rameshraghupathy Aug 26, 2025
fb51c33
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 8, 2025
4650d23
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 8, 2025
dece2a0
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 9, 2025
6a8524f
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 9, 2025
a381400
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 9, 2025
da39422
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
d5ab77b
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
78de30a
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
39db631
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
ee497b9
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
e5558b6
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
05571bb
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
7285eda
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
2009207
Refactored for graceful shutdown, fixing UT - Final round of tweaks
rameshraghupathy Sep 10, 2025
2470888
Addressed copilot PR comments
rameshraghupathy Sep 15, 2025
c62e79f
Made the timeout logic common
rameshraghupathy Sep 20, 2025
2106099
working on coverage
rameshraghupathy Sep 20, 2025
ffe85ec
working on coverage
rameshraghupathy Sep 20, 2025
22654c8
working on coverage
rameshraghupathy Sep 20, 2025
cac4b67
Addressed PR comments
rameshraghupathy Sep 26, 2025
6d46f60
Addressed review comments related to refactoring
rameshraghupathy Oct 1, 2025
4b092dc
Fixing test failures
rameshraghupathy Oct 1, 2025
b0bfd18
Fixing test failures
rameshraghupathy Oct 1, 2025
aeac810
Addressed review comments related to refactoring
rameshraghupathy Oct 1, 2025
5c98c46
Addressing review comments
rameshraghupathy Oct 21, 2025
8d829cc
Addressing review comments
rameshraghupathy Oct 21, 2025
942874c
Addressing review comments
rameshraghupathy Oct 21, 2025
d1533a8
Addressing review comments
rameshraghupathy Oct 21, 2025
8454a37
Addressing review comments
rameshraghupathy Oct 21, 2025
7e3bf57
Addressing review comments
rameshraghupathy Oct 21, 2025
3c93891
Addressing review comments
rameshraghupathy Oct 21, 2025
b1f6139
Update scripts/wait-for-sonic-core.sh
rameshraghupathy Oct 21, 2025
6a76f95
Update scripts/wait-for-sonic-core.sh
rameshraghupathy Oct 21, 2025
6005650
Merge branch 'sonic-net:master' into graceful-shutdown
rameshraghupathy Nov 7, 2025
39c5889
Aligning with the new changes in module_base.py PR:#608
rameshraghupathy Nov 7, 2025
4e46ef1
Fixing imports in test
rameshraghupathy Nov 7, 2025
8fa0d79
Fixing test issue
rameshraghupathy Nov 7, 2025
dd66d4c
Cleaned up the _handle_successful_reboot function, as the current imp…
rameshraghupathy Nov 7, 2025
74dfe3d
Increasing coverage
rameshraghupathy Nov 7, 2025
aa03811
Increasing coverage
rameshraghupathy Nov 7, 2025
e379e9e
Doing UT
rameshraghupathy Nov 11, 2025
e5a564e
Tested version with the recent module_base changes
rameshraghupathy Nov 12, 2025
693f3a5
Fixed test issue
rameshraghupathy Nov 12, 2025
7b658d2
Fixed test issue
rameshraghupathy Nov 12, 2025
ddff999
Fixed test issue
rameshraghupathy Nov 12, 2025
68ce97a
Update scripts/gnoi_shutdown_daemon.py
rameshraghupathy Nov 12, 2025
ba36f56
Update scripts/gnoi_shutdown_daemon.py
rameshraghupathy Nov 12, 2025
96f8d99
Update scripts/gnoi_shutdown_daemon.py
rameshraghupathy Nov 12, 2025
4bd5631
Update scripts/gnoi_shutdown_daemon.py
rameshraghupathy Nov 12, 2025
a04ccc7
Update scripts/gnoi_shutdown_daemon.py
rameshraghupathy Nov 12, 2025
3a22b62
Update setup.py
rameshraghupathy Nov 12, 2025
22e5684
Update scripts/gnoi_shutdown_daemon.py
rameshraghupathy Nov 12, 2025
6660cc8
Update scripts/gnoi_shutdown_daemon.py
rameshraghupathy Nov 12, 2025
0ef829c
Merge branch 'sonic-net:master' into graceful-shutdown
rameshraghupathy Nov 12, 2025
83ca4a1
Addressed review comments
rameshraghupathy Nov 12, 2025
8da027b
Aligning tests with ddressed review comments
rameshraghupathy Nov 12, 2025
e326d70
Fixing a syntax issue
rameshraghupathy Nov 12, 2025
42d3d49
Fixing tests and coverage
rameshraghupathy Nov 13, 2025
dc9ad31
Fixing tests and coverage
rameshraghupathy Nov 13, 2025
74125be
Fixing tests and coverage
rameshraghupathy Nov 13, 2025
0f50662
testing the import approach in ut
rameshraghupathy Nov 13, 2025
50fe6ea
added tests
rameshraghupathy Nov 13, 2025
8d2b58f
fixing test issue
rameshraghupathy Nov 13, 2025
8a3bfa3
Update scripts/wait-for-sonic-core.sh
rameshraghupathy Nov 13, 2025
755c8b9
Update scripts/wait-for-sonic-core.sh
rameshraghupathy Nov 13, 2025
910f19d
Update setup.py
rameshraghupathy Nov 13, 2025
e146d52
addressed some cosmetic changes suggested by copilot
rameshraghupathy Nov 13, 2025
a8567b8
improving coverage
rameshraghupathy Nov 13, 2025
8dbad2e
improving coverage
rameshraghupathy Nov 13, 2025
5e6ccbb
improving coverage
rameshraghupathy Nov 13, 2025
6ab850d
triggering a run
rameshraghupathy Nov 13, 2025
f438f3d
triggering a run
rameshraghupathy Nov 13, 2025
a20a9f4
triggering a run
rameshraghupathy Nov 13, 2025
5e52daa
cleaning up
rameshraghupathy Nov 13, 2025
c6013f3
adding one more test
rameshraghupathy Nov 13, 2025
59521cf
adding one more test
rameshraghupathy Nov 13, 2025
d25029a
Update scripts/gnoi_shutdown_daemon.py
rameshraghupathy Nov 13, 2025
0410028
Update scripts/gnoi_shutdown_daemon.py
rameshraghupathy Nov 13, 2025
e6c71c4
Update scripts/check_platform.py
rameshraghupathy Nov 13, 2025
7f86b98
addressed review comments
rameshraghupathy Nov 13, 2025
183c337
Fix test failures - add returncode mocking and fix exception test
rameshraghupathy Nov 13, 2025
6ff92ad
addressed review comments
rameshraghupathy Nov 14, 2025
fd0037b
added gnoi_shutdown_daemon.py to setup
rameshraghupathy Nov 15, 2025
565294c
Addressed review comments
rameshraghupathy Nov 15, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions data/debian/rules
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,6 @@ override_dh_installsystemd:
dh_installsystemd --no-start --name=procdockerstatsd
dh_installsystemd --no-start --name=determine-reboot-cause
dh_installsystemd --no-start --name=process-reboot-cause
dh_installsystemd --no-start --name=gnoi-shutdown
dh_installsystemd $(HOST_SERVICE_OPTS) --name=sonic-hostservice

16 changes: 16 additions & 0 deletions data/debian/sonic-host-services-data.gnoi-shutdown.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[Unit]
Description=gNOI based DPU Graceful Shutdown Daemon
Requires=database.service
Wants=network-online.target
After=network-online.target database.service

[Service]
Type=simple
ExecStartPre=/usr/local/bin/check_platform.sh
ExecStartPre=/usr/local/bin/wait-for-sonic-core.sh
ExecStart=/usr/local/bin/gnoi-shutdown-daemon
Restart=always
RestartSec=5

[Install]
WantedBy=multi-user.target
14 changes: 14 additions & 0 deletions scripts/check_platform.sh
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this need to be bash? Can it be python? Inline script is difficult to maintain.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hdwhdw Fixed

Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash

subtype=$(sonic-cfggen -d -v DEVICE_METADATA.localhost.subtype)
is_dpu=$(python3 -c "try:
from utilities_common.chassis import is_dpu
print(is_dpu())
except Exception:
print('False')")

if [[ "$subtype" == "SmartSwitch" && "$is_dpu" != "True" ]]; then
exit 0
else
exit 1
fi
297 changes: 297 additions & 0 deletions scripts/gnoi_shutdown_daemon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
#!/usr/bin/env python3
"""
gnoi-shutdown-daemon

Listens for CHASSIS_MODULE_TABLE state changes in STATE_DB and, when a
SmartSwitch DPU module enters a "shutdown" transition, issues a gNOI Reboot
(method HALT) toward that DPU and polls RebootStatus until complete or timeout.

Additionally, a lightweight background thread periodically enforces timeout
clearing of stuck transitions (startup/shutdown/reboot) using ModuleBase’s
common APIs, so all code paths (CLI, chassisd, platform, gNOI) benefit.
"""

import json
import time
import subprocess
import socket
import os
import threading

REBOOT_RPC_TIMEOUT_SEC = 60 # gNOI System.Reboot call timeout
STATUS_POLL_TIMEOUT_SEC = 60 # overall time - polling RebootStatus
STATUS_POLL_INTERVAL_SEC = 5 # delay between polls
STATUS_RPC_TIMEOUT_SEC = 10 # per RebootStatus RPC timeout
REBOOT_METHOD_HALT = 3 # gNOI System.Reboot method: HALT

from swsscommon.swsscommon import SonicV2Connector
from sonic_py_common import syslogger
# Centralized transition API on ModuleBase
from sonic_platform_base.module_base import ModuleBase

_v2 = None
SYSLOG_IDENTIFIER = "gnoi-shutdown-daemon"
logger = syslogger.SysLogger(SYSLOG_IDENTIFIER)

# ##########
# helper
# ##########
def is_tcp_open(host: str, port: int, timeout: float = None) -> bool:
"""Fast reachability test for <host,port>. No side effects."""
if timeout is None:
timeout = float(os.getenv("GNOI_DIAL_TIMEOUT", "1.0"))
try:
with socket.create_connection((host, port), timeout=timeout):
return True
except OSError:
return False

# ##########
# DB helpers
# ##########

def _get_dbid_state(db) -> int:
"""Resolve STATE_DB numeric ID across connector implementations."""
try:
return db.get_dbid(db.STATE_DB)
except Exception:
# Default STATE_DB index in SONiC redis instances
return 6

def _get_pubsub(db):
"""Return a pubsub object for keyspace notifications.

Prefer a direct pubsub() if the connector exposes one; otherwise,
fall back to the raw redis client's pubsub().
"""
try:
return db.pubsub() # some connectors expose pubsub()
except AttributeError:
client = db.get_redis_client(db.STATE_DB)
return client.pubsub()

def _cfg_get_entry(table, key):
"""Read CONFIG_DB row via unix-socket V2 API and normalize to str."""
global _v2
if _v2 is None:
from swsscommon import swsscommon
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use existing API similar to

"import sonic_py_common.daemon_base as daemon_base
self.state_db = daemon_base.db_connect("STATE_DB")"

_v2 = swsscommon.SonicV2Connector(use_unix_socket_path=True)
_v2.connect(_v2.CONFIG_DB)
raw = _v2.get_all(_v2.CONFIG_DB, f"{table}|{key}") or {}
def _s(x): return x.decode("utf-8", "ignore") if isinstance(x, (bytes, bytearray)) else x
return {_s(k): _s(v) for k, v in raw.items()}

# ############
# gNOI helpers
# ############

def execute_gnoi_command(command_args, timeout_sec=REBOOT_RPC_TIMEOUT_SEC):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like execute any command. Maybe we should bake in docker exec gnoi_client.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hdwhdw We're deferring this refactoring for now to keep the PR focused on the critical review comments already addressed (function renaming, main loop refactoring, test fixes, and UTC/ISO alignment). This is a nice-to-have improvement that can be done in a follow-up if desired, as it doesn't affect correctness or the core functionality.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hdwhdw could you elaborate more on this? Do you mean that this command is not executing via the docker ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rameshraghupathy in that case maybe at least rename the function to execute_command instead of execute_gnoi_command?

"""Run gnoi_client with a timeout; return (rc, stdout, stderr)."""
try:
result = subprocess.run(command_args, capture_output=True, text=True, timeout=timeout_sec)
return result.returncode, result.stdout.strip(), result.stderr.strip()
except subprocess.TimeoutExpired as e:
return -1, "", f"Command timed out after {int(e.timeout)}s."
except Exception as e:
return -2, "", f"Command failed: {e}"

def get_dpu_ip(dpu_name: str):
entry = _cfg_get_entry("DHCP_SERVER_IPV4_PORT", f"bridge-midplane|{dpu_name.lower()}")
return entry.get("ips@")

def get_gnmi_port(dpu_name: str):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggest get_dpu_gnmi_port so not to be confused with local NPU gnmi port.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hdwhdw Fixed

variants = [dpu_name, dpu_name.lower(), dpu_name.upper()]
for k in variants:
entry = _cfg_get_entry("DPU_PORT", k)
if entry and entry.get("gnmi_port"):
return str(entry.get("gnmi_port"))
return "8080"

# ###############
# Timeout Enforcer
# ###############
class TimeoutEnforcer(threading.Thread):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This TimeoutEnforcer seems to be an overhead. We can get rid of this in my opinion, as the set call for state_transition_in_progress is clearing if timeout is reached and caller is not clearing it for some reason.

"""
Periodically enforces CHASSIS_MODULE_TABLE transition timeouts for all modules.
Uses ModuleBase’s common helpers so all code paths benefit (CLI, chassisd, platform, gNOI).
"""
def __init__(self, db, module_base: ModuleBase, interval_sec: int = 5):
super().__init__(daemon=True, name="timeout-enforcer")
self._db = db
self._mb = module_base
self._interval = max(1, int(interval_sec))
self._stop = threading.Event()

def stop(self):
self._stop.set()

def _list_modules(self):
"""Discover module names by scanning CHASSIS_MODULE_TABLE keys."""
try:
client = self._db.get_redis_client(self._db.STATE_DB)
keys = client.keys("CHASSIS_MODULE_TABLE|*")
out = []
for k in keys or []:
if isinstance(k, (bytes, bytearray)):
k = k.decode("utf-8", "ignore")
_, _, name = k.partition("|")
if name:
out.append(name)
return sorted(out)
except Exception:
return []

def run(self):
while not self._stop.is_set():
try:
for name in self._list_modules():
try:
entry = self._mb.get_module_state_transition(self._db, name) or {}
inprog = str(entry.get("state_transition_in_progress", "")).lower() in ("1", "true", "yes", "on")
if not inprog:
continue
op = entry.get("transition_type", "startup")
timeouts = self._mb._load_transition_timeouts()
# Fallback safely to defaults if key missing/unknown
timeout_sec = int(timeouts.get(op, ModuleBase._TRANSITION_TIMEOUT_DEFAULTS.get(op, 300)))
if self._mb.is_module_state_transition_timed_out(self._db, name, timeout_sec):
success = self._mb.clear_module_state_transition(self._db, name)
if success:
logger.log_info(f"Cleared transition after timeout for {name}")
else:
logger.log_warning(f"Failed to clear transition timeout for {name}")
except Exception as e:
# Keep loop resilient; log at debug noise level
logger.log_debug(f"Timeout enforce error for {name}: {e}")
except Exception as e:
logger.log_debug(f"TimeoutEnforcer loop error: {e}")
self._stop.wait(self._interval)

# #########
# Main loop
# #########

def main():
# Connect for STATE_DB pubsub + reads
db = SonicV2Connector()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use existing AAPI

"import sonic_py_common.daemon_base as daemon_base
self.state_db = daemon_base.db_connect("STATE_DB")"

db.connect(db.STATE_DB)

# Centralized transition reader
module_base = ModuleBase()

pubsub = _get_pubsub(db)
state_dbid = _get_dbid_state(db)

# Listen to keyspace notifications for CHASSIS_MODULE_TABLE keys
topic = f"__keyspace@{state_dbid}__:CHASSIS_MODULE_TABLE|*"
pubsub.psubscribe(topic)

logger.log_info("gnoi-shutdown-daemon started and listening for shutdown events.")

# Start background timeout enforcement so stuck transitions auto-clear
enforcer = TimeoutEnforcer(db, module_base, interval_sec=5)
enforcer.start()

while True:
message = pubsub.get_message()
if message and message.get("type") == "pmessage":
channel = message.get("channel", "")
# channel format: "__keyspace@N__:CHASSIS_MODULE_TABLE|DPU0"
key = channel.split(":", 1)[-1] if ":" in channel else channel

if not key.startswith("CHASSIS_MODULE_TABLE|"):
time.sleep(1)
continue

# Extract module name
try:
dpu_name = key.split("|", 1)[1]
except IndexError:
time.sleep(1)
continue

# Read state via centralized API
try:
entry = module_base.get_module_state_transition(db, dpu_name) or {}
except Exception as e:
logger.log_error(f"Failed reading transition state for {dpu_name}: {e}")
time.sleep(1)
continue

type = entry.get("transition_type")
if entry.get("state_transition_in_progress", "False") == "True" and (type == "shutdown" or type == "reboot"):
logger.log_info(f"{type} request detected for {dpu_name}. Initiating gNOI reboot.")
try:
dpu_ip = get_dpu_ip(dpu_name)
port = get_gnmi_port(dpu_name)
if not dpu_ip:
raise RuntimeError("DPU IP not found")
except Exception as e:
logger.log_error(f"Error getting DPU IP or port for {dpu_name}: {e}")
time.sleep(1)
continue

# skip if TCP is not reachable
if not is_tcp_open(dpu_ip, int(port)):
logger.log_info(f"Skipping {dpu_name}: {dpu_ip}:{port} unreachable (offline/down)")
time.sleep(1)
continue

# 1) Send Reboot HALT
logger.log_notice(f"Issuing gNOI Reboot to {dpu_ip}:{port}")
reboot_cmd = [
"docker", "exec", "gnmi", "gnoi_client",
f"-target={dpu_ip}:{port}",
"-logtostderr", "-notls",
"-module", "System",
"-rpc", "Reboot",
"-jsonin", json.dumps({"method": REBOOT_METHOD_HALT, "message": "Triggered by SmartSwitch graceful shutdown"})
]
rc, out, err = execute_gnoi_command(reboot_cmd, timeout_sec=REBOOT_RPC_TIMEOUT_SEC)
if rc != 0:
logger.log_error(f"gNOI Reboot command failed for {dpu_name}: {err or out}")
# As per HLD, daemon just logs and returns.
time.sleep(1)
continue

# 2) Poll RebootStatus with a real deadline
logger.log_notice(
f"Polling RebootStatus for {dpu_name} at {dpu_ip}:{port} "
f"(timeout {STATUS_POLL_TIMEOUT_SEC}s, interval {STATUS_POLL_INTERVAL_SEC}s)"
)
deadline = time.monotonic() + STATUS_POLL_TIMEOUT_SEC
reboot_successful = False

status_cmd = [
"docker", "exec", "gnmi", "gnoi_client",
f"-target={dpu_ip}:{port}",
"-logtostderr", "-notls",
"-module", "System",
"-rpc", "RebootStatus"
]
while time.monotonic() < deadline:
rc_s, out_s, err_s = execute_gnoi_command(status_cmd, timeout_sec=STATUS_RPC_TIMEOUT_SEC)
if rc_s == 0 and out_s and ("reboot complete" in out_s.lower()):
reboot_successful = True
break
time.sleep(STATUS_POLL_INTERVAL_SEC)

if reboot_successful:
if type == "reboot":
success = module_base.clear_module_state_transition(db, dpu_name)
if success:
logger.log_info(f"Cleared transition for {dpu_name}")
else:
logger.log_warning(f"Failed to clear transition for {dpu_name}")
logger.log_info(f"Halting the services on DPU is successful for {dpu_name}.")
else:
logger.log_warning(f"Status polling of halting the services on DPU timed out for {dpu_name}.")

# NOTE:
# The CHASSIS_MODULE_TABLE transition flag is cleared for startup/shutdown in
# module_base.py. The daemon does not clear it. For reboot transitions, the
# daemon relies on the TimeoutEnforcer thread to clear any stuck transitions.

if __name__ == "__main__":
main()

54 changes: 54 additions & 0 deletions scripts/wait-for-sonic-core.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env bash
set -euo pipefail

log() { echo "[wait-for-sonic-core] $*"; }

# Hard dep we expect to be up before we start: swss
if systemctl is-active --quiet swss.service; then
log "Service swss.service is active"
else
log "Waiting for swss.service to become active…"
systemctl is-active -q swss.service || true
systemctl --no-pager --full status swss.service || true
exit 0 # let systemd retry; ExecStartPre must be quick
fi

# Hard dep we expect to be up before we start: gnmi
if systemctl is-active --quiet gnmi.service; then
log "Service gnmi.service is active"
else
log "Waiting for gnmi.service to become active…"
systemctl is-active -q gnmi.service || true
systemctl --no-pager --full status gnmi.service || true
exit 0 # let systemd retry; ExecStartPre must be quick
fi

# pmon is advisory: proceed even if it's not active yet
if systemctl is-active --quiet pmon.service; then
log "Service pmon.service is active"
else
log "pmon.service not active yet (advisory)"
fi

# Wait for CHASSIS_MODULE_TABLE to exist (best-effort, bounded time)
MAX_WAIT=${WAIT_CORE_MAX_SECONDS:-60}
INTERVAL=2
ELAPSED=0

has_chassis_table() {
redis-cli -n 6 KEYS 'CHASSIS_MODULE_TABLE|*' | grep -q .
}

log "Waiting for CHASSIS_MODULE_TABLE keys…"
while ! has_chassis_table; do
if (( ELAPSED >= MAX_WAIT )); then
log "Timed out waiting for CHASSIS_MODULE_TABLE; proceeding anyway."
exit 0
fi
sleep "$INTERVAL"
ELAPSED=$((ELAPSED + INTERVAL))
done

log "CHASSIS_MODULE_TABLE present."
log "SONiC core is ready."
exit 0
Loading
Loading