Skip to content

Commit 437dc69

Browse files
louis-nexthoplotus-nexthopdomingo-nexthop
authored
[Nexthop] NH-4010 & NH-4020 Watchdog support (sonic-net#24225)
* Add watchdog support * Bring back empty line * Add missing comma in pddf-device.json.j2 * no-op commit to retrigger workflow * no-op commit to retrigger workflow * Enable watchdog counter before setting countdown value --------- Co-authored-by: Lotus Fenn <[email protected]> Co-authored-by: domingo-nexthop <[email protected]>
1 parent 65eaee1 commit 437dc69

File tree

6 files changed

+470
-50
lines changed

6 files changed

+470
-50
lines changed

device/nexthop/x86_64-nexthop_4010-r0/pddf/pddf-device.json.j2

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13191,6 +13191,16 @@
1319113191
"drv_attr_name": "curr4_crit"
1319213192
}
1319313193
]
13194+
}
13195+
},
13196+
"WATCHDOG": {
13197+
"dev_info": {
13198+
"device_type": "WATCHDOG",
13199+
"device_parent": "MULTIFPGAPCIE1"
13200+
},
13201+
"dev_attr": {
13202+
"event_driven_power_cycle_control_reg_offset": "0x28",
13203+
"watchdog_counter_reg_offset": "0x1E0"
1319413204
}
1319513205
}
1319613206
}

platform/broadcom/sonic-platform-modules-nexthop/common/sonic_platform/chassis.py

Lines changed: 35 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,14 @@
99
#
1010
#############################################################################
1111

12+
import os
13+
import re
1214
import sys
1315
import time
1416

1517
from sonic_platform.thermal import NexthopFpgaAsicThermal
18+
from sonic_platform.watchdog import Watchdog
19+
1620
try:
1721
from sonic_platform_pddf_base.pddf_chassis import PddfChassis
1822
except ImportError as e:
@@ -25,6 +29,7 @@
2529
# Sleep duration waiting for change events
2630
CHANGE_EVENT_SLEEP_SECONDS = 1
2731

32+
2833
class Chassis(PddfChassis):
2934
"""
3035
PDDF Platform-specific Chassis class
@@ -35,9 +40,11 @@ def __init__(self, pddf_data=None, pddf_plugin_data=None):
3540

3641
# {'port': 'presence'}
3742
self._xcvr_presence = {}
43+
self._watchdog: Watchdog | None = None
44+
self._pddf_data = pddf_data
3845

39-
if pddf_data:
40-
num_asic_thermals = pddf_data.data.get('PLATFORM', {}).get('num_nexthop_fpga_asic_temp_sensors', 0)
46+
if self._pddf_data:
47+
num_asic_thermals = self._pddf_data.data.get("PLATFORM", {}).get("num_nexthop_fpga_asic_temp_sensors", 0)
4148
else:
4249
num_asic_thermals = 0
4350
for index in range(num_asic_thermals):
@@ -84,10 +91,9 @@ def get_sfp(self, index):
8491

8592
try:
8693
# The index starts from 1
87-
sfp = self._sfp_list[index-1]
94+
sfp = self._sfp_list[index - 1]
8895
except IndexError:
89-
sys.stderr.write("SFP index {} out of range (1-{})\n".format(
90-
index, len(self._sfp_list)))
96+
sys.stderr.write("SFP index {} out of range (1-{})\n".format(index, len(self._sfp_list)))
9197
return sfp
9298

9399
def _get_xcvr_change_event(self):
@@ -126,15 +132,15 @@ def get_change_event(self, timeout=0):
126132
indicates that fan 0 has been removed, fan 2
127133
has been inserted and sfp 11 has been removed.
128134
"""
129-
end_time = time.monotonic() + timeout/1000 if timeout > 0 else None
135+
end_time = time.monotonic() + timeout / 1000 if timeout > 0 else None
130136
change_events = {}
131137
while True:
132-
change_events['sfp'] = self._get_xcvr_change_event()
133-
if bool(change_events['sfp']):
138+
change_events["sfp"] = self._get_xcvr_change_event()
139+
if bool(change_events["sfp"]):
134140
break
135141
if end_time is not None and time.monotonic() > end_time:
136142
break
137-
time.sleep(min(timeout/1000, CHANGE_EVENT_SLEEP_SECONDS))
143+
time.sleep(min(timeout / 1000, CHANGE_EVENT_SLEEP_SECONDS))
138144
return True, change_events
139145

140146
# sonic-utilities/show/system_health.py calls this
@@ -169,17 +175,32 @@ def get_reboot_cause(self):
169175
sw_reboot_cause = "Unknown"
170176

171177
return ('REBOOT_CAUSE_NON_HARDWARE', sw_reboot_cause)
172-
173-
def get_watchdog(self):
178+
179+
def get_watchdog(self) -> Watchdog | None:
174180
"""
175181
Retrieves hardware watchdog device on this chassis
176182
Returns:
177183
An object derived from WatchdogBase representing the hardware
178-
watchdog device
184+
watchdog device. None if no watchdog is present as defined in pddf_data.
179185
"""
180186
if self._watchdog is None:
181-
from sonic_platform.watchdog import Watchdog
182-
self._watchdog = Watchdog()
187+
if not self._pddf_data:
188+
return None
189+
watchdog_pddf_obj_data = self._pddf_data.data.get("WATCHDOG")
190+
if watchdog_pddf_obj_data is None:
191+
return None
192+
device_parent_name = watchdog_pddf_obj_data["dev_info"]["device_parent"]
193+
fpga_pci_addr = self._pddf_data.data[device_parent_name]["dev_info"]["device_bdf"]
194+
watchdog_dev_attr = watchdog_pddf_obj_data["dev_attr"]
195+
event_driven_power_cycle_control_reg_offset = int(
196+
watchdog_dev_attr["event_driven_power_cycle_control_reg_offset"], 16
197+
)
198+
watchdog_counter_reg_offset = int(watchdog_dev_attr["watchdog_counter_reg_offset"], 16)
199+
self._watchdog = Watchdog(
200+
fpga_pci_addr=fpga_pci_addr,
201+
event_driven_power_cycle_control_reg_offset=event_driven_power_cycle_control_reg_offset,
202+
watchdog_counter_reg_offset=watchdog_counter_reg_offset,
203+
)
183204

184205
return self._watchdog
185206

platform/broadcom/sonic-platform-modules-nexthop/common/sonic_platform/watchdog.py

Lines changed: 109 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,93 @@
44
# SPDX-License-Identifier: Apache-2.0
55

66
from sonic_platform_base.watchdog_base import WatchdogBase
7+
from nexthop import fpga_lib
8+
from sonic_py_common import syslogger
9+
10+
_SYSLOG_IDENTIFIER = "sonic_platform.watchdog"
11+
_logger = syslogger.SysLogger(_SYSLOG_IDENTIFIER)
12+
713

814
class Watchdog(WatchdogBase):
915
"""
1016
Nexthop platform-specific placeholder Watchdog class
11-
This class isn't implemented yet.
1217
"""
1318

14-
def __init__(self):
19+
# Counter is 24 bits and should be interpreted as milliseconds
20+
_MAX_WATCHDOG_COUNTER_MILLISECONDS = 0xFFFFFF
21+
22+
def __init__(
23+
self,
24+
fpga_pci_addr: str,
25+
event_driven_power_cycle_control_reg_offset: int,
26+
watchdog_counter_reg_offset: int,
27+
):
1528
"""
1629
Initialize the Watchdog class
1730
"""
18-
self.armed = False
19-
self.timeout = 0
20-
# Add any platform-specific initialization here
31+
super().__init__()
32+
self.fpga_pci_addr: str = fpga_pci_addr
33+
self.event_driven_power_cycle_control_reg_offset: int = (
34+
event_driven_power_cycle_control_reg_offset
35+
)
36+
self.watchdog_counter_reg_offset: int = watchdog_counter_reg_offset
37+
38+
def _read_watchdog_counter_register(self) -> int:
39+
"""Returns the value of the watchdog counter register."""
40+
return fpga_lib.read_32(
41+
pci_address=self.fpga_pci_addr, offset=self.watchdog_counter_reg_offset
42+
)
43+
44+
def _read_watchdog_countdown_value_milliseconds(self) -> int:
45+
"""Returns the value in the watchdog countdown, in milliseconds."""
46+
reg_val = self._read_watchdog_counter_register()
47+
return fpga_lib.get_field(reg_val=reg_val, bit_range=(0, 23))
48+
49+
def _update_watchdog_countdown_value(self, milliseconds: int) -> None:
50+
"""Updates the watchdog counter value."""
51+
reg_val = self._read_watchdog_counter_register()
52+
new_reg_val = fpga_lib.overwrite_field(
53+
reg_val=reg_val, bit_range=(0, 23), field_val=milliseconds
54+
)
55+
fpga_lib.write_32(
56+
pci_address=self.fpga_pci_addr,
57+
offset=self.watchdog_counter_reg_offset,
58+
val=new_reg_val,
59+
)
60+
61+
def _read_watchdog_counter_enable(self) -> int:
62+
"""Reads the bit of whether the counter is enabled."""
63+
reg_val = self._read_watchdog_counter_register()
64+
return fpga_lib.get_field(reg_val=reg_val, bit_range=(31, 31))
2165

22-
def arm(self, seconds):
66+
def _toggle_watchdog_counter_enable(self, enable: bool) -> None:
67+
"""Enables or disables the watchdog counter."""
68+
reg_val = self._read_watchdog_counter_register()
69+
new_reg_val = fpga_lib.overwrite_field(
70+
reg_val=reg_val, bit_range=(31, 31), field_val=int(enable)
71+
)
72+
fpga_lib.write_32(
73+
pci_address=self.fpga_pci_addr,
74+
offset=self.watchdog_counter_reg_offset,
75+
val=new_reg_val,
76+
)
77+
78+
def _toggle_watchdog_reboot(self, enable: bool) -> None:
79+
"""Enables or disables the capability of reboot induced by watchdog."""
80+
reg_val = fpga_lib.read_32(
81+
pci_address=self.fpga_pci_addr,
82+
offset=self.event_driven_power_cycle_control_reg_offset,
83+
)
84+
new_reg_val = fpga_lib.overwrite_field(
85+
reg_val=reg_val, bit_range=(4, 4), field_val=int(enable)
86+
)
87+
fpga_lib.write_32(
88+
pci_address=self.fpga_pci_addr,
89+
offset=self.event_driven_power_cycle_control_reg_offset,
90+
val=new_reg_val,
91+
)
92+
93+
def arm(self, seconds: int) -> int:
2394
"""
2495
Arm the hardware watchdog with a timeout of <seconds> seconds.
2596
If the watchdog is currently armed, calling this function will
@@ -32,38 +103,50 @@ def arm(self, seconds):
32103
An integer specifying the *actual* number of seconds the watchdog
33104
was armed with. On failure returns -1.
34105
"""
35-
# Implement platform-specific arming logic here
36-
# For now, just simulate successful arming
37-
if seconds < 0:
106+
milliseconds = seconds * 1_000
107+
108+
if milliseconds < 0 or milliseconds > self._MAX_WATCHDOG_COUNTER_MILLISECONDS:
109+
_logger.log_error(
110+
f"cannot arm watchdog with {milliseconds} ms. should be within 0 and {self._MAX_WATCHDOG_COUNTER_MILLISECONDS} ms"
111+
)
38112
return -1
39-
40-
self.timeout = seconds
41-
self.armed = True
42-
return self.timeout
43113

44-
def disarm(self):
114+
try:
115+
self._toggle_watchdog_counter_enable(True)
116+
self._toggle_watchdog_reboot(True)
117+
self._update_watchdog_countdown_value(milliseconds=milliseconds)
118+
except Exception as e:
119+
_logger.log_error(f"cannot arm watchdog: {e}")
120+
return -1
121+
else:
122+
return seconds
123+
124+
def disarm(self) -> bool:
45125
"""
46126
Disarm the hardware watchdog
47127
48128
Returns:
49129
A boolean, True if watchdog is disarmed successfully, False if not
50130
"""
51-
# Implement platform-specific disarming logic here
52-
# For now, just simulate successful disarming
53-
self.armed = False
54-
self.timeout = 0
55-
return True
131+
try:
132+
self._toggle_watchdog_counter_enable(False)
133+
self._toggle_watchdog_reboot(False)
134+
except Exception as e:
135+
_logger.log_error(f"cannot disarm watchdog: {e}")
136+
return False
137+
else:
138+
return True
56139

57-
def is_armed(self):
140+
def is_armed(self) -> bool:
58141
"""
59142
Retrieves the armed state of the hardware watchdog.
60143
61144
Returns:
62145
A boolean, True if watchdog is armed, False if not
63146
"""
64-
return self.armed
147+
return bool(self._read_watchdog_counter_enable())
65148

66-
def get_remaining_time(self):
149+
def get_remaining_time(self) -> int:
67150
"""
68151
If the watchdog is armed, retrieve the number of seconds remaining on
69152
the watchdog timer
@@ -72,10 +155,8 @@ def get_remaining_time(self):
72155
An integer specifying the number of seconds remaining on the
73156
watchdog timer. If the watchdog is not armed, returns -1.
74157
"""
75-
# Implement platform-specific logic to get remaining time
76-
# For now, just return the timeout if armed
77-
if not self.armed:
158+
if not self.is_armed():
78159
return -1
79-
80-
# In a real implementation, you would calculate the actual remaining time
81-
return self.timeout
160+
161+
countdown_milliseconds = self._read_watchdog_countdown_value_milliseconds()
162+
return int(countdown_milliseconds / 1_000)

0 commit comments

Comments
 (0)