Skip to content

Commit 95f317a

Browse files
[Mellanox] Fix issue: watchdogutil command does not work (#16091)
- Why I did it watchdogutil uses platform API watchdog instance to control/query watchdog status. In Nvidia watchdog status, it caches "armed" status in a object member "WatchdogImplBase.armed". This is not working for CLI infrastructure because each CLI will create a new watchdog instance, the status cached in previous instance will totally lose. Consider following commands: admin@sonic:~$ sudo watchdogutil arm -s 100 =====> watchdog instance1, armed=True Watchdog armed for 100 seconds admin@sonic:~$ sudo watchdogutil status ======> watchdog instance2, armed=False Status: Unarmed admin@sonic:~$ sudo watchdogutil disarm =======> watchdog instance3, armed=False Failed to disarm Watchdog - How I did it Use sysfs to query watchdog status - How to verify it Manual test Unit test
1 parent d42066c commit 95f317a

File tree

2 files changed

+46
-38
lines changed

2 files changed

+46
-38
lines changed

platform/mellanox/mlnx-platform-api/sonic_platform/watchdog.py

Lines changed: 22 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import time
2828

2929
from sonic_platform_base.watchdog_base import WatchdogBase
30+
from . import utils
3031

3132
""" ioctl constants """
3233
IO_WRITE = 0x40000000
@@ -80,16 +81,15 @@ def __init__(self, wd_device_path):
8081
super(WatchdogImplBase, self).__init__()
8182

8283
self.watchdog_path = wd_device_path
83-
self.watchdog = self.open_handle()
84-
85-
# Opening a watchdog descriptor starts
86-
# watchdog timer;
87-
# by default it should be stopped
88-
self._disablecard()
89-
self.armed = False
90-
84+
self._watchdog = None
9185
self.timeout = self._gettimeout()
9286

87+
@property
88+
def watchdog(self):
89+
if self._watchdog is None:
90+
self._watchdog = self.open_handle()
91+
return self._watchdog
92+
9393
def open_handle(self):
9494
return os.open(self.watchdog_path, os.O_WRONLY)
9595

@@ -134,21 +134,15 @@ def _gettimeout(self):
134134
@return watchdog timeout
135135
"""
136136

137-
req = array.array('I', [0])
138-
fcntl.ioctl(self.watchdog, WDIOC_GETTIMEOUT, req, True)
139-
140-
return int(req[0])
137+
return utils.read_int_from_file('/run/hw-management/watchdog/main/timeout')
141138

142139
def _gettimeleft(self):
143140
"""
144141
Get time left before watchdog timer expires
145142
@return time left in seconds
146143
"""
147144

148-
req = array.array('I', [0])
149-
fcntl.ioctl(self.watchdog, WDIOC_GETTIMELEFT, req, True)
150-
151-
return int(req[0])
145+
return utils.read_int_from_file('/run/hw-management/watchdog/main/timeleft')
152146

153147
def arm(self, seconds):
154148
"""
@@ -162,11 +156,10 @@ def arm(self, seconds):
162156
try:
163157
if self.timeout != seconds:
164158
self.timeout = self._settimeout(seconds)
165-
if self.armed:
159+
if self.is_armed():
166160
self._keepalive()
167161
else:
168162
self._enablecard()
169-
self.armed = True
170163
ret = self.timeout
171164
except IOError:
172165
pass
@@ -179,10 +172,9 @@ def disarm(self):
179172
"""
180173

181174
disarmed = False
182-
if self.armed:
175+
if self.is_armed():
183176
try:
184177
self._disablecard()
185-
self.armed = False
186178
disarmed = True
187179
except IOError:
188180
pass
@@ -194,7 +186,7 @@ def is_armed(self):
194186
Implements is_armed WatchdogBase API
195187
"""
196188

197-
return self.armed
189+
return utils.read_str_from_file('/run/hw-management/watchdog/main/state') == 'active'
198190

199191
def get_remaining_time(self):
200192
"""
@@ -203,7 +195,7 @@ def get_remaining_time(self):
203195

204196
timeleft = WD_COMMON_ERROR
205197

206-
if self.armed:
198+
if self.is_armed():
207199
try:
208200
timeleft = self._gettimeleft()
209201
except IOError:
@@ -216,13 +208,15 @@ def __del__(self):
216208
Close watchdog
217209
"""
218210

219-
os.close(self.watchdog)
211+
if self._watchdog is not None:
212+
os.close(self._watchdog)
220213

221214

222215
class WatchdogType1(WatchdogImplBase):
223216
"""
224217
Watchdog type 1
225218
"""
219+
TIMESTAMP_FILE = '/tmp/nvidia/watchdog_timestamp'
226220

227221
def arm(self, seconds):
228222
"""
@@ -233,7 +227,8 @@ def arm(self, seconds):
233227
ret = WatchdogImplBase.arm(self, seconds)
234228
# Save the watchdog arm timestamp
235229
# requiered for get_remaining_time()
236-
self.arm_timestamp = time.time()
230+
os.makedirs('/tmp/nvidia', exist_ok=True)
231+
utils.write_file(self.TIMESTAMP_FILE, str(time.time()))
237232

238233
return ret
239234

@@ -246,8 +241,9 @@ def get_remaining_time(self):
246241

247242
timeleft = WD_COMMON_ERROR
248243

249-
if self.armed:
250-
timeleft = int(self.timeout - (time.time() - self.arm_timestamp))
244+
if self.is_armed():
245+
arm_timestamp = utils.read_float_from_file(self.TIMESTAMP_FILE)
246+
timeleft = int(self.timeout - (time.time() - arm_timestamp))
251247

252248
return timeleft
253249

platform/mellanox/mlnx-platform-api/tests/test_watchdog.py

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -84,46 +84,58 @@ def test_is_wd_type2(self, mock_exists, test_para):
8484
mock_exists.return_value = test_para
8585
assert is_wd_type2('') is test_para
8686

87+
@mock.patch('sonic_platform.utils.read_str_from_file')
88+
def test_is_armed(self, mock_read):
89+
watchdog = WatchdogType2('watchdog2')
90+
mock_read.return_value = 'inactive'
91+
assert not watchdog.is_armed()
92+
mock_read.return_value = 'active'
93+
assert watchdog.is_armed()
94+
8795
@mock.patch('sonic_platform.watchdog.WatchdogImplBase.open_handle', mock.MagicMock())
8896
@mock.patch('sonic_platform.watchdog.fcntl.ioctl', mock.MagicMock())
89-
def test_arm_disarm_watchdog2(self):
97+
@mock.patch('sonic_platform.watchdog.WatchdogImplBase.is_armed')
98+
def test_arm_disarm_watchdog2(self, mock_is_armed):
9099
watchdog = WatchdogType2('watchdog2')
91100
assert watchdog.arm(-1) == -1
92-
assert not watchdog.is_armed()
101+
mock_is_armed.return_value = False
93102
watchdog.arm(10)
94-
assert watchdog.is_armed()
103+
mock_is_armed.return_value = True
95104
watchdog.arm(5)
96-
assert watchdog.is_armed()
97105
watchdog.disarm()
98-
assert not watchdog.is_armed()
99106

100107
@mock.patch('sonic_platform.watchdog.WatchdogImplBase.open_handle', mock.MagicMock())
101108
@mock.patch('sonic_platform.watchdog.fcntl.ioctl', mock.MagicMock())
102-
def test_arm_disarm_watchdog1(self):
109+
@mock.patch('sonic_platform.watchdog.WatchdogImplBase.is_armed')
110+
def test_arm_disarm_watchdog1(self, mock_is_armed):
103111
watchdog = WatchdogType1('watchdog1')
104112
assert watchdog.arm(-1) == -1
105-
assert not watchdog.is_armed()
113+
mock_is_armed.return_value = False
106114
watchdog.arm(10)
107-
assert watchdog.is_armed()
115+
mock_is_armed.return_value = True
108116
watchdog.arm(5)
109-
assert watchdog.is_armed()
110117
watchdog.disarm()
111-
assert not watchdog.is_armed()
112118

113119
@mock.patch('sonic_platform.watchdog.WatchdogImplBase.open_handle', mock.MagicMock())
114120
@mock.patch('sonic_platform.watchdog.fcntl.ioctl', mock.MagicMock())
115121
@mock.patch('sonic_platform.watchdog.WatchdogImplBase._gettimeleft', mock.MagicMock(return_value=10))
116-
def test_get_remaining_time_watchdog2(self):
122+
@mock.patch('sonic_platform.watchdog.WatchdogImplBase.is_armed')
123+
def test_get_remaining_time_watchdog2(self, mock_is_armed):
117124
watchdog = WatchdogType2('watchdog2')
125+
mock_is_armed.return_value = False
118126
assert watchdog.get_remaining_time() == -1
119127
watchdog.arm(10)
128+
mock_is_armed.return_value = True
120129
assert watchdog.get_remaining_time() == 10
121130

122131
@mock.patch('sonic_platform.watchdog.WatchdogImplBase.open_handle', mock.MagicMock())
123132
@mock.patch('sonic_platform.watchdog.fcntl.ioctl', mock.MagicMock())
124133
@mock.patch('sonic_platform.watchdog.WatchdogImplBase._gettimeleft', mock.MagicMock(return_value=10))
125-
def test_get_remaining_time_watchdog1(self):
134+
@mock.patch('sonic_platform.watchdog.WatchdogImplBase.is_armed')
135+
def test_get_remaining_time_watchdog1(self, mock_is_armed):
126136
watchdog = WatchdogType1('watchdog2')
137+
mock_is_armed.return_value = False
127138
assert watchdog.get_remaining_time() == -1
128139
watchdog.arm(10)
140+
mock_is_armed.return_value = True
129141
assert watchdog.get_remaining_time() > 0

0 commit comments

Comments
 (0)