Skip to content

Commit 6e975f5

Browse files
[thermalctld] add FAN led management in thermal control daemon (sonic-net#54)
1 parent f1409e0 commit 6e975f5

File tree

3 files changed

+190
-20
lines changed

3 files changed

+190
-20
lines changed

sonic-thermalctld/scripts/thermalctld

Lines changed: 101 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -64,27 +64,61 @@ def log_on_status_changed(normal_status, normal_log, abnormal_log):
6464

6565

6666
class FanStatus(object):
67-
def __init__(self):
67+
absence_fan_count = 0
68+
fault_fan_count = 0
69+
update_led_color = True
70+
71+
def __init__(self, fan=None, is_psu_fan=False):
6872
"""
6973
Constructor of FanStatus
7074
"""
75+
self.fan = fan
76+
self.is_psu_fan = is_psu_fan
7177
self.presence = True
78+
self.status = True
7279
self.under_speed = False
7380
self.over_speed = False
7481
self.invalid_direction = False
7582

83+
@classmethod
84+
def get_bad_fan_count(cls):
85+
return cls.absence_fan_count + cls.fault_fan_count
86+
87+
@classmethod
88+
def reset_fan_counter(cls):
89+
cls.absence_fan_count = 0
90+
cls.fault_fan_count = 0
91+
7692
def set_presence(self, presence):
7793
"""
7894
Set and cache Fan presence status
7995
:param presence: Fan presence status
8096
:return: True if status changed else False
8197
"""
98+
if not presence and not self.is_psu_fan:
99+
FanStatus.absence_fan_count += 1
100+
82101
if presence == self.presence:
83102
return False
84103

85104
self.presence = presence
86105
return True
87106

107+
def set_fault_status(self, status):
108+
"""
109+
Set and cache Fan fault status
110+
:param status: Fan fault status, False indicate Fault
111+
:return: True if status changed else False
112+
"""
113+
if not status:
114+
FanStatus.fault_fan_count += 1
115+
116+
if status == self.status:
117+
return False
118+
119+
self.status = status
120+
return True
121+
88122
def _check_speed_value_available(self, speed, target_speed, tolerance, current_status):
89123
if speed == NOT_AVAILABLE or target_speed == NOT_AVAILABLE or tolerance == NOT_AVAILABLE:
90124
if tolerance > 100 or tolerance < 0:
@@ -142,7 +176,11 @@ class FanStatus(object):
142176
Indicate the Fan works as expect
143177
:return: True if Fan works normal else False
144178
"""
145-
return self.presence and not self.under_speed and not self.over_speed and not self.invalid_direction
179+
return self.presence and \
180+
self.status and \
181+
not self.under_speed and \
182+
not self.over_speed and \
183+
not self.invalid_direction
146184

147185

148186
#
@@ -176,33 +214,51 @@ class FanUpdater(object):
176214
:return:
177215
"""
178216
logger.log_debug("Start fan updating")
179-
for index, fan in enumerate(self.chassis.get_all_fans()):
180-
try:
181-
self._refresh_fan_status(fan, index)
182-
except Exception as e:
183-
logger.log_warning('Failed to update FAN status - {}'.format(e))
217+
old_bad_fan_count = FanStatus.get_bad_fan_count()
218+
FanStatus.reset_fan_counter()
219+
220+
fan_index = 0
221+
for drawer in self.chassis.get_all_fan_drawers():
222+
for fan in drawer.get_all_fans():
223+
try:
224+
self._refresh_fan_status(drawer, fan, fan_index)
225+
except Exception as e:
226+
logger.log_warning('Failed to update FAN status - {}'.format(e))
227+
fan_index += 1
184228

185229
for psu_index, psu in enumerate(self.chassis.get_all_psus()):
186230
psu_name = try_get(psu.get_name, 'PSU {}'.format(psu_index))
187231
for fan_index, fan in enumerate(psu.get_all_fans()):
188232
try:
189-
self._refresh_fan_status(fan, fan_index, '{} FAN'.format(psu_name))
233+
self._refresh_fan_status(None, fan, fan_index, '{} FAN'.format(psu_name), True)
190234
except Exception as e:
191235
logger.log_warning('Failed to update PSU FAN status - {}'.format(e))
192236

237+
self._update_led_color()
238+
239+
bad_fan_count = FanStatus.get_bad_fan_count()
240+
if bad_fan_count > 0 and old_bad_fan_count != bad_fan_count:
241+
logger.log_warning("Insufficient number of working fans warning: {} fans are not working.".format(
242+
bad_fan_count
243+
))
244+
elif old_bad_fan_count > 0 and bad_fan_count == 0:
245+
logger.log_notice("Insufficient number of working fans warning cleared: all fans are back to normal.")
246+
193247
logger.log_debug("End fan updating")
194248

195-
def _refresh_fan_status(self, fan, index, name_prefix='FAN'):
249+
def _refresh_fan_status(self, fan_drawer, fan, index, name_prefix='FAN', is_psu_fan=False):
196250
"""
197251
Get Fan status by platform API and write to database for a given Fan
252+
:param fan_drawer: Object representing a platform Fan drawer
198253
:param fan: Object representing a platform Fan
199254
:param index: Index of the Fan object in the platform
200255
:param name_prefix: name prefix of Fan object if Fan.get_name not presented
201256
:return:
202257
"""
258+
drawer_name = NOT_AVAILABLE if is_psu_fan else str(try_get(fan_drawer.get_name))
203259
fan_name = try_get(fan.get_name, '{} {}'.format(name_prefix, index + 1))
204260
if fan_name not in self.fan_status_dict:
205-
self.fan_status_dict[fan_name] = FanStatus()
261+
self.fan_status_dict[fan_name] = FanStatus(fan, is_psu_fan)
206262

207263
fan_status = self.fan_status_dict[fan_name]
208264

@@ -228,45 +284,55 @@ class FanUpdater(object):
228284
'the system, potential overheat hazard'.format(fan_name)
229285
)
230286

287+
if presence and fan_status.set_fault_status(fan_fault_status):
288+
set_led = True
289+
log_on_status_changed(fan_status.status,
290+
'Fan fault warning cleared: {} is back to normal.'.format(fan_name),
291+
'Fan fault warning: {} is broken.'.format(fan_name)
292+
)
293+
231294
if presence and fan_status.set_under_speed(speed, speed_target, speed_tolerance):
232295
set_led = True
233296
log_on_status_changed(not fan_status.under_speed,
234-
'Fan under speed warning cleared: {} speed back to normal.'.format(fan_name),
235-
'Fan under speed warning: {} current speed={}, target speed={}, tolerance={}.'.
297+
'Fan low speed warning cleared: {} speed is back to normal.'.format(fan_name),
298+
'Fan low speed warning: {} current speed={}, target speed={}, tolerance={}.'.
236299
format(fan_name, speed, speed_target, speed_tolerance)
237300
)
238301

239302
if presence and fan_status.set_over_speed(speed, speed_target, speed_tolerance):
240303
set_led = True
241304
log_on_status_changed(not fan_status.over_speed,
242-
'Fan over speed warning cleared: {} speed back to normal.'.format(fan_name),
243-
'Fan over speed warning: {} target speed={}, current speed={}, tolerance={}.'.
305+
'Fan high speed warning cleared: {} speed is back to normal.'.format(fan_name),
306+
'Fan high speed warning: {} target speed={}, current speed={}, tolerance={}.'.
244307
format(fan_name, speed_target, speed, speed_tolerance)
245308
)
246309

247310
# TODO: handle invalid fan direction
248311

249-
if set_led:
250-
self._set_fan_led(fan, fan_name, fan_status)
312+
# We don't set PSU led here, PSU led will be handled in psud
313+
if set_led and not is_psu_fan:
314+
self._set_fan_led(fan_drawer, fan, fan_name, fan_status)
315+
FanStatus.update_led_color = True
251316

252317
fvs = swsscommon.FieldValuePairs(
253318
[('presence', str(presence)),
319+
('drawer_name', drawer_name),
254320
('model', str(try_get(fan.get_model))),
255321
('serial', str(try_get(fan.get_serial))),
256322
('status', str(fan_fault_status)),
257323
('direction', str(fan_direction)),
258324
('speed', str(speed)),
259325
('speed_tolerance', str(speed_tolerance)),
260326
('speed_target', str(speed_target)),
261-
('led_status', str(try_get(fan.get_status_led))),
262327
('timestamp', datetime.now().strftime('%Y%m%d %H:%M:%S'))
263328
])
264329

265330
self.table.set(fan_name, fvs)
266331

267-
def _set_fan_led(self, fan, fan_name, fan_status):
332+
def _set_fan_led(self, fan_drawer, fan, fan_name, fan_status):
268333
"""
269334
Set fan led according to current status
335+
:param fan_drawer: Object representing a platform Fan drawer or PSU
270336
:param fan: Object representing a platform Fan
271337
:param fan_name: Name of the Fan object in case any vendor not implement Fan.get_name
272338
:param fan_status: Object representing the FanStatus
@@ -275,13 +341,30 @@ class FanUpdater(object):
275341
try:
276342
if fan_status.is_ok():
277343
fan.set_status_led(fan.STATUS_LED_COLOR_GREEN)
344+
fan_drawer.set_status_led(fan.STATUS_LED_COLOR_GREEN)
278345
else:
279346
# TODO: wait for Kebo to define the mapping of fan status to led color,
280347
# just set it to red so far
281348
fan.set_status_led(fan.STATUS_LED_COLOR_RED)
349+
fan_drawer.set_status_led(fan.STATUS_LED_COLOR_RED)
282350
except NotImplementedError as e:
283351
logger.log_warning('Failed to set led to fan, set_status_led not implemented')
284352

353+
def _update_led_color(self):
354+
if FanStatus.update_led_color:
355+
for fan_name, fan_status in self.fan_status_dict.items():
356+
try:
357+
fvs = swsscommon.FieldValuePairs([
358+
('led_status', str(try_get(fan_status.fan.get_status_led)))
359+
])
360+
except Exception as e:
361+
logger.log_warning('Failed to get led status for fan')
362+
fvs = swsscommon.FieldValuePairs([
363+
('led_status', NOT_AVAILABLE)
364+
])
365+
self.table.set(fan_name, fvs)
366+
FanStatus.update_led_color = False
367+
285368

286369
class TemperatureStatus(object):
287370
TEMPERATURE_DIFF_THRESHOLD = 10

sonic-thermalctld/tests/mock_platform.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,25 @@ def get_all_fans(self):
8181
return self.fan_list
8282

8383

84+
class MockFanDrawer(MockDevice):
85+
def __init__(self):
86+
self.name = 'FanDrawer'
87+
self.fan_list = []
88+
self.led_status = 'red'
89+
90+
def get_name(self):
91+
return self.name
92+
93+
def get_all_fans(self):
94+
return self.fan_list
95+
96+
def get_status_led(self):
97+
return self.led_status
98+
99+
def set_status_led(self, value):
100+
self.led_status = value
101+
102+
84103
class MockThermal:
85104
def __init__(self):
86105
self.name = None
@@ -134,6 +153,7 @@ def __init__(self):
134153
self.fan_list = []
135154
self.psu_list = []
136155
self.thermal_list = []
156+
self.fan_drawer_list = []
137157

138158
def get_all_fans(self):
139159
return self.fan_list
@@ -144,24 +164,47 @@ def get_all_psus(self):
144164
def get_all_thermals(self):
145165
return self.thermal_list
146166

167+
def get_all_fan_drawers(self):
168+
return self.fan_drawer_list
169+
147170
def make_absence_fan(self):
148171
fan = MockFan()
149172
fan.presence = False
173+
fan_drawer = MockFanDrawer()
174+
fan_drawer.fan_list.append(fan)
175+
self.fan_list.append(fan)
176+
self.fan_drawer_list.append(fan_drawer)
177+
178+
def make_fault_fan(self):
179+
fan = MockFan()
180+
fan.status = False
181+
fan_drawer = MockFanDrawer()
182+
fan_drawer.fan_list.append(fan)
150183
self.fan_list.append(fan)
184+
self.fan_drawer_list.append(fan_drawer)
151185

152186
def make_under_speed_fan(self):
153187
fan = MockFan()
154188
fan.make_under_speed()
189+
fan_drawer = MockFanDrawer()
190+
fan_drawer.fan_list.append(fan)
155191
self.fan_list.append(fan)
192+
self.fan_drawer_list.append(fan_drawer)
156193

157194
def make_over_speed_fan(self):
158195
fan = MockFan()
159196
fan.make_over_speed()
197+
fan_drawer = MockFanDrawer()
198+
fan_drawer.fan_list.append(fan)
160199
self.fan_list.append(fan)
200+
self.fan_drawer_list.append(fan_drawer)
161201

162202
def make_error_fan(self):
163203
fan = MockErrorFan()
204+
fan_drawer = MockFanDrawer()
205+
fan_drawer.fan_list.append(fan)
164206
self.fan_list.append(fan)
207+
self.fan_drawer_list.append(fan_drawer)
165208

166209
def make_over_temper_thermal(self):
167210
thermal = MockThermal()

sonic-thermalctld/tests/test_thermalctld.py

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,12 +95,27 @@ def test_fanupdater_fan_absence():
9595
fan_updater.update()
9696
fan_list = chassis.get_all_fans()
9797
assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED
98-
logger.log_warning.assert_called_once()
98+
logger.log_warning.assert_called()
9999

100100
fan_list[0].presence = True
101101
fan_updater.update()
102102
assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_GREEN
103-
logger.log_notice.assert_called_once()
103+
logger.log_notice.assert_called()
104+
105+
106+
def test_fanupdater_fan_fault():
107+
chassis = MockChassis()
108+
chassis.make_fault_fan()
109+
fan_updater = FanUpdater(chassis)
110+
fan_updater.update()
111+
fan_list = chassis.get_all_fans()
112+
assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_RED
113+
logger.log_warning.assert_called()
114+
115+
fan_list[0].status = True
116+
fan_updater.update()
117+
assert fan_list[0].get_status_led() == MockFan.STATUS_LED_COLOR_GREEN
118+
logger.log_notice.assert_called()
104119

105120

106121
def test_fanupdater_fan_under_speed():
@@ -133,6 +148,35 @@ def test_fanupdater_fan_over_speed():
133148
logger.log_notice.assert_called_once()
134149

135150

151+
def test_insufficient_fan_number():
152+
fan_status1 = FanStatus()
153+
fan_status2 = FanStatus()
154+
fan_status1.set_presence(False)
155+
fan_status2.set_fault_status(False)
156+
assert FanStatus.get_bad_fan_count() == 2
157+
FanStatus.reset_fan_counter()
158+
assert FanStatus.get_bad_fan_count() == 0
159+
160+
chassis = MockChassis()
161+
chassis.make_absence_fan()
162+
chassis.make_fault_fan()
163+
fan_updater = FanUpdater(chassis)
164+
fan_updater.update()
165+
assert logger.log_warning.call_count == 3
166+
logger.log_warning.assert_called_with('Insufficient number of working fans warning: 2 fans are not working.')
167+
168+
fan_list = chassis.get_all_fans()
169+
fan_list[0].presence = True
170+
fan_updater.update()
171+
assert logger.log_notice.call_count == 1
172+
logger.log_warning.assert_called_with('Insufficient number of working fans warning: 1 fans are not working.')
173+
174+
fan_list[1].status = True
175+
fan_updater.update()
176+
assert logger.log_notice.call_count == 3
177+
logger.log_notice.assert_called_with('Insufficient number of working fans warning cleared: all fans are back to normal.')
178+
179+
136180
def test_temperature_status_set_over_temper():
137181
temperatue_status = TemperatureStatus()
138182
ret = temperatue_status.set_over_temperature(NOT_AVAILABLE, NOT_AVAILABLE)

0 commit comments

Comments
 (0)