Skip to content

Commit 0be9f15

Browse files
committed
Add suport for liquid cooling leakage detection
Signed-off-by: Yuanzhe, Liu <[email protected]>
1 parent c885695 commit 0be9f15

File tree

3 files changed

+141
-2
lines changed

3 files changed

+141
-2
lines changed

sonic-thermalctld/scripts/thermalctld

Lines changed: 105 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import sys
1111
import threading
1212
import time
1313
from datetime import datetime
14+
import argparse
1415

1516
import sonic_platform
1617
from sonic_py_common import daemon_base, logger
@@ -435,6 +436,91 @@ class FanUpdater(logger.Logger):
435436
])
436437
self.drawer_table.set(drawer_name, fvs)
437438

439+
class LiquidCoolingUpdater(threading.Thread, logger.Logger):
440+
441+
LIQUID_COOLING_INFO_TABLE_NAME = 'LIQUID_COOLING_INFO'
442+
443+
def __init__(self, chassis, liquid_cooling_update_interval = 0.5):
444+
"""
445+
Constructor for LiquidCoolingUpdater
446+
:param chassis: Object representing a platform chassis
447+
"""
448+
threading.Thread.__init__(self)
449+
logger.Logger.__init__(self)
450+
self.name = "LiquidCoolingUpdater"
451+
self.exc = None
452+
self.task_stopping_event = threading.Event()
453+
self.chassis = chassis
454+
self.liquid_cooling = self.chassis.get_liquid_cooling()
455+
self.leaking_sensors = []
456+
self.interval = liquid_cooling_update_interval
457+
458+
state_db = daemon_base.db_connect("STATE_DB")
459+
self.table = swsscommon.Table(state_db, LiquidCoolingUpdater.LIQUID_COOLING_INFO_TABLE_NAME)
460+
461+
def __del__(self):
462+
if self.table:
463+
table_keys = self.table.getKeys()
464+
for tk in table_keys:
465+
self.table._del(tk)
466+
467+
def _refresh_leak_status(self):
468+
for index, sensor in enumerate(self.liquid_cooling.leakage_sensors, start = 1):
469+
sensor_name = try_get(sensor.get_name, 'leakage{}'.format(index))
470+
sensor_leak_status = sensor.is_leak()
471+
status_msg = "N/A"
472+
473+
if sensor_leak_status is True:
474+
status_msg = "Yes"
475+
if sensor_name not in self.leaking_sensors:
476+
self.leaking_sensors.append(sensor_name)
477+
self.log_error('Liquid cooling leakage sensor {} reported leaking'.format(sensor_name))
478+
elif sensor_leak_status is False:
479+
status_msg = "No"
480+
if sensor_name in self.leaking_sensors:
481+
self.leaking_sensors.remove(sensor_name)
482+
self.log_notice('Liquid cooling leakage sensor {} recovered from leaking'.format(sensor_name))
483+
484+
fvs = swsscommon.FieldValuePairs([('leak_status', status_msg)])
485+
self.table.set(sensor_name, fvs)
486+
487+
def update(self):
488+
self._refresh_leak_status()
489+
490+
def task_worker(self, stopping_event):
491+
"""
492+
Update all liquid cooling information to database
493+
:return:
494+
"""
495+
while not stopping_event.is_set():
496+
self.log_debug("Start liquid cooling updating")
497+
498+
self.update()
499+
500+
self.log_debug("End liquid cooling updating")
501+
502+
if self.task_stopping_event.is_set():
503+
return
504+
505+
time.sleep(self.interval)
506+
507+
def join(self):
508+
self.task_stopping_event.set()
509+
super().join(self)
510+
if self.exc:
511+
raise self.exc
512+
513+
def run(self):
514+
self.thread_id = threading.current_thread().ident
515+
if self.task_stopping_event.is_set():
516+
return
517+
try:
518+
self.task_worker(self.task_stopping_event)
519+
except Exception as e:
520+
logger.log_error("Exception occured at {} thread due to {}".format(threading.current_thread().getName(), repr(e)))
521+
log_exception_traceback()
522+
self.exc = e
523+
self.task_stopping_event.set()
438524

439525
class TemperatureStatus(logger.Logger):
440526
TEMPERATURE_DIFF_THRESHOLD = 10
@@ -808,7 +894,7 @@ class ThermalControlDaemon(daemon_base.DaemonBase):
808894

809895
POLICY_FILE = '/usr/share/sonic/platform/thermal_policy.json'
810896

811-
def __init__(self):
897+
def __init__(self, enable_liquid_cooling=False, liquid_cooling_update_interval=0.5):
812898
"""
813899
Initializer of ThermalControlDaemon
814900
"""
@@ -840,6 +926,12 @@ class ThermalControlDaemon(daemon_base.DaemonBase):
840926
except Exception as e:
841927
self.log_error('Caught exception while initializing thermal manager - {}'.format(repr(e)))
842928

929+
self.liquid_cooling_updater = LiquidCoolingUpdater(self.chassis, liquid_cooling_update_interval) if enable_liquid_cooling else None
930+
931+
if self.liquid_cooling_updater is not None:
932+
self.liquid_cooling_updater.start()
933+
self.log_notice("Started thread for liquid cooling updater")
934+
843935
def deinit(self):
844936
"""
845937
Deinitializer of ThermalControlDaemon
@@ -852,6 +944,11 @@ class ThermalControlDaemon(daemon_base.DaemonBase):
852944

853945
self.thermal_monitor.task_stop()
854946

947+
if self.liquid_cooling_updater is not None:
948+
if self.liquid_cooling_updater.is_alive():
949+
self.liquid_cooling_updater.join()
950+
self.log_notice("Joined thread for liquid cooling updater")
951+
855952
# Override signal handler from DaemonBase
856953
def signal_handler(self, sig, frame):
857954
"""
@@ -912,7 +1009,13 @@ class ThermalControlDaemon(daemon_base.DaemonBase):
9121009
# Main =========================================================================
9131010
#
9141011
def main():
915-
thermal_control = ThermalControlDaemon()
1012+
parser = argparse.ArgumentParser()
1013+
parser.add_argument('--enable_liquid_cooling', action='store_true')
1014+
parser.add_argument('--liquid_cooling_update_interval', type=float)
1015+
1016+
args = parser.parse_args()
1017+
1018+
thermal_control = ThermalControlDaemon(args.enable_liquid_cooling, args.liquid_cooling_update_interval)
9161019

9171020
thermal_control.log_info("Starting up...")
9181021

sonic-thermalctld/tests/mock_platform.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from sonic_platform_base import psu_base
66
from sonic_platform_base import sfp_base
77
from sonic_platform_base import thermal_base
8+
from sonic_platform_base import liquid_cooling_base
89
from sonic_platform_base.sonic_thermal_control import thermal_manager_base
910

1011

@@ -188,6 +189,24 @@ def get_position_in_parent(self):
188189
def is_replaceable(self):
189190
return self._replaceable
190191

192+
class MockLiquidCoolingSensor(liquid_cooling_base.LeakageSensorBase):
193+
def __init__(self):
194+
super(MockLiquidCoolingSensor, self).__init__()
195+
self._name = None
196+
self._presence = True
197+
self._model = 'Liquid Cooling Sensor Model'
198+
self._serial = 'Liquid Cooling Sensor Serial'
199+
self._status = True
200+
self._position_in_parent = 1
201+
self._replaceable = True
202+
203+
class MockLiquidCooling(liquid_cooling_base.LiquidCoolingBase):
204+
def __init__(self):
205+
super(MockLiquidCooling, self).__init__(1, [])
206+
self._name = None
207+
self._presence = True
208+
self._model = 'Liquid Cooling Model'
209+
self._serial = 'Liquid Cooling Serial'
191210

192211
class MockSfp(sfp_base.SfpBase):
193212
def __init__(self):
@@ -407,6 +426,9 @@ def make_module_thermal(self):
407426
module._fan_list.append(fan)
408427
module._thermal_list.append(MockThermal())
409428

429+
def get_liquid_cooling(self):
430+
return MockLiquidCooling()
431+
410432
def is_modular_chassis(self):
411433
return self._is_chassis_system
412434

sonic-thermalctld/tests/test_thermalctld.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,18 @@ def test_update_module_fans(self):
288288
else:
289289
fan_updater.log_warning.assert_called_with("Failed to update module fan status - Exception('Test message',)")
290290

291+
class TestLiquidCoolingUpdater(object):
292+
def test_update(self):
293+
mock_chassis = MockChassis()
294+
liquid_cooling_updater = thermalctld.LiquidCoolingUpdater(mock_chassis, 0.5)
295+
296+
liquid_cooling_updater._refresh_leak_status = mock.MagicMock()
297+
298+
liquid_cooling_updater.update()
299+
300+
assert liquid_cooling_updater._refresh_leak_status.call_count == 1
301+
302+
291303
class TestThermalMonitor(object):
292304
"""
293305
Test cases to cover functionality in ThermalMonitor class
@@ -794,6 +806,8 @@ def test_update_entity_info():
794806
def test_main(mock_run):
795807
mock_run.return_value = False
796808

809+
sys.argv = ['thermalctld']
810+
797811
ret = thermalctld.main()
798812
assert mock_run.call_count == 1
799813
assert ret != 0

0 commit comments

Comments
 (0)