Skip to content

Commit da7cc34

Browse files
committed
Add suport for liquid cooling leakage detection
Signed-off-by: Yuanzhe, Liu <[email protected]>
1 parent 011d949 commit da7cc34

File tree

3 files changed

+141
-1
lines changed

3 files changed

+141
-1
lines changed

sonic-thermalctld/scripts/thermalctld

Lines changed: 105 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import sys
1212
import threading
1313
import time
1414
from datetime import datetime
15+
import argparse
1516

1617
import sonic_platform
1718
from sonic_py_common import daemon_base, logger
@@ -437,6 +438,91 @@ class FanUpdater(logger.Logger):
437438
])
438439
self.drawer_table.set(drawer_name, fvs)
439440

441+
class LiquidCoolingUpdater(threading.Thread, logger.Logger):
442+
443+
LIQUID_COOLING_INFO_TABLE_NAME = 'LIQUID_COOLING_INFO'
444+
445+
def __init__(self, chassis, liquid_cooling_update_interval = 0.5):
446+
"""
447+
Constructor for LiquidCoolingUpdater
448+
:param chassis: Object representing a platform chassis
449+
"""
450+
threading.Thread.__init__(self)
451+
logger.Logger.__init__(self)
452+
self.name = "LiquidCoolingUpdater"
453+
self.exc = None
454+
self.task_stopping_event = threading.Event()
455+
self.chassis = chassis
456+
self.liquid_cooling = self.chassis.get_liquid_cooling()
457+
self.leaking_sensors = []
458+
self.interval = liquid_cooling_update_interval
459+
460+
state_db = daemon_base.db_connect("STATE_DB")
461+
self.table = swsscommon.Table(state_db, LiquidCoolingUpdater.LIQUID_COOLING_INFO_TABLE_NAME)
462+
463+
def __del__(self):
464+
if self.table:
465+
table_keys = self.table.getKeys()
466+
for tk in table_keys:
467+
self.table._del(tk)
468+
469+
def _refresh_leak_status(self):
470+
for index, sensor in enumerate(self.liquid_cooling.leakage_sensors, start = 1):
471+
sensor_name = try_get(sensor.get_name, 'leakage{}'.format(index))
472+
sensor_leak_status = sensor.is_leak()
473+
status_msg = "N/A"
474+
475+
if sensor_leak_status is True:
476+
status_msg = "Yes"
477+
if sensor_name not in self.leaking_sensors:
478+
self.leaking_sensors.append(sensor_name)
479+
self.log_error('Liquid cooling leakage sensor {} reported leaking'.format(sensor_name))
480+
elif sensor_leak_status is False:
481+
status_msg = "No"
482+
if sensor_name in self.leaking_sensors:
483+
self.leaking_sensors.remove(sensor_name)
484+
self.log_notice('Liquid cooling leakage sensor {} recovered from leaking'.format(sensor_name))
485+
486+
fvs = swsscommon.FieldValuePairs([('leak_status', status_msg)])
487+
self.table.set(sensor_name, fvs)
488+
489+
def update(self):
490+
self._refresh_leak_status()
491+
492+
def task_worker(self, stopping_event):
493+
"""
494+
Update all liquid cooling information to database
495+
:return:
496+
"""
497+
while not stopping_event.is_set():
498+
self.log_debug("Start liquid cooling updating")
499+
500+
self.update()
501+
502+
self.log_debug("End liquid cooling updating")
503+
504+
if self.task_stopping_event.is_set():
505+
return
506+
507+
time.sleep(self.interval)
508+
509+
def join(self):
510+
self.task_stopping_event.set()
511+
super().join(self)
512+
if self.exc:
513+
raise self.exc
514+
515+
def run(self):
516+
self.thread_id = threading.current_thread().ident
517+
if self.task_stopping_event.is_set():
518+
return
519+
try:
520+
self.task_worker(self.task_stopping_event)
521+
except Exception as e:
522+
logger.log_error("Exception occured at {} thread due to {}".format(threading.current_thread().getName(), repr(e)))
523+
log_exception_traceback()
524+
self.exc = e
525+
self.task_stopping_event.set()
440526

441527
class TemperatureStatus(logger.Logger):
442528
TEMPERATURE_DIFF_THRESHOLD = 10
@@ -815,6 +901,8 @@ class ThermalControlDaemon(daemon_base.DaemonBase):
815901
thermal_monitor_initial_interval,
816902
thermal_monitor_update_interval,
817903
thermal_monitor_update_elapsed_threshold,
904+
enable_liquid_cooling=False,
905+
liquid_cooling_update_interval=0.5
818906
):
819907
"""
820908
Initializer of ThermalControlDaemon
@@ -856,6 +944,12 @@ class ThermalControlDaemon(daemon_base.DaemonBase):
856944
except Exception as e:
857945
self.log_error('Caught exception while initializing thermal manager - {}'.format(repr(e)))
858946

947+
self.liquid_cooling_updater = LiquidCoolingUpdater(self.chassis, liquid_cooling_update_interval) if enable_liquid_cooling else None
948+
949+
if self.liquid_cooling_updater is not None:
950+
self.liquid_cooling_updater.start()
951+
self.log_notice("Started thread for liquid cooling updater")
952+
859953
def deinit(self):
860954
"""
861955
Deinitializer of ThermalControlDaemon
@@ -868,6 +962,11 @@ class ThermalControlDaemon(daemon_base.DaemonBase):
868962

869963
self.thermal_monitor.task_stop()
870964

965+
if self.liquid_cooling_updater is not None:
966+
if self.liquid_cooling_updater.is_alive():
967+
self.liquid_cooling_updater.join()
968+
self.log_notice("Joined thread for liquid cooling updater")
969+
871970
# Override signal handler from DaemonBase
872971
def signal_handler(self, sig, frame):
873972
"""
@@ -932,12 +1031,17 @@ def main():
9321031
parser.add_argument('--thermal-monitor-initial-interval', type=int, default=5)
9331032
parser.add_argument('--thermal-monitor-update-interval', type=int, default=60)
9341033
parser.add_argument('--thermal-monitor-update-elapsed-threshold', type=int, default=30)
1034+
parser.add_argument('--enable_liquid_cooling', action='store_true')
1035+
parser.add_argument('--liquid_cooling_update_interval', type=float)
1036+
9351037
args = parser.parse_args()
9361038

9371039
thermal_control = ThermalControlDaemon(
9381040
args.thermal_monitor_initial_interval,
9391041
args.thermal_monitor_update_interval,
940-
args.thermal_monitor_update_elapsed_threshold
1042+
args.thermal_monitor_update_elapsed_threshold,
1043+
args.enable_liquid_cooling,
1044+
args.liquid_cooling_update_interval
9411045
)
9421046

9431047
thermal_control.log_info("Starting up...")

sonic-thermalctld/tests/mock_platform.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from sonic_platform_base import psu_base
66
from sonic_platform_base import sfp_base
77
from sonic_platform_base import thermal_base
8+
from sonic_platform_base import liquid_cooling_base
89
from sonic_platform_base.sonic_thermal_control import thermal_manager_base
910

1011

@@ -188,6 +189,24 @@ def get_position_in_parent(self):
188189
def is_replaceable(self):
189190
return self._replaceable
190191

192+
class MockLiquidCoolingSensor(liquid_cooling_base.LeakageSensorBase):
193+
def __init__(self):
194+
super(MockLiquidCoolingSensor, self).__init__()
195+
self._name = None
196+
self._presence = True
197+
self._model = 'Liquid Cooling Sensor Model'
198+
self._serial = 'Liquid Cooling Sensor Serial'
199+
self._status = True
200+
self._position_in_parent = 1
201+
self._replaceable = True
202+
203+
class MockLiquidCooling(liquid_cooling_base.LiquidCoolingBase):
204+
def __init__(self):
205+
super(MockLiquidCooling, self).__init__(1, [])
206+
self._name = None
207+
self._presence = True
208+
self._model = 'Liquid Cooling Model'
209+
self._serial = 'Liquid Cooling Serial'
191210

192211
class MockSfp(sfp_base.SfpBase):
193212
def __init__(self):
@@ -407,6 +426,9 @@ def make_module_thermal(self):
407426
module._fan_list.append(fan)
408427
module._thermal_list.append(MockThermal())
409428

429+
def get_liquid_cooling(self):
430+
return MockLiquidCooling()
431+
410432
def is_modular_chassis(self):
411433
return self._is_chassis_system
412434

sonic-thermalctld/tests/test_thermalctld.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,18 @@ def test_update_module_fans(self):
288288
else:
289289
fan_updater.log_warning.assert_called_with("Failed to update module fan status - Exception('Test message',)")
290290

291+
class TestLiquidCoolingUpdater(object):
292+
def test_update(self):
293+
mock_chassis = MockChassis()
294+
liquid_cooling_updater = thermalctld.LiquidCoolingUpdater(mock_chassis, 0.5)
295+
296+
liquid_cooling_updater._refresh_leak_status = mock.MagicMock()
297+
298+
liquid_cooling_updater.update()
299+
300+
assert liquid_cooling_updater._refresh_leak_status.call_count == 1
301+
302+
291303
class TestThermalMonitor(object):
292304
"""
293305
Test cases to cover functionality in ThermalMonitor class
@@ -794,6 +806,8 @@ def test_update_entity_info():
794806
def test_main(mock_run):
795807
mock_run.return_value = False
796808

809+
sys.argv = ['thermalctld']
810+
797811
ret = thermalctld.main()
798812
assert mock_run.call_count == 1
799813
assert ret != 0

0 commit comments

Comments
 (0)