Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 105 additions & 1 deletion sonic-thermalctld/scripts/thermalctld
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import sys
import threading
import time
from datetime import datetime
import argparse

import sonic_platform
from sonic_py_common import daemon_base, logger
Expand Down Expand Up @@ -437,6 +438,91 @@ class FanUpdater(logger.Logger):
])
self.drawer_table.set(drawer_name, fvs)

class LiquidCoolingUpdater(threading.Thread, logger.Logger):

LIQUID_COOLING_INFO_TABLE_NAME = 'LIQUID_COOLING_INFO'

def __init__(self, chassis, liquid_cooling_update_interval):
"""
Constructor for LiquidCoolingUpdater
:param chassis: Object representing a platform chassis
"""
threading.Thread.__init__(self)
logger.Logger.__init__(self)
self.name = "LiquidCoolingUpdater"
self.exc = None
self.task_stopping_event = threading.Event()
self.chassis = chassis
self.liquid_cooling = self.chassis.get_liquid_cooling()
self.leaking_sensors = []
self.interval = liquid_cooling_update_interval

state_db = daemon_base.db_connect("STATE_DB")
self.table = swsscommon.Table(state_db, LiquidCoolingUpdater.LIQUID_COOLING_INFO_TABLE_NAME)

def __del__(self):
if self.table:
table_keys = self.table.getKeys()
for tk in table_keys:
self.table._del(tk)

def _refresh_leak_status(self):
for index, sensor in enumerate(self.liquid_cooling.leakage_sensors, start = 1):
sensor_name = try_get(sensor.get_name, 'leakage{}'.format(index))
sensor_leak_status = sensor.is_leak()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How is the failure condition handled ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the is_leak will simply return the sensor object status, which is a string as an attribute of the sensor object, this operation can not failed, no exception can happen

status_msg = "N/A"

if sensor_leak_status is True:
status_msg = "Yes"
if sensor_name not in self.leaking_sensors:
self.leaking_sensors.append(sensor_name)
self.log_error('Liquid cooling leakage sensor {} reported leaking'.format(sensor_name))
elif sensor_leak_status is False:
status_msg = "No"
if sensor_name in self.leaking_sensors:
self.leaking_sensors.remove(sensor_name)
self.log_notice('Liquid cooling leakage sensor {} recovered from leaking'.format(sensor_name))

fvs = swsscommon.FieldValuePairs([('leak_status', status_msg)])
self.table.set(sensor_name, fvs)

def update(self):
self._refresh_leak_status()

def task_worker(self, stopping_event):
"""
Update all liquid cooling information to database
:return:
"""
while not stopping_event.is_set():
self.log_debug("Start liquid cooling updating")

self.update()

self.log_debug("End liquid cooling updating")

if self.task_stopping_event.is_set():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we check if there is task_stopping_event.is_set() before calling update ?

Copy link
Contributor Author

@yuazhe yuazhe Nov 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand, I check this flag in line 497, only it is not set, will the update be executed.

return

time.sleep(self.interval)

def join(self):
self.task_stopping_event.set()
super().join()
if self.exc:
raise self.exc

def run(self):
self.thread_id = threading.current_thread().ident
if self.task_stopping_event.is_set():
return
try:
self.task_worker(self.task_stopping_event)
except Exception as e:
logger.log_error("Exception occured at {} thread due to {}".format(threading.current_thread().getName(), repr(e)))
log_exception_traceback()
self.exc = e
self.task_stopping_event.set()

class TemperatureStatus(logger.Logger):
TEMPERATURE_DIFF_THRESHOLD = 10
Expand Down Expand Up @@ -815,6 +901,8 @@ class ThermalControlDaemon(daemon_base.DaemonBase):
thermal_monitor_initial_interval,
thermal_monitor_update_interval,
thermal_monitor_update_elapsed_threshold,
enable_liquid_cooling,
liquid_cooling_update_interval
):
"""
Initializer of ThermalControlDaemon
Expand Down Expand Up @@ -856,6 +944,12 @@ class ThermalControlDaemon(daemon_base.DaemonBase):
except Exception as e:
self.log_error('Caught exception while initializing thermal manager - {}'.format(repr(e)))

self.liquid_cooling_updater = LiquidCoolingUpdater(self.chassis, liquid_cooling_update_interval) if enable_liquid_cooling else None

if self.liquid_cooling_updater is not None:
self.liquid_cooling_updater.start()
self.log_notice("Started thread for liquid cooling updater")

def deinit(self):
"""
Deinitializer of ThermalControlDaemon
Expand All @@ -868,6 +962,11 @@ class ThermalControlDaemon(daemon_base.DaemonBase):

self.thermal_monitor.task_stop()

if self.liquid_cooling_updater is not None:
if self.liquid_cooling_updater.is_alive():
self.liquid_cooling_updater.join()
self.log_notice("Joined thread for liquid cooling updater")

# Override signal handler from DaemonBase
def signal_handler(self, sig, frame):
"""
Expand Down Expand Up @@ -932,12 +1031,17 @@ def main():
parser.add_argument('--thermal-monitor-initial-interval', type=int, default=5)
parser.add_argument('--thermal-monitor-update-interval', type=int, default=60)
parser.add_argument('--thermal-monitor-update-elapsed-threshold', type=int, default=30)
parser.add_argument('--enable_liquid_cooling', action='store_true', default=False)
parser.add_argument('--liquid_cooling_update_interval', type=float, default=0.5)

args = parser.parse_args()

thermal_control = ThermalControlDaemon(
args.thermal_monitor_initial_interval,
args.thermal_monitor_update_interval,
args.thermal_monitor_update_elapsed_threshold
args.thermal_monitor_update_elapsed_threshold,
args.enable_liquid_cooling,
args.liquid_cooling_update_interval
)

thermal_control.log_info("Starting up...")
Expand Down
22 changes: 22 additions & 0 deletions sonic-thermalctld/tests/mock_platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from sonic_platform_base import psu_base
from sonic_platform_base import sfp_base
from sonic_platform_base import thermal_base
from sonic_platform_base import liquid_cooling_base
from sonic_platform_base.sonic_thermal_control import thermal_manager_base


Expand Down Expand Up @@ -188,6 +189,24 @@ def get_position_in_parent(self):
def is_replaceable(self):
return self._replaceable

class MockLiquidCoolingSensor(liquid_cooling_base.LeakageSensorBase):
def __init__(self):
super(MockLiquidCoolingSensor, self).__init__()
self._name = None
self._presence = True
self._model = 'Liquid Cooling Sensor Model'
self._serial = 'Liquid Cooling Sensor Serial'
self._status = True
self._position_in_parent = 1
self._replaceable = True

class MockLiquidCooling(liquid_cooling_base.LiquidCoolingBase):
def __init__(self):
super(MockLiquidCooling, self).__init__(1, [])
self._name = None
self._presence = True
self._model = 'Liquid Cooling Model'
self._serial = 'Liquid Cooling Serial'

class MockSfp(sfp_base.SfpBase):
def __init__(self):
Expand Down Expand Up @@ -407,6 +426,9 @@ def make_module_thermal(self):
module._fan_list.append(fan)
module._thermal_list.append(MockThermal())

def get_liquid_cooling(self):
return MockLiquidCooling()

def is_modular_chassis(self):
return self._is_chassis_system

Expand Down
14 changes: 14 additions & 0 deletions sonic-thermalctld/tests/test_thermalctld.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,18 @@ def test_update_module_fans(self):
else:
fan_updater.log_warning.assert_called_with("Failed to update module fan status - Exception('Test message',)")

class TestLiquidCoolingUpdater(object):
def test_update(self):
mock_chassis = MockChassis()
liquid_cooling_updater = thermalctld.LiquidCoolingUpdater(mock_chassis, 0.5)

liquid_cooling_updater._refresh_leak_status = mock.MagicMock()

liquid_cooling_updater.update()

assert liquid_cooling_updater._refresh_leak_status.call_count == 1


Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add a few more tests to cover the failure and exception cases

class TestThermalMonitor(object):
"""
Test cases to cover functionality in ThermalMonitor class
Expand Down Expand Up @@ -794,6 +806,8 @@ def test_update_entity_info():
def test_main(mock_run):
mock_run.return_value = False

sys.argv = ['thermalctld']

ret = thermalctld.main()
assert mock_run.call_count == 1
assert ret != 0
Expand Down
Loading