Skip to content

Commit c711688

Browse files
committed
Add support for liquid cooling of mellnox api and system health monitor
Signed-off-by: Yuanzhe, Liu <[email protected]>
1 parent fd49ea4 commit c711688

File tree

9 files changed

+272
-6
lines changed

9 files changed

+272
-6
lines changed

dockers/docker-platform-monitor/docker-pmon.supervisord.conf.j2

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,20 @@ dependent_startup_wait_for=rsyslogd:running {% if delay_non_critical_daemon %}de
199199

200200
{% if not skip_thermalctld %}
201201
[program:thermalctld]
202-
command={% if API_VERSION == 3 and 'thermalctld' not in python2_daemons %}python3 {% else %} python2 {% endif %}/usr/local/bin/thermalctld
202+
{% set base_command = "python3 /usr/local/bin/thermalctld" %}
203+
{% set options = "" -%}
204+
205+
{% if enable_liquid_cooling %}
206+
{%- set options = options + " --enable_liquid_cooling" %}
207+
{% endif -%}
208+
209+
{% if liquid_cooling_update_interval %}
210+
{%- set options = options + " --liquid_cooling_update_interval {{ liquid_cooling_update_interval }}" %}
211+
{% endif -%}
212+
213+
{%- set command = base_command ~ options %}
214+
215+
command={{ command }}
203216
priority=10
204217
autostart=false
205218
autorestart=unexpected

platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,8 @@ def __init__(self):
120120
self._RJ45_port_inited = False
121121
self._RJ45_port_list = None
122122

123+
self.liquid_cooling = None
124+
123125
Chassis.chassis_instance = self
124126

125127
self.module_host_mgmt_initializer = module_host_mgmt_initializer.ModuleHostMgmtInitializer()
@@ -1085,6 +1087,20 @@ def is_replaceable(self):
10851087
"""
10861088
return False
10871089

1090+
1091+
##############################################
1092+
# LiquidCooling methods
1093+
##############################################
1094+
1095+
def initialize_liquid_cooling(self):
1096+
if not self.liquid_cooling:
1097+
from .liquid_cooling import LiquidCooling
1098+
self.liquid_cooling = LiquidCooling()
1099+
1100+
def get_liquid_cooling(self):
1101+
self.initialize_liquid_cooling()
1102+
return self.liquid_cooling
1103+
10881104

10891105
class ModularChassis(Chassis):
10901106
def __init__(self):

platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,11 @@ def get_fan_count(cls):
247247
def is_fan_hotswapable(cls):
248248
return utils.read_int_from_file('/run/hw-management/config/hotplug_fans') > 0
249249

250+
@classmethod
251+
@utils.read_only_cache()
252+
def get_leakage_sensor_count(cls):
253+
return len(glob.glob('/var/run/hw-management/system/leakage*'))
254+
250255
@classmethod
251256
@utils.read_only_cache()
252257
def get_psu_count(cls):
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
#
2+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
3+
# Copyright (c) 2019-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4+
# SPDX-License-Identifier: Apache-2.0
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License");
7+
# you may not use this file except in compliance with the License.
8+
# You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
#
18+
#############################################################################
19+
# Mellanox
20+
#
21+
# Module contains an implementation of SONiC Platform Base API and
22+
# provides the liquid cooling status which are available in the platform
23+
#
24+
#############################################################################
25+
26+
try:
27+
from sonic_platform_base.liquid_cooling_base import LeakageSensorBase,LiquidCoolingBase
28+
from sonic_py_common.logger import Logger
29+
from . import utils
30+
import os
31+
import glob
32+
import logging
33+
except ImportError as e:
34+
raise ImportError(str(e) + "- required module not found")
35+
36+
logging.basicConfig(level=logging.DEBUG)
37+
logger = logging.getLogger(__name__)
38+
39+
LIQUID_COOLING_SENSOR_PATH = "/var/run/hw-management/system/"
40+
41+
class LeakageSensor(LeakageSensorBase):
42+
def __init__(self, name,path):
43+
super(LeakageSensor, self).__init__(name)
44+
self.path = path
45+
46+
def is_leak(self):
47+
content = utils.read_int_from_file(self.path)
48+
49+
if content == 1:
50+
self.leaking = False
51+
return False
52+
elif content == 0:
53+
self.leaking = True
54+
return True
55+
else:
56+
logger.error(f"Failed to read leakage sensor {self.name} value: {content}")
57+
return True
58+
59+
class LiquidCooling(LiquidCoolingBase):
60+
"""Platform-specific Liquid Cooling class"""
61+
62+
def __init__(self):
63+
64+
sensor_files = glob.glob(os.path.join(LIQUID_COOLING_SENSOR_PATH, "leakage*"))
65+
66+
sensor_files.sort(key=lambda x: int(x.split("leakage")[-1]))
67+
68+
self.leakage_sensors_num = len(sensor_files)
69+
self.leakage_sensors = []
70+
71+
for sensor_path in sensor_files:
72+
sensor_name = os.path.basename(sensor_path)
73+
index = int(sensor_name.replace("leakage", ""))
74+
self.leakage_sensors.append(LeakageSensor(sensor_name, sensor_path))
75+
76+
super(LiquidCooling, self).__init__(self.leakage_sensors_num, self.leakage_sensors)
77+
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import os
2+
import sys
3+
from unittest import mock
4+
from sonic_platform.liquid_cooling import LiquidCooling, LeakageSensor
5+
6+
def test_leakage_sensor_init():
7+
sensor = LeakageSensor("leakage1", "/test/path")
8+
assert sensor.name == "leakage1"
9+
assert sensor.path == "/test/path"
10+
11+
def test_leakage_sensor_is_leak():
12+
sensor = LeakageSensor("leakage1", "/test/path")
13+
14+
# Test when file exists and content is "1" (no leak)
15+
with mock.patch('os.path.exists') as mock_exists:
16+
with mock.patch('builtins.open', mock.mock_open(read_data="1")):
17+
mock_exists.return_value = True
18+
assert sensor.is_leak() is False
19+
20+
# Test when file exists and content is "0" (leak detected)
21+
with mock.patch('os.path.exists') as mock_exists:
22+
with mock.patch('builtins.open', mock.mock_open(read_data="0")):
23+
mock_exists.return_value = True
24+
assert sensor.is_leak() is True
25+
26+
# Test when file does not exist
27+
with mock.patch('os.path.exists') as mock_exists:
28+
mock_exists.return_value = False
29+
assert sensor.is_leak() is None
30+
31+
def test_liquid_cooling_init():
32+
33+
with mock.patch('os.path.exists') as mock_exists, \
34+
mock.patch('os.path.join', side_effect=lambda *args: "/".join(args)) as mock_join, \
35+
mock.patch('glob.glob') as mock_glob:
36+
37+
# Setup mock to simulate 4 leakage sensors
38+
mock_exists.side_effect = [True, True, True, True]
39+
mock_glob.return_value = [
40+
"/var/run/hw-management/system/leakage1",
41+
"/var/run/hw-management/system/leakage2",
42+
"/var/run/hw-management/system/leakage3",
43+
"/var/run/hw-management/system/leakage4"
44+
]
45+
46+
liquid_cooling = LiquidCooling()
47+
48+
# Verify the number of sensors initialized
49+
assert liquid_cooling.leakage_sensors_num == 4
50+
51+
sensors = liquid_cooling.get_leak_sensor_status()
52+
assert len(sensors) == 0

src/system-health/health_checker/hardware_checker.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
from natsort import natsorted
2+
from swsscommon import swsscommon
23
from swsscommon.swsscommon import SonicV2Connector
34

45
from .health_checker import HealthChecker
56

7+
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
8+
EVENTS_PUBLISHER_TAG = "liquid-cooling-leak"
69

710
class HardwareChecker(HealthChecker):
811
"""
@@ -12,6 +15,7 @@ class HardwareChecker(HealthChecker):
1215
ASIC_TEMPERATURE_KEY = 'TEMPERATURE_INFO|ASIC'
1316
FAN_TABLE_NAME = 'FAN_INFO'
1417
PSU_TABLE_NAME = 'PSU_INFO'
18+
LIQUID_COOLING_TABLE_NAME = 'LIQUID_COOLING_INFO'
1519

1620
def __init__(self):
1721
HealthChecker.__init__(self)
@@ -26,6 +30,7 @@ def check(self, config):
2630
self._check_asic_status(config)
2731
self._check_fan_status(config)
2832
self._check_psu_status(config)
33+
self._check_liquid_cooling_status(config)
2934

3035
def _check_asic_status(self, config):
3136
"""
@@ -283,3 +288,58 @@ def _ignore_check(cls, ignore_set, category, object_name, check_point):
283288
elif '{}.{}'.format(object_name, check_point) in ignore_set:
284289
return True
285290
return False
291+
292+
def publish_events(self, leak_sensors):
293+
params = swsscommon.FieldValueMap()
294+
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
295+
for sensor in leak_sensors:
296+
params["leak_sensor"] = sensor
297+
swsscommon.event_publish(events_handle, EVENTS_PUBLISHER_TAG, params)
298+
swsscommon.events_deinit_publisher(events_handle)
299+
300+
301+
def _check_liquid_cooling_status(self, config):
302+
"""
303+
Check liquid cooling status including:
304+
1. Check all leakage sensors are in good state
305+
:param config: Health checker configuration
306+
:return:
307+
"""
308+
if config.user_defined_checkers and 'liquid_cooling' not in config.user_defined_checkers:
309+
return
310+
311+
keys = self._db.keys(self._db.STATE_DB, HardwareChecker.LIQUID_COOLING_TABLE_NAME + '*')
312+
if not keys:
313+
self.set_object_not_ok('Liquid Cooling', 'Liquid Cooling', 'Failed to get liquid cooling information')
314+
return
315+
316+
leaking_sensors = []
317+
318+
for key in natsorted(keys):
319+
key_list = key.split('|')
320+
if len(key_list) != 2: # error data in DB, log it and ignore
321+
self.set_object_not_ok('Liquid Cooling', key, 'Invalid key for LIQUID_COOLING_INFO: {}'.format(key))
322+
continue
323+
324+
name = key_list[1]
325+
if config.ignore_devices and name in config.ignore_devices:
326+
continue
327+
328+
data_dict = self._db.get_all(self._db.STATE_DB, key)
329+
leak_status = data_dict.get('leak_status', None)
330+
if leak_status is None:
331+
self.set_object_not_ok('Liquid Cooling', name, 'Failed to get leakage sensor status for {}'.format(name))
332+
continue
333+
334+
if leak_status.lower() == 'true':
335+
if name not in leaking_sensors:
336+
leaking_sensors.append(name)
337+
self.set_object_not_ok('Liquid Cooling', name, 'Leakage sensor {} is leaking'.format(name))
338+
continue
339+
340+
if leak_status.lower() == 'false' and name in leaking_sensors:
341+
leaking_sensors.remove(name)
342+
343+
self.set_object_ok('Liquid Cooling', name)
344+
345+
self.publish_events(leaking_sensors)

src/system-health/health_checker/service_checker.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,6 @@ def __init__(self):
7070

7171
self.load_critical_process_cache()
7272

73-
self.events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
74-
7573
def get_expected_running_containers(self, feature_table):
7674
"""Get a set of containers that are expected to running on SONiC
7775
@@ -342,7 +340,6 @@ def check(self, config):
342340
self.reset()
343341
self.check_by_monit(config)
344342
self.check_services(config)
345-
swsscommon.events_deinit_publisher(self.events_handle)
346343

347344
def _parse_supervisorctl_status(self, process_status):
348345
"""Expected input:
@@ -366,9 +363,11 @@ def _parse_supervisorctl_status(self, process_status):
366363
def publish_events(self, container_name, critical_process_list):
367364
params = swsscommon.FieldValueMap()
368365
params["ctr_name"] = container_name
366+
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
369367
for process_name in critical_process_list:
370368
params["process_name"] = process_name
371-
swsscommon.event_publish(self.events_handle, EVENTS_PUBLISHER_TAG, params)
369+
swsscommon.event_publish(events_handle, EVENTS_PUBLISHER_TAG, params)
370+
swsscommon.events_deinit_publisher(events_handle)
372371

373372
def check_process_existence(self, container_name, critical_process_list, config, feature_table):
374373
"""Check whether the process in the specified container is running or not.

src/system-health/health_checker/utils.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import subprocess
22

3-
43
def run_command(command):
54
"""
65
Utility function to run an shell command and return the output.

src/system-health/tests/test_system_health.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -471,6 +471,33 @@ def test_hardware_checker():
471471
}
472472
})
473473

474+
MockConnector.data.update({
475+
'LIQUID_COOLING_INFO|liquid_cooling_1': {
476+
'leak_status': 'True',
477+
'leak_sensor_name': 'liquid_cooling_1'
478+
},
479+
'LIQUID_COOLING_INFO|liquid_cooling_2': {
480+
'leak_status': 'False',
481+
'leak_sensor_name': 'liquid_cooling_2'
482+
},
483+
'LIQUID_COOLING_INFO|liquid_cooling_3': {
484+
'leak_status': 'True',
485+
'leak_sensor_name': 'liquid_cooling_3'
486+
},
487+
'LIQUID_COOLING_INFO|liquid_cooling_4': {
488+
'leak_status': 'False',
489+
'leak_sensor_name': 'liquid_cooling_4'
490+
},
491+
'LIQUID_COOLING_INFO|liquid_cooling_5': {
492+
'leak_status': 'True',
493+
'leak_sensor_name': 'liquid_cooling_5'
494+
},
495+
'LIQUID_COOLING_INFO|liquid_cooling_6': {
496+
'leak_status': 'False',
497+
'leak_sensor_name': 'liquid_cooling_6'
498+
}
499+
})
500+
474501
checker = HardwareChecker()
475502
assert checker.get_category() == 'Hardware'
476503
config = Config()
@@ -521,6 +548,24 @@ def test_hardware_checker():
521548
assert checker._info['PSU 7'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
522549
assert checker._info['PSU 7'][HealthChecker.INFO_FIELD_OBJECT_MSG] == 'System power exceeds threshold but power_critical_threshold is invalid'
523550

551+
assert 'liquid_cooling_1' in checker._info
552+
assert checker._info['liquid_cooling_1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
553+
554+
assert 'liquid_cooling_2' in checker._info
555+
assert checker._info['liquid_cooling_2'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
556+
557+
assert 'liquid_cooling_3' in checker._info
558+
assert checker._info['liquid_cooling_3'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
559+
560+
assert 'liquid_cooling_4' in checker._info
561+
assert checker._info['liquid_cooling_4'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
562+
563+
assert 'liquid_cooling_5' in checker._info
564+
assert checker._info['liquid_cooling_5'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
565+
566+
assert 'liquid_cooling_6' in checker._info
567+
assert checker._info['liquid_cooling_6'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
568+
524569

525570
def test_config():
526571
config = Config()

0 commit comments

Comments
 (0)