Skip to content

Commit 9ca9219

Browse files
Yuan Liuyuazhe
authored andcommitted
Add support for liquid cooling inside hardware checker
Signed-off-by: Yuanzhe, Liu <[email protected]>
1 parent fd49ea4 commit 9ca9219

File tree

7 files changed

+118
-6
lines changed

7 files changed

+118
-6
lines changed

src/system-health/health_checker/config.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def __init__(self):
4444
self.ignore_services = None
4545
self.ignore_devices = None
4646
self.user_defined_checkers = None
47-
47+
self.include_devices = None
4848
def config_file_exists(self):
4949
return os.path.exists(self._config_file)
5050

@@ -72,6 +72,7 @@ def load_config(self):
7272
self.ignore_services = self._get_list_data('services_to_ignore')
7373
self.ignore_devices = self._get_list_data('devices_to_ignore')
7474
self.user_defined_checkers = self._get_list_data('user_defined_checkers')
75+
self.include_devices = self._get_list_data('include_devices')
7576
except Exception as e:
7677
self._reset()
7778

@@ -86,6 +87,7 @@ def _reset(self):
8687
self.ignore_services = None
8788
self.ignore_devices = None
8889
self.user_defined_checkers = None
90+
self.include_devices = None
8991

9092
def get_led_color(self, status):
9193
"""

src/system-health/health_checker/hardware_checker.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
from natsort import natsorted
2+
from swsscommon import swsscommon
23
from swsscommon.swsscommon import SonicV2Connector
34

45
from .health_checker import HealthChecker
56

7+
EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
8+
EVENTS_PUBLISHER_TAG = "liquid-cooling-leak"
69

710
class HardwareChecker(HealthChecker):
811
"""
@@ -12,12 +15,15 @@ class HardwareChecker(HealthChecker):
1215
ASIC_TEMPERATURE_KEY = 'TEMPERATURE_INFO|ASIC'
1316
FAN_TABLE_NAME = 'FAN_INFO'
1417
PSU_TABLE_NAME = 'PSU_INFO'
18+
LIQUID_COOLING_TABLE_NAME = 'LIQUID_COOLING_INFO'
1519

1620
def __init__(self):
1721
HealthChecker.__init__(self)
1822
self._db = SonicV2Connector(use_unix_socket_path=True)
1923
self._db.connect(self._db.STATE_DB)
2024

25+
self.leaking_sensors = []
26+
2127
def get_category(self):
2228
return 'Hardware'
2329

@@ -26,6 +32,7 @@ def check(self, config):
2632
self._check_asic_status(config)
2733
self._check_fan_status(config)
2834
self._check_psu_status(config)
35+
self._check_liquid_cooling_status(config)
2936

3037
def _check_asic_status(self, config):
3138
"""
@@ -283,3 +290,58 @@ def _ignore_check(cls, ignore_set, category, object_name, check_point):
283290
elif '{}.{}'.format(object_name, check_point) in ignore_set:
284291
return True
285292
return False
293+
294+
def publish_events(self, sensors, event_name):
295+
params = swsscommon.FieldValueMap()
296+
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
297+
for sensor in sensors:
298+
params[event_name] = sensor
299+
swsscommon.event_publish(events_handle, EVENTS_PUBLISHER_TAG, params)
300+
swsscommon.events_deinit_publisher(events_handle)
301+
302+
303+
def _check_liquid_cooling_status(self, config):
304+
"""
305+
Check liquid cooling status including:
306+
1. Check all leakage sensors are in good state
307+
:param config: Health checker configuration
308+
:return:
309+
"""
310+
if not config.include_devices or 'liquid_cooling' not in config.include_devices:
311+
return
312+
313+
keys = self._db.keys(self._db.STATE_DB, HardwareChecker.LIQUID_COOLING_TABLE_NAME + '*')
314+
if not keys:
315+
self.set_object_not_ok('Liquid Cooling', 'Liquid Cooling', 'Failed to get liquid cooling information')
316+
return
317+
318+
new_leaking_sensors = []
319+
for key in natsorted(keys):
320+
key_list = key.split('|')
321+
if len(key_list) != 2: # error data in DB, log it and ignore
322+
self.set_object_not_ok('Liquid Cooling', key, 'Invalid key for LIQUID_COOLING_INFO: {}'.format(key))
323+
continue
324+
325+
name = key_list[1]
326+
if config.ignore_devices and name in config.ignore_devices:
327+
continue
328+
329+
data_dict = self._db.get_all(self._db.STATE_DB, key)
330+
leak_status = data_dict.get('leak_status', None)
331+
if leak_status is None or leak_status == 'N/A':
332+
self.set_object_not_ok('Liquid Cooling', name, 'Failed to get leakage sensor status for {}'.format(name))
333+
continue
334+
335+
if leak_status.lower() == 'yes' and name not in self.leaking_sensors:
336+
self.leaking_sensors.append(name)
337+
new_leaking_sensors.append(name)
338+
self.set_object_not_ok('Liquid Cooling', name, 'Leakage sensor {} is leaking'.format(name))
339+
continue
340+
341+
if leak_status.lower() == 'no':
342+
self.set_object_ok('Liquid Cooling', name)
343+
if name in self.leaking_sensors:
344+
self.leaking_sensors.remove(name)
345+
self.publish_events([name], "leaking sensor report recovered")
346+
347+
self.publish_events(new_leaking_sensors, "sensor report leaking event")

src/system-health/health_checker/service_checker.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,6 @@ def __init__(self):
7070

7171
self.load_critical_process_cache()
7272

73-
self.events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
74-
7573
def get_expected_running_containers(self, feature_table):
7674
"""Get a set of containers that are expected to running on SONiC
7775
@@ -342,7 +340,6 @@ def check(self, config):
342340
self.reset()
343341
self.check_by_monit(config)
344342
self.check_services(config)
345-
swsscommon.events_deinit_publisher(self.events_handle)
346343

347344
def _parse_supervisorctl_status(self, process_status):
348345
"""Expected input:
@@ -366,9 +363,11 @@ def _parse_supervisorctl_status(self, process_status):
366363
def publish_events(self, container_name, critical_process_list):
367364
params = swsscommon.FieldValueMap()
368365
params["ctr_name"] = container_name
366+
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
369367
for process_name in critical_process_list:
370368
params["process_name"] = process_name
371-
swsscommon.event_publish(self.events_handle, EVENTS_PUBLISHER_TAG, params)
369+
swsscommon.event_publish(events_handle, EVENTS_PUBLISHER_TAG, params)
370+
swsscommon.events_deinit_publisher(events_handle)
372371

373372
def check_process_existence(self, container_name, critical_process_list, config, feature_table):
374373
"""Check whether the process in the specified container is running or not.

src/system-health/health_checker/system_health_monitoring_config.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"services_to_ignore": [],
33
"devices_to_ignore": [],
44
"user_defined_checkers": [],
5+
"include_devices": [],
56
"polling_interval": 60,
67
"led_color": {
78
"fault": "amber",

src/system-health/health_checker/utils.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import subprocess
22

3-
43
def run_command(command):
54
"""
65
Utility function to run an shell command and return the output.

src/system-health/tests/system_health_monitoring_config.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
"services_to_ignore": ["dummy_service"],
33
"devices_to_ignore": ["psu.voltage"],
44
"user_defined_checkers": [],
5+
"include_devices": ["liquid_cooling"],
56
"polling_interval": 60,
67
"led_color": {
78
"fault": "orange",

src/system-health/tests/test_system_health.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -471,9 +471,37 @@ def test_hardware_checker():
471471
}
472472
})
473473

474+
MockConnector.data.update({
475+
'LIQUID_COOLING_INFO|liquid_cooling_1': {
476+
'leak_status': 'Yes',
477+
'leak_sensor_name': 'liquid_cooling_1'
478+
},
479+
'LIQUID_COOLING_INFO|liquid_cooling_2': {
480+
'leak_status': 'No',
481+
'leak_sensor_name': 'liquid_cooling_2'
482+
},
483+
'LIQUID_COOLING_INFO|liquid_cooling_3': {
484+
'leak_status': 'Yes',
485+
'leak_sensor_name': 'liquid_cooling_3'
486+
},
487+
'LIQUID_COOLING_INFO|liquid_cooling_4': {
488+
'leak_status': 'No',
489+
'leak_sensor_name': 'liquid_cooling_4'
490+
},
491+
'LIQUID_COOLING_INFO|liquid_cooling_5': {
492+
'leak_status': 'Yes',
493+
'leak_sensor_name': 'liquid_cooling_5'
494+
},
495+
'LIQUID_COOLING_INFO|liquid_cooling_6': {
496+
'leak_status': 'No',
497+
'leak_sensor_name': 'liquid_cooling_6'
498+
}
499+
})
500+
474501
checker = HardwareChecker()
475502
assert checker.get_category() == 'Hardware'
476503
config = Config()
504+
config.include_devices = ['liquid_cooling']
477505
checker.check(config)
478506

479507
assert 'ASIC' in checker._info
@@ -521,6 +549,24 @@ def test_hardware_checker():
521549
assert checker._info['PSU 7'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
522550
assert checker._info['PSU 7'][HealthChecker.INFO_FIELD_OBJECT_MSG] == 'System power exceeds threshold but power_critical_threshold is invalid'
523551

552+
assert 'liquid_cooling_1' in checker._info
553+
assert checker._info['liquid_cooling_1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
554+
555+
assert 'liquid_cooling_2' in checker._info
556+
assert checker._info['liquid_cooling_2'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
557+
558+
assert 'liquid_cooling_3' in checker._info
559+
assert checker._info['liquid_cooling_3'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
560+
561+
assert 'liquid_cooling_4' in checker._info
562+
assert checker._info['liquid_cooling_4'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
563+
564+
assert 'liquid_cooling_5' in checker._info
565+
assert checker._info['liquid_cooling_5'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
566+
567+
assert 'liquid_cooling_6' in checker._info
568+
assert checker._info['liquid_cooling_6'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK
569+
524570

525571
def test_config():
526572
config = Config()
@@ -532,6 +578,7 @@ def test_config():
532578
assert 'dummy_service' in config.ignore_services
533579
assert 'psu.voltage' in config.ignore_devices
534580
assert len(config.user_defined_checkers) == 0
581+
assert 'liquid_cooling' in config.include_devices
535582

536583
assert config.get_led_color('fault') == 'orange'
537584
assert config.get_led_color('normal') == 'green'
@@ -543,6 +590,7 @@ def test_config():
543590
assert not config.ignore_devices
544591
assert not config.user_defined_checkers
545592
assert not config.config_data
593+
assert not config.include_devices
546594

547595
assert config.get_led_color('fault') == 'red'
548596
assert config.get_led_color('normal') == 'green'

0 commit comments

Comments
 (0)