Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/system-health/health_checker/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def __init__(self):
self.ignore_services = None
self.ignore_devices = None
self.user_defined_checkers = None

self.include_devices = None
def config_file_exists(self):
return os.path.exists(self._config_file)

Expand Down Expand Up @@ -72,6 +72,7 @@ def load_config(self):
self.ignore_services = self._get_list_data('services_to_ignore')
self.ignore_devices = self._get_list_data('devices_to_ignore')
self.user_defined_checkers = self._get_list_data('user_defined_checkers')
self.include_devices = self._get_list_data('include_devices')
except Exception as e:
self._reset()

Expand All @@ -86,6 +87,7 @@ def _reset(self):
self.ignore_services = None
self.ignore_devices = None
self.user_defined_checkers = None
self.include_devices = None

def get_led_color(self, status):
"""
Expand Down
62 changes: 62 additions & 0 deletions src/system-health/health_checker/hardware_checker.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
from natsort import natsorted
from swsscommon import swsscommon
from swsscommon.swsscommon import SonicV2Connector

from .health_checker import HealthChecker

EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
EVENTS_PUBLISHER_TAG = "liquid-cooling-leak"

class HardwareChecker(HealthChecker):
"""
Expand All @@ -12,12 +15,15 @@ class HardwareChecker(HealthChecker):
ASIC_TEMPERATURE_KEY = 'TEMPERATURE_INFO|ASIC'
FAN_TABLE_NAME = 'FAN_INFO'
PSU_TABLE_NAME = 'PSU_INFO'
LIQUID_COOLING_TABLE_NAME = 'LIQUID_COOLING_INFO'

def __init__(self):
HealthChecker.__init__(self)
self._db = SonicV2Connector(use_unix_socket_path=True)
self._db.connect(self._db.STATE_DB)

self.leaking_sensors = []

def get_category(self):
return 'Hardware'

Expand All @@ -26,6 +32,7 @@ def check(self, config):
self._check_asic_status(config)
self._check_fan_status(config)
self._check_psu_status(config)
self._check_liquid_cooling_status(config)

def _check_asic_status(self, config):
"""
Expand Down Expand Up @@ -283,3 +290,58 @@ def _ignore_check(cls, ignore_set, category, object_name, check_point):
elif '{}.{}'.format(object_name, check_point) in ignore_set:
return True
return False

def publish_events(self, sensors, event_name):
params = swsscommon.FieldValueMap()
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
for sensor in sensors:
params[event_name] = sensor
swsscommon.event_publish(events_handle, EVENTS_PUBLISHER_TAG, params)
swsscommon.events_deinit_publisher(events_handle)


def _check_liquid_cooling_status(self, config):
"""
Check liquid cooling status including:
1. Check all leakage sensors are in good state
:param config: Health checker configuration
:return:
"""
if not config.include_devices or 'liquid_cooling' not in config.include_devices:
return

keys = self._db.keys(self._db.STATE_DB, HardwareChecker.LIQUID_COOLING_TABLE_NAME + '*')
if not keys:
self.set_object_not_ok('Liquid Cooling', 'Liquid Cooling', 'Failed to get liquid cooling information')
return

new_leaking_sensors = []
for key in natsorted(keys):
key_list = key.split('|')
if len(key_list) != 2: # error data in DB, log it and ignore
self.set_object_not_ok('Liquid Cooling', key, 'Invalid key for LIQUID_COOLING_INFO: {}'.format(key))
continue

name = key_list[1]
if config.ignore_devices and name in config.ignore_devices:
continue

data_dict = self._db.get_all(self._db.STATE_DB, key)
leak_status = data_dict.get('leak_status', None)
if leak_status is None or leak_status == 'N/A':
self.set_object_not_ok('Liquid Cooling', name, 'Failed to get leakage sensor status for {}'.format(name))
continue

if leak_status.lower() == 'yes' and name not in self.leaking_sensors:
self.leaking_sensors.append(name)
new_leaking_sensors.append(name)
self.set_object_not_ok('Liquid Cooling', name, 'Leakage sensor {} is leaking'.format(name))
continue

if leak_status.lower() == 'no':
self.set_object_ok('Liquid Cooling', name)
if name in self.leaking_sensors:
self.leaking_sensors.remove(name)
self.publish_events([name], "leaking sensor report recovered")

self.publish_events(new_leaking_sensors, "sensor report leaking event")
7 changes: 3 additions & 4 deletions src/system-health/health_checker/service_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,6 @@ def __init__(self):

self.load_critical_process_cache()

self.events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)

def get_expected_running_containers(self, feature_table):
"""Get a set of containers that are expected to running on SONiC

Expand Down Expand Up @@ -342,7 +340,6 @@ def check(self, config):
self.reset()
self.check_by_monit(config)
self.check_services(config)
swsscommon.events_deinit_publisher(self.events_handle)

def _parse_supervisorctl_status(self, process_status):
"""Expected input:
Expand All @@ -366,9 +363,11 @@ def _parse_supervisorctl_status(self, process_status):
def publish_events(self, container_name, critical_process_list):
params = swsscommon.FieldValueMap()
params["ctr_name"] = container_name
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
for process_name in critical_process_list:
params["process_name"] = process_name
swsscommon.event_publish(self.events_handle, EVENTS_PUBLISHER_TAG, params)
swsscommon.event_publish(events_handle, EVENTS_PUBLISHER_TAG, params)
swsscommon.events_deinit_publisher(events_handle)

def check_process_existence(self, container_name, critical_process_list, config, feature_table):
"""Check whether the process in the specified container is running or not.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"services_to_ignore": [],
"devices_to_ignore": [],
"user_defined_checkers": [],
"include_devices": [],
"polling_interval": 60,
"led_color": {
"fault": "amber",
Expand Down
1 change: 0 additions & 1 deletion src/system-health/health_checker/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import subprocess


def run_command(command):
"""
Utility function to run an shell command and return the output.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"services_to_ignore": ["dummy_service"],
"devices_to_ignore": ["psu.voltage"],
"user_defined_checkers": [],
"include_devices": ["liquid_cooling"],
"polling_interval": 60,
"led_color": {
"fault": "orange",
Expand Down
48 changes: 48 additions & 0 deletions src/system-health/tests/test_system_health.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,9 +471,37 @@ def test_hardware_checker():
}
})

MockConnector.data.update({
'LIQUID_COOLING_INFO|liquid_cooling_1': {
'leak_status': 'Yes',
'leak_sensor_name': 'liquid_cooling_1'
},
'LIQUID_COOLING_INFO|liquid_cooling_2': {
'leak_status': 'No',
'leak_sensor_name': 'liquid_cooling_2'
},
'LIQUID_COOLING_INFO|liquid_cooling_3': {
'leak_status': 'Yes',
'leak_sensor_name': 'liquid_cooling_3'
},
'LIQUID_COOLING_INFO|liquid_cooling_4': {
'leak_status': 'No',
'leak_sensor_name': 'liquid_cooling_4'
},
'LIQUID_COOLING_INFO|liquid_cooling_5': {
'leak_status': 'Yes',
'leak_sensor_name': 'liquid_cooling_5'
},
'LIQUID_COOLING_INFO|liquid_cooling_6': {
'leak_status': 'No',
'leak_sensor_name': 'liquid_cooling_6'
}
})

checker = HardwareChecker()
assert checker.get_category() == 'Hardware'
config = Config()
config.include_devices = ['liquid_cooling']
checker.check(config)

assert 'ASIC' in checker._info
Expand Down Expand Up @@ -521,6 +549,24 @@ def test_hardware_checker():
assert checker._info['PSU 7'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK
assert checker._info['PSU 7'][HealthChecker.INFO_FIELD_OBJECT_MSG] == 'System power exceeds threshold but power_critical_threshold is invalid'

assert 'liquid_cooling_1' in checker._info
assert checker._info['liquid_cooling_1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK

assert 'liquid_cooling_2' in checker._info
assert checker._info['liquid_cooling_2'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK

assert 'liquid_cooling_3' in checker._info
assert checker._info['liquid_cooling_3'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK

assert 'liquid_cooling_4' in checker._info
assert checker._info['liquid_cooling_4'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK

assert 'liquid_cooling_5' in checker._info
assert checker._info['liquid_cooling_5'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK

assert 'liquid_cooling_6' in checker._info
assert checker._info['liquid_cooling_6'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK


def test_config():
config = Config()
Expand All @@ -532,6 +578,7 @@ def test_config():
assert 'dummy_service' in config.ignore_services
assert 'psu.voltage' in config.ignore_devices
assert len(config.user_defined_checkers) == 0
assert 'liquid_cooling' in config.include_devices

assert config.get_led_color('fault') == 'orange'
assert config.get_led_color('normal') == 'green'
Expand All @@ -543,6 +590,7 @@ def test_config():
assert not config.ignore_devices
assert not config.user_defined_checkers
assert not config.config_data
assert not config.include_devices

assert config.get_led_color('fault') == 'red'
assert config.get_led_color('normal') == 'green'
Expand Down
Loading