11from natsort import natsorted
2+ from swsscommon import swsscommon
23from swsscommon .swsscommon import SonicV2Connector
34
45from .health_checker import HealthChecker
56
7+ EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
8+ EVENTS_PUBLISHER_TAG = "liquid-cooling-leak"
69
710class HardwareChecker (HealthChecker ):
811 """
@@ -12,12 +15,15 @@ class HardwareChecker(HealthChecker):
1215 ASIC_TEMPERATURE_KEY = 'TEMPERATURE_INFO|ASIC'
1316 FAN_TABLE_NAME = 'FAN_INFO'
1417 PSU_TABLE_NAME = 'PSU_INFO'
18+ LIQUID_COOLING_TABLE_NAME = 'LIQUID_COOLING_INFO'
1519
1620 def __init__ (self ):
1721 HealthChecker .__init__ (self )
1822 self ._db = SonicV2Connector (use_unix_socket_path = True )
1923 self ._db .connect (self ._db .STATE_DB )
2024
25+ self .leaking_sensors = []
26+
2127 def get_category (self ):
2228 return 'Hardware'
2329
@@ -26,6 +32,7 @@ def check(self, config):
2632 self ._check_asic_status (config )
2733 self ._check_fan_status (config )
2834 self ._check_psu_status (config )
35+ self ._check_liquid_cooling_status (config )
2936
3037 def _check_asic_status (self , config ):
3138 """
@@ -283,3 +290,58 @@ def _ignore_check(cls, ignore_set, category, object_name, check_point):
283290 elif '{}.{}' .format (object_name , check_point ) in ignore_set :
284291 return True
285292 return False
293+
294+ def publish_events (self , sensors , event_name ):
295+ params = swsscommon .FieldValueMap ()
296+ events_handle = swsscommon .events_init_publisher (EVENTS_PUBLISHER_SOURCE )
297+ for sensor in sensors :
298+ params [event_name ] = sensor
299+ swsscommon .event_publish (events_handle , EVENTS_PUBLISHER_TAG , params )
300+ swsscommon .events_deinit_publisher (events_handle )
301+
302+
303+ def _check_liquid_cooling_status (self , config ):
304+ """
305+ Check liquid cooling status including:
306+ 1. Check all leakage sensors are in good state
307+ :param config: Health checker configuration
308+ :return:
309+ """
310+ if not config .include_devices or 'liquid_cooling' not in config .include_devices :
311+ return
312+
313+ keys = self ._db .keys (self ._db .STATE_DB , HardwareChecker .LIQUID_COOLING_TABLE_NAME + '*' )
314+ if not keys :
315+ self .set_object_not_ok ('Liquid Cooling' , 'Liquid Cooling' , 'Failed to get liquid cooling information' )
316+ return
317+
318+ new_leaking_sensors = []
319+ for key in natsorted (keys ):
320+ key_list = key .split ('|' )
321+ if len (key_list ) != 2 : # error data in DB, log it and ignore
322+ self .set_object_not_ok ('Liquid Cooling' , key , 'Invalid key for LIQUID_COOLING_INFO: {}' .format (key ))
323+ continue
324+
325+ name = key_list [1 ]
326+ if config .ignore_devices and name in config .ignore_devices :
327+ continue
328+
329+ data_dict = self ._db .get_all (self ._db .STATE_DB , key )
330+ leak_status = data_dict .get ('leak_status' , None )
331+ if leak_status is None or leak_status == 'N/A' :
332+ self .set_object_not_ok ('Liquid Cooling' , name , 'Failed to get leakage sensor status for {}' .format (name ))
333+ continue
334+
335+ if leak_status .lower () == 'yes' and name not in self .leaking_sensors :
336+ self .leaking_sensors .append (name )
337+ new_leaking_sensors .append (name )
338+ self .set_object_not_ok ('Liquid Cooling' , name , 'Leakage sensor {} is leaking' .format (name ))
339+ continue
340+
341+ if leak_status .lower () == 'no' :
342+ self .set_object_ok ('Liquid Cooling' , name )
343+ if name in self .leaking_sensors :
344+ self .leaking_sensors .remove (name )
345+ self .publish_events ([name ], "leaking sensor report recovered" )
346+
347+ self .publish_events (new_leaking_sensors , "sensor report leaking event" )
0 commit comments