Skip to content

Commit 875f690

Browse files
committed
xcvrd: Add new thread to poll temperatures aggressively
For high power modules it is important to query the temperature often as they can heat up quickly and the cooling algorithm needs to react quickly to avoid damaging the transceiver modules. Introducing a new DomThermalUpdateInfoTask thread which is solely polling the temperature information out of the xcvrs and publishing these in a new TRANSCEIVER_DOM_TEMPERATURE table in STATE_DB. This is mostly additional code being added. To maximize code reuse, the existing DomUpdateInfoTask has been refactored so that it now share a base with the newly introduced DomThermalInfoUpdateTask. Benchmarking shows that polling the temperature out of a xcvrand publishing it inr the databasse takes around 0.0016 for 800G optics. Considering a worst case of 0.002 and considering a device with 64 ports, this amounts to around 0.128s for every loop. With this loop running every 5s this means this represent a ~2.5% of busyness time.
1 parent 999f165 commit 875f690

File tree

6 files changed

+225
-43
lines changed

6 files changed

+225
-43
lines changed

sonic-xcvrd/tests/test_xcvrd.py

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -394,33 +394,38 @@ def test_SfpStateUpdateTask_task_run_with_exception(self):
394394

395395
@patch('xcvrd.xcvrd.SfpStateUpdateTask.is_alive', MagicMock(return_value = False))
396396
@patch('xcvrd.xcvrd.DomInfoUpdateTask.is_alive', MagicMock(return_value = False))
397+
@patch('xcvrd.xcvrd.DomThermalInfoUpdateTask.is_alive', MagicMock(return_value = False))
397398
@patch('xcvrd.xcvrd.CmisManagerTask.is_alive', MagicMock(return_value = False))
398399
@patch('xcvrd.xcvrd.SffManagerTask.is_alive', MagicMock(return_value=False))
399400
@patch('xcvrd.xcvrd.CmisManagerTask.join', MagicMock(side_effect=NotImplementedError))
400401
@patch('xcvrd.xcvrd.CmisManagerTask.start', MagicMock())
401402
@patch('xcvrd.xcvrd.SffManagerTask.start', MagicMock())
402403
@patch('xcvrd.xcvrd.DomInfoUpdateTask.start', MagicMock())
404+
@patch('xcvrd.xcvrd.DomThermalInfoUpdateTask.start', MagicMock())
403405
@patch('xcvrd.xcvrd.SfpStateUpdateTask.start', MagicMock())
404406
@patch('xcvrd.xcvrd.DaemonXcvrd.deinit', MagicMock())
405407
@patch('os.kill')
406408
@patch('xcvrd.xcvrd.DaemonXcvrd.init')
407409
@patch('xcvrd.xcvrd.DomInfoUpdateTask.join')
410+
@patch('xcvrd.xcvrd.DomThermalInfoUpdateTask.join')
408411
@patch('xcvrd.xcvrd.SfpStateUpdateTask.join')
409412
@patch('xcvrd.xcvrd.SffManagerTask.join')
410413
def test_DaemonXcvrd_run_with_exception(self, mock_task_join_sff, mock_task_join_sfp,
411-
mock_task_join_dom, mock_init, mock_os_kill):
414+
mock_task_join_dom, mock_task_join_dom_thermal,
415+
mock_init, mock_os_kill):
412416
mock_init.return_value = PortMapping()
413417
xcvrd = DaemonXcvrd(SYSLOG_IDENTIFIER)
414418
xcvrd.enable_sff_mgr = True
415419
xcvrd.load_feature_flags = MagicMock()
416420
xcvrd.stop_event.wait = MagicMock()
417421
xcvrd.run()
418422

419-
assert len(xcvrd.threads) == 4
423+
assert len(xcvrd.threads) == 5
420424
assert mock_init.call_count == 1
421425
assert mock_task_join_sff.call_count == 1
422426
assert mock_task_join_sfp.call_count == 1
423427
assert mock_task_join_dom.call_count == 1
428+
assert mock_task_join_dom_thermal.call_count == 1
424429
assert mock_os_kill.call_count == 1
425430

426431
class TestXcvrdScript(object):
@@ -644,6 +649,64 @@ def mock_get_transceiver_dom_sensor_real_value(physical_port):
644649
dom_db_utils.post_port_dom_sensor_info_to_db(logical_port_name, db_cache=db_cache)
645650
assert dom_tbl.get_size_for_key(logical_port_name) == 27
646651

652+
def test_post_port_dom_temperature_info_to_db(self):
653+
def mock_get_transceiver_dom_temperature(physical_port):
654+
return {
655+
'temperature': '68.75',
656+
}
657+
658+
logical_port_name = "Ethernet0"
659+
port_mapping = PortMapping()
660+
port_mapping.get_logical_to_physical = MagicMock(return_value=[0])
661+
xcvr_table_helper = XcvrTableHelper(DEFAULT_NAMESPACE)
662+
stop_event = threading.Event()
663+
mock_sfp_obj_dict = {0 : MagicMock()}
664+
665+
dom_db_utils = DOMDBUtils(mock_sfp_obj_dict, port_mapping, xcvr_table_helper, stop_event, helper_logger)
666+
dom_db_utils.dom_utils = MagicMock()
667+
dom_db_utils.xcvrd_utils.get_transceiver_presence = MagicMock(return_value=False)
668+
dom_db_utils.xcvrd_utils.is_transceiver_flat_memory = MagicMock(return_value=False)
669+
dom_tbl = Table("STATE_DB", TRANSCEIVER_DOM_TEMPERATURE_TABLE)
670+
dom_db_utils.xcvr_table_helper.get_dom_temperature_tbl = MagicMock(return_value=dom_tbl)
671+
dom_db_utils.dom_utils.get_transceiver_dom_temperature = MagicMock(return_value=None)
672+
assert dom_tbl.get_size() == 0
673+
674+
# Ensure table is empty asic_index is None
675+
port_mapping.get_asic_id_for_logical_port = MagicMock(return_value=None)
676+
dom_db_utils.post_port_dom_temperature_info_to_db(logical_port_name)
677+
assert dom_tbl.get_size() == 0
678+
679+
# Set asic_index to 0
680+
port_mapping.get_asic_id_for_logical_port = MagicMock(return_value=0)
681+
682+
# Ensure table is empty if stop_event is set
683+
stop_event.set()
684+
dom_db_utils.post_port_dom_temperature_info_to_db(logical_port_name)
685+
assert dom_tbl.get_size() == 0
686+
stop_event.clear()
687+
688+
# Ensure table is empty if transceiver is not present
689+
dom_db_utils.post_port_dom_temperature_info_to_db(logical_port_name)
690+
assert dom_tbl.get_size() == 0
691+
dom_db_utils.return_value = True
692+
693+
# Ensure table is empty if get_values_func returns None
694+
dom_db_utils.xcvrd_utils.get_transceiver_presence = MagicMock(return_value=True)
695+
dom_db_utils.post_port_dom_temperature_info_to_db(logical_port_name)
696+
assert dom_tbl.get_size() == 0
697+
698+
# Ensure table is populated if get_values_func returns valid values
699+
db_cache = {}
700+
dom_db_utils.dom_utils.get_transceiver_dom_temperature = MagicMock(side_effect=mock_get_transceiver_dom_temperature)
701+
dom_db_utils.post_port_dom_temperature_info_to_db(logical_port_name, db_cache=db_cache)
702+
assert dom_tbl.get_size_for_key(logical_port_name) == 2
703+
704+
# Ensure db_cache is populated correctly
705+
assert db_cache.get(0) is not None
706+
dom_db_utils.dom_utils.get_transceiver_dom_temperature = MagicMock(return_value=None)
707+
dom_db_utils.post_port_dom_temperature_info_to_db(logical_port_name, db_cache=db_cache)
708+
assert dom_tbl.get_size_for_key(logical_port_name) == 2
709+
647710
def test_post_port_dom_flags_to_db(self):
648711
def mock_get_transceiver_dom_flags(physical_port):
649712
return {
@@ -4582,6 +4645,7 @@ def test_DaemonXcvrd_init_deinit_fastboot_enabled(self, mock_del_port_sfp_dom_in
45824645
status_sw_tbl = MagicMock()
45834646
xcvrd.xcvr_table_helper.get_status_sw_tbl = MagicMock(return_value=status_sw_tbl)
45844647
xcvrd.xcvr_table_helper.get_dom_tbl = MagicMock(return_value=MagicMock)
4648+
xcvrd.xcvr_table_helper.get_dom_temperature_tbl = MagicMock(return_value=MagicMock)
45854649
xcvrd.xcvr_table_helper.get_dom_flag_tbl = MagicMock()
45864650
xcvrd.xcvr_table_helper.get_dom_flag_change_count_tbl = MagicMock()
45874651
xcvrd.xcvr_table_helper.get_dom_flag_set_time_tbl = MagicMock()
@@ -4629,6 +4693,7 @@ def test_DaemonXcvrd_init_deinit_cold(self, mock_del_port_sfp_dom_info_from_db):
46294693
status_sw_tbl = MagicMock()
46304694
xcvrdaemon.xcvr_table_helper.get_status_sw_tbl = MagicMock(return_value=status_sw_tbl)
46314695
xcvrdaemon.xcvr_table_helper.get_dom_tbl = MagicMock(return_value=MagicMock)
4696+
xcvrdaemon.xcvr_table_helper.get_dom_temperature_tbl = MagicMock(return_value=MagicMock)
46324697
xcvrdaemon.xcvr_table_helper.get_dom_threshold_tbl = MagicMock(return_value=MagicMock)
46334698
xcvrdaemon.xcvr_table_helper.get_dom_flag_tbl = MagicMock()
46344699
xcvrdaemon.xcvr_table_helper.get_dom_flag_change_count_tbl = MagicMock()

sonic-xcvrd/xcvrd/dom/dom_mgr.py

Lines changed: 116 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import copy
1414
import sys
1515
import re
16+
import time
1617

1718
from natsort import natsorted
1819
from sonic_py_common import syslogger
@@ -33,32 +34,19 @@
3334

3435
SYSLOG_IDENTIFIER_DOMINFOUPDATETASK = "DomInfoUpdateTask"
3536

36-
class DomInfoUpdateTask(threading.Thread):
37-
DOM_INFO_UPDATE_PERIOD_SECS = 60
38-
DIAG_DB_UPDATE_TIME_AFTER_LINK_CHANGE = 1
39-
DOM_PORT_CHG_OBSERVER_TBL_MAP = [
40-
{'APPL_DB': 'PORT_TABLE', 'FILTER': ['flap_count']},
41-
]
37+
class DomInfoUpdateBase(threading.Thread):
4238

43-
def __init__(self, namespaces, port_mapping, sfp_obj_dict, main_thread_stop_event, skip_cmis_mgr):
39+
name = ''
40+
41+
def __init__(self, namespaces, port_mapping, sfp_obj_dict, main_thread_stop_event):
4442
threading.Thread.__init__(self)
45-
self.name = "DomInfoUpdateTask"
4643
self.exc = None
4744
self.task_stopping_event = threading.Event()
4845
self.main_thread_stop_event = main_thread_stop_event
4946
self.helper_logger = syslogger.SysLogger(SYSLOG_IDENTIFIER_DOMINFOUPDATETASK, enable_runtime_config=True)
5047
self.port_mapping = copy.deepcopy(port_mapping)
5148
self.namespaces = namespaces
52-
self.skip_cmis_mgr = skip_cmis_mgr
5349
self.sfp_obj_dict = sfp_obj_dict
54-
self.link_change_affected_ports = {}
55-
self.xcvr_table_helper = XcvrTableHelper(self.namespaces)
56-
self.xcvrd_utils = XCVRDUtils(self.sfp_obj_dict, helper_logger)
57-
self.dom_db_utils = DOMDBUtils(self.sfp_obj_dict, self.port_mapping, self.xcvr_table_helper, self.task_stopping_event, self.helper_logger)
58-
self.db_utils = self.dom_db_utils
59-
self.vdm_utils = VDMUtils(self.sfp_obj_dict, self.helper_logger)
60-
self.vdm_db_utils = VDMDBUtils(self.sfp_obj_dict, self.port_mapping, self.xcvr_table_helper, self.task_stopping_event, self.helper_logger)
61-
self.status_db_utils = StatusDBUtils(self.sfp_obj_dict, self.port_mapping, self.xcvr_table_helper, self.task_stopping_event, self.helper_logger)
6250

6351
def log_debug(self, message):
6452
self.helper_logger.log_debug("{}".format(message))
@@ -75,6 +63,14 @@ def log_warning(self, message):
7563
def log_error(self, message):
7664
self.helper_logger.log_error("{}".format(message))
7765

66+
def on_port_config_change(self, port_change_event):
67+
if port_change_event.event_type == port_event_helper.PortChangeEvent.PORT_REMOVE:
68+
self.on_remove_logical_port(port_change_event)
69+
self.port_mapping.handle_port_change_event(port_change_event)
70+
71+
def on_remove_logical_port(self, port_change_event):
72+
pass
73+
7874
def get_dom_polling_from_config_db(self, lport):
7975
"""
8076
Returns the value of dom_polling field from PORT table in CONFIG_DB
@@ -110,6 +106,51 @@ def get_dom_polling_from_config_db(self, lport):
110106

111107
return dom_polling
112108

109+
110+
def is_port_dom_monitoring_disabled(self, logical_port_name):
111+
return self.get_dom_polling_from_config_db(logical_port_name) == 'disabled'
112+
113+
def task_worker(self):
114+
pass
115+
116+
def run(self):
117+
if self.task_stopping_event.is_set():
118+
return
119+
try:
120+
self.task_worker()
121+
except Exception as e:
122+
self.log_error("Exception occured at {} thread due to {}".format(threading.current_thread().getName(), repr(e)))
123+
xcvrd.log_exception_traceback()
124+
self.exc = e
125+
self.main_thread_stop_event.set()
126+
127+
def join(self):
128+
self.task_stopping_event.set()
129+
threading.Thread.join(self)
130+
if self.exc:
131+
raise self.exc
132+
133+
class DomInfoUpdateTask(DomInfoUpdateBase):
134+
name = "DomInfoUpdateTask"
135+
136+
DOM_INFO_UPDATE_PERIOD_SECS = 60
137+
DIAG_DB_UPDATE_TIME_AFTER_LINK_CHANGE = 1
138+
DOM_PORT_CHG_OBSERVER_TBL_MAP = [
139+
{'APPL_DB': 'PORT_TABLE', 'FILTER': ['flap_count']},
140+
]
141+
142+
def __init__(self, namespaces, port_mapping, sfp_obj_dict, main_thread_stop_event, skip_cmis_mgr):
143+
super().__init__(namespaces, port_mapping, sfp_obj_dict, main_thread_stop_event)
144+
self.skip_cmis_mgr = skip_cmis_mgr
145+
self.link_change_affected_ports = {}
146+
self.xcvr_table_helper = XcvrTableHelper(self.namespaces)
147+
self.xcvrd_utils = XCVRDUtils(self.sfp_obj_dict, helper_logger)
148+
self.dom_db_utils = DOMDBUtils(self.sfp_obj_dict, self.port_mapping, self.xcvr_table_helper, self.task_stopping_event, self.helper_logger)
149+
self.db_utils = self.dom_db_utils
150+
self.vdm_utils = VDMUtils(self.sfp_obj_dict, self.helper_logger)
151+
self.vdm_db_utils = VDMDBUtils(self.sfp_obj_dict, self.port_mapping, self.xcvr_table_helper, self.task_stopping_event, self.helper_logger)
152+
self.status_db_utils = StatusDBUtils(self.sfp_obj_dict, self.port_mapping, self.xcvr_table_helper, self.task_stopping_event, self.helper_logger)
153+
113154
"""
114155
Checks if the port is going through CMIS initialization process
115156
This API assumes CMIS_STATE_UNKNOWN as a transitional state since it is the
@@ -339,23 +380,6 @@ def task_worker(self):
339380

340381
self.log_notice("Stop DOM monitoring loop")
341382

342-
def run(self):
343-
if self.task_stopping_event.is_set():
344-
return
345-
try:
346-
self.task_worker()
347-
except Exception as e:
348-
self.log_error("Exception occured at {} thread due to {}".format(threading.current_thread().getName(), repr(e)))
349-
common.log_exception_traceback()
350-
self.exc = e
351-
self.main_thread_stop_event.set()
352-
353-
def join(self):
354-
self.task_stopping_event.set()
355-
threading.Thread.join(self)
356-
if self.exc:
357-
raise self.exc
358-
359383
def on_port_update_event(self, port_change_event):
360384
"""Called when a port change event is received
361385
@@ -427,11 +451,6 @@ def update_port_db_diagnostics_on_link_change(self, physical_port):
427451
self.log_warning(f"Update DB diagnostics during link change: Got exception {repr(e)} while processing vdm flags for port {first_logical_port}, ignored")
428452
return
429453

430-
def on_port_config_change(self, port_change_event):
431-
if port_change_event.event_type == port_event_helper.PortChangeEvent.PORT_REMOVE:
432-
self.on_remove_logical_port(port_change_event)
433-
self.port_mapping.handle_port_change_event(port_change_event)
434-
435454
def on_remove_logical_port(self, port_change_event):
436455
"""Called when a logical port is removed from CONFIG_DB
437456
@@ -444,6 +463,7 @@ def on_remove_logical_port(self, port_change_event):
444463
common.del_port_sfp_dom_info_from_db(port_change_event.port_name,
445464
self.port_mapping,
446465
[self.xcvr_table_helper.get_dom_tbl(port_change_event.asic_id),
466+
self.xcvr_table_helper.get_dom_temperature_tbl(port_change_event.asic_id),
447467
self.xcvr_table_helper.get_dom_flag_tbl(port_change_event.asic_id),
448468
self.xcvr_table_helper.get_dom_flag_change_count_tbl(port_change_event.asic_id),
449469
self.xcvr_table_helper.get_dom_flag_set_time_tbl(port_change_event.asic_id),
@@ -461,3 +481,60 @@ def on_remove_logical_port(self, port_change_event):
461481
self.xcvr_table_helper.get_pm_tbl(port_change_event.asic_id),
462482
self.xcvr_table_helper.get_firmware_info_tbl(port_change_event.asic_id)
463483
])
484+
485+
class DomThermalInfoUpdateTask(DomInfoUpdateBase):
486+
name = 'DomThermalInfoUpdateTask'
487+
488+
DOM_INFO_UPDATE_PERIOD_SECS = 5
489+
490+
def __init__(self, namespaces, port_mapping, sfp_obj_dict, main_thread_stop_event):
491+
super().__init__(namespaces, port_mapping, sfp_obj_dict, main_thread_stop_event)
492+
self.xcvr_table_helper = XcvrTableHelper(self.namespaces)
493+
self.dom_db_utils = DOMDBUtils(self.sfp_obj_dict, self.port_mapping, self.xcvr_table_helper, self.task_stopping_event, self.helper_logger)
494+
495+
def task_worker(self):
496+
self.log_notice("Start DOM thermal monitoring loop")
497+
498+
# Set the periodic db update time
499+
dom_info_update_periodic_secs = self.DOM_INFO_UPDATE_PERIOD_SECS
500+
501+
# Poll transceiver temperature as soon as possible
502+
next_periodic_db_update_time = datetime.datetime.now()
503+
504+
# Start loop to update dom info in DB periodically and handle port change events
505+
while not self.task_stopping_event.is_set():
506+
# Check if periodic db update is needed
507+
now = datetime.datetime.now()
508+
if next_periodic_db_update_time > now:
509+
# Sleep for 1 second or less depending on the remaining time
510+
time.sleep(min(1, (next_periodic_db_update_time - now).total_seconds()))
511+
continue
512+
513+
for physical_port, logical_ports in self.port_mapping.physical_to_logical.items():
514+
# Get the first logical port name since it corresponds to the first subport
515+
# of the breakout group
516+
logical_port_name = logical_ports[0]
517+
518+
if self.is_port_dom_monitoring_disabled(logical_port_name):
519+
continue
520+
521+
# Get the asic to which this port belongs
522+
asic_index = self.port_mapping.get_asic_id_for_logical_port(logical_port_name)
523+
if asic_index is None:
524+
self.log_warning("Got invalid asic index for {}, ignored".format(logical_port_name))
525+
continue
526+
527+
if not sfp_status_helper.detect_port_in_error_status(logical_port_name, self.xcvr_table_helper.get_status_sw_tbl(asic_index)):
528+
if not xcvrd._wrapper_get_presence(physical_port):
529+
continue
530+
531+
try:
532+
self.dom_db_utils.post_port_dom_temperature_info_to_db(logical_port_name)
533+
except (KeyError, TypeError) as e:
534+
#continue to process next port since exception could be raised due to port reset, transceiver removal
535+
self.log_warning("Got exception {} while processing dom info for port {}, ignored".format(repr(e), logical_port_name))
536+
537+
# Set the periodic db update time after all the ports are processed
538+
next_periodic_db_update_time = now + datetime.timedelta(seconds=dom_info_update_periodic_secs)
539+
540+
self.log_notice("Stop DOM thermal monitoring loop")

sonic-xcvrd/xcvrd/dom/utilities/dom_sensor/db_utils.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,19 @@ def __init__(self, sfp_obj_dict, port_mapping, xcvr_table_helper, task_stopping_
2424
self.dom_utils = DOMUtils(self.sfp_obj_dict, logger)
2525
self.logger = logger
2626

27+
def post_port_dom_temperature_info_to_db(self, logical_port_name, db_cache=None):
28+
asic_index = self.port_mapping.get_asic_id_for_logical_port(logical_port_name)
29+
if asic_index is None:
30+
self.logger.log_error(f"Post port dom sensor info to db failed for {logical_port_name} "
31+
"as no asic index found")
32+
return
33+
34+
return self.post_diagnostic_values_to_db(logical_port_name,
35+
self.xcvr_table_helper.get_dom_temperature_tbl(asic_index),
36+
self.dom_utils.get_transceiver_dom_temperature,
37+
db_cache=db_cache,
38+
beautify_func=self._beautify_dom_info_dict)
39+
2740
def post_port_dom_sensor_info_to_db(self, logical_port_name, db_cache=None):
2841
asic_index = self.port_mapping.get_asic_id_for_logical_port(logical_port_name)
2942
if asic_index is None:

sonic-xcvrd/xcvrd/dom/utilities/dom_sensor/utils.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@ def __init__(self, sfp_obj_dict, logger):
77
self.sfp_obj_dict = sfp_obj_dict
88
self.logger = logger
99

10+
def get_transceiver_dom_temperature(self, physical_port):
11+
try:
12+
return {
13+
'temperature': self.sfp_obj_dict[physical_port].get_temperature(),
14+
}
15+
except (NotImplementedError):
16+
return {}
17+
1018
def get_transceiver_dom_sensor_real_value(self, physical_port):
1119
try:
1220
return self.sfp_obj_dict[physical_port].get_transceiver_dom_real_value()

0 commit comments

Comments
 (0)