Skip to content

Commit 88d6d5a

Browse files
keboliuqiluo-msft
authored andcommitted
[xcvrd] Enhance xcvrd to handle new system level event/error (sonic-net#39)
* make xcvrd wait when system not ready * [xcvrd] fix indents and tense error. * [xcvrd]update retry-until-ready logic in state-machine approach * [xcvrd] receiving a normal event when "SYSTEM_NOT_READY", transmit the state to "SYSTEM_READY" * [xcvrd]set timeout = 0 when enters SYSTEM_READY state. * [xcvrd] update the logic of sfp_state_update_task in a state machine approach the definition of state machine is in front of sfp_state_update_task.task_worker * [xcvrd]address review comments * [xcvrd] fix merging confilcts * [xcvrd] improve readability.
1 parent be616c8 commit 88d6d5a

File tree

1 file changed

+182
-29
lines changed

1 file changed

+182
-29
lines changed

sonic-xcvrd/scripts/xcvrd

Lines changed: 182 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,27 @@ XCVRD_MAIN_THREAD_SLEEP_SECS = 60
4343
SFP_STATUS_INSERTED = '1'
4444
SFP_STATUS_REMOVED = '0'
4545

46+
EVENT_ON_ALL_SFP = '-1'
47+
# events definition
48+
SYSTEM_NOT_READY = 'system_not_ready'
49+
SYSTEM_BECOME_READY = 'system_become_ready'
50+
SYSTEM_FAIL = 'system_fail'
51+
NORMAL_EVENT = 'normal'
52+
# states definition
53+
STATE_INIT = 0
54+
STATE_NORMAL = 1
55+
STATE_EXIT = 2
56+
4657
PHYSICAL_PORT_NOT_EXIST = -1
4758
SFP_EEPROM_NOT_READY = -2
4859

4960
SFPUTIL_LOAD_ERROR = 1
5061
PORT_CONFIG_LOAD_ERROR = 2
5162
NOT_IMPLEMENTED_ERROR = 3
5263

64+
RETRY_TIMES_FOR_SYSTEM_READY = 24
65+
RETRY_PERIOD_FOR_SYSTEM_READY_MSECS = 5000
66+
5367
TEMP_UNIT = 'C'
5468
VOLT_UNIT = 'Volts'
5569
POWER_UNIT = 'dBm'
@@ -130,15 +144,15 @@ def _wrapper_get_transceiver_dom_threshold_info(physical_port):
130144

131145
return platform_sfputil.get_transceiver_dom_threshold_info_dict(physical_port)
132146

133-
def _wrapper_get_transceiver_change_event():
147+
def _wrapper_get_transceiver_change_event(timeout):
134148
if platform_chassis is not None:
135149
try:
136-
status, events = platform_chassis.get_change_event()
150+
status, events = platform_chassis.get_change_event(timeout)
137151
sfp_events = events['sfp']
138152
return status, sfp_events
139153
except NotImplementedError:
140154
pass
141-
return platform_sfputil.get_transceiver_change_event()
155+
return platform_sfputil.get_transceiver_change_event(timeout)
142156

143157
# Remove unnecessary unit from the raw data
144158
def beautify_dom_info_dict(dom_info_dict):
@@ -657,6 +671,30 @@ class sfp_state_update_task:
657671
self.task_process = None
658672
self.task_stopping_event = multiprocessing.Event()
659673

674+
def _mapping_event_from_change_event(self, status, port_dict):
675+
"""
676+
mapping from what get_transceiver_change_event returns to event defined in the state machine
677+
the logic is pretty straightforword
678+
"""
679+
if status:
680+
if bool(port_dict):
681+
event = NORMAL_EVENT
682+
else:
683+
event = SYSTEM_BECOME_READY
684+
# here, a simple timeout event whose port_dict is empty is mapped
685+
# into a SYSTEM_BECOME_READY event so that it can be handled
686+
port_dict[EVENT_ON_ALL_SFP] = SYSTEM_BECOME_READY
687+
else:
688+
if EVENT_ON_ALL_SFP in port_dict.keys():
689+
event = port_dict[EVENT_ON_ALL_SFP]
690+
else:
691+
# this should not happen. just for protection
692+
event = SYSTEM_FAIL
693+
port_dict[EVENT_ON_ALL_SFP] = SYSTEM_FAIL
694+
695+
logger.log_debug("mapping from {} {} to {}".format(status, port_dict, event))
696+
return event
697+
660698
def task_worker(self, stopping_event):
661699
logger.log_info("Start SFP monitoring loop")
662700

@@ -672,36 +710,151 @@ class sfp_state_update_task:
672710
swsscommon.APP_PORT_TABLE_NAME)
673711

674712
# Start loop to listen to the sfp change event
713+
# The state migrating sequence:
714+
# 1. When the system starts, it is in "INIT" state, calling get_transceiver_change_event
715+
# with RETRY_PERIOD_FOR_SYSTEM_READY_MSECS as timeout for as many as RETRY_TIMES_FOR_SYSTEM_READY
716+
# times
717+
# 2. Once 'system_become_ready' returned, the system enters "SYSTEM_READY" state and starts to monitor
718+
# the insertion/removal event of all the SFP modules.
719+
# In this state, receiving any system level event will be treated as an unrecoverable error and cause
720+
# the daemon exit
721+
722+
# states definition
723+
# - Initial state: INIT, before received system ready or a normal event
724+
# - Final state: EXIT
725+
# - other state: NORMAL, after has received system-ready or a normal event
726+
727+
# events definition
728+
# - SYSTEM_NOT_READY
729+
# - SYSTEM_BECOME_READY
730+
# -
731+
# - NORMAL_EVENT
732+
# - sfp insertion/removal
733+
# - timeout returned by sfputil.get_change_event with status = true
734+
# - SYSTEM_FAIL
735+
736+
# State transmit:
737+
# 1. SYSTEM_NOT_READY
738+
# - INIT
739+
# - retry < RETRY_TIMES_FOR_SYSTEM_READY
740+
# retry ++
741+
# - else
742+
# max retry reached, treat as fatal, exit
743+
# - NORMAL
744+
# Treat as a fatal error, exit
745+
# 2. SYSTEM_BECOME_READY
746+
# - INIT
747+
# transmit to NORMAL
748+
# - NORMAL
749+
# log the event
750+
# nop
751+
# 3. NORMAL_EVENT
752+
# - INIT (for the vendors who don't implement SYSTEM_BECOME_READY)
753+
# transmit to NORMAL
754+
# handle the event normally
755+
# - NORMAL
756+
# handle the event normally
757+
# 4. SYSTEM_FAIL
758+
# treat as a fatal error
759+
760+
# State event next state
761+
# INIT SYSTEM NOT READY INIT / EXIT
762+
# INIT SYSTEM BECOME READY NORMAL
763+
# NORMAL SYSTEM BECOME READY NORMAL
764+
# INIT/NORMAL SYSTEM FAIL EXIT
765+
# INIT/NORMAL NORMAL EVENT NORMAL
766+
# NORMAL SYSTEM NOT READY EXIT
767+
# EXIT -
768+
769+
retry = 0
770+
timeout = RETRY_PERIOD_FOR_SYSTEM_READY_MSECS
771+
state = STATE_INIT
675772
while not stopping_event.is_set():
676-
status, port_dict = _wrapper_get_transceiver_change_event()
677-
if status:
678-
for key, value in port_dict.iteritems():
679-
logical_port_list = platform_sfputil.get_physical_to_logical(int(key))
680-
for logical_port in logical_port_list:
681-
if value == SFP_STATUS_INSERTED:
682-
logger.log_info("Got SFP inserted event {}".format(logical_port))
683-
rc = post_port_sfp_info_to_db(logical_port, int_tbl, transceiver_dict)
684-
# If we didn't get the sfp info, assuming the eeprom is not ready, give a try again.
685-
if rc == SFP_EEPROM_NOT_READY:
686-
logger.log_warning("SFP EEPROM is not ready. One more try...")
687-
time.sleep(TIME_FOR_SFP_READY_SECS)
688-
post_port_sfp_info_to_db(logical_port, int_tbl, transceiver_dict)
689-
post_port_dom_info_to_db(logical_port, dom_tbl)
690-
post_port_dom_threshold_info_to_db(logical_port, dom_tbl)
691-
notify_media_setting(logical_port, transceiver_dict, app_port_tbl)
692-
transceiver_dict.clear()
693-
elif value == SFP_STATUS_REMOVED:
694-
logger.log_info("Got SFP removed event {}".format(logical_port))
695-
del_port_sfp_dom_info_from_db(logical_port, int_tbl, dom_tbl)
696-
else:
697-
# TODO, SFP return error code, need handle accordingly.
698-
continue
773+
next_state = state
774+
time_start = time.time()
775+
status, port_dict = _wrapper_get_transceiver_change_event(timeout)
776+
logger.log_debug("Got event {} {} in state {}".format(status, port_dict, state))
777+
event = self._mapping_event_from_change_event(status, port_dict)
778+
if event == SYSTEM_NOT_READY:
779+
if state == STATE_INIT:
780+
# system not ready, wait and retry
781+
if retry >= RETRY_TIMES_FOR_SYSTEM_READY:
782+
logger.log_error("System failed to get ready in {} secs or received system error. Exiting...".format((RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000)*RETRY_TIMES_FOR_SYSTEM_READY))
783+
next_state = STATE_EXIT
784+
else:
785+
retry = retry + 1
786+
787+
# get_transceiver_change_event may return immediately,
788+
# we want the retry expired in expected time period,
789+
# So need to calc the time diff,
790+
# if time diff less that the pre-defined waiting time,
791+
# use sleep() to complete the time.
792+
time_now = time.time()
793+
time_diff = time_now - time_start
794+
if time_diff < RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000:
795+
time.sleep(RETRY_PERIOD_FOR_SYSTEM_READY_MSECS/1000 - time_diff)
796+
elif state == STATE_NORMAL:
797+
logger.log_error("Got system_not_ready in normal state, treat as fatal. Exiting...")
798+
next_state = STATE_EXIT
799+
else:
800+
next_state = STATE_EXIT
801+
elif event == SYSTEM_BECOME_READY:
802+
if state == STATE_INIT:
803+
next_state = STATE_NORMAL
804+
logger.log_info("Got system_become_ready in init state, transmit to normal state")
805+
elif state == STATE_NORMAL:
806+
logger.log_info("Got system_become_ready in normal state, ignored")
807+
else:
808+
next_state = STATE_EXIT
809+
elif event == NORMAL_EVENT:
810+
if state == STATE_NORMAL or state == STATE_INIT:
811+
if state == STATE_INIT:
812+
next_state = STATE_NORMAL
813+
# this is the originally logic that handled the transceiver change event
814+
# this can be reached in two cases:
815+
# 1. the state has been normal before got the event
816+
# 2. the state was init and is transmitted to normal after got the event.
817+
# this is for the vendors who don't implement "system_not_ready/system_becom_ready" logic
818+
for key, value in port_dict.iteritems():
819+
logical_port_list = platform_sfputil.get_physical_to_logical(int(key))
820+
for logical_port in logical_port_list:
821+
if value == SFP_STATUS_INSERTED:
822+
logger.log_info("Got SFP inserted event")
823+
rc = post_port_sfp_info_to_db(logical_port, int_tbl, transceiver_dict)
824+
# If we didn't get the sfp info, assuming the eeprom is not ready, give a try again.
825+
if rc == SFP_EEPROM_NOT_READY:
826+
logger.log_warning("SFP EEPROM is not ready. One more try...")
827+
time.sleep(TIME_FOR_SFP_READY_SECS)
828+
post_port_sfp_info_to_db(logical_port, int_tbl, transceiver_dict)
829+
post_port_dom_info_to_db(logical_port, dom_tbl)
830+
post_port_dom_threshold_info_to_db(logical_port, dom_tbl)
831+
notify_media_setting(logical_port, transceiver_dict, app_port_tbl)
832+
transceiver_dict.clear()
833+
elif value == SFP_STATUS_REMOVED:
834+
logger.log_info("Got SFP removed event")
835+
del_port_sfp_dom_info_from_db(logical_port, int_tbl, dom_tbl)
836+
else:
837+
# TODO, SFP return error code, need handle accordingly.
838+
logger.log_warning("Got unknown event {}, ignored".format(value))
839+
continue
840+
else:
841+
next_state = STATE_EXIT
842+
elif event == SYSTEM_FAIL:
843+
# no matter which state current it is, it's fatal
844+
next_state = STATE_EXIT
845+
logger.log_error("Got system_fail event on state {}, exiting".format(state))
699846
else:
700-
# If get_transceiver_change_event() return error, will clean up the DB and then exit
701-
# TODO: next step need to define more error types to handle accordingly.
702-
logger.log_error("Failed to get transceiver change event. Exiting...")
847+
logger.log_warning("Got unknown event {} on state {}.".format(event, state))
848+
849+
if next_state != state:
850+
logger.log_debug("State transmitted from {} to {}".format(state, next_state))
851+
state = next_state
852+
853+
if next_state == STATE_EXIT:
703854
os.kill(os.getppid(), signal.SIGTERM)
704855
break
856+
elif next_state == STATE_NORMAL:
857+
timeout = 0
705858

706859
logger.log_info("Stop SFP monitoring loop")
707860

0 commit comments

Comments
 (0)