@@ -43,13 +43,27 @@ XCVRD_MAIN_THREAD_SLEEP_SECS = 60
4343SFP_STATUS_INSERTED = '1'
4444SFP_STATUS_REMOVED = '0'
4545
46+ EVENT_ON_ALL_SFP = '-1'
47+ # events definition
48+ SYSTEM_NOT_READY = 'system_not_ready'
49+ SYSTEM_BECOME_READY = 'system_become_ready'
50+ SYSTEM_FAIL = 'system_fail'
51+ NORMAL_EVENT = 'normal'
52+ # states definition
53+ STATE_INIT = 0
54+ STATE_NORMAL = 1
55+ STATE_EXIT = 2
56+
4657PHYSICAL_PORT_NOT_EXIST = - 1
4758SFP_EEPROM_NOT_READY = - 2
4859
4960SFPUTIL_LOAD_ERROR = 1
5061PORT_CONFIG_LOAD_ERROR = 2
5162NOT_IMPLEMENTED_ERROR = 3
5263
64+ RETRY_TIMES_FOR_SYSTEM_READY = 24
65+ RETRY_PERIOD_FOR_SYSTEM_READY_MSECS = 5000
66+
5367TEMP_UNIT = 'C'
5468VOLT_UNIT = 'Volts'
5569POWER_UNIT = 'dBm'
@@ -130,15 +144,15 @@ def _wrapper_get_transceiver_dom_threshold_info(physical_port):
130144
131145 return platform_sfputil .get_transceiver_dom_threshold_info_dict (physical_port )
132146
133- def _wrapper_get_transceiver_change_event ():
147+ def _wrapper_get_transceiver_change_event (timeout ):
134148 if platform_chassis is not None :
135149 try :
136- status , events = platform_chassis .get_change_event ()
150+ status , events = platform_chassis .get_change_event (timeout )
137151 sfp_events = events ['sfp' ]
138152 return status , sfp_events
139153 except NotImplementedError :
140154 pass
141- return platform_sfputil .get_transceiver_change_event ()
155+ return platform_sfputil .get_transceiver_change_event (timeout )
142156
143157# Remove unnecessary unit from the raw data
144158def beautify_dom_info_dict (dom_info_dict ):
@@ -657,6 +671,30 @@ class sfp_state_update_task:
657671 self .task_process = None
658672 self .task_stopping_event = multiprocessing .Event ()
659673
674+ def _mapping_event_from_change_event (self , status , port_dict ):
675+ """
676+ mapping from what get_transceiver_change_event returns to event defined in the state machine
677+ the logic is pretty straightforword
678+ """
679+ if status :
680+ if bool (port_dict ):
681+ event = NORMAL_EVENT
682+ else :
683+ event = SYSTEM_BECOME_READY
684+ # here, a simple timeout event whose port_dict is empty is mapped
685+ # into a SYSTEM_BECOME_READY event so that it can be handled
686+ port_dict [EVENT_ON_ALL_SFP ] = SYSTEM_BECOME_READY
687+ else :
688+ if EVENT_ON_ALL_SFP in port_dict .keys ():
689+ event = port_dict [EVENT_ON_ALL_SFP ]
690+ else :
691+ # this should not happen. just for protection
692+ event = SYSTEM_FAIL
693+ port_dict [EVENT_ON_ALL_SFP ] = SYSTEM_FAIL
694+
695+ logger .log_debug ("mapping from {} {} to {}" .format (status , port_dict , event ))
696+ return event
697+
660698 def task_worker (self , stopping_event ):
661699 logger .log_info ("Start SFP monitoring loop" )
662700
@@ -672,36 +710,151 @@ class sfp_state_update_task:
672710 swsscommon .APP_PORT_TABLE_NAME )
673711
674712 # Start loop to listen to the sfp change event
713+ # The state migrating sequence:
714+ # 1. When the system starts, it is in "INIT" state, calling get_transceiver_change_event
715+ # with RETRY_PERIOD_FOR_SYSTEM_READY_MSECS as timeout for as many as RETRY_TIMES_FOR_SYSTEM_READY
716+ # times
717+ # 2. Once 'system_become_ready' returned, the system enters "SYSTEM_READY" state and starts to monitor
718+ # the insertion/removal event of all the SFP modules.
719+ # In this state, receiving any system level event will be treated as an unrecoverable error and cause
720+ # the daemon exit
721+
722+ # states definition
723+ # - Initial state: INIT, before received system ready or a normal event
724+ # - Final state: EXIT
725+ # - other state: NORMAL, after has received system-ready or a normal event
726+
727+ # events definition
728+ # - SYSTEM_NOT_READY
729+ # - SYSTEM_BECOME_READY
730+ # -
731+ # - NORMAL_EVENT
732+ # - sfp insertion/removal
733+ # - timeout returned by sfputil.get_change_event with status = true
734+ # - SYSTEM_FAIL
735+
736+ # State transmit:
737+ # 1. SYSTEM_NOT_READY
738+ # - INIT
739+ # - retry < RETRY_TIMES_FOR_SYSTEM_READY
740+ # retry ++
741+ # - else
742+ # max retry reached, treat as fatal, exit
743+ # - NORMAL
744+ # Treat as a fatal error, exit
745+ # 2. SYSTEM_BECOME_READY
746+ # - INIT
747+ # transmit to NORMAL
748+ # - NORMAL
749+ # log the event
750+ # nop
751+ # 3. NORMAL_EVENT
752+ # - INIT (for the vendors who don't implement SYSTEM_BECOME_READY)
753+ # transmit to NORMAL
754+ # handle the event normally
755+ # - NORMAL
756+ # handle the event normally
757+ # 4. SYSTEM_FAIL
758+ # treat as a fatal error
759+
760+ # State event next state
761+ # INIT SYSTEM NOT READY INIT / EXIT
762+ # INIT SYSTEM BECOME READY NORMAL
763+ # NORMAL SYSTEM BECOME READY NORMAL
764+ # INIT/NORMAL SYSTEM FAIL EXIT
765+ # INIT/NORMAL NORMAL EVENT NORMAL
766+ # NORMAL SYSTEM NOT READY EXIT
767+ # EXIT -
768+
769+ retry = 0
770+ timeout = RETRY_PERIOD_FOR_SYSTEM_READY_MSECS
771+ state = STATE_INIT
675772 while not stopping_event .is_set ():
676- status , port_dict = _wrapper_get_transceiver_change_event ()
677- if status :
678- for key , value in port_dict .iteritems ():
679- logical_port_list = platform_sfputil .get_physical_to_logical (int (key ))
680- for logical_port in logical_port_list :
681- if value == SFP_STATUS_INSERTED :
682- logger .log_info ("Got SFP inserted event {}" .format (logical_port ))
683- rc = post_port_sfp_info_to_db (logical_port , int_tbl , transceiver_dict )
684- # If we didn't get the sfp info, assuming the eeprom is not ready, give a try again.
685- if rc == SFP_EEPROM_NOT_READY :
686- logger .log_warning ("SFP EEPROM is not ready. One more try..." )
687- time .sleep (TIME_FOR_SFP_READY_SECS )
688- post_port_sfp_info_to_db (logical_port , int_tbl , transceiver_dict )
689- post_port_dom_info_to_db (logical_port , dom_tbl )
690- post_port_dom_threshold_info_to_db (logical_port , dom_tbl )
691- notify_media_setting (logical_port , transceiver_dict , app_port_tbl )
692- transceiver_dict .clear ()
693- elif value == SFP_STATUS_REMOVED :
694- logger .log_info ("Got SFP removed event {}" .format (logical_port ))
695- del_port_sfp_dom_info_from_db (logical_port , int_tbl , dom_tbl )
696- else :
697- # TODO, SFP return error code, need handle accordingly.
698- continue
773+ next_state = state
774+ time_start = time .time ()
775+ status , port_dict = _wrapper_get_transceiver_change_event (timeout )
776+ logger .log_debug ("Got event {} {} in state {}" .format (status , port_dict , state ))
777+ event = self ._mapping_event_from_change_event (status , port_dict )
778+ if event == SYSTEM_NOT_READY :
779+ if state == STATE_INIT :
780+ # system not ready, wait and retry
781+ if retry >= RETRY_TIMES_FOR_SYSTEM_READY :
782+ logger .log_error ("System failed to get ready in {} secs or received system error. Exiting..." .format ((RETRY_PERIOD_FOR_SYSTEM_READY_MSECS / 1000 )* RETRY_TIMES_FOR_SYSTEM_READY ))
783+ next_state = STATE_EXIT
784+ else :
785+ retry = retry + 1
786+
787+ # get_transceiver_change_event may return immediately,
788+ # we want the retry expired in expected time period,
789+ # So need to calc the time diff,
790+ # if time diff less that the pre-defined waiting time,
791+ # use sleep() to complete the time.
792+ time_now = time .time ()
793+ time_diff = time_now - time_start
794+ if time_diff < RETRY_PERIOD_FOR_SYSTEM_READY_MSECS / 1000 :
795+ time .sleep (RETRY_PERIOD_FOR_SYSTEM_READY_MSECS / 1000 - time_diff )
796+ elif state == STATE_NORMAL :
797+ logger .log_error ("Got system_not_ready in normal state, treat as fatal. Exiting..." )
798+ next_state = STATE_EXIT
799+ else :
800+ next_state = STATE_EXIT
801+ elif event == SYSTEM_BECOME_READY :
802+ if state == STATE_INIT :
803+ next_state = STATE_NORMAL
804+ logger .log_info ("Got system_become_ready in init state, transmit to normal state" )
805+ elif state == STATE_NORMAL :
806+ logger .log_info ("Got system_become_ready in normal state, ignored" )
807+ else :
808+ next_state = STATE_EXIT
809+ elif event == NORMAL_EVENT :
810+ if state == STATE_NORMAL or state == STATE_INIT :
811+ if state == STATE_INIT :
812+ next_state = STATE_NORMAL
813+ # this is the originally logic that handled the transceiver change event
814+ # this can be reached in two cases:
815+ # 1. the state has been normal before got the event
816+ # 2. the state was init and is transmitted to normal after got the event.
817+ # this is for the vendors who don't implement "system_not_ready/system_becom_ready" logic
818+ for key , value in port_dict .iteritems ():
819+ logical_port_list = platform_sfputil .get_physical_to_logical (int (key ))
820+ for logical_port in logical_port_list :
821+ if value == SFP_STATUS_INSERTED :
822+ logger .log_info ("Got SFP inserted event" )
823+ rc = post_port_sfp_info_to_db (logical_port , int_tbl , transceiver_dict )
824+ # If we didn't get the sfp info, assuming the eeprom is not ready, give a try again.
825+ if rc == SFP_EEPROM_NOT_READY :
826+ logger .log_warning ("SFP EEPROM is not ready. One more try..." )
827+ time .sleep (TIME_FOR_SFP_READY_SECS )
828+ post_port_sfp_info_to_db (logical_port , int_tbl , transceiver_dict )
829+ post_port_dom_info_to_db (logical_port , dom_tbl )
830+ post_port_dom_threshold_info_to_db (logical_port , dom_tbl )
831+ notify_media_setting (logical_port , transceiver_dict , app_port_tbl )
832+ transceiver_dict .clear ()
833+ elif value == SFP_STATUS_REMOVED :
834+ logger .log_info ("Got SFP removed event" )
835+ del_port_sfp_dom_info_from_db (logical_port , int_tbl , dom_tbl )
836+ else :
837+ # TODO, SFP return error code, need handle accordingly.
838+ logger .log_warning ("Got unknown event {}, ignored" .format (value ))
839+ continue
840+ else :
841+ next_state = STATE_EXIT
842+ elif event == SYSTEM_FAIL :
843+ # no matter which state current it is, it's fatal
844+ next_state = STATE_EXIT
845+ logger .log_error ("Got system_fail event on state {}, exiting" .format (state ))
699846 else :
700- # If get_transceiver_change_event() return error, will clean up the DB and then exit
701- # TODO: next step need to define more error types to handle accordingly.
702- logger .log_error ("Failed to get transceiver change event. Exiting..." )
847+ logger .log_warning ("Got unknown event {} on state {}." .format (event , state ))
848+
849+ if next_state != state :
850+ logger .log_debug ("State transmitted from {} to {}" .format (state , next_state ))
851+ state = next_state
852+
853+ if next_state == STATE_EXIT :
703854 os .kill (os .getppid (), signal .SIGTERM )
704855 break
856+ elif next_state == STATE_NORMAL :
857+ timeout = 0
705858
706859 logger .log_info ("Stop SFP monitoring loop" )
707860
0 commit comments