Skip to content

Commit f7377f2

Browse files
Enhance decommission logic on per-logical-port basis
1 parent fde28a6 commit f7377f2

File tree

2 files changed

+224
-114
lines changed

2 files changed

+224
-114
lines changed

sonic-xcvrd/xcvrd/xcvrd.py

Lines changed: 169 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -477,7 +477,6 @@ class CmisManagerTask(threading.Thread):
477477
CMIS_MODULE_TYPES = ['QSFP-DD', 'QSFP_DD', 'OSFP', 'OSFP-8X', 'QSFP+C']
478478
CMIS_MAX_HOST_LANES = 8
479479
CMIS_EXPIRATION_BUFFER_MS = 2
480-
ALL_LANES_MASK = 0xff
481480

482481
def __init__(self, namespaces, port_mapping, main_thread_stop_event, skip_cmis_mgr=False):
483482
threading.Thread.__init__(self)
@@ -698,98 +697,217 @@ def get_cmis_media_lanes_mask(self, api, appl, lport, subport):
698697

699698
def clear_decomm_pending(self, lport):
700699
"""
701-
Clear the decommission pending status for the entire physical port this logical port belongs to.
700+
Clear the decommission pending status for this logical port.
702701
703702
Args:
704703
lport:
705704
String, logical port name
706705
"""
707-
self.decomm_pending_dict.pop(self.port_dict.get(lport, {}).get('index'), None)
706+
physical_port_idx = self.port_dict.get(lport, {}).get('index')
707+
if physical_port_idx not in self.decomm_pending_dict:
708+
return
709+
self.decomm_pending_dict[physical_port_idx].pop(lport, None)
710+
# If there are no more logical ports pending decommission on this physical port,
711+
# remove the physical port entry from the decomm_pending_dict
712+
if not self.decomm_pending_dict[physical_port_idx]:
713+
self.decomm_pending_dict.pop(physical_port_idx)
708714

709-
def set_decomm_pending(self, lport):
715+
def set_decomm_pending(self, lport, api):
710716
"""
711-
Set the decommission pending status.
717+
Set the decommission pending status for this logical port to start decommissioning.
718+
719+
Decommissioning can be done on a per-logical-port basis: for each logical port,
720+
only decommission the minimal set of host lanes to allow the logical port to
721+
be able to apply the new appl code without config errors.
722+
723+
decomm_pending_dict stores the host lanes pending to be decommissioned for a logical
724+
port.
725+
i.e. self.decomm_pending_dict[physical_port_idx][lport] = lanes_mask_requiring_decomm
726+
lanes_mask_requiring_decomm of a logical port can be wider than the logical port itself.
712727
713728
Args:
714729
lport:
715730
String, logical port name
731+
api:
732+
XcvrApi object
733+
Returns:
734+
Boolean, True if skip the rest of the processing of the current CMIS state
716735
"""
717-
physical_port_idx = self.port_dict[lport]['index']
718-
if physical_port_idx in self.decomm_pending_dict:
719-
# only one logical port can be the lead logical port doing the
720-
# decommission state machine.
721-
return
722-
self.decomm_pending_dict[physical_port_idx] = lport
723-
self.log_notice("{}: DECOMMISSION: setting decomm_pending for physical port "
724-
"{}".format(lport, physical_port_idx))
736+
skip_rest_processing = False
737+
738+
lport_host_lanes_mask = self.port_dict[lport]['host_lanes_mask']
739+
lanes_mask_requiring_decomm = self.get_host_lanes_mask_requiring_decomm(lport, api)
740+
741+
# Check if other lports are doing decommission on the lanes overlapping with this lport
742+
total_affected_lanes_mask = lport_host_lanes_mask | lanes_mask_requiring_decomm
743+
if total_affected_lanes_mask & self.get_decomm_pending_host_lanes_mask(lport, exclude_lports=[lport]):
744+
self.clear_decomm_pending(lport)
745+
746+
if total_affected_lanes_mask & self.get_decomm_failed_host_lanes_mask(lport):
747+
# Fail this lport if any of its host lanes are in decommissioning failed state
748+
self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_FAILED)
749+
decomm_status_str = "failed"
750+
else:
751+
decomm_status_str = "waiting for completion"
752+
753+
self.log_notice("{}: DECOMM: decommission initiated by other lports is still in progress on "
754+
"host lanes {:#x}, {}".format(lport, total_affected_lanes_mask, decomm_status_str))
755+
756+
skip_rest_processing = True
757+
elif lanes_mask_requiring_decomm:
758+
self.decomm_pending_dict.setdefault(self.port_dict[lport]['index'], {})[lport] = lanes_mask_requiring_decomm
759+
self.log_notice("{}: DECOMM: setting decomm_pending for host lanes "
760+
"{:#x}".format(lport, lanes_mask_requiring_decomm))
761+
762+
self.port_dict[lport]['appl'] = 0
763+
self.port_dict[lport]['host_lanes_mask'] = lanes_mask_requiring_decomm
764+
self.port_dict[lport]['media_lanes_mask'] = lanes_mask_requiring_decomm
765+
self.log_notice("{}: DECOMM: setting appl={} and host_lanes_mask/media_lanes_mask="
766+
"{:#x}".format(lport, self.port_dict[lport]['appl'], lanes_mask_requiring_decomm))
767+
768+
# Skip rest of the deinit/pre-init when this is the logical port doing decommission
769+
self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_DP_DEINIT)
770+
skip_rest_processing = True
771+
else: # For the case of lanes_mask_requiring_decomm == 0x0:
772+
# If lport was previously marked as pending, then decommissioning is no more needed, clear its status
773+
self.clear_decomm_pending(lport)
774+
775+
return skip_rest_processing
725776

726-
def is_decomm_lead_lport(self, lport):
777+
def is_decomm_pending(self, lport):
727778
"""
728-
Check if this is the lead logical port doing the decommission state machine.
779+
Check if this logical port is in middle of decommissioning.
729780
730781
Args:
731782
lport:
732783
String, logical port name
733784
Returns:
734785
Boolean, True if decommission pending, False otherwise
735786
"""
736-
return self.decomm_pending_dict.get(self.port_dict[lport]['index']) == lport
787+
return lport in self.decomm_pending_dict.get(self.port_dict[lport]['index'], {})
737788

738-
def is_decomm_pending(self, lport):
789+
def get_decomm_pending_host_lanes_mask(self, lport, exclude_lports=[]):
739790
"""
740-
Get the decommission pending status for the physical port the given logical port belongs to.
791+
Get the host lanes in decommission pending status for the entire
792+
physical port the given logical port belongs to.
741793
742794
Args:
743795
lport:
744796
String, logical port name
797+
exclude_lports:
798+
List of logical ports to exclude from the mask
745799
Returns:
746-
Boolean, True if decommission pending, False otherwise
800+
Integer, bitmask of host lanes that are decommission pending
747801
"""
748-
return self.port_dict[lport]['index'] in self.decomm_pending_dict
802+
physical_port_idx = self.port_dict[lport]['index']
803+
decomm_ports = self.decomm_pending_dict.get(physical_port_idx, {})
804+
if not decomm_ports:
805+
return 0
749806

750-
def is_decomm_failed(self, lport):
807+
decomm_pending_mask = 0
808+
for logical_port, mask in decomm_ports.items():
809+
if logical_port in exclude_lports:
810+
continue
811+
decomm_pending_mask |= mask
812+
813+
return decomm_pending_mask
814+
815+
def get_decomm_failed_host_lanes_mask(self, lport):
751816
"""
752-
Get the decommission failed status for the physical port the given logical port belongs to.
817+
Get the host lanes in decommissioning failed state for the entire
818+
physical port the given logical port belongs to.
753819
754820
Args:
755821
lport:
756822
String, logical port name
757823
Returns:
758-
Boolean, True if decommission failed, False otherwise
824+
Integer, bitmask of host lanes in decommissioning failed state
759825
"""
826+
failed_mask = 0
827+
760828
physical_port_idx = self.port_dict[lport]['index']
761-
lead_logical_port = self.decomm_pending_dict.get(physical_port_idx)
762-
if lead_logical_port is None:
763-
return False
764-
return (
765-
get_cmis_state_from_state_db(
766-
lead_logical_port,
767-
self.xcvr_table_helper.get_status_sw_tbl(
768-
self.get_asic_id(lead_logical_port)
769-
)
770-
)
771-
== CMIS_STATE_FAILED
772-
)
829+
if physical_port_idx not in self.decomm_pending_dict:
830+
return failed_mask
773831

774-
def is_decommission_required(self, api, app_new):
832+
for logical_port, mask in self.decomm_pending_dict[physical_port_idx].items():
833+
if get_cmis_state_from_state_db(
834+
logical_port, self.xcvr_table_helper.get_status_sw_tbl(self.get_asic_id(logical_port))
835+
) != CMIS_STATE_FAILED:
836+
continue
837+
failed_mask |= mask
838+
839+
return failed_mask
840+
841+
def get_host_lanes_mask_requiring_decomm(self, lport, api):
775842
"""
776-
Check if the CMIS decommission (i.e. reset appl code to 0 for all lanes
777-
of the entire physical port) is required
843+
Get the minimal set of host lanes that require decommissioning to allow
844+
the given logical port to apply its new appl code successfully.
778845
779846
Args:
847+
lport:
848+
String, logical port name
780849
api:
781850
XcvrApi object
782-
app_new:
783-
Integer, the new desired appl code
784851
Returns:
785-
True, if decommission is required
786-
False, if decommission is not required
852+
Integer, bitmask of host lanes that require decommissioning
787853
"""
788-
for lane in range(self.CMIS_MAX_HOST_LANES):
789-
app_cur = api.get_application(lane)
790-
if app_cur != 0 and app_cur != app_new:
791-
return True
792-
return False
854+
def get_data_path_mask(app_advt, app, lane_idx):
855+
"""
856+
Get the host lane mask for the entire data path based on the appl code on one of its lanes.
857+
858+
Args:
859+
app_advt: The application advertisement dictionary
860+
app: The application code
861+
lane_idx: The index of the lane this appl code is assigned to
862+
Returns:
863+
Integer, the host lane mask for this data path
864+
"""
865+
host_lane_assignment_options = app_advt.get(app, {}).get('host_lane_assignment_options')
866+
host_lane_count = app_advt.get(app, {}).get('host_lane_count')
867+
868+
if not host_lane_assignment_options or not host_lane_count:
869+
return 0
870+
871+
mask_for_single_lane = 1 << lane_idx
872+
873+
for start_lane_idx in range(self.CMIS_MAX_HOST_LANES):
874+
if not (host_lane_assignment_options & (1 << start_lane_idx)):
875+
continue
876+
data_path_mask = ((1 << host_lane_count) - 1) << start_lane_idx
877+
if data_path_mask & mask_for_single_lane:
878+
return data_path_mask
879+
880+
return 0
881+
882+
lport_host_lanes_mask = self.port_dict[lport]['host_lanes_mask']
883+
app_advt = api.get_application_advertisement()
884+
active_app_dict = api.get_active_apsel_hostlane()
885+
app_new = self.port_dict[lport]['appl']
886+
887+
conflicting_data_paths_host_lanes_mask = 0
888+
# Identify the configured data paths that share host lanes with this logical port
889+
# and have conflicting appl codes
890+
for lane_idx in range(self.CMIS_MAX_HOST_LANES):
891+
if not (1 << lane_idx & lport_host_lanes_mask):
892+
continue
893+
app_cur = active_app_dict.get('ActiveAppSelLane{}'.format(lane_idx + 1), 0)
894+
if app_cur == 0 or app_cur == app_new:
895+
continue
896+
conflicting_data_paths_host_lanes_mask |= get_data_path_mask(app_advt, app_cur, lane_idx)
897+
898+
# If conflicting_data_paths_host_lanes_mask is covered by current lport's mask,
899+
# then new appl code can be applied directly without decommissioning
900+
host_lanes_mask_requiring_decomm = (
901+
0 if not (conflicting_data_paths_host_lanes_mask & ~lport_host_lanes_mask)
902+
else conflicting_data_paths_host_lanes_mask
903+
)
904+
905+
log_func = self.log_debug if not host_lanes_mask_requiring_decomm else self.log_notice
906+
log_func("{}: DECOMM: based on ActiveAppSel(lane 8->1) {}, to apply appl {} on {:#010b}, "
907+
"host lanes requiring decomm is {:#010b}".format(
908+
lport, list(reversed(active_app_dict.values())), app_new, lport_host_lanes_mask,
909+
host_lanes_mask_requiring_decomm))
910+
return host_lanes_mask_requiring_decomm
793911

794912
def is_cmis_application_update_required(self, api, app_new, host_lanes_mask):
795913
"""
@@ -1211,16 +1329,15 @@ def task_worker(self):
12111329
host_lanes_mask = self.port_dict[lport].get('host_lanes_mask', 0)
12121330
appl = self.port_dict[lport].get('appl', 0)
12131331
# appl can be 0 if this lport is in decommission state machine, which should not be considered as failed case.
1214-
if state != CMIS_STATE_INSERTED and not self.is_decomm_lead_lport(lport) and (host_lanes_mask <= 0 or appl < 1):
1332+
if state != CMIS_STATE_INSERTED and not self.is_decomm_pending(lport) and (host_lanes_mask <= 0 or appl < 1):
12151333
self.log_error("{}: Unexpected value for host_lanes_mask {} or appl {} in "
12161334
"{} state".format(lport, host_lanes_mask, appl, state))
12171335
self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_FAILED)
12181336
continue
12191337

12201338
self.log_notice("{}: {}G, lanemask=0x{:x}, CMIS state={}{}, Module state={}, DP state={}, appl {} host_lane_count {} "
12211339
"retries={}".format(lport, int(speed/1000), host_lanes_mask, state,
1222-
"(decommission" + ("*" if self.is_decomm_lead_lport(lport) else "") + ")"
1223-
if self.is_decomm_pending(lport) else "",
1340+
"(decommission)" if self.is_decomm_pending(lport) else "",
12241341
api.get_module_state(), api.get_datapath_state(), appl, host_lane_count, retries))
12251342
if retries > self.CMIS_MAX_RETRIES:
12261343
self.log_error("{}: FAILED".format(lport))
@@ -1264,27 +1381,7 @@ def task_worker(self):
12641381
media_lanes_mask = self.port_dict[lport]['media_lanes_mask']
12651382
self.log_notice("{}: Setting media_lanemask=0x{:x}".format(lport, media_lanes_mask))
12661383

1267-
if self.is_decommission_required(api, appl):
1268-
self.set_decomm_pending(lport)
1269-
1270-
if self.is_decomm_lead_lport(lport):
1271-
# Set all the DP lanes AppSel to unused(0) when non default app code needs to be configured
1272-
self.port_dict[lport]['appl'] = appl = 0
1273-
self.port_dict[lport]['host_lanes_mask'] = host_lanes_mask = self.ALL_LANES_MASK
1274-
self.port_dict[lport]['media_lanes_mask'] = self.ALL_LANES_MASK
1275-
self.log_notice("{}: DECOMMISSION: setting appl={} and "
1276-
"host_lanes_mask/media_lanes_mask={:#x}".format(lport, appl, self.ALL_LANES_MASK))
1277-
# Skip rest of the deinit/pre-init when this is the lead logical port for decommission
1278-
self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_DP_DEINIT)
1279-
continue
1280-
elif self.is_decomm_pending(lport):
1281-
if self.is_decomm_failed(lport):
1282-
self.update_port_transceiver_status_table_sw_cmis_state(lport, CMIS_STATE_FAILED)
1283-
decomm_status_str = "failed"
1284-
else:
1285-
decomm_status_str = "waiting for completion"
1286-
self.log_notice("{}: DECOMMISSION: decommission has already started for this physical port, "
1287-
"{}".format(lport, decomm_status_str))
1384+
if self.set_decomm_pending(lport, api):
12881385
continue
12891386

12901387
if self.port_dict[lport]['host_tx_ready'] != 'true' or \
@@ -1444,7 +1541,7 @@ def task_worker(self):
14441541

14451542
# Clear decommission status and invoke CMIS reinit so that normal CMIS initialization can begin
14461543
if self.is_decomm_pending(lport):
1447-
self.log_notice("{}: DECOMMISSION: done for physical port {}".format(lport, self.port_dict[lport]['index']))
1544+
self.log_notice("{}: DECOMM: decommission done for host lanes {:#x}".format(lport, self.port_dict[lport]['host_lanes_mask']))
14481545
self.clear_decomm_pending(lport)
14491546
self.force_cmis_reinit(lport)
14501547
continue

0 commit comments

Comments
 (0)