Skip to content

Commit b13b41f

Browse files
[swss] Chassis db clean up optimization and bug fixes (#16454)
* [swss] Chassis db clean up optimization and bug fixes This commit includes the following changes: - Fix for regression failure due to error in finding CHASSIS_APP_DB in pizzabox (#PR 16451) - After attempting to delete the system neighbor entries from chassis db, before starting clearing the system interface entries, wait for sometime only if some system neighbors were deleted. If there are no system neighbors entries deleted for the asic coming up, no need to wait. - Similar changes for system lag delete. Before deleting the system lag, wait for some time only if some system lag memebers were deleted. If there are no system lag members deleted no need to wait. - Flush the SYSTEM_NEIGH_TABLE from the local STATE_DB. While asic is coming up, when system neigh entries are deleted from chassis ap db (as part of chassis db clean up), there is no orchs/process running to process the delete messages from chassis redis. Because of this, stale system neigh are entries present in the local STATE_DB. The stale entries result in creation of orphan (no corresponding data path/asic db entry) kernel neigh entries during STATE_DB:SYSTEM_NEIGH_TABLE entries processing by nbrmgr (after the swss serive came up). This is avoided by flushing the SYSTEM_NEIGH_TABLE from the local STATE_DB when sevice comes up. Signed-off-by: vedganes <[email protected]> * [swss] Chassis db clean up bug fixes review comment fix - 1 Debug logs added for deletion of other tables (SYSTEM_INTERFACE and SYSTEM_LAG_TABLE) Signed-off-by: vedganes <[email protected]> --------- Signed-off-by: vedganes <[email protected]>
1 parent 9c1c82e commit b13b41f

File tree

1 file changed

+40
-16
lines changed

1 file changed

+40
-16
lines changed

files/scripts/swss.sh

Lines changed: 40 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -124,72 +124,93 @@ function clean_up_tables()
124124
# SYSTEM_LAG_ID_TABLE and SYSTEM_LAG_ID_SET are adjusted appropriately
125125
function clean_up_chassis_db_tables()
126126
{
127-
if [[ !($($SONIC_DB_CLI CHASSIS_APP_DB PING | grep -c True) -gt 0) ]]; then
128-
return
129-
fi
130127

131-
lc=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' 'hostname'`
132-
asic=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' 'asic_name'`
133128
switch_type=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' 'switch_type'`
134129

135130
# Run clean up only in swss running for voq switches
136131
if is_chassis_supervisor || [[ $switch_type != 'voq' ]]; then
137132
return
138133
fi
139134

135+
if [[ !($($SONIC_DB_CLI CHASSIS_APP_DB PING | grep -c True) -gt 0) ]]; then
136+
return
137+
fi
138+
139+
lc=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' 'hostname'`
140+
asic=`$SONIC_DB_CLI CONFIG_DB hget 'DEVICE_METADATA|localhost' 'asic_name'`
141+
140142
# First, delete SYSTEM_NEIGH entries
141-
$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
143+
num_neigh=`$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
144+
local nn = 0
142145
local host = string.gsub(ARGV[1], '%-', '%%-')
143146
local dev = ARGV[2]
144147
local ps = 'SYSTEM_NEIGH*|' .. host .. '|' .. dev
145148
local keylist = redis.call('KEYS', 'SYSTEM_NEIGH*')
146149
for j,key in ipairs(keylist) do
147150
if string.match(key, ps) ~= nil then
148151
redis.call('DEL', key)
152+
nn = nn + 1
149153
end
150154
end
151-
return " 0 $lc $asic
155+
return nn" 0 $lc $asic`
156+
157+
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_NEIGH entries deleted: $num_neigh"
152158

153159
# Wait for some time before deleting system interface so that the system interface's "object in use"
154160
# is cleared in both orchangent and in syncd. Without this delay, the orchagent clears the refcount
155161
# but the syncd (meta) still has no-zero refcount. Because of this, orchagent gets "object still in use"
156162
# error and aborts.
163+
# This delay is needed only if some system neighbors were deleted.
157164

158-
sleep 30
165+
if [[ $num_neigh > 0 ]]; then
166+
sleep 30
167+
fi
159168

160169
# Next, delete SYSTEM_INTERFACE entries
161-
$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
170+
num_sys_intf=`$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
171+
local nsi = 0
162172
local host = string.gsub(ARGV[1], '%-', '%%-')
163173
local dev = ARGV[2]
164174
local ps = 'SYSTEM_INTERFACE*|' .. host .. '|' .. dev
165175
local keylist = redis.call('KEYS', 'SYSTEM_INTERFACE*')
166176
for j,key in ipairs(keylist) do
167177
if string.match(key, ps) ~= nil then
168178
redis.call('DEL', key)
179+
nsi = nsi + 1
169180
end
170181
end
171-
return " 0 $lc $asic
182+
return nsi" 0 $lc $asic`
183+
184+
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_INTERFACE entries deleted: $num_sys_intf"
172185

173186
# Next, delete SYSTEM_LAG_MEMBER_TABLE entries
174-
$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
187+
num_lag_mem=`$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
188+
local nlm = 0
175189
local host = string.gsub(ARGV[1], '%-', '%%-')
176190
local dev = ARGV[2]
177191
local ps = 'SYSTEM_LAG_MEMBER_TABLE*|' .. host .. '|' .. dev
178192
local keylist = redis.call('KEYS', 'SYSTEM_LAG_MEMBER_TABLE*')
179193
for j,key in ipairs(keylist) do
180194
if string.match(key, ps) ~= nil then
181195
redis.call('DEL', key)
196+
nlm = nlm + 1
182197
end
183198
end
184-
return " 0 $lc $asic
199+
return nlm" 0 $lc $asic`
200+
201+
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_LAG_MEMBER_TABLE entries deleted: $num_lag_mem"
185202

186203
# Wait for some time before deleting system lag so that the all the memebers of the
187204
# system lag will be cleared.
205+
# This delay is needed only if some system lag members were deleted
188206

189-
sleep 15
207+
if [[ $num_lag_mem > 0 ]]; then
208+
sleep 15
209+
fi
190210

191211
# Finally, delete SYSTEM_LAG_TABLE entries and deallot LAG IDs
192-
$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
212+
num_sys_lag=`$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
213+
local nsl = 0
193214
local host = string.gsub(ARGV[1], '%-', '%%-')
194215
local dev = ARGV[2]
195216
local ps = 'SYSTEM_LAG_TABLE*|' .. '(' .. host .. '|' .. dev ..'.*' .. ')'
@@ -201,9 +222,12 @@ function clean_up_chassis_db_tables()
201222
local lagid = redis.call('HGET', 'SYSTEM_LAG_ID_TABLE', lagname)
202223
redis.call('SREM', 'SYSTEM_LAG_ID_SET', lagid)
203224
redis.call('HDEL', 'SYSTEM_LAG_ID_TABLE', lagname)
225+
nsl = nsl + 1
204226
end
205227
end
206-
return " 0 $lc $asic
228+
return nsl" 0 $lc $asic`
229+
230+
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_LAG_TABLE entries deleted: $num_sys_lag"
207231

208232
}
209233

@@ -275,7 +299,7 @@ start() {
275299
$SONIC_DB_CLI GB_ASIC_DB FLUSHDB
276300
$SONIC_DB_CLI GB_COUNTERS_DB FLUSHDB
277301
$SONIC_DB_CLI RESTAPI_DB FLUSHDB
278-
clean_up_tables STATE_DB "'PORT_TABLE*', 'MGMT_PORT_TABLE*', 'VLAN_TABLE*', 'VLAN_MEMBER_TABLE*', 'LAG_TABLE*', 'LAG_MEMBER_TABLE*', 'INTERFACE_TABLE*', 'MIRROR_SESSION*', 'VRF_TABLE*', 'FDB_TABLE*', 'FG_ROUTE_TABLE*', 'BUFFER_POOL*', 'BUFFER_PROFILE*', 'MUX_CABLE_TABLE*', 'ADVERTISE_NETWORK_TABLE*', 'VXLAN_TUNNEL_TABLE*', 'VNET_ROUTE*', 'MACSEC_PORT_TABLE*', 'MACSEC_INGRESS_SA_TABLE*', 'MACSEC_EGRESS_SA_TABLE*', 'MACSEC_INGRESS_SC_TABLE*', 'MACSEC_EGRESS_SC_TABLE*', 'VRF_OBJECT_TABLE*', 'VNET_MONITOR_TABLE*', 'BFD_SESSION_TABLE*'"
302+
clean_up_tables STATE_DB "'PORT_TABLE*', 'MGMT_PORT_TABLE*', 'VLAN_TABLE*', 'VLAN_MEMBER_TABLE*', 'LAG_TABLE*', 'LAG_MEMBER_TABLE*', 'INTERFACE_TABLE*', 'MIRROR_SESSION*', 'VRF_TABLE*', 'FDB_TABLE*', 'FG_ROUTE_TABLE*', 'BUFFER_POOL*', 'BUFFER_PROFILE*', 'MUX_CABLE_TABLE*', 'ADVERTISE_NETWORK_TABLE*', 'VXLAN_TUNNEL_TABLE*', 'VNET_ROUTE*', 'MACSEC_PORT_TABLE*', 'MACSEC_INGRESS_SA_TABLE*', 'MACSEC_EGRESS_SA_TABLE*', 'MACSEC_INGRESS_SC_TABLE*', 'MACSEC_EGRESS_SC_TABLE*', 'VRF_OBJECT_TABLE*', 'VNET_MONITOR_TABLE*', 'BFD_SESSION_TABLE*','SYSTEM_NEIGH_TABLE*'"
279303
$SONIC_DB_CLI APPL_STATE_DB FLUSHDB
280304
clean_up_chassis_db_tables
281305
rm -rf /tmp/cache

0 commit comments

Comments
 (0)