Skip to content

Commit 742e3cf

Browse files
authored
[action] [PR:24219] retry the chassis db cleanup operations (#1730)
<!-- Please make sure you've read and understood our contributing guidelines: https://github.com/Azure/SONiC/blob/gh-pages/CONTRIBUTING.md ** Make sure all your commits include a signature generated with `git commit -s` ** If this is a bug fix, make sure your description includes "fixes #xxxx", or "closes #xxxx" or "resolves #xxxx" Please provide the following information: --> #### Why I did it When running load_minigraph or reloading configuration on the linecards, the `interface-config.service` restarts, which causes the midplane interface to flap. If swss.sh on the linecards deletes state from chassis_db, some states may not be cleaned up correctly, while others are successfully removed. For example, cleanup for `SYSTEM_NEIGHBOR` or `SYSTEM_INTF` may fail, but SYSTEM_LAG cleanup might succeed. This can lead to inconsistent lag IDs for the remote LC. ##### Work item tracking - Microsoft ADO **35454463** #### How I did it Add logic to retry in swss.sh script. #### How to verify it Run test to do load_minigraph on all the linecards and check for the logs to for remove lag failure for Lags on remote LC. <!-- If PR needs to be backported, then the PR must be tested against the base branch and the earliest backport release branch and provide tested image version on these two branches. For example, if the PR is requested for master, 202211 and 202012, then the requester needs to provide test results on master and 202012. --> #### Which release branch to backport (provide reason below if selected) <!-- - Note we only backport fixes to a release branch, *not* features! - Please also provide a reason for the backporting below. - e.g. - [x] 202006 --> - [ ] 202205 - [ ] 202211 - [ ] 202305 - [ ] 202311 - [ ] 202405 - [ ] 202411 - [ ] 202505 #### Tested branch (Please provide the tested image version) <!-- - Please provide tested image version - e.g. - [x] 20201231.100 --> - [ ] <!-- image version 1 --> - [ ] <!-- image version 2 --> #### Description for the changelog <!-- Write a short (one line) summary that describes the changes in this pull request for inclusion in the changelog: --> <!-- Ensure to add label/tag for the feature raised. example - PR#2174 under sonic-utilities repo. where, Generic Config and Update feature has been labelled as GCU. --> #### Link to config_db schema for YANG module changes <!-- Provide a link to config_db schema for the table for which YANG model is defined Link should point to correct section on https://github.com/Azure/sonic-buildimage/blob/master/src/sonic-yang-models/doc/Configuration.md --> #### A picture of a cute animal (not mandatory but encouraged)
1 parent c057635 commit 742e3cf

File tree

1 file changed

+124
-60
lines changed

1 file changed

+124
-60
lines changed

files/scripts/swss.sh

Lines changed: 124 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,120 @@ function clean_up_tables()
107107
end" 0
108108
}
109109

110+
# This function attempts to delete entries using a specified delete function, retrying up to a maximum number of times if the deletion fails.
111+
# Arguments:
112+
# $1 - delete_func: Name of the function to call for deleting the entry.
113+
# $2 - host: Host identifier for the deletion operation.
114+
# $3 - asic: ASIC identifier for the deletion operation.
115+
# $4 - entry_name: Name of the entry being deleted (used for logging).
116+
#
117+
# Behavior:
118+
# - Calls the delete function with host and asic as arguments.
119+
# - If the deletion succeeds (exit code 0), prints the result and exits.
120+
# - If the deletion fails, logs the attempt and retries up to max_retries times (default: 6).
121+
# - Waits 10 seconds between retries.
122+
# - If all attempts fail, logs an error and returns 1.
123+
function retry_delete_entries() {
124+
local delete_func=$1
125+
local host=$2
126+
local asic=$3
127+
local max_retries=3
128+
local attempt=1
129+
local result
130+
while true; do
131+
result=$($delete_func "$host" "$asic")
132+
if [[ $? -eq 0 ]]; then
133+
echo $result
134+
break
135+
else
136+
debug "retrying $delete_func failed host=$host asic=$asic(attempt $attempt/$max_retries)"
137+
if [[ $attempt -ge $max_retries ]]; then
138+
debug "Error: $delete_func failed after $max_retries attempts for host=$host asic=$asic"
139+
return 1
140+
fi
141+
attempt=$((attempt+1))
142+
sleep 10
143+
fi
144+
done
145+
}
146+
# Function to delete SYSTEM_NEIGH entries for a given host and asic
147+
function delete_system_neigh_entries() {
148+
local_host=$1
149+
local_asic=$2
150+
$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
151+
local nn = 0
152+
local host = string.gsub(ARGV[1], '%-', '%%-')
153+
local dev = ARGV[2]
154+
local ps = 'SYSTEM_NEIGH*|' .. host .. '|' .. dev
155+
local keylist = redis.call('KEYS', 'SYSTEM_NEIGH*')
156+
for j,key in ipairs(keylist) do
157+
if string.match(key, ps) ~= nil then
158+
redis.call('DEL', key)
159+
nn = nn + 1
160+
end
161+
end
162+
return nn" 0 $local_host $local_asic
163+
}
164+
# Function to delete SYSTEM_INTERFACE entries for a given host and asic
165+
function delete_system_interface_entries() {
166+
local_host=$1
167+
local_asic=$2
168+
$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
169+
local nsi = 0
170+
local host = string.gsub(ARGV[1], '%-', '%%-')
171+
local dev = ARGV[2]
172+
local ps = 'SYSTEM_INTERFACE*|' .. host .. '|' .. dev
173+
local keylist = redis.call('KEYS', 'SYSTEM_INTERFACE*')
174+
for j,key in ipairs(keylist) do
175+
if string.match(key, ps) ~= nil then
176+
redis.call('DEL', key)
177+
nsi = nsi + 1
178+
end
179+
end
180+
return nsi" 0 $local_host $local_asic
181+
}
182+
# Function to delete SYSTEM_LAG_MEMBER_TABLE entries for a given host and asic
183+
function delete_system_lag_member_entries() {
184+
local_host=$1
185+
local_asic=$2
186+
$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
187+
local nlm = 0
188+
local host = string.gsub(ARGV[1], '%-', '%%-')
189+
local dev = ARGV[2]
190+
local ps = 'SYSTEM_LAG_MEMBER_TABLE*|' .. host .. '|' .. dev
191+
local keylist = redis.call('KEYS', 'SYSTEM_LAG_MEMBER_TABLE*')
192+
for j,key in ipairs(keylist) do
193+
if string.match(key, ps) ~= nil then
194+
redis.call('DEL', key)
195+
nlm = nlm + 1
196+
end
197+
end
198+
return nlm" 0 $local_host $local_asic
199+
}
200+
201+
function delete_system_lag_entries() {
202+
local_host=$1
203+
local_asic=$2
204+
$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
205+
local nsl = 0
206+
local host = string.gsub(ARGV[1], '%-', '%%-')
207+
local dev = ARGV[2]
208+
local ps = 'SYSTEM_LAG_TABLE*|' .. '(' .. host .. '|' .. dev ..'.*' .. ')'
209+
local keylist = redis.call('KEYS', 'SYSTEM_LAG_TABLE*')
210+
for j,key in ipairs(keylist) do
211+
local lagname = string.match(key, ps)
212+
if lagname ~= nil then
213+
redis.call('DEL', key)
214+
local lagid = redis.call('HGET', 'SYSTEM_LAG_ID_TABLE', lagname)
215+
redis.call('SREM', 'SYSTEM_LAG_ID_SET', lagid)
216+
redis.call('HDEL', 'SYSTEM_LAG_ID_TABLE', lagname)
217+
redis.call('rpush', 'SYSTEM_LAG_IDS_FREE_LIST', lagid)
218+
nsl = nsl + 1
219+
end
220+
end
221+
return nsl" 0 $local_host $local_asic
222+
}
223+
110224
# This function cleans up the chassis db table entries created ONLY by this asic
111225
# This is used to do the clean up operation when the line card / asic reboots
112226
# When the asic/lc is RE-booting, the chassis db server is supposed to be running
@@ -150,20 +264,8 @@ function clean_up_chassis_db_tables()
150264
done
151265
debug "Chassis db clean up for ${SERVICE}$DEV. asic=$asic"
152266

153-
# First, delete SYSTEM_NEIGH entries
154-
num_neigh=`$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
155-
local nn = 0
156-
local host = string.gsub(ARGV[1], '%-', '%%-')
157-
local dev = ARGV[2]
158-
local ps = 'SYSTEM_NEIGH*|' .. host .. '|' .. dev
159-
local keylist = redis.call('KEYS', 'SYSTEM_NEIGH*')
160-
for j,key in ipairs(keylist) do
161-
if string.match(key, ps) ~= nil then
162-
redis.call('DEL', key)
163-
nn = nn + 1
164-
end
165-
end
166-
return nn" 0 $lc $asic`
267+
# First, delete SYSTEM_NEIGH entries using a dedicated function
268+
num_neigh=$(retry_delete_entries delete_system_neigh_entries "$lc" "$asic")
167269

168270
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_NEIGH entries deleted: $num_neigh"
169271

@@ -177,37 +279,16 @@ function clean_up_chassis_db_tables()
177279
sleep 30
178280
fi
179281

180-
# Next, delete SYSTEM_INTERFACE entries
181-
num_sys_intf=`$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
182-
local nsi = 0
183-
local host = string.gsub(ARGV[1], '%-', '%%-')
184-
local dev = ARGV[2]
185-
local ps = 'SYSTEM_INTERFACE*|' .. host .. '|' .. dev
186-
local keylist = redis.call('KEYS', 'SYSTEM_INTERFACE*')
187-
for j,key in ipairs(keylist) do
188-
if string.match(key, ps) ~= nil then
189-
redis.call('DEL', key)
190-
nsi = nsi + 1
191-
end
192-
end
193-
return nsi" 0 $lc $asic`
282+
# Next, delete SYSTEM_INTERFACE entries
283+
num_sys_intf=$(retry_delete_entries delete_system_interface_entries "$lc" "$asic")
194284

195285
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_INTERFACE entries deleted: $num_sys_intf"
286+
if [[ $num_sys_intf > 0 ]]; then
287+
sleep 15
288+
fi
196289

197-
# Next, delete SYSTEM_LAG_MEMBER_TABLE entries
198-
num_lag_mem=`$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
199-
local nlm = 0
200-
local host = string.gsub(ARGV[1], '%-', '%%-')
201-
local dev = ARGV[2]
202-
local ps = 'SYSTEM_LAG_MEMBER_TABLE*|' .. host .. '|' .. dev
203-
local keylist = redis.call('KEYS', 'SYSTEM_LAG_MEMBER_TABLE*')
204-
for j,key in ipairs(keylist) do
205-
if string.match(key, ps) ~= nil then
206-
redis.call('DEL', key)
207-
nlm = nlm + 1
208-
end
209-
end
210-
return nlm" 0 $lc $asic`
290+
# Next, delete SYSTEM_LAG_MEMBER_TABLE entries using a dedicated function
291+
num_lag_mem=$(retry_delete_entries delete_system_lag_member_entries "$lc" "$asic")
211292

212293
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_LAG_MEMBER_TABLE entries deleted: $num_lag_mem"
213294

@@ -220,24 +301,7 @@ function clean_up_chassis_db_tables()
220301
fi
221302

222303
# Finally, delete SYSTEM_LAG_TABLE entries and deallot LAG IDs
223-
num_sys_lag=`$SONIC_DB_CLI CHASSIS_APP_DB EVAL "
224-
local nsl = 0
225-
local host = string.gsub(ARGV[1], '%-', '%%-')
226-
local dev = ARGV[2]
227-
local ps = 'SYSTEM_LAG_TABLE*|' .. '(' .. host .. '|' .. dev ..'.*' .. ')'
228-
local keylist = redis.call('KEYS', 'SYSTEM_LAG_TABLE*')
229-
for j,key in ipairs(keylist) do
230-
local lagname = string.match(key, ps)
231-
if lagname ~= nil then
232-
redis.call('DEL', key)
233-
local lagid = redis.call('HGET', 'SYSTEM_LAG_ID_TABLE', lagname)
234-
redis.call('SREM', 'SYSTEM_LAG_ID_SET', lagid)
235-
redis.call('HDEL', 'SYSTEM_LAG_ID_TABLE', lagname)
236-
redis.call('rpush', 'SYSTEM_LAG_IDS_FREE_LIST', lagid)
237-
nsl = nsl + 1
238-
end
239-
end
240-
return nsl" 0 $lc $asic`
304+
num_sys_lag=$(retry_delete_entries delete_system_lag_entries "$lc" "$asic")
241305

242306
debug "Chassis db clean up for ${SERVICE}$DEV. Number of SYSTEM_LAG_TABLE entries deleted: $num_sys_lag"
243307

0 commit comments

Comments
 (0)