Skip to content

Commit 3cc06dd

Browse files
committed
[BB200] Restore locking mechanism to update IMEX nodes config.
Even if now the config file is local and we assume exclusive job allocation, having the locking mechanism prevents impacts on IMEX when concurrent jobs are submitted.
1 parent 3197bcc commit 3cc06dd

File tree

1 file changed

+34
-3
lines changed

1 file changed

+34
-3
lines changed

tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,40 @@ function write_file() {
9191
return 1 # Not Updated
9292
fi
9393

94-
echo "${_content}" > "${_file}"
95-
info "File ${_file} updated"
96-
return 0 # Updated
94+
# Try to acquire lock with timeout
95+
(
96+
if ! flock -x -w ${_lock_timeout_seconds} 200; then
97+
# If timeout, assume deadlock and try to recover
98+
info "Lock timeout after ${_lock_timeout_seconds}s, attempting deadlock recovery"
99+
exit 1
100+
fi
101+
echo "${_content}" > "${_file}"
102+
) 200>"${_lock_file}"
103+
104+
local _lock_result=$?
105+
106+
if [[ ${_lock_result} -eq 0 ]]; then
107+
return 0 # Updated successfully
108+
fi
109+
110+
# Deadlock recovery: remove stale lock file and retry once
111+
error "Potential deadlock detected for ${_file}, attempting recovery"
112+
rm -f "${_lock_file}"
113+
sleep 1 # Brief pause to avoid race conditions
114+
115+
(
116+
if ! flock -x -w 10 200; then
117+
exit 1
118+
fi
119+
echo "${_content}" > "${_file}"
120+
) 200>"${_lock_file}"
121+
122+
if [[ $? -eq 0 ]]; then
123+
info "Lock acquired after deadlock recovery for ${_file}"
124+
return 0 # Updated
125+
fi
126+
127+
error_exit "Failed to acquire lock for ${_file} even after deadlock recovery"
97128
}
98129

99130
function reload_imex() {

0 commit comments

Comments
 (0)