Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions tests/integration-tests/tests/ultraserver/test_gb200.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,19 @@ def submit_job_imex_status(rce: RemoteCommandExecutor, queue: str, max_nodes: in

def assert_imex_nodes_config_is_correct(cluster: Cluster, queue: str, compute_resource: str, expected_ips: list):
for compute_node_ip in cluster.get_compute_nodes_private_ip(queue, compute_resource):
logging.info(f"Checking IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}")
logging.info(
f"Checking IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}"
)
rce = RemoteCommandExecutor(cluster, compute_node_ip=compute_node_ip)
imex_config_content = read_remote_file(rce, "/etc/nvidia-imex/nodes_config.cfg")
imex_config_content_clean = [line for line in imex_config_content.split("\n") if not line.strip().startswith("#")]
imex_config_content_clean = [
line for line in imex_config_content.split("\n") if not line.strip().startswith("#")
]
actual_ips = [ip.strip() for ip in imex_config_content_clean]
assert_that(actual_ips).contains_only(*expected_ips)
logging.info(f"IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}")
logging.info(
f"IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}"
)


def assert_no_errors_in_logs(cluster: Cluster, queue: str, compute_resource: str):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,40 @@ function write_file() {
return 1 # Not Updated
fi

echo "${_content}" > "${_file}"
info "File ${_file} updated"
return 0 # Updated
# Try to acquire lock with timeout
(
if ! flock -x -w ${_lock_timeout_seconds} 200; then
# If timeout, assume deadlock and try to recover
info "Lock timeout after ${_lock_timeout_seconds}s, attempting deadlock recovery"
exit 1
fi
echo "${_content}" > "${_file}"
) 200>"${_lock_file}"

local _lock_result=$?

if [[ ${_lock_result} -eq 0 ]]; then
return 0 # Updated successfully
fi

# Deadlock recovery: remove stale lock file and retry once
error "Potential deadlock detected for ${_file}, attempting recovery"
rm -f "${_lock_file}"
sleep 1 # Brief pause to avoid race conditions

(
if ! flock -x -w 10 200; then
exit 1
fi
echo "${_content}" > "${_file}"
) 200>"${_lock_file}"

if [[ $? -eq 0 ]]; then
info "Lock acquired after deadlock recovery for ${_file}"
return 0 # Updated
fi

error_exit "Failed to acquire lock for ${_file} even after deadlock recovery"
}

function reload_imex() {
Expand Down
Loading