diff --git a/tests/integration-tests/tests/ultraserver/test_gb200.py b/tests/integration-tests/tests/ultraserver/test_gb200.py index a1cf86bc22..7b7e881d17 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200.py +++ b/tests/integration-tests/tests/ultraserver/test_gb200.py @@ -55,13 +55,19 @@ def submit_job_imex_status(rce: RemoteCommandExecutor, queue: str, max_nodes: in def assert_imex_nodes_config_is_correct(cluster: Cluster, queue: str, compute_resource: str, expected_ips: list): for compute_node_ip in cluster.get_compute_nodes_private_ip(queue, compute_resource): - logging.info(f"Checking IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}") + logging.info( + f"Checking IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}" + ) rce = RemoteCommandExecutor(cluster, compute_node_ip=compute_node_ip) imex_config_content = read_remote_file(rce, "/etc/nvidia-imex/nodes_config.cfg") - imex_config_content_clean = [line for line in imex_config_content.split("\n") if not line.strip().startswith("#")] + imex_config_content_clean = [ + line for line in imex_config_content.split("\n") if not line.strip().startswith("#") + ] actual_ips = [ip.strip() for ip in imex_config_content_clean] assert_that(actual_ips).contains_only(*expected_ips) - logging.info(f"IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}") + logging.info( + f"IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}" + ) def assert_no_errors_in_logs(cluster: Cluster, queue: str, compute_resource: str): diff --git a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh index b77fdba42a..7a84bdbf32 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh +++ b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh @@ -91,9 +91,40 @@ function write_file() { return 1 # Not Updated fi - echo "${_content}" > "${_file}" - info "File ${_file} updated" - return 0 # Updated + # Try to acquire lock with timeout + ( + if ! flock -x -w ${_lock_timeout_seconds} 200; then + # If timeout, assume deadlock and try to recover + info "Lock timeout after ${_lock_timeout_seconds}s, attempting deadlock recovery" + exit 1 + fi + echo "${_content}" > "${_file}" + ) 200>"${_lock_file}" + + local _lock_result=$? + + if [[ ${_lock_result} -eq 0 ]]; then + return 0 # Updated successfully + fi + + # Deadlock recovery: remove stale lock file and retry once + error "Potential deadlock detected for ${_file}, attempting recovery" + rm -f "${_lock_file}" + sleep 1 # Brief pause to avoid race conditions + + ( + if ! flock -x -w 10 200; then + exit 1 + fi + echo "${_content}" > "${_file}" + ) 200>"${_lock_file}" + + if [[ $? -eq 0 ]]; then + info "Lock acquired after deadlock recovery for ${_file}" + return 0 # Updated + fi + + error_exit "Failed to acquire lock for ${_file} even after deadlock recovery" } function reload_imex() {