From 0cdccc5360c130e0080dde39b4ea519775fdc7b4 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Mon, 22 Sep 2025 09:51:59 -0400 Subject: [PATCH 1/2] [GB200] Restore locking mechanism to update IMEX nodes config. Even if now the config file is local and we assume exclusive job allocation, having the locking mechanism prevents impacts on IMEX when concurrent jobs are submitted. --- .../test_gb200/91_nvidia_imex_prolog.sh | 37 +++++++++++++++++-- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh index b77fdba42a..7a84bdbf32 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh +++ b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh @@ -91,9 +91,40 @@ function write_file() { return 1 # Not Updated fi - echo "${_content}" > "${_file}" - info "File ${_file} updated" - return 0 # Updated + # Try to acquire lock with timeout + ( + if ! flock -x -w ${_lock_timeout_seconds} 200; then + # If timeout, assume deadlock and try to recover + info "Lock timeout after ${_lock_timeout_seconds}s, attempting deadlock recovery" + exit 1 + fi + echo "${_content}" > "${_file}" + ) 200>"${_lock_file}" + + local _lock_result=$? + + if [[ ${_lock_result} -eq 0 ]]; then + return 0 # Updated successfully + fi + + # Deadlock recovery: remove stale lock file and retry once + error "Potential deadlock detected for ${_file}, attempting recovery" + rm -f "${_lock_file}" + sleep 1 # Brief pause to avoid race conditions + + ( + if ! flock -x -w 10 200; then + exit 1 + fi + echo "${_content}" > "${_file}" + ) 200>"${_lock_file}" + + if [[ $? -eq 0 ]]; then + info "Lock acquired after deadlock recovery for ${_file}" + return 0 # Updated + fi + + error_exit "Failed to acquire lock for ${_file} even after deadlock recovery" } function reload_imex() { From b6c3cb0a1b651300b53162113802097144cf9998 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Mon, 22 Sep 2025 09:57:11 -0400 Subject: [PATCH 2/2] [GB200] Address linter findings. --- .../tests/ultraserver/test_gb200.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/integration-tests/tests/ultraserver/test_gb200.py b/tests/integration-tests/tests/ultraserver/test_gb200.py index a1cf86bc22..7b7e881d17 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200.py +++ b/tests/integration-tests/tests/ultraserver/test_gb200.py @@ -55,13 +55,19 @@ def submit_job_imex_status(rce: RemoteCommandExecutor, queue: str, max_nodes: in def assert_imex_nodes_config_is_correct(cluster: Cluster, queue: str, compute_resource: str, expected_ips: list): for compute_node_ip in cluster.get_compute_nodes_private_ip(queue, compute_resource): - logging.info(f"Checking IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}") + logging.info( + f"Checking IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}" + ) rce = RemoteCommandExecutor(cluster, compute_node_ip=compute_node_ip) imex_config_content = read_remote_file(rce, "/etc/nvidia-imex/nodes_config.cfg") - imex_config_content_clean = [line for line in imex_config_content.split("\n") if not line.strip().startswith("#")] + imex_config_content_clean = [ + line for line in imex_config_content.split("\n") if not line.strip().startswith("#") + ] actual_ips = [ip.strip() for ip in imex_config_content_clean] assert_that(actual_ips).contains_only(*expected_ips) - logging.info(f"IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}") + logging.info( + f"IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}" + ) def assert_no_errors_in_logs(cluster: Cluster, queue: str, compute_resource: str):