Skip to content

Commit 65160b2

Browse files
committed
[GB200] Adapt the prolog used to configure IMEX so that it uses the local IMEX nodes config, rather than the shared one.
1 parent 25ff751 commit 65160b2

File tree

2 files changed

+14
-48
lines changed

2 files changed

+14
-48
lines changed

tests/integration-tests/tests/ultraserver/test_gb200.py

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -53,18 +53,15 @@ def submit_job_imex_status(rce: RemoteCommandExecutor, queue: str, max_nodes: in
5353
return job_id
5454

5555

56-
def assert_imex_nodes_config_is_correct(
57-
rce: RemoteCommandExecutor, queue_name: str, compute_resource_name: str, expected_ips: list
58-
):
59-
logging.info(f"Checking IMEX nodes config contains the expected nodes: {expected_ips}")
60-
imex_nodes_config_file = (
61-
f"/opt/parallelcluster/shared/nvidia-imex/nodes_config_{queue_name}_{compute_resource_name}.cfg"
62-
)
63-
imex_config_content = read_remote_file(rce, imex_nodes_config_file)
64-
imex_config_content_clean = [line for line in imex_config_content.split("\n") if not line.strip().startswith("#")]
65-
actual_ips = [ip.strip() for ip in imex_config_content_clean]
66-
assert_that(actual_ips).contains_only(*expected_ips)
67-
logging.info(f"IMEX nodes config {imex_nodes_config_file} contains the expected nodes: {expected_ips}")
56+
def assert_imex_nodes_config_is_correct(cluster: Cluster, queue: str, compute_resource: str, expected_ips: list):
57+
for compute_node_ip in cluster.get_compute_nodes_private_ip(queue, compute_resource):
58+
logging.info(f"Checking IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}")
59+
rce = RemoteCommandExecutor(cluster, compute_node_ip=compute_node_ip)
60+
imex_config_content = read_remote_file(rce, "/etc/nvidia-imex/nodes_config.cfg")
61+
imex_config_content_clean = [line for line in imex_config_content.split("\n") if not line.strip().startswith("#")]
62+
actual_ips = [ip.strip() for ip in imex_config_content_clean]
63+
assert_that(actual_ips).contains_only(*expected_ips)
64+
logging.info(f"IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}")
6865

6966

7067
def assert_no_errors_in_logs(cluster: Cluster, queue: str, compute_resource: str):

tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh

Lines changed: 5 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -91,40 +91,9 @@ function write_file() {
9191
return 1 # Not Updated
9292
fi
9393

94-
# Try to acquire lock with timeout
95-
(
96-
if ! flock -x -w ${_lock_timeout_seconds} 200; then
97-
# If timeout, assume deadlock and try to recover
98-
info "Lock timeout after ${_lock_timeout_seconds}s, attempting deadlock recovery"
99-
exit 1
100-
fi
101-
echo "${_content}" > "${_file}"
102-
) 200>"${_lock_file}"
103-
104-
local _lock_result=$?
105-
106-
if [[ ${_lock_result} -eq 0 ]]; then
107-
return 0 # Updated successfully
108-
fi
109-
110-
# Deadlock recovery: remove stale lock file and retry once
111-
error "Potential deadlock detected for ${_file}, attempting recovery"
112-
rm -f "${_lock_file}"
113-
sleep 1 # Brief pause to avoid race conditions
114-
115-
(
116-
if ! flock -x -w 10 200; then
117-
exit 1
118-
fi
119-
echo "${_content}" > "${_file}"
120-
) 200>"${_lock_file}"
121-
122-
if [[ $? -eq 0 ]]; then
123-
info "Lock acquired after deadlock recovery for ${_file}"
124-
return 0 # Updated
125-
fi
126-
127-
error_exit "Failed to acquire lock for ${_file} even after deadlock recovery"
94+
echo "${_content}" > "${_file}"
95+
info "File ${_file} updated"
96+
return 0 # Updated
12897
}
12998

13099
function reload_imex() {
@@ -171,8 +140,8 @@ function create_default_imex_channel() {
171140
COMPUTE_RESOURCE_NAME=$(get_compute_resource_name "${QUEUE_NAME}-st-" $SLURMD_NODENAME)
172141
CR_NODES=$(get_node_names "${QUEUE_NAME}" "${COMPUTE_RESOURCE_NAME}")
173142
IPS_FROM_CR=$(get_ips_from_node_names "${CR_NODES}")
174-
IMEX_MAIN_CONFIG="/opt/parallelcluster/shared/nvidia-imex/config_${QUEUE_NAME}_${COMPUTE_RESOURCE_NAME}.cfg"
175-
IMEX_NODES_CONFIG="/opt/parallelcluster/shared/nvidia-imex/nodes_config_${QUEUE_NAME}_${COMPUTE_RESOURCE_NAME}.cfg"
143+
IMEX_MAIN_CONFIG="/etc/nvidia-imex/config.cfg"
144+
IMEX_NODES_CONFIG="/etc/nvidia-imex/nodes_config.cfg"
176145

177146
info "Queue Name: ${QUEUE_NAME}"
178147
info "CR Name: ${COMPUTE_RESOURCE_NAME}"

0 commit comments

Comments
 (0)