Skip to content

Commit 18bee17

Browse files
committed
[GB200] Make IMEX prolog use local IMEx configurations + test fixes (aws#7013)
* [GB200] Adapt the prolog used to configure IMEX so that it uses the local IMEX nodes config, rather than the shared one. * [GB200] In test_ultraserver, fix the job script that checks for IMEX status by using the local IMEX config file rather than the shared one. * [GB200] In test_ultraserver, fix assertion on imex logs. * [GB200] In test_ultraserver, fix assert_imex_nodes_config_is_correct. * [GB200] In test_ultraserver, fix job to chekc imex status. * [GB200] In test_ultraserver, fix assert_no_errors_in_logs
1 parent 6263d26 commit 18bee17

File tree

4 files changed

+20
-56
lines changed

4 files changed

+20
-56
lines changed

tests/integration-tests/tests/common/assertions.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -439,8 +439,7 @@ def _assert_build_image_stack_deleted(stack_name, region, timeout_seconds=600, p
439439
pytest.fail(f"Timed-out waiting for stack {stack_name} deletion (last status: {last_status})")
440440

441441

442-
def assert_regex_in_file(cluster: Cluster, compute_node_ip: str, file_name: str, pattern: str, negate: bool = True):
443-
rce = RemoteCommandExecutor(cluster, compute_node_ip)
442+
def assert_regex_in_file(rce: RemoteCommandExecutor, file_name: str, pattern: str, negate: bool = True):
444443
file_content = read_remote_file(rce, file_name)
445444
assertion = assert_that(bool(re.search(pattern, file_content, re.IGNORECASE)))
446445
assertion.is_false() if negate else assertion.is_fals()

tests/integration-tests/tests/ultraserver/test_gb200.py

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -53,30 +53,27 @@ def submit_job_imex_status(rce: RemoteCommandExecutor, queue: str, max_nodes: in
5353
return job_id
5454

5555

56-
def assert_imex_nodes_config_is_correct(
57-
rce: RemoteCommandExecutor, queue_name: str, compute_resource_name: str, expected_ips: list
58-
):
59-
logging.info(f"Checking IMEX nodes config contains the expected nodes: {expected_ips}")
60-
imex_nodes_config_file = (
61-
f"/opt/parallelcluster/shared/nvidia-imex/nodes_config_{queue_name}_{compute_resource_name}.cfg"
62-
)
63-
imex_config_content = read_remote_file(rce, imex_nodes_config_file)
64-
imex_config_content_clean = [line for line in imex_config_content.split("\n") if not line.strip().startswith("#")]
65-
actual_ips = [ip.strip() for ip in imex_config_content_clean]
66-
assert_that(actual_ips).contains_only(*expected_ips)
67-
logging.info(f"IMEX nodes config {imex_nodes_config_file} contains the expected nodes: {expected_ips}")
56+
def assert_imex_nodes_config_is_correct(cluster: Cluster, queue: str, compute_resource: str, expected_ips: list):
57+
for compute_node_ip in cluster.get_compute_nodes_private_ip(queue, compute_resource):
58+
logging.info(f"Checking IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}")
59+
rce = RemoteCommandExecutor(cluster, compute_node_ip=compute_node_ip)
60+
imex_config_content = read_remote_file(rce, "/etc/nvidia-imex/nodes_config.cfg")
61+
imex_config_content_clean = [line for line in imex_config_content.split("\n") if not line.strip().startswith("#")]
62+
actual_ips = [ip.strip() for ip in imex_config_content_clean]
63+
assert_that(actual_ips).contains_only(*expected_ips)
64+
logging.info(f"IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}")
6865

6966

7067
def assert_no_errors_in_logs(cluster: Cluster, queue: str, compute_resource: str):
71-
rce = RemoteCommandExecutor(cluster)
7268
logs = ["/var/log/nvidia-imex-verbose.log", "/var/log/parallelcluster/nvidia-imex-prolog.log"]
7369
for compute_node_ip in cluster.get_compute_nodes_private_ip(queue, compute_resource):
70+
rce = RemoteCommandExecutor(cluster, compute_node_ip=compute_node_ip)
7471
for log in logs:
7572
logging.info(f"Checking file {log} log does not contain any error")
7673
if log == "/var/log/nvidia-imex-verbose.log" and not is_existing_remote_file(rce, log):
7774
logging.info("IMEX log file not found. Not an issue as IMEX writes logs there only in case of errors.")
7875
continue
79-
assert_regex_in_file(cluster, compute_node_ip, log, r"(warn|error|fail)", negate=True)
76+
assert_regex_in_file(rce, log, r"(warn|error|fail)", negate=True)
8077

8178

8279
def assert_imex_status(
@@ -210,7 +207,7 @@ def _check_imex_healthy():
210207
f"Private IP addresses for nodes in queue {queue} and compute resource {compute_resource}: " f"{ips}"
211208
)
212209

213-
assert_imex_nodes_config_is_correct(rce, queue, compute_resource, ips)
210+
assert_imex_nodes_config_is_correct(cluster, queue, compute_resource, ips)
214211
assert_imex_status(rce, job_id, ips, service_status="UP", node_status="READY", connection_status="CONNECTED")
215212
assert_no_errors_in_logs(cluster, queue, compute_resource)
216213

@@ -240,7 +237,7 @@ def assert_imex_not_configured(cluster: Cluster, queue: str, compute_resource: s
240237

241238
job_id = submit_job_imex_status(rce, queue, max_nodes)
242239

243-
assert_imex_nodes_config_is_correct(rce, queue, compute_resource, FAKE_IPS)
240+
assert_imex_nodes_config_is_correct(cluster, queue, compute_resource, FAKE_IPS)
244241
assert_imex_status(
245242
rce, job_id, FAKE_IPS, service_status="DOWN", node_status="UNAVAILABLE", connection_status="INVALID"
246243
)

tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh

Lines changed: 5 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -91,40 +91,9 @@ function write_file() {
9191
return 1 # Not Updated
9292
fi
9393

94-
# Try to acquire lock with timeout
95-
(
96-
if ! flock -x -w ${_lock_timeout_seconds} 200; then
97-
# If timeout, assume deadlock and try to recover
98-
info "Lock timeout after ${_lock_timeout_seconds}s, attempting deadlock recovery"
99-
exit 1
100-
fi
101-
echo "${_content}" > "${_file}"
102-
) 200>"${_lock_file}"
103-
104-
local _lock_result=$?
105-
106-
if [[ ${_lock_result} -eq 0 ]]; then
107-
return 0 # Updated successfully
108-
fi
109-
110-
# Deadlock recovery: remove stale lock file and retry once
111-
error "Potential deadlock detected for ${_file}, attempting recovery"
112-
rm -f "${_lock_file}"
113-
sleep 1 # Brief pause to avoid race conditions
114-
115-
(
116-
if ! flock -x -w 10 200; then
117-
exit 1
118-
fi
119-
echo "${_content}" > "${_file}"
120-
) 200>"${_lock_file}"
121-
122-
if [[ $? -eq 0 ]]; then
123-
info "Lock acquired after deadlock recovery for ${_file}"
124-
return 0 # Updated
125-
fi
126-
127-
error_exit "Failed to acquire lock for ${_file} even after deadlock recovery"
94+
echo "${_content}" > "${_file}"
95+
info "File ${_file} updated"
96+
return 0 # Updated
12897
}
12998

13099
function reload_imex() {
@@ -171,8 +140,8 @@ function create_default_imex_channel() {
171140
COMPUTE_RESOURCE_NAME=$(get_compute_resource_name "${QUEUE_NAME}-st-" $SLURMD_NODENAME)
172141
CR_NODES=$(get_node_names "${QUEUE_NAME}" "${COMPUTE_RESOURCE_NAME}")
173142
IPS_FROM_CR=$(get_ips_from_node_names "${CR_NODES}")
174-
IMEX_MAIN_CONFIG="/opt/parallelcluster/shared/nvidia-imex/config_${QUEUE_NAME}_${COMPUTE_RESOURCE_NAME}.cfg"
175-
IMEX_NODES_CONFIG="/opt/parallelcluster/shared/nvidia-imex/nodes_config_${QUEUE_NAME}_${COMPUTE_RESOURCE_NAME}.cfg"
143+
IMEX_MAIN_CONFIG="/etc/nvidia-imex/config.cfg"
144+
IMEX_NODES_CONFIG="/etc/nvidia-imex/nodes_config.cfg"
176145

177146
info "Queue Name: ${QUEUE_NAME}"
178147
info "CR Name: ${COMPUTE_RESOURCE_NAME}"

tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/nvidia-imex-status.job

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,5 @@
88
sleep 45
99
QUEUE_NAME=$(cat "/etc/chef/dna.json" | jq -r ".cluster.scheduler_queue_name")
1010
COMPUTE_RES_NAME=$(cat "/etc/chef/dna.json" | jq -r ".cluster.scheduler_compute_resource_name")
11-
IMEX_CONFIG_FILE="/opt/parallelcluster/shared/nvidia-imex/config_${QUEUE_NAME}_${COMPUTE_RES_NAME}.cfg"
1211

13-
srun bash -c "/usr/bin/nvidia-imex-ctl -N -j -c ${IMEX_CONFIG_FILE} > result_\${SLURM_JOB_ID}_\$(hostname).out 2> result_\${SLURM_JOB_ID}_\$(hostname).err"
12+
srun bash -c "/usr/bin/nvidia-imex-ctl -N -j > result_\${SLURM_JOB_ID}_\$(hostname).out 2> result_\${SLURM_JOB_ID}_\$(hostname).err"

0 commit comments

Comments
 (0)