From ab73514839327bc872ccbd278e9187fcc5d1f8c3 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Fri, 19 Sep 2025 14:17:16 -0400 Subject: [PATCH 1/8] [GB200] Adapt the prolog used to configure IMEX so that it uses the local IMEX nodes config, rather than the shared one. --- .../tests/ultraserver/test_gb200.py | 21 ++++------ .../test_gb200/91_nvidia_imex_prolog.sh | 41 +++---------------- 2 files changed, 14 insertions(+), 48 deletions(-) diff --git a/tests/integration-tests/tests/ultraserver/test_gb200.py b/tests/integration-tests/tests/ultraserver/test_gb200.py index cfa7dc4cb3..bb08e20275 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200.py +++ b/tests/integration-tests/tests/ultraserver/test_gb200.py @@ -53,18 +53,15 @@ def submit_job_imex_status(rce: RemoteCommandExecutor, queue: str, max_nodes: in return job_id -def assert_imex_nodes_config_is_correct( - rce: RemoteCommandExecutor, queue_name: str, compute_resource_name: str, expected_ips: list -): - logging.info(f"Checking IMEX nodes config contains the expected nodes: {expected_ips}") - imex_nodes_config_file = ( - f"/opt/parallelcluster/shared/nvidia-imex/nodes_config_{queue_name}_{compute_resource_name}.cfg" - ) - imex_config_content = read_remote_file(rce, imex_nodes_config_file) - imex_config_content_clean = [line for line in imex_config_content.split("\n") if not line.strip().startswith("#")] - actual_ips = [ip.strip() for ip in imex_config_content_clean] - assert_that(actual_ips).contains_only(*expected_ips) - logging.info(f"IMEX nodes config {imex_nodes_config_file} contains the expected nodes: {expected_ips}") +def assert_imex_nodes_config_is_correct(cluster: Cluster, queue: str, compute_resource: str, expected_ips: list): + for compute_node_ip in cluster.get_compute_nodes_private_ip(queue, compute_resource): + logging.info(f"Checking IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}") + rce = RemoteCommandExecutor(cluster, compute_node_ip=compute_node_ip) + imex_config_content = read_remote_file(rce, "/etc/nvidia-imex/nodes_config.cfg") + imex_config_content_clean = [line for line in imex_config_content.split("\n") if not line.strip().startswith("#")] + actual_ips = [ip.strip() for ip in imex_config_content_clean] + assert_that(actual_ips).contains_only(*expected_ips) + logging.info(f"IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}") def assert_no_errors_in_logs(cluster: Cluster, queue: str, compute_resource: str): diff --git a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh index f90f2eca4c..b77fdba42a 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh +++ b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh @@ -91,40 +91,9 @@ function write_file() { return 1 # Not Updated fi - # Try to acquire lock with timeout - ( - if ! flock -x -w ${_lock_timeout_seconds} 200; then - # If timeout, assume deadlock and try to recover - info "Lock timeout after ${_lock_timeout_seconds}s, attempting deadlock recovery" - exit 1 - fi - echo "${_content}" > "${_file}" - ) 200>"${_lock_file}" - - local _lock_result=$? - - if [[ ${_lock_result} -eq 0 ]]; then - return 0 # Updated successfully - fi - - # Deadlock recovery: remove stale lock file and retry once - error "Potential deadlock detected for ${_file}, attempting recovery" - rm -f "${_lock_file}" - sleep 1 # Brief pause to avoid race conditions - - ( - if ! flock -x -w 10 200; then - exit 1 - fi - echo "${_content}" > "${_file}" - ) 200>"${_lock_file}" - - if [[ $? -eq 0 ]]; then - info "Lock acquired after deadlock recovery for ${_file}" - return 0 # Updated - fi - - error_exit "Failed to acquire lock for ${_file} even after deadlock recovery" + echo "${_content}" > "${_file}" + info "File ${_file} updated" + return 0 # Updated } function reload_imex() { @@ -171,8 +140,8 @@ function create_default_imex_channel() { COMPUTE_RESOURCE_NAME=$(get_compute_resource_name "${QUEUE_NAME}-st-" $SLURMD_NODENAME) CR_NODES=$(get_node_names "${QUEUE_NAME}" "${COMPUTE_RESOURCE_NAME}") IPS_FROM_CR=$(get_ips_from_node_names "${CR_NODES}") - IMEX_MAIN_CONFIG="/opt/parallelcluster/shared/nvidia-imex/config_${QUEUE_NAME}_${COMPUTE_RESOURCE_NAME}.cfg" - IMEX_NODES_CONFIG="/opt/parallelcluster/shared/nvidia-imex/nodes_config_${QUEUE_NAME}_${COMPUTE_RESOURCE_NAME}.cfg" + IMEX_MAIN_CONFIG="/etc/nvidia-imex/config.cfg" + IMEX_NODES_CONFIG="/etc/nvidia-imex/nodes_config.cfg" info "Queue Name: ${QUEUE_NAME}" info "CR Name: ${COMPUTE_RESOURCE_NAME}" From 79962947812d3fcb72cf4ae2f1c0bae6d6bdd1a2 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Fri, 19 Sep 2025 17:12:57 -0400 Subject: [PATCH 2/8] [GB200] In test_ultraserver, fix the job script that checks for IMEX status by using the local IMEX config file rather than the shared one. --- .../ultraserver/test_gb200/test_gb200/nvidia-imex-status.job | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/nvidia-imex-status.job b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/nvidia-imex-status.job index 037dfe68b5..6281a507b6 100755 --- a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/nvidia-imex-status.job +++ b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/nvidia-imex-status.job @@ -8,6 +8,6 @@ sleep 45 QUEUE_NAME=$(cat "/etc/chef/dna.json" | jq -r ".cluster.scheduler_queue_name") COMPUTE_RES_NAME=$(cat "/etc/chef/dna.json" | jq -r ".cluster.scheduler_compute_resource_name") -IMEX_CONFIG_FILE="/opt/parallelcluster/shared/nvidia-imex/config_${QUEUE_NAME}_${COMPUTE_RES_NAME}.cfg" +IMEX_CONFIG_FILE="/etc/nvidia-imex/config.cfg" srun bash -c "/usr/bin/nvidia-imex-ctl -N -j -c ${IMEX_CONFIG_FILE} > result_\${SLURM_JOB_ID}_\$(hostname).out 2> result_\${SLURM_JOB_ID}_\$(hostname).err" \ No newline at end of file From 94393f7e5b06bf0f17076868ac93d2255d6b110b Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Fri, 19 Sep 2025 17:13:12 -0400 Subject: [PATCH 3/8] [GB200] In test_ultraserver, fix assertion on imex logs. --- tests/integration-tests/tests/common/assertions.py | 3 +-- tests/integration-tests/tests/ultraserver/test_gb200.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/integration-tests/tests/common/assertions.py b/tests/integration-tests/tests/common/assertions.py index c34cf12590..06616a520a 100644 --- a/tests/integration-tests/tests/common/assertions.py +++ b/tests/integration-tests/tests/common/assertions.py @@ -439,8 +439,7 @@ def _assert_build_image_stack_deleted(stack_name, region, timeout_seconds=600, p pytest.fail(f"Timed-out waiting for stack {stack_name} deletion (last status: {last_status})") -def assert_regex_in_file(cluster: Cluster, compute_node_ip: str, file_name: str, pattern: str, negate: bool = True): - rce = RemoteCommandExecutor(cluster, compute_node_ip) +def assert_regex_in_file(rce: RemoteCommandExecutor, file_name: str, pattern: str, negate: bool = True): file_content = read_remote_file(rce, file_name) assertion = assert_that(bool(re.search(pattern, file_content, re.IGNORECASE))) assertion.is_false() if negate else assertion.is_fals() diff --git a/tests/integration-tests/tests/ultraserver/test_gb200.py b/tests/integration-tests/tests/ultraserver/test_gb200.py index bb08e20275..d0ede21fdd 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200.py +++ b/tests/integration-tests/tests/ultraserver/test_gb200.py @@ -65,9 +65,9 @@ def assert_imex_nodes_config_is_correct(cluster: Cluster, queue: str, compute_re def assert_no_errors_in_logs(cluster: Cluster, queue: str, compute_resource: str): - rce = RemoteCommandExecutor(cluster) logs = ["/var/log/nvidia-imex-verbose.log", "/var/log/parallelcluster/nvidia-imex-prolog.log"] for compute_node_ip in cluster.get_compute_nodes_private_ip(queue, compute_resource): + rce = RemoteCommandExecutor(cluster, compute_node_ip=compute_node_ip) for log in logs: logging.info(f"Checking file {log} log does not contain any error") if log == "/var/log/nvidia-imex-verbose.log" and not is_existing_remote_file(rce, log): From 4f41f341de707407d4229e97b6dcac1328f1661a Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Fri, 19 Sep 2025 18:12:22 -0400 Subject: [PATCH 4/8] [GB200] In test_ultraserver, fix assert_imex_nodes_config_is_correct. --- tests/integration-tests/tests/ultraserver/test_gb200.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/ultraserver/test_gb200.py b/tests/integration-tests/tests/ultraserver/test_gb200.py index d0ede21fdd..0e899e0ca0 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200.py +++ b/tests/integration-tests/tests/ultraserver/test_gb200.py @@ -207,7 +207,7 @@ def _check_imex_healthy(): f"Private IP addresses for nodes in queue {queue} and compute resource {compute_resource}: " f"{ips}" ) - assert_imex_nodes_config_is_correct(rce, queue, compute_resource, ips) + assert_imex_nodes_config_is_correct(cluster, queue, compute_resource, ips) assert_imex_status(rce, job_id, ips, service_status="UP", node_status="READY", connection_status="CONNECTED") assert_no_errors_in_logs(cluster, queue, compute_resource) From 85ca8563e3d5b82ea1d4d02ca987beb49d0326f3 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Fri, 19 Sep 2025 18:13:17 -0400 Subject: [PATCH 5/8] [GB200] In test_ultraserver, fix job to chekc imex status. --- .../ultraserver/test_gb200/test_gb200/nvidia-imex-status.job | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/nvidia-imex-status.job b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/nvidia-imex-status.job index 6281a507b6..8b8113eabf 100755 --- a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/nvidia-imex-status.job +++ b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/nvidia-imex-status.job @@ -8,6 +8,5 @@ sleep 45 QUEUE_NAME=$(cat "/etc/chef/dna.json" | jq -r ".cluster.scheduler_queue_name") COMPUTE_RES_NAME=$(cat "/etc/chef/dna.json" | jq -r ".cluster.scheduler_compute_resource_name") -IMEX_CONFIG_FILE="/etc/nvidia-imex/config.cfg" -srun bash -c "/usr/bin/nvidia-imex-ctl -N -j -c ${IMEX_CONFIG_FILE} > result_\${SLURM_JOB_ID}_\$(hostname).out 2> result_\${SLURM_JOB_ID}_\$(hostname).err" \ No newline at end of file +srun bash -c "/usr/bin/nvidia-imex-ctl -N -j > result_\${SLURM_JOB_ID}_\$(hostname).out 2> result_\${SLURM_JOB_ID}_\$(hostname).err" \ No newline at end of file From 8a03c71c5cfd4445a26cb094254d8a7bcf03b9c5 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Fri, 19 Sep 2025 18:15:16 -0400 Subject: [PATCH 6/8] [GB200] In test_ultraserver, fix assert_no_errors_in_logs --- tests/integration-tests/tests/ultraserver/test_gb200.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/ultraserver/test_gb200.py b/tests/integration-tests/tests/ultraserver/test_gb200.py index 0e899e0ca0..0b9540a7b1 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200.py +++ b/tests/integration-tests/tests/ultraserver/test_gb200.py @@ -73,7 +73,7 @@ def assert_no_errors_in_logs(cluster: Cluster, queue: str, compute_resource: str if log == "/var/log/nvidia-imex-verbose.log" and not is_existing_remote_file(rce, log): logging.info("IMEX log file not found. Not an issue as IMEX writes logs there only in case of errors.") continue - assert_regex_in_file(cluster, compute_node_ip, log, r"(warn|error|fail)", negate=True) + assert_regex_in_file(rce, compute_node_ip, log, r"(warn|error|fail)", negate=True) def assert_imex_status( From 5eba4a61846e5ef109448527d5679dbab69d0971 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Fri, 19 Sep 2025 18:20:15 -0400 Subject: [PATCH 7/8] [GB200] In test_ultraserver, fix assert_imex_nodes_config_is_correct --- tests/integration-tests/tests/ultraserver/test_gb200.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/ultraserver/test_gb200.py b/tests/integration-tests/tests/ultraserver/test_gb200.py index 0b9540a7b1..9063d09c80 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200.py +++ b/tests/integration-tests/tests/ultraserver/test_gb200.py @@ -237,7 +237,7 @@ def assert_imex_not_configured(cluster: Cluster, queue: str, compute_resource: s job_id = submit_job_imex_status(rce, queue, max_nodes) - assert_imex_nodes_config_is_correct(rce, queue, compute_resource, FAKE_IPS) + assert_imex_nodes_config_is_correct(cluster, queue, compute_resource, FAKE_IPS) assert_imex_status( rce, job_id, FAKE_IPS, service_status="DOWN", node_status="UNAVAILABLE", connection_status="INVALID" ) From 094290f86d9628b71c432601d9ab9b2556f75f3d Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Fri, 19 Sep 2025 18:23:26 -0400 Subject: [PATCH 8/8] [GB200] In test_ultraserver, fix assert_regex_in_file --- tests/integration-tests/tests/ultraserver/test_gb200.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/ultraserver/test_gb200.py b/tests/integration-tests/tests/ultraserver/test_gb200.py index 9063d09c80..a1cf86bc22 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200.py +++ b/tests/integration-tests/tests/ultraserver/test_gb200.py @@ -73,7 +73,7 @@ def assert_no_errors_in_logs(cluster: Cluster, queue: str, compute_resource: str if log == "/var/log/nvidia-imex-verbose.log" and not is_existing_remote_file(rce, log): logging.info("IMEX log file not found. Not an issue as IMEX writes logs there only in case of errors.") continue - assert_regex_in_file(rce, compute_node_ip, log, r"(warn|error|fail)", negate=True) + assert_regex_in_file(rce, log, r"(warn|error|fail)", negate=True) def assert_imex_status(