From 713f7d0d7af2c17c6c36f11b780e7184a2e60e5f Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Fri, 19 Sep 2025 18:29:18 -0400 Subject: [PATCH 1/3] [GB200] Make IMEX prolog use local IMEx configurations + test fixes (#7013) * [GB200] Adapt the prolog used to configure IMEX so that it uses the local IMEX nodes config, rather than the shared one. * [GB200] In test_ultraserver, fix the job script that checks for IMEX status by using the local IMEX config file rather than the shared one. * [GB200] In test_ultraserver, fix assertion on imex logs. * [GB200] In test_ultraserver, fix assert_imex_nodes_config_is_correct. * [GB200] In test_ultraserver, fix job to chekc imex status. * [GB200] In test_ultraserver, fix assert_no_errors_in_logs --- .../tests/common/assertions.py | 3 +- .../tests/ultraserver/test_gb200.py | 29 ++++++------- .../test_gb200/91_nvidia_imex_prolog.sh | 41 +++---------------- .../test_gb200/nvidia-imex-status.job | 3 +- 4 files changed, 20 insertions(+), 56 deletions(-) diff --git a/tests/integration-tests/tests/common/assertions.py b/tests/integration-tests/tests/common/assertions.py index c34cf12590..06616a520a 100644 --- a/tests/integration-tests/tests/common/assertions.py +++ b/tests/integration-tests/tests/common/assertions.py @@ -439,8 +439,7 @@ def _assert_build_image_stack_deleted(stack_name, region, timeout_seconds=600, p pytest.fail(f"Timed-out waiting for stack {stack_name} deletion (last status: {last_status})") -def assert_regex_in_file(cluster: Cluster, compute_node_ip: str, file_name: str, pattern: str, negate: bool = True): - rce = RemoteCommandExecutor(cluster, compute_node_ip) +def assert_regex_in_file(rce: RemoteCommandExecutor, file_name: str, pattern: str, negate: bool = True): file_content = read_remote_file(rce, file_name) assertion = assert_that(bool(re.search(pattern, file_content, re.IGNORECASE))) assertion.is_false() if negate else assertion.is_fals() diff --git a/tests/integration-tests/tests/ultraserver/test_gb200.py b/tests/integration-tests/tests/ultraserver/test_gb200.py index 0e31740c2f..9e47782a57 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200.py +++ b/tests/integration-tests/tests/ultraserver/test_gb200.py @@ -53,30 +53,27 @@ def submit_job_imex_status(rce: RemoteCommandExecutor, queue: str, max_nodes: in return job_id -def assert_imex_nodes_config_is_correct( - rce: RemoteCommandExecutor, queue_name: str, compute_resource_name: str, expected_ips: list -): - logging.info(f"Checking IMEX nodes config contains the expected nodes: {expected_ips}") - imex_nodes_config_file = ( - f"/opt/parallelcluster/shared/nvidia-imex/nodes_config_{queue_name}_{compute_resource_name}.cfg" - ) - imex_config_content = read_remote_file(rce, imex_nodes_config_file) - imex_config_content_clean = [line for line in imex_config_content.split("\n") if not line.strip().startswith("#")] - actual_ips = [ip.strip() for ip in imex_config_content_clean] - assert_that(actual_ips).contains_only(*expected_ips) - logging.info(f"IMEX nodes config {imex_nodes_config_file} contains the expected nodes: {expected_ips}") +def assert_imex_nodes_config_is_correct(cluster: Cluster, queue: str, compute_resource: str, expected_ips: list): + for compute_node_ip in cluster.get_compute_nodes_private_ip(queue, compute_resource): + logging.info(f"Checking IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}") + rce = RemoteCommandExecutor(cluster, compute_node_ip=compute_node_ip) + imex_config_content = read_remote_file(rce, "/etc/nvidia-imex/nodes_config.cfg") + imex_config_content_clean = [line for line in imex_config_content.split("\n") if not line.strip().startswith("#")] + actual_ips = [ip.strip() for ip in imex_config_content_clean] + assert_that(actual_ips).contains_only(*expected_ips) + logging.info(f"IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}") def assert_no_errors_in_logs(cluster: Cluster, queue: str, compute_resource: str): - rce = RemoteCommandExecutor(cluster) logs = ["/var/log/nvidia-imex-verbose.log", "/var/log/parallelcluster/nvidia-imex-prolog.log"] for compute_node_ip in cluster.get_compute_nodes_private_ip(queue, compute_resource): + rce = RemoteCommandExecutor(cluster, compute_node_ip=compute_node_ip) for log in logs: logging.info(f"Checking file {log} log does not contain any error") if log == "/var/log/nvidia-imex-verbose.log" and not is_existing_remote_file(rce, log): logging.info("IMEX log file not found. Not an issue as IMEX writes logs there only in case of errors.") continue - assert_regex_in_file(cluster, compute_node_ip, log, r"(warn|error|fail)", negate=True) + assert_regex_in_file(rce, log, r"(warn|error|fail)", negate=True) def assert_imex_status( @@ -210,7 +207,7 @@ def _check_imex_healthy(): f"Private IP addresses for nodes in queue {queue} and compute resource {compute_resource}: " f"{ips}" ) - assert_imex_nodes_config_is_correct(rce, queue, compute_resource, ips) + assert_imex_nodes_config_is_correct(cluster, queue, compute_resource, ips) assert_imex_status(rce, job_id, ips, service_status="UP", node_status="READY", connection_status="CONNECTED") assert_no_errors_in_logs(cluster, queue, compute_resource) @@ -240,7 +237,7 @@ def assert_imex_not_configured(cluster: Cluster, queue: str, compute_resource: s job_id = submit_job_imex_status(rce, queue, max_nodes) - assert_imex_nodes_config_is_correct(rce, queue, compute_resource, FAKE_IPS) + assert_imex_nodes_config_is_correct(cluster, queue, compute_resource, FAKE_IPS) assert_imex_status( rce, job_id, FAKE_IPS, service_status="DOWN", node_status="UNAVAILABLE", connection_status="INVALID" ) diff --git a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh index f90f2eca4c..b77fdba42a 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh +++ b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh @@ -91,40 +91,9 @@ function write_file() { return 1 # Not Updated fi - # Try to acquire lock with timeout - ( - if ! flock -x -w ${_lock_timeout_seconds} 200; then - # If timeout, assume deadlock and try to recover - info "Lock timeout after ${_lock_timeout_seconds}s, attempting deadlock recovery" - exit 1 - fi - echo "${_content}" > "${_file}" - ) 200>"${_lock_file}" - - local _lock_result=$? - - if [[ ${_lock_result} -eq 0 ]]; then - return 0 # Updated successfully - fi - - # Deadlock recovery: remove stale lock file and retry once - error "Potential deadlock detected for ${_file}, attempting recovery" - rm -f "${_lock_file}" - sleep 1 # Brief pause to avoid race conditions - - ( - if ! flock -x -w 10 200; then - exit 1 - fi - echo "${_content}" > "${_file}" - ) 200>"${_lock_file}" - - if [[ $? -eq 0 ]]; then - info "Lock acquired after deadlock recovery for ${_file}" - return 0 # Updated - fi - - error_exit "Failed to acquire lock for ${_file} even after deadlock recovery" + echo "${_content}" > "${_file}" + info "File ${_file} updated" + return 0 # Updated } function reload_imex() { @@ -171,8 +140,8 @@ function create_default_imex_channel() { COMPUTE_RESOURCE_NAME=$(get_compute_resource_name "${QUEUE_NAME}-st-" $SLURMD_NODENAME) CR_NODES=$(get_node_names "${QUEUE_NAME}" "${COMPUTE_RESOURCE_NAME}") IPS_FROM_CR=$(get_ips_from_node_names "${CR_NODES}") - IMEX_MAIN_CONFIG="/opt/parallelcluster/shared/nvidia-imex/config_${QUEUE_NAME}_${COMPUTE_RESOURCE_NAME}.cfg" - IMEX_NODES_CONFIG="/opt/parallelcluster/shared/nvidia-imex/nodes_config_${QUEUE_NAME}_${COMPUTE_RESOURCE_NAME}.cfg" + IMEX_MAIN_CONFIG="/etc/nvidia-imex/config.cfg" + IMEX_NODES_CONFIG="/etc/nvidia-imex/nodes_config.cfg" info "Queue Name: ${QUEUE_NAME}" info "CR Name: ${COMPUTE_RESOURCE_NAME}" diff --git a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/nvidia-imex-status.job b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/nvidia-imex-status.job index 037dfe68b5..8b8113eabf 100755 --- a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/nvidia-imex-status.job +++ b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/nvidia-imex-status.job @@ -8,6 +8,5 @@ sleep 45 QUEUE_NAME=$(cat "/etc/chef/dna.json" | jq -r ".cluster.scheduler_queue_name") COMPUTE_RES_NAME=$(cat "/etc/chef/dna.json" | jq -r ".cluster.scheduler_compute_resource_name") -IMEX_CONFIG_FILE="/opt/parallelcluster/shared/nvidia-imex/config_${QUEUE_NAME}_${COMPUTE_RES_NAME}.cfg" -srun bash -c "/usr/bin/nvidia-imex-ctl -N -j -c ${IMEX_CONFIG_FILE} > result_\${SLURM_JOB_ID}_\$(hostname).out 2> result_\${SLURM_JOB_ID}_\$(hostname).err" \ No newline at end of file +srun bash -c "/usr/bin/nvidia-imex-ctl -N -j > result_\${SLURM_JOB_ID}_\$(hostname).out 2> result_\${SLURM_JOB_ID}_\$(hostname).err" \ No newline at end of file From 8de76f8e064a869ca556a6c9182f2c4288c160d6 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Mon, 22 Sep 2025 11:40:02 -0400 Subject: [PATCH 2/3] [GB200] Address linter findings in test_gb200. --- .../tests/ultraserver/test_gb200.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/integration-tests/tests/ultraserver/test_gb200.py b/tests/integration-tests/tests/ultraserver/test_gb200.py index 9e47782a57..a73dfba756 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200.py +++ b/tests/integration-tests/tests/ultraserver/test_gb200.py @@ -55,13 +55,19 @@ def submit_job_imex_status(rce: RemoteCommandExecutor, queue: str, max_nodes: in def assert_imex_nodes_config_is_correct(cluster: Cluster, queue: str, compute_resource: str, expected_ips: list): for compute_node_ip in cluster.get_compute_nodes_private_ip(queue, compute_resource): - logging.info(f"Checking IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}") + logging.info( + f"Checking IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}" + ) rce = RemoteCommandExecutor(cluster, compute_node_ip=compute_node_ip) imex_config_content = read_remote_file(rce, "/etc/nvidia-imex/nodes_config.cfg") - imex_config_content_clean = [line for line in imex_config_content.split("\n") if not line.strip().startswith("#")] + imex_config_content_clean = [ + line for line in imex_config_content.split("\n") if not line.strip().startswith("#") + ] actual_ips = [ip.strip() for ip in imex_config_content_clean] assert_that(actual_ips).contains_only(*expected_ips) - logging.info(f"IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}") + logging.info( + f"IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}" + ) def assert_no_errors_in_logs(cluster: Cluster, queue: str, compute_resource: str): From efe3f875a8800435ab51c4ad59f8a08168bb7f14 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Mon, 22 Sep 2025 11:34:42 -0400 Subject: [PATCH 3/3] [GB200] Simplify prolog after moving IMEX nodes configuration from shared to local location. Also added more detailed description of the prolog script at the top. --- .../test_gb200/91_nvidia_imex_prolog.sh | 79 ++++++------------- 1 file changed, 26 insertions(+), 53 deletions(-) diff --git a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh index b77fdba42a..5355367881 100644 --- a/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh +++ b/tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh @@ -1,7 +1,17 @@ #!/usr/bin/env bash -# This prolog script configures the NVIDIA IMEX nodes config file and reloads the nvidia-imex service. -# This prolog is meant to be run by compute nodes with exclusive jobs. +# This prolog script configures the NVIDIA IMEX on compute nodes involved in the job execution. +# +# In particular: +# - Checks whether the job is executed exclusively. +# If not, it exits immediately because it requires jobs to be executed exclusively. +# - Writes the private IP addresses of compute nodes into /etc/nvidia-imex/nodes_config.cfg. +# - Creates the IMEX default channel. +# For more information about IMEX channels, see https://docs.nvidia.com/multi-node-nvlink-systems/imex-guide/imexchannels.html +# - Restarts the IMEX system service. +# +# REQUIREMENTS: +# - This prolog assumes to be run only with exclusive jobs. LOG_FILE_PATH="/var/log/parallelcluster/nvidia-imex-prolog.log" SCONTROL_CMD="/opt/slurm/bin/scontrol" @@ -10,6 +20,7 @@ IMEX_STOP_TIMEOUT=15 #TODO In production, specify p6e-gb200, only. We added g5g only for testing purposes. ALLOWED_INSTANCE_TYPES="^(p6e-gb200|g5g)" IMEX_SERVICE="nvidia-imex" +IMEX_NODES_CONFIG="/etc/nvidia-imex/nodes_config.cfg" function info() { echo "$(date "+%Y-%m-%dT%H:%M:%S.%3N") [INFO] [PID:$$] [JOB:${SLURM_JOB_ID}] $1" @@ -48,24 +59,13 @@ function return_if_unsupported_instance_type() { fi } -function get_node_names() { - local _queue_name=$1 - local _compute_resource_name=$2 - - ${SCONTROL_CMD} show nodes --json | \ - jq -r \ - --arg queue_name "${_queue_name}" \ - --arg compute_resource_name "${_compute_resource_name}" \ - '[ - .nodes[] | - select( - (.partitions[] | contains($queue_name)) and - (.features[] | contains($compute_resource_name)) and - (.features[] | contains("static")) - ) | - .name - ] | - join(",")' +function return_if_job_is_not_exclusive() { + if [[ "${SLURM_JOB_OVERSUBSCRIBE}" = "NO" ]]; then + info "Job is exclusive, proceeding with IMEX configuration" + else + info "Skipping IMEX configuration because the job is not exclusive" + prolog_end + fi } function get_ips_from_node_names() { @@ -80,22 +80,6 @@ function get_compute_resource_name() { echo "${_slurmd_node_name}" | sed -E "s/${_queue_name_prefix}(.+)-[0-9]+$/\1/" } -function write_file() { - local _file=$1 - local _content=$2 - local _lock_file="${_file}.lock" - local _lock_timeout_seconds=60 - - if [[ -f "${_file}" ]] && [[ "$(cat "${_file}")" = "${_content}" ]]; then - info "File ${_file} already has the expected content, skipping the write operation" - return 1 # Not Updated - fi - - echo "${_content}" > "${_file}" - info "File ${_file} updated" - return 0 # Updated -} - function reload_imex() { info "Stopping IMEX" timeout ${IMEX_STOP_TIMEOUT} systemctl stop ${IMEX_SERVICE} @@ -111,9 +95,6 @@ function reload_imex() { } function create_default_imex_channel() { - # This configuration follows - # [Nvidia doc](https://docs.nvidia.com/multi-node-nvlink-systems/imex-guide/imexchannels.html) - # This configuration is only suitable for single user environment, and not compatible with multi-user environment. info "Creating IMEX default Channel" MAJOR_NUMBER=$(cat /proc/devices | grep nvidia-caps-imex-channels | cut -d' ' -f1) if [ ! -d "/dev/nvidia-caps-imex-channels" ]; then @@ -132,27 +113,19 @@ function create_default_imex_channel() { { info "PROLOG Start JobId=${SLURM_JOB_ID}: $0" + return_if_job_is_not_exclusive return_if_unsupported_instance_type create_default_imex_channel - QUEUE_NAME=$SLURM_JOB_PARTITION - COMPUTE_RESOURCE_NAME=$(get_compute_resource_name "${QUEUE_NAME}-st-" $SLURMD_NODENAME) - CR_NODES=$(get_node_names "${QUEUE_NAME}" "${COMPUTE_RESOURCE_NAME}") - IPS_FROM_CR=$(get_ips_from_node_names "${CR_NODES}") - IMEX_MAIN_CONFIG="/etc/nvidia-imex/config.cfg" - IMEX_NODES_CONFIG="/etc/nvidia-imex/nodes_config.cfg" - - info "Queue Name: ${QUEUE_NAME}" - info "CR Name: ${COMPUTE_RESOURCE_NAME}" - info "CR Nodes: ${CR_NODES}" - info "Node IPs from CR: ${IPS_FROM_CR}" - info "IMEX Main Config: ${IMEX_MAIN_CONFIG}" + IPS_FROM_CR=$(get_ips_from_node_names "${SLURM_NODELIST}") + + info "Node Names: ${SLURM_NODELIST}" + info "Node IPs: ${IPS_FROM_CR}" info "IMEX Nodes Config: ${IMEX_NODES_CONFIG}" info "Updating IMEX nodes config ${IMEX_NODES_CONFIG}" - write_file "${IMEX_NODES_CONFIG}" "${IPS_FROM_CR}" - + echo "${IPS_FROM_CR}" > "${IMEX_NODES_CONFIG}" reload_imex prolog_end