Skip to content

Commit 872a4d6

Browse files
himani2411Himani Anil Deshpande
andauthored
Revert "[Develop][GB200] Add concurrent control mechanisms to nvidia-imex prolog script (aws#6964)" (aws#6991)
This reverts commit 00a2656 Co-authored-by: Himani Anil Deshpande <[email protected]>
1 parent 91b41e5 commit 872a4d6

File tree

1 file changed

+6
-76
lines changed

1 file changed

+6
-76
lines changed

tests/integration-tests/tests/ultraserver/test_gb200/test_gb200/91_nvidia_imex_prolog.sh

Lines changed: 6 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env bash
22

33
# This prolog script configures the NVIDIA IMEX nodes config file and reloads the nvidia-imex service.
4-
# This prolog is meant to be run by compute nodes.
4+
# This prolog is meant to be run by compute nodes with exclusive jobs.
55

66
LOG_FILE_PATH="/var/log/parallelcluster/nvidia-imex-prolog.log"
77
SCONTROL_CMD="/opt/slurm/bin/scontrol"
@@ -11,8 +11,6 @@ IMEX_STOP_TIMEOUT=15
1111
ALLOWED_INSTANCE_TYPES="^(p6e-gb200|g5g)"
1212
IMEX_SERVICE="nvidia-imex"
1313

14-
15-
1614
function info() {
1715
echo "$(date "+%Y-%m-%dT%H:%M:%S.%3N") [INFO] [PID:$$] [JOB:${SLURM_JOB_ID}] $1"
1816
}
@@ -82,47 +80,6 @@ function get_compute_resource_name() {
8280
echo "${_slurmd_node_name}" | sed -E "s/${_queue_name_prefix}(.+)-[0-9]+$/\1/"
8381
}
8482

85-
function check_imex_needs_reload() {
86-
local _expected_ips=$1
87-
local _imex_config_file=$2
88-
89-
# First check if IMEX service is running
90-
if ! systemctl is-active ${IMEX_SERVICE} &>/dev/null; then
91-
info "IMEX service is not running, reload needed"
92-
return 0 # Need reload
93-
fi
94-
95-
# Get current IMEX status
96-
local imex_status_output
97-
if ! imex_status_output=$(timeout 15 /usr/bin/nvidia-imex-ctl -N -j -c "${_imex_config_file}" 2>/dev/null); then
98-
info "Failed to get IMEX status, assuming reload needed"
99-
return 0 # Need reload
100-
fi
101-
102-
# Parse JSON to extract current IPs from IMEX status
103-
local current_imex_ips
104-
if ! current_imex_ips=$(echo "${imex_status_output}" | jq -r '.nodes | to_entries[].value.host' 2>/dev/null | sort | tr '\n' ' '); then
105-
info "Failed to parse IMEX status JSON, assuming reload needed"
106-
return 0 # Need reload
107-
fi
108-
109-
# Convert expected IPs to sorted space-separated string
110-
local expected_ips_sorted
111-
expected_ips_sorted=$(echo "${_expected_ips}" | tr ',' '\n' | sort | tr '\n' ' ')
112-
113-
info "Current IMEX IPs: ${current_imex_ips}"
114-
info "Expected IPs: ${expected_ips_sorted}"
115-
116-
# Compare IP lists
117-
if [[ "${current_imex_ips}" = "${expected_ips_sorted}" ]]; then
118-
info "IMEX service running with correct IPs, skipping reload"
119-
return 1 # Skip reload
120-
else
121-
info "IMEX IPs mismatch, reload needed"
122-
return 0 # Need reload
123-
fi
124-
}
125-
12683
function write_file() {
12784
local _file=$1
12885
local _content=$2
@@ -181,32 +138,7 @@ function reload_imex() {
181138
# sed -i "s/SERVER_PORT.*/SERVER_PORT=${NEW_SERVER_PORT}/" "${IMEX_MAIN_CONFIG}"
182139

183140
info "Restarting IMEX"
184-
if ! timeout ${IMEX_START_TIMEOUT} systemctl start ${IMEX_SERVICE}; then
185-
error "IMEX service reload failed"
186-
return 1
187-
fi
188-
189-
return 0
190-
}
191-
192-
function handle_imex_reload() {
193-
local _ips_from_cr=$1
194-
local _imex_main_config=$2
195-
local _reload_reason=$3
196-
local _skip_message=$4
197-
local _reload_message=$5
198-
199-
info "${_reload_reason}"
200-
if check_imex_needs_reload "${_ips_from_cr}" "${_imex_main_config}"; then
201-
info "${_reload_message}"
202-
if reload_imex; then
203-
info "IMEX has been reloaded"
204-
else
205-
error "Failed to reload IMEX service"
206-
fi
207-
else
208-
info "${_skip_message}"
209-
fi
141+
timeout ${IMEX_START_TIMEOUT} systemctl start ${IMEX_SERVICE}
210142
}
211143

212144
function create_default_imex_channel() {
@@ -249,12 +181,10 @@ function create_default_imex_channel() {
249181
info "IMEX Main Config: ${IMEX_MAIN_CONFIG}"
250182
info "IMEX Nodes Config: ${IMEX_NODES_CONFIG}"
251183

252-
info "Checking IMEX nodes config ${IMEX_NODES_CONFIG}"
253-
if write_file "${IMEX_NODES_CONFIG}" "${IPS_FROM_CR}"; then
254-
handle_imex_reload "${IPS_FROM_CR}" "${IMEX_MAIN_CONFIG}" "IMEX nodes config updated, checking if reload is needed" "IMEX already configured correctly, skipping reload" "IMEX reload needed, restarting service"
255-
else
256-
handle_imex_reload "${IPS_FROM_CR}" "${IMEX_MAIN_CONFIG}" "IMEX nodes config unchanged, checking if reload is still needed" "IMEX config unchanged and service correctly configured, skipping reload" "IMEX reload needed despite unchanged config, restarting service"
257-
fi
184+
info "Updating IMEX nodes config ${IMEX_NODES_CONFIG}"
185+
write_file "${IMEX_NODES_CONFIG}" "${IPS_FROM_CR}"
186+
187+
reload_imex
258188

259189
prolog_end
260190

0 commit comments

Comments
 (0)