11#! /usr/bin/env bash
22
33# This prolog script configures the NVIDIA IMEX nodes config file and reloads the nvidia-imex service.
4- # This prolog is meant to be run by compute nodes.
4+ # This prolog is meant to be run by compute nodes with exclusive jobs .
55
66LOG_FILE_PATH=" /var/log/parallelcluster/nvidia-imex-prolog.log"
77SCONTROL_CMD=" /opt/slurm/bin/scontrol"
@@ -11,8 +11,6 @@ IMEX_STOP_TIMEOUT=15
1111ALLOWED_INSTANCE_TYPES=" ^(p6e-gb200|g5g)"
1212IMEX_SERVICE=" nvidia-imex"
1313
14-
15-
1614function info() {
1715 echo " $( date " +%Y-%m-%dT%H:%M:%S.%3N" ) [INFO] [PID:$$ ] [JOB:${SLURM_JOB_ID} ] $1 "
1816}
@@ -82,47 +80,6 @@ function get_compute_resource_name() {
8280 echo " ${_slurmd_node_name} " | sed -E " s/${_queue_name_prefix} (.+)-[0-9]+$/\1/"
8381}
8482
85- function check_imex_needs_reload() {
86- local _expected_ips=$1
87- local _imex_config_file=$2
88-
89- # First check if IMEX service is running
90- if ! systemctl is-active ${IMEX_SERVICE} & > /dev/null; then
91- info " IMEX service is not running, reload needed"
92- return 0 # Need reload
93- fi
94-
95- # Get current IMEX status
96- local imex_status_output
97- if ! imex_status_output=$( timeout 15 /usr/bin/nvidia-imex-ctl -N -j -c " ${_imex_config_file} " 2> /dev/null) ; then
98- info " Failed to get IMEX status, assuming reload needed"
99- return 0 # Need reload
100- fi
101-
102- # Parse JSON to extract current IPs from IMEX status
103- local current_imex_ips
104- if ! current_imex_ips=$( echo " ${imex_status_output} " | jq -r ' .nodes | to_entries[].value.host' 2> /dev/null | sort | tr ' \n' ' ' ) ; then
105- info " Failed to parse IMEX status JSON, assuming reload needed"
106- return 0 # Need reload
107- fi
108-
109- # Convert expected IPs to sorted space-separated string
110- local expected_ips_sorted
111- expected_ips_sorted=$( echo " ${_expected_ips} " | tr ' ,' ' \n' | sort | tr ' \n' ' ' )
112-
113- info " Current IMEX IPs: ${current_imex_ips} "
114- info " Expected IPs: ${expected_ips_sorted} "
115-
116- # Compare IP lists
117- if [[ " ${current_imex_ips} " = " ${expected_ips_sorted} " ]]; then
118- info " IMEX service running with correct IPs, skipping reload"
119- return 1 # Skip reload
120- else
121- info " IMEX IPs mismatch, reload needed"
122- return 0 # Need reload
123- fi
124- }
125-
12683function write_file() {
12784 local _file=$1
12885 local _content=$2
@@ -181,32 +138,7 @@ function reload_imex() {
181138 # sed -i "s/SERVER_PORT.*/SERVER_PORT=${NEW_SERVER_PORT}/" "${IMEX_MAIN_CONFIG}"
182139
183140 info " Restarting IMEX"
184- if ! timeout ${IMEX_START_TIMEOUT} systemctl start ${IMEX_SERVICE} ; then
185- error " IMEX service reload failed"
186- return 1
187- fi
188-
189- return 0
190- }
191-
192- function handle_imex_reload() {
193- local _ips_from_cr=$1
194- local _imex_main_config=$2
195- local _reload_reason=$3
196- local _skip_message=$4
197- local _reload_message=$5
198-
199- info " ${_reload_reason} "
200- if check_imex_needs_reload " ${_ips_from_cr} " " ${_imex_main_config} " ; then
201- info " ${_reload_message} "
202- if reload_imex; then
203- info " IMEX has been reloaded"
204- else
205- error " Failed to reload IMEX service"
206- fi
207- else
208- info " ${_skip_message} "
209- fi
141+ timeout ${IMEX_START_TIMEOUT} systemctl start ${IMEX_SERVICE}
210142}
211143
212144function create_default_imex_channel() {
@@ -249,12 +181,10 @@ function create_default_imex_channel() {
249181 info " IMEX Main Config: ${IMEX_MAIN_CONFIG} "
250182 info " IMEX Nodes Config: ${IMEX_NODES_CONFIG} "
251183
252- info " Checking IMEX nodes config ${IMEX_NODES_CONFIG} "
253- if write_file " ${IMEX_NODES_CONFIG} " " ${IPS_FROM_CR} " ; then
254- handle_imex_reload " ${IPS_FROM_CR} " " ${IMEX_MAIN_CONFIG} " " IMEX nodes config updated, checking if reload is needed" " IMEX already configured correctly, skipping reload" " IMEX reload needed, restarting service"
255- else
256- handle_imex_reload " ${IPS_FROM_CR} " " ${IMEX_MAIN_CONFIG} " " IMEX nodes config unchanged, checking if reload is still needed" " IMEX config unchanged and service correctly configured, skipping reload" " IMEX reload needed despite unchanged config, restarting service"
257- fi
184+ info " Updating IMEX nodes config ${IMEX_NODES_CONFIG} "
185+ write_file " ${IMEX_NODES_CONFIG} " " ${IPS_FROM_CR} "
186+
187+ reload_imex
258188
259189 prolog_end
260190
0 commit comments