Skip to content

Commit 3f96cb9

Browse files
committed
remove duplicate embedding of prolog/epilog scripts
1 parent d876a1f commit 3f96cb9

File tree

1 file changed

+1
-63
lines changed

1 file changed

+1
-63
lines changed

examples/machine-learning/a4x-maxgpu-4g-metal/a4xmax-bm-slurm-blueprint.yaml

Lines changed: 1 addition & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -848,66 +848,4 @@ deployment_groups:
848848
cloud_parameters:
849849
prolog_flags: Alloc,DeferBatch,NoHold
850850
switch_type: switch/nvidia_imex
851-
prolog_scripts:
852-
- filename: 1_imex_prolog.sh
853-
content: |
854-
#!/usr/bin/env bash
855-
if ! systemctl list-unit-files --all | grep -Fq "nvidia-imex.service"; then
856-
exit 0
857-
fi
858-
859-
activate_imex() {
860-
set -ex
861-
862-
# Clean the config file in case the service gets started by accident
863-
> /etc/nvidia-imex/nodes_config.cfg
864-
865-
NVIDIA_IMEX_START_TIMEOUT=80
866-
IMEX_CONN_WAIT_TIMEOUT=70
867-
NVIDIA_IMEX_STOP_TIMEOUT=15
868-
IMEX_SERVER_PORT=1101
869-
IMEX_CMD_PORT=1102
870-
871-
# clean up prev connection
872-
set +e
873-
timeout $NVIDIA_IMEX_STOP_TIMEOUT systemctl stop nvidia-imex
874-
pkill -9 nvidia-imex
875-
set -e
876-
877-
# update peer list
878-
scontrol -a show node "${SLURM_NODELIST}" -o | sed 's/^.* NodeAddr=\([^ ]*\).*/\1/' > /etc/nvidia-imex/nodes_config.cfg
879-
880-
sed -i "s/SERVER_PORT.*/SERVER_PORT=${IMEX_SERVER_PORT}/" /etc/nvidia-imex/config.cfg
881-
882-
# enable imex-ctl on all nodes so you can query imex status with: nvidia-imex-ctl -a -q
883-
sed -i "s/IMEX_CMD_PORT.*/IMEX_CMD_PORT=${IMEX_CMD_PORT}/" /etc/nvidia-imex/config.cfg
884-
sed -i "s/IMEX_CMD_ENABLED.*/IMEX_CMD_ENABLED=1/" /etc/nvidia-imex/config.cfg
885-
886-
# set timeouts for start
887-
sed -i "s/IMEX_CONN_WAIT_TIMEOUT.*/IMEX_CONN_WAIT_TIMEOUT=${IMEX_CONN_WAIT_TIMEOUT}/" /etc/nvidia-imex/config.cfg
888-
889-
timeout $NVIDIA_IMEX_START_TIMEOUT systemctl start nvidia-imex
890-
}
891-
activate_imex > "/var/log/slurm/imex_prolog_${SLURM_JOB_ID}.log" 2>&1
892-
epilog_scripts:
893-
- filename: 2_imex_epilog.sh
894-
content: |
895-
#!/usr/bin/env bash
896-
set -ex
897-
898-
if ! systemctl list-unit-files --all | grep -Fq "nvidia-imex.service"; then
899-
exit 0
900-
fi
901-
902-
# Clean the config file in case the service gets started by accident
903-
> /etc/nvidia-imex/nodes_config.cfg
904-
905-
NVIDIA_IMEX_STOP_TIMEOUT=30
906-
907-
# clean up connection
908-
set +e
909-
910-
timeout $NVIDIA_IMEX_STOP_TIMEOUT systemctl stop nvidia-imex
911-
912-
pkill -9 nvidia-imex
913-
set -e
851+
enable_external_prolog_epilog: true

0 commit comments

Comments
 (0)