@@ -848,66 +848,4 @@ deployment_groups:
848848 cloud_parameters :
849849 prolog_flags : Alloc,DeferBatch,NoHold
850850 switch_type : switch/nvidia_imex
851- prolog_scripts :
852- - filename : 1_imex_prolog.sh
853- content : |
854- #!/usr/bin/env bash
855- if ! systemctl list-unit-files --all | grep -Fq "nvidia-imex.service"; then
856- exit 0
857- fi
858-
859- activate_imex() {
860- set -ex
861-
862- # Clean the config file in case the service gets started by accident
863- > /etc/nvidia-imex/nodes_config.cfg
864-
865- NVIDIA_IMEX_START_TIMEOUT=80
866- IMEX_CONN_WAIT_TIMEOUT=70
867- NVIDIA_IMEX_STOP_TIMEOUT=15
868- IMEX_SERVER_PORT=1101
869- IMEX_CMD_PORT=1102
870-
871- # clean up prev connection
872- set +e
873- timeout $NVIDIA_IMEX_STOP_TIMEOUT systemctl stop nvidia-imex
874- pkill -9 nvidia-imex
875- set -e
876-
877- # update peer list
878- scontrol -a show node "${SLURM_NODELIST}" -o | sed 's/^.* NodeAddr=\([^ ]*\).*/\1/' > /etc/nvidia-imex/nodes_config.cfg
879-
880- sed -i "s/SERVER_PORT.*/SERVER_PORT=${IMEX_SERVER_PORT}/" /etc/nvidia-imex/config.cfg
881-
882- # enable imex-ctl on all nodes so you can query imex status with: nvidia-imex-ctl -a -q
883- sed -i "s/IMEX_CMD_PORT.*/IMEX_CMD_PORT=${IMEX_CMD_PORT}/" /etc/nvidia-imex/config.cfg
884- sed -i "s/IMEX_CMD_ENABLED.*/IMEX_CMD_ENABLED=1/" /etc/nvidia-imex/config.cfg
885-
886- # set timeouts for start
887- sed -i "s/IMEX_CONN_WAIT_TIMEOUT.*/IMEX_CONN_WAIT_TIMEOUT=${IMEX_CONN_WAIT_TIMEOUT}/" /etc/nvidia-imex/config.cfg
888-
889- timeout $NVIDIA_IMEX_START_TIMEOUT systemctl start nvidia-imex
890- }
891- activate_imex > "/var/log/slurm/imex_prolog_${SLURM_JOB_ID}.log" 2>&1
892- epilog_scripts :
893- - filename : 2_imex_epilog.sh
894- content : |
895- #!/usr/bin/env bash
896- set -ex
897-
898- if ! systemctl list-unit-files --all | grep -Fq "nvidia-imex.service"; then
899- exit 0
900- fi
901-
902- # Clean the config file in case the service gets started by accident
903- > /etc/nvidia-imex/nodes_config.cfg
904-
905- NVIDIA_IMEX_STOP_TIMEOUT=30
906-
907- # clean up connection
908- set +e
909-
910- timeout $NVIDIA_IMEX_STOP_TIMEOUT systemctl stop nvidia-imex
911-
912- pkill -9 nvidia-imex
913- set -e
851+ enable_external_prolog_epilog : true
0 commit comments