Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -848,66 +848,4 @@
cloud_parameters:
prolog_flags: Alloc,DeferBatch,NoHold
switch_type: switch/nvidia_imex
prolog_scripts:
- filename: 1_imex_prolog.sh
content: |
#!/usr/bin/env bash
if ! systemctl list-unit-files --all | grep -Fq "nvidia-imex.service"; then
exit 0
fi

activate_imex() {
set -ex

# Clean the config file in case the service gets started by accident
> /etc/nvidia-imex/nodes_config.cfg

NVIDIA_IMEX_START_TIMEOUT=80
IMEX_CONN_WAIT_TIMEOUT=70
NVIDIA_IMEX_STOP_TIMEOUT=15
IMEX_SERVER_PORT=1101
IMEX_CMD_PORT=1102

# clean up prev connection
set +e
timeout $NVIDIA_IMEX_STOP_TIMEOUT systemctl stop nvidia-imex
pkill -9 nvidia-imex
set -e

# update peer list
scontrol -a show node "${SLURM_NODELIST}" -o | sed 's/^.* NodeAddr=\([^ ]*\).*/\1/' > /etc/nvidia-imex/nodes_config.cfg

sed -i "s/SERVER_PORT.*/SERVER_PORT=${IMEX_SERVER_PORT}/" /etc/nvidia-imex/config.cfg

# enable imex-ctl on all nodes so you can query imex status with: nvidia-imex-ctl -a -q
sed -i "s/IMEX_CMD_PORT.*/IMEX_CMD_PORT=${IMEX_CMD_PORT}/" /etc/nvidia-imex/config.cfg
sed -i "s/IMEX_CMD_ENABLED.*/IMEX_CMD_ENABLED=1/" /etc/nvidia-imex/config.cfg

# set timeouts for start
sed -i "s/IMEX_CONN_WAIT_TIMEOUT.*/IMEX_CONN_WAIT_TIMEOUT=${IMEX_CONN_WAIT_TIMEOUT}/" /etc/nvidia-imex/config.cfg

timeout $NVIDIA_IMEX_START_TIMEOUT systemctl start nvidia-imex
}
activate_imex > "/var/log/slurm/imex_prolog_${SLURM_JOB_ID}.log" 2>&1
epilog_scripts:
- filename: 2_imex_epilog.sh
content: |
#!/usr/bin/env bash
set -ex

if ! systemctl list-unit-files --all | grep -Fq "nvidia-imex.service"; then
exit 0
fi

# Clean the config file in case the service gets started by accident
> /etc/nvidia-imex/nodes_config.cfg

NVIDIA_IMEX_STOP_TIMEOUT=30

# clean up connection
set +e

timeout $NVIDIA_IMEX_STOP_TIMEOUT systemctl stop nvidia-imex

pkill -9 nvidia-imex
set -e
enable_external_prolog_epilog: true

Check failure on line 851 in examples/machine-learning/a4x-maxgpu-4g-metal/a4xmax-bm-slurm-blueprint.yaml

View workflow job for this annotation

GitHub Actions / pre-commit-highest-dependencies

851:42 [new-line-at-end-of-file] no new line character at the end of file

Check failure on line 851 in examples/machine-learning/a4x-maxgpu-4g-metal/a4xmax-bm-slurm-blueprint.yaml

View workflow job for this annotation

GitHub Actions / pre-commit

851:42 [new-line-at-end-of-file] no new line character at the end of file
Loading