Skip to content

Commit f07f446

Browse files
authored
Move env vars after node vars in Slurm template (#148)
* Move env vars block after head node configuration to allow usage of head node variables in user-specific env vars Signed-off-by: Hemil Desai <[email protected]> * Fix tests Signed-off-by: Hemil Desai <[email protected]> --------- Signed-off-by: Hemil Desai <[email protected]>
1 parent 7b13fb3 commit f07f446

File tree

8 files changed

+11
-10
lines changed

8 files changed

+11
-10
lines changed

src/nemo_run/core/execution/templates/slurm.sh.j2

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,6 @@ set -evx
1515
export PYTHONUNBUFFERED=1
1616
export SLURM_UNBUFFEREDIO=1
1717
export TORCHX_MAX_RETRIES={{max_retries}}
18-
{%- for env_var in env_vars %}
19-
{{env_var}}
20-
{%- endfor %}
2118

2219
set +e
2320

@@ -33,6 +30,10 @@ head_node=${nodes_array[0]}
3330
{%- endfor %}
3431
{% endif %}
3532

33+
{%- for env_var in env_vars %}
34+
{{env_var}}
35+
{%- endfor %}
36+
3637
{%- if setup_lines %}
3738
{{setup_lines}}
3839
{%- endif %}

test/core/execution/artifacts/dummy_slurm.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ set -evx
1818
export PYTHONUNBUFFERED=1
1919
export SLURM_UNBUFFEREDIO=1
2020
export TORCHX_MAX_RETRIES=3
21-
export ENV_VAR=value
2221

2322
set +e
2423

@@ -29,6 +28,7 @@ nodes_array=($nodes)
2928
head_node=${nodes_array[0]}
3029
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
3130

31+
export ENV_VAR=value
3232

3333

3434
# Command 1

test/core/execution/artifacts/ft_het_slurm.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ set -evx
2626
export PYTHONUNBUFFERED=1
2727
export SLURM_UNBUFFEREDIO=1
2828
export TORCHX_MAX_RETRIES=3
29-
export ENV_VAR=value
3029

3130
set +e
3231

@@ -41,6 +40,7 @@ head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
4140
het_group_host_0=$(scontrol show hostnames=$SLURM_JOB_NODELIST_HET_GROUP_0 | head -n1)
4241
het_group_host_1=$(scontrol show hostnames=$SLURM_JOB_NODELIST_HET_GROUP_1 | head -n1)
4342

43+
export ENV_VAR=value
4444
# This script uses experimental fault tolerance launcher
4545
# Fault tolerance related items
4646
export FAULT_TOL_CFG_PATH="/root/experiment/sample_job/sample_job_ft_cfg.yml"

test/core/execution/artifacts/ft_slurm.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ set -evx
1818
export PYTHONUNBUFFERED=1
1919
export SLURM_UNBUFFEREDIO=1
2020
export TORCHX_MAX_RETRIES=3
21-
export ENV_VAR=value
2221

2322
set +e
2423

@@ -29,6 +28,7 @@ nodes_array=($nodes)
2928
head_node=${nodes_array[0]}
3029
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
3130

31+
export ENV_VAR=value
3232
# This script uses experimental fault tolerance launcher
3333
# Fault tolerance related items
3434
export FAULT_TOL_CFG_PATH="/root/sample_job/sample_job_ft_cfg.yml"

test/core/execution/artifacts/group_resource_req_slurm.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ set -evx
2020
export PYTHONUNBUFFERED=1
2121
export SLURM_UNBUFFEREDIO=1
2222
export TORCHX_MAX_RETRIES=3
23-
export ENV_VAR=value
2423

2524
set +e
2625

@@ -31,6 +30,7 @@ nodes_array=($nodes)
3130
head_node=${nodes_array[0]}
3231
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
3332

33+
export ENV_VAR=value
3434

3535

3636
# Command 1

test/core/execution/artifacts/group_slurm.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ set -evx
2020
export PYTHONUNBUFFERED=1
2121
export SLURM_UNBUFFEREDIO=1
2222
export TORCHX_MAX_RETRIES=3
23-
export ENV_VAR=value
2423

2524
set +e
2625

@@ -31,6 +30,7 @@ nodes_array=($nodes)
3130
head_node=${nodes_array[0]}
3231
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
3332

33+
export ENV_VAR=value
3434

3535

3636
# Command 1

test/core/execution/artifacts/group_slurm_no_monitor.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ set -evx
2020
export PYTHONUNBUFFERED=1
2121
export SLURM_UNBUFFEREDIO=1
2222
export TORCHX_MAX_RETRIES=3
23-
export ENV_VAR=value
2423

2524
set +e
2625

@@ -31,6 +30,7 @@ nodes_array=($nodes)
3130
head_node=${nodes_array[0]}
3231
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
3332

33+
export ENV_VAR=value
3434

3535

3636
# Command 1

test/core/execution/artifacts/het_slurm.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ set -evx
3030
export PYTHONUNBUFFERED=1
3131
export SLURM_UNBUFFEREDIO=1
3232
export TORCHX_MAX_RETRIES=3
33-
export ENV_VAR=value
3433

3534
set +e
3635

@@ -45,6 +44,7 @@ head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
4544
het_group_host_0=$(scontrol show hostnames=$SLURM_JOB_NODELIST_HET_GROUP_0 | head -n1)
4645
het_group_host_1=$(scontrol show hostnames=$SLURM_JOB_NODELIST_HET_GROUP_1 | head -n1)
4746

47+
export ENV_VAR=value
4848

4949

5050
# Command 1

0 commit comments

Comments
 (0)