Skip to content

Commit 707429f

Browse files
authored
Merge pull request #446 from ExaWorks/fix_slurm_nodefile
Fix slurm nodefiles
2 parents a7a1256 + a5d4d59 commit 707429f

File tree

1 file changed

+35
-11
lines changed

1 file changed

+35
-11
lines changed

src/psij/executors/batch/slurm/slurm.mustache

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#!/bin/bash
22

3+
34
{{#job.name}}
45
#SBATCH --job-name="{{.}}"
56
{{/job.name}}
@@ -67,21 +68,44 @@ only results in empty files that are not cleaned up}}
6768
#SBATCH -e /dev/null
6869
#SBATCH -o /dev/null
6970

70-
PSIJ_NODEFILE="{{psij.script_dir}}/$SLURM_JOB_ID.nodefile"
71-
scontrol show hostnames >"$PSIJ_NODEFILE"
72-
export PSIJ_NODEFILE
73-
71+
{{#job.spec.inherit_environment}}
72+
#SBATCH --export=ALL
73+
{{/job.spec.inherit_environment}}
74+
{{^job.spec.inherit_environment}}
75+
#SBATCH --export=NONE
76+
{{/job.spec.inherit_environment}}
7477

7578
{{#env}}
7679
export {{name}}={{value}}
7780
{{/env}}
7881

79-
{{#job.spec.inherit_environment}}
80-
#SBATCH --export=ALL{{#env}},{{name}}{{/env}}
81-
{{/job.spec.inherit_environment}}
82-
{{^job.spec.inherit_environment}}
83-
#SBATCH --export={{#env}}{{name}},{{/env}}
84-
{{/job.spec.inherit_environment}}
82+
{{#job.spec.resources}}
83+
{{#process_count}}
84+
_PSIJ_PC={{.}}
85+
{{/process_count}}
86+
{{#processes_per_node}}
87+
_PSIJ_PPN={{.}}
88+
{{/processes_per_node}}
89+
{{/job.spec.resources}}
90+
91+
_PSIJ_NC=`scontrol show hostnames | wc -l`
92+
93+
{{!Unlike PBS, Slurm only lists the nodes once in the nodelist, so, to bring it to uniform PBS
94+
form, we need to duplicate each node line by PPN, which we need to calculate}}
95+
if [ "$_PSIJ_PPN" == "" ]; then
96+
if [ "$_PSIJ_NC" != "" ] && [ "$_PSIJ_PC" != "" ]; then
97+
$_PSIJ_PPN=$((_PSIJ_PC/_PSIJ_NC))
98+
fi
99+
fi
100+
101+
PSIJ_NODEFILE="{{psij.script_dir}}/$SLURM_JOB_ID.nodefile"
102+
if [ "$_PSIJ_PPN" == "" ]; then
103+
scontrol show hostnames >"$PSIJ_NODEFILE"
104+
else
105+
scontrol show hostnames | while read NODE; do for _ in $(seq 1 1 $_PSIJ_PPN); do echo "$NODE"; done; done > "$PSIJ_NODEFILE"
106+
fi
107+
export PSIJ_NODEFILE
108+
85109

86110

87111
{{!redirect output here instead of through #SBATCH directive since SLURM_JOB_ID is not available
@@ -92,4 +116,4 @@ exec &>> "{{psij.script_dir}}/$SLURM_JOB_ID.out"
92116
{{#psij.launch_command}}{{.}} {{/psij.launch_command}}
93117

94118
{{!we redirect to a file tied to the native ID so that we can reach the file with attach().}}
95-
echo "$?" > "{{psij.script_dir}}/$SLURM_JOB_ID.ec"
119+
echo "$?" > "{{psij.script_dir}}/$SLURM_JOB_ID.ec"

0 commit comments

Comments
 (0)