File tree Expand file tree Collapse file tree 2 files changed +9
-4
lines changed Expand file tree Collapse file tree 2 files changed +9
-4
lines changed Original file line number Diff line number Diff line change @@ -40,12 +40,12 @@ bool mca_common_pmi_init (void) {
4040 {
4141 int spawned , size , rank , appnum ;
4242 int rc ;
43+ char buf [PMI2_MAX_VALLEN ];
4344
4445 size = -1 ;
4546 rank = -1 ;
4647 appnum = -1 ;
4748
48-
4949 /* if we can't startup PMI, we can't be used */
5050 if (PMI2_Initialized ()) {
5151 return true;
@@ -56,11 +56,16 @@ bool mca_common_pmi_init (void) {
5656 mca_common_pmi_init_count -- ;
5757 return false;
5858 }
59- if (size < 0 || rank < 0 ) {
60- opal_show_help ("help-common-pmi.txt" , "pmi2-init-returned-bad-values" , true);
59+ /* depending on slurm versions, we may get bad rank/size or bad jobid */
60+ if (size < 0 || rank < 0 || PMI2_SUCCESS != PMI2_Job_GetId (buf , PMI2_MAX_VALLEN )) {
61+ /* When no srun (singloton) fail quietly */
62+ if (NULL != getenv ("SLURM_STEP_NUM_TASKS" )) {
63+ opal_show_help ("help-common-pmi.txt" , "pmi2-init-returned-bad-values" , true);
64+ }
6165 mca_common_pmi_init_count -- ;
6266 return false;
6367 }
68+
6469 mca_common_pmi_init_size = size ;
6570 mca_common_pmi_init_rank = rank ;
6671 mca_common_pmi_init_count -- ;
Original file line number Diff line number Diff line change @@ -13,7 +13,7 @@ We cannot use PMI2 at this time, and your job will
1313likely abort.
1414#
1515[pmi2-init-returned-bad-values]
16- PMI2 initialized but returned bad values for size and rank.
16+ PMI2 initialized but returned bad values for size/ rank/jobid .
1717This is symptomatic of either a failure to use the
1818"--mpi=pmi2" flag in SLURM, or a borked PMI2 installation.
1919If running under SLURM, try adding "-mpi=pmi2" to your
You can’t perform that action at this time.
0 commit comments