Skip to content

Commit 44f01b4

Browse files
authored
Merge pull request #1858 from artpol84/fix_pmix_slurm
opal/pmix: add blocking fence to SLURM components
2 parents c8b1c6c + 72585a9 commit 44f01b4

File tree

2 files changed

+64
-2
lines changed

2 files changed

+64
-2
lines changed

opal/mca/pmix/s1/pmix_s1.c

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ static int s1_abort(int flag, const char msg[],
3838
static int s1_commit(void);
3939
static int s1_fencenb(opal_list_t *procs, int collect_data,
4040
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
41+
static int s1_fence(opal_list_t *procs, int collect_data);
4142
static int s1_put(opal_pmix_scope_t scope,
4243
opal_value_t *kv);
4344
static int s1_get(const opal_process_name_t *id,
@@ -61,6 +62,7 @@ const opal_pmix_base_module_t opal_pmix_s1_module = {
6162
.abort = s1_abort,
6263
.commit = s1_commit,
6364
.fence_nb = s1_fencenb,
65+
.fence = s1_fence,
6466
.put = s1_put,
6567
.get = s1_get,
6668
.publish = s1_publish,
@@ -527,7 +529,7 @@ static int s1_commit(void)
527529
static void fencenb(int sd, short args, void *cbdata)
528530
{
529531
pmi_opcaddy_t *op = (pmi_opcaddy_t*)cbdata;
530-
int rc;
532+
int rc = OPAL_SUCCESS;
531533
int32_t i;
532534
opal_value_t *kp, kvn;
533535
opal_hwloc_locality_t locality;
@@ -617,6 +619,35 @@ static int s1_fencenb(opal_list_t *procs, int collect_data,
617619
return OPAL_SUCCESS;
618620
}
619621

622+
#define S1_WAIT_FOR_COMPLETION(a) \
623+
do { \
624+
while ((a)) { \
625+
usleep(10); \
626+
} \
627+
} while (0)
628+
629+
struct fence_result {
630+
volatile int flag;
631+
int status;
632+
};
633+
634+
static void fence_release(int status, void *cbdata)
635+
{
636+
struct fence_result *res = (struct fence_result*)cbdata;
637+
res->status = status;
638+
opal_atomic_wmb();
639+
res->flag = 0;
640+
}
641+
642+
static int s1_fence(opal_list_t *procs, int collect_data)
643+
{
644+
struct fence_result result = { 1, OPAL_SUCCESS };
645+
s1_fencenb(procs, collect_data, fence_release, (void*)&result);
646+
S1_WAIT_FOR_COMPLETION(result.flag);
647+
return result.status;
648+
}
649+
650+
620651
static int s1_get(const opal_process_name_t *id,
621652
const char *key, opal_list_t *info,
622653
opal_value_t **kv)

opal/mca/pmix/s2/pmix_s2.c

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ static int s2_abort(int flag, const char msg[],
4545
static int s2_commit(void);
4646
static int s2_fencenb(opal_list_t *procs, int collect_data,
4747
opal_pmix_op_cbfunc_t cbfunc, void *cbdata);
48+
static int s2_fence(opal_list_t *procs, int collect_data);
4849
static int s2_put(opal_pmix_scope_t scope,
4950
opal_value_t *kv);
5051
static int s2_get(const opal_process_name_t *id,
@@ -68,6 +69,7 @@ const opal_pmix_base_module_t opal_pmix_s2_module = {
6869
.abort = s2_abort,
6970
.commit = s2_commit,
7071
.fence_nb = s2_fencenb,
72+
.fence = s2_fence,
7173
.put = s2_put,
7274
.get = s2_get,
7375
.publish = s2_publish,
@@ -545,7 +547,7 @@ static int s2_commit(void)
545547
static void fencenb(int sd, short args, void *cbdata)
546548
{
547549
pmi_opcaddy_t *op = (pmi_opcaddy_t*)cbdata;
548-
int rc;
550+
int rc = OPAL_SUCCESS;
549551
int32_t i;
550552
opal_value_t *kp, kvn;
551553
opal_hwloc_locality_t locality;
@@ -635,6 +637,35 @@ static int s2_fencenb(opal_list_t *procs, int collect_data,
635637
return OPAL_SUCCESS;
636638
}
637639

640+
#define S2_WAIT_FOR_COMPLETION(a) \
641+
do { \
642+
while ((a)) { \
643+
usleep(10); \
644+
} \
645+
} while (0)
646+
647+
struct fence_result {
648+
volatile int flag;
649+
int status;
650+
};
651+
652+
static void fence_release(int status, void *cbdata)
653+
{
654+
struct fence_result *res = (struct fence_result*)cbdata;
655+
res->status = status;
656+
opal_atomic_wmb();
657+
res->flag = 0;
658+
}
659+
660+
static int s2_fence(opal_list_t *procs, int collect_data)
661+
{
662+
struct fence_result result = { 1, OPAL_SUCCESS };
663+
s2_fencenb(procs, collect_data, fence_release, (void*)&result);
664+
S2_WAIT_FOR_COMPLETION(result.flag);
665+
return result.status;
666+
}
667+
668+
638669
static int s2_get(const opal_process_name_t *id,
639670
const char *key, opal_list_t *info,
640671
opal_value_t **kv)

0 commit comments

Comments
 (0)