Skip to content

Commit a33af68

Browse files
committed
add_procs: add threading protection for dynamic add_procs
This commit add protection to the group, ob1, and bml endpoint lookup code. For ob1 and the bml a lock has been added. For performance reasons the lock is only held if a bml or ob1 endpoint does not exist. ompi_group_dense_lookup no uses opal_atomic_cmpset to ensure the proc is only retained by the thread that actually updates the group. Signed-off-by: Nathan Hjelm <[email protected]> (cherry picked from open-mpi/ompi@08e267b)
1 parent c4245d6 commit a33af68

File tree

5 files changed

+36
-13
lines changed

5 files changed

+36
-13
lines changed

ompi/group/group.h

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -333,25 +333,34 @@ int ompi_group_minloc (int list[], int length);
333333
*/
334334
static inline struct ompi_proc_t *ompi_group_dense_lookup (ompi_group_t *group, const int peer_id, const bool allocate)
335335
{
336+
ompi_proc_t *proc;
337+
336338
#if OPAL_ENABLE_DEBUG
337339
if (peer_id >= group->grp_proc_count) {
338340
opal_output(0, "ompi_group_dense_lookup: invalid peer index (%d)", peer_id);
339341
return (struct ompi_proc_t *) NULL;
340342
}
341343
#endif
342344

343-
if (OPAL_UNLIKELY(ompi_proc_is_sentinel (group->grp_proc_pointers[peer_id]))) {
345+
proc = group->grp_proc_pointers[peer_id];
346+
347+
if (OPAL_UNLIKELY(ompi_proc_is_sentinel (proc))) {
344348
if (!allocate) {
345349
return NULL;
346350
}
347351

348352
/* replace sentinel value with an actual ompi_proc_t */
349-
group->grp_proc_pointers[peer_id] =
350-
(ompi_proc_t *) ompi_proc_for_name (ompi_proc_sentinel_to_name ((intptr_t) group->grp_proc_pointers[peer_id]));
351-
OBJ_RETAIN(group->grp_proc_pointers[peer_id]);
353+
ompi_proc_t *real_proc =
354+
(ompi_proc_t *) ompi_proc_for_name (ompi_proc_sentinel_to_name ((intptr_t) proc));
355+
356+
if (opal_atomic_cmpset_ptr (group->grp_proc_pointers + peer_id, proc, real_proc)) {
357+
OBJ_RETAIN(real_proc);
358+
}
359+
360+
proc = real_proc;
352361
}
353362

354-
return group->grp_proc_pointers[peer_id];
363+
return proc;
355364
}
356365

357366
/*

ompi/mca/bml/base/base.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,15 @@ OMPI_DECLSPEC int mca_bml_base_ft_event(int state);
6363
OMPI_DECLSPEC extern mca_bml_base_component_t mca_bml_component;
6464
OMPI_DECLSPEC extern mca_bml_base_module_t mca_bml;
6565
OMPI_DECLSPEC extern mca_base_framework_t ompi_bml_base_framework;
66+
OMPI_DECLSPEC extern opal_mutex_t mca_bml_lock;
6667

6768
static inline struct mca_bml_base_endpoint_t *mca_bml_base_get_endpoint (struct ompi_proc_t *proc) {
6869
if (OPAL_UNLIKELY(NULL == proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML])) {
69-
mca_bml.bml_add_proc (proc);
70+
OPAL_THREAD_LOCK(&mca_bml_lock);
71+
if (NULL == proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) {
72+
mca_bml.bml_add_proc (proc);
73+
}
74+
OPAL_THREAD_UNLOCK(&mca_bml_lock);
7075
}
7176

7277
return (struct mca_bml_base_endpoint_t *) proc->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML];

ompi/mca/bml/base/bml_base_frame.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ static bool mca_bml_base_srand;
5050
opal_rng_buff_t mca_bml_base_rand_buff;
5151
#endif
5252

53+
opal_mutex_t mca_bml_lock = OPAL_MUTEX_STATIC_INIT;
54+
5355
static int mca_bml_base_register(mca_base_register_flag_t flags)
5456
{
5557
#if OPAL_ENABLE_DEBUG_RELIABILITY

ompi/mca/pml/ob1/pml_ob1_comm.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ static void mca_pml_ob1_comm_construct(mca_pml_ob1_comm_t* comm)
5555
{
5656
OBJ_CONSTRUCT(&comm->wild_receives, opal_list_t);
5757
OBJ_CONSTRUCT(&comm->matching_lock, opal_mutex_t);
58+
OBJ_CONSTRUCT(&comm->proc_lock, opal_mutex_t);
5859
comm->recv_sequence = 0;
5960
comm->procs = NULL;
6061
comm->last_probed = 0;
@@ -76,6 +77,7 @@ static void mca_pml_ob1_comm_destruct(mca_pml_ob1_comm_t* comm)
7677

7778
OBJ_DESTRUCT(&comm->wild_receives);
7879
OBJ_DESTRUCT(&comm->matching_lock);
80+
OBJ_DESTRUCT(&comm->proc_lock);
7981
}
8082

8183

ompi/mca/pml/ob1/pml_ob1_comm.h

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
12
/*
23
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
34
* University Research and Technology
@@ -9,6 +10,8 @@
910
* University of Stuttgart. All rights reserved.
1011
* Copyright (c) 2004-2005 The Regents of the University of California.
1112
* All rights reserved.
13+
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
14+
* reserved.
1215
* $COPYRIGHT$
1316
*
1417
* Additional copyrights may follow
@@ -51,13 +54,10 @@ OBJ_CLASS_DECLARATION(mca_pml_ob1_comm_proc_t);
5154
*/
5255
struct mca_pml_comm_t {
5356
opal_object_t super;
54-
#if OPAL_ENABLE_MULTI_THREADS
5557
volatile uint32_t recv_sequence; /**< recv request sequence number - receiver side */
56-
#else
57-
uint32_t recv_sequence; /**< recv request sequence number - receiver side */
58-
#endif
5958
opal_mutex_t matching_lock; /**< matching lock */
6059
opal_list_t wild_receives; /**< queue of unmatched wild (source process not specified) receives */
60+
opal_mutex_t proc_lock;
6161
mca_pml_ob1_comm_proc_t **procs;
6262
size_t num_procs;
6363
size_t last_probed;
@@ -71,9 +71,14 @@ static inline mca_pml_ob1_comm_proc_t *mca_pml_ob1_peer_lookup (struct ompi_comm
7171
mca_pml_ob1_comm_t *pml_comm = (mca_pml_ob1_comm_t *)comm->c_pml_comm;
7272

7373
if (OPAL_UNLIKELY(NULL == pml_comm->procs[rank])) {
74-
pml_comm->procs[rank] = OBJ_NEW(mca_pml_ob1_comm_proc_t);
75-
pml_comm->procs[rank]->ompi_proc = ompi_comm_peer_lookup (comm, rank);
76-
OBJ_RETAIN(pml_comm->procs[rank]->ompi_proc);
74+
OPAL_THREAD_LOCK(&pml_comm->proc_lock);
75+
if (NULL == pml_comm->procs[rank]) {
76+
pml_comm->procs[rank] = OBJ_NEW(mca_pml_ob1_comm_proc_t);
77+
pml_comm->procs[rank]->ompi_proc = ompi_comm_peer_lookup (comm, rank);
78+
OBJ_RETAIN(pml_comm->procs[rank]->ompi_proc);
79+
opal_atomic_wmb ();
80+
}
81+
OPAL_THREAD_UNLOCK(&pml_comm->proc_lock);
7782
}
7883

7984
return pml_comm->procs[rank];

0 commit comments

Comments
 (0)