Skip to content

Commit 1237c2a

Browse files
authored
Merge pull request #2442 from ggouaillardet/topic/v2.0.x/libnbc_mt
v2.0.x: coll/libnbc: fix race condition with multi threaded apps
2 parents 4caafba + 8e00474 commit 1237c2a

File tree

3 files changed

+16
-2
lines changed

3 files changed

+16
-2
lines changed

ompi/mca/coll/libnbc/coll_libnbc.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
1414
* Copyright (c) 2013-2015 Los Alamos National Security, LLC. All rights
1515
* reserved.
16-
* Copyright (c) 2014-2015 Research Organization for Information Science
16+
* Copyright (c) 2014-2016 Research Organization for Information Science
1717
* and Technology (RIST). All rights reserved.
1818
* Copyright (c) 2016 IBM Corporation. All rights reserved.
1919
* $COPYRIGHT$
@@ -75,7 +75,8 @@ struct ompi_coll_libnbc_component_t {
7575
opal_free_list_t requests;
7676
opal_list_t active_requests;
7777
int32_t active_comms;
78-
opal_atomic_lock_t progress_lock;
78+
opal_atomic_lock_t progress_lock; /* protect from recursive calls */
79+
opal_mutex_t lock; /* protect access to the active_requests list */
7980
};
8081
typedef struct ompi_coll_libnbc_component_t ompi_coll_libnbc_component_t;
8182

ompi/mca/coll/libnbc/coll_libnbc_component.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ libnbc_open(void)
9191

9292
OBJ_CONSTRUCT(&mca_coll_libnbc_component.requests, opal_free_list_t);
9393
OBJ_CONSTRUCT(&mca_coll_libnbc_component.active_requests, opal_list_t);
94+
OBJ_CONSTRUCT(&mca_coll_libnbc_component.lock, opal_mutex_t);
9495
ret = opal_free_list_init (&mca_coll_libnbc_component.requests,
9596
sizeof(ompi_coll_libnbc_request_t), 8,
9697
OBJ_CLASS(ompi_coll_libnbc_request_t),
@@ -115,6 +116,7 @@ libnbc_close(void)
115116

116117
OBJ_DESTRUCT(&mca_coll_libnbc_component.requests);
117118
OBJ_DESTRUCT(&mca_coll_libnbc_component.active_requests);
119+
OBJ_DESTRUCT(&mca_coll_libnbc_component.lock);
118120

119121
return OMPI_SUCCESS;
120122
}
@@ -261,15 +263,22 @@ ompi_coll_libnbc_progress(void)
261263
ompi_coll_libnbc_request_t* request, *next;
262264
int res;
263265

266+
/* return if invoked recursively */
264267
if (opal_atomic_trylock(&mca_coll_libnbc_component.progress_lock)) return 0;
265268

269+
/* process active requests, and use mca_coll_libnbc_component.lock to access the
270+
* mca_coll_libnbc_component.active_requests list */
271+
OPAL_THREAD_LOCK(&mca_coll_libnbc_component.lock);
266272
OPAL_LIST_FOREACH_SAFE(request, next, &mca_coll_libnbc_component.active_requests,
267273
ompi_coll_libnbc_request_t) {
274+
OPAL_THREAD_UNLOCK(&mca_coll_libnbc_component.lock);
268275
res = NBC_Progress(request);
269276
if( NBC_CONTINUE != res ) {
270277
/* done, remove and complete */
278+
OPAL_THREAD_LOCK(&mca_coll_libnbc_component.lock);
271279
opal_list_remove_item(&mca_coll_libnbc_component.active_requests,
272280
&request->super.super.super);
281+
OPAL_THREAD_UNLOCK(&mca_coll_libnbc_component.lock);
273282

274283
if( OMPI_SUCCESS == res || NBC_OK == res || NBC_SUCCESS == res ) {
275284
request->super.req_status.MPI_ERROR = OMPI_SUCCESS;
@@ -281,7 +290,9 @@ ompi_coll_libnbc_progress(void)
281290
ompi_request_complete(&request->super, true);
282291
OPAL_THREAD_UNLOCK(&ompi_request_lock);
283292
}
293+
OPAL_THREAD_LOCK(&mca_coll_libnbc_component.lock);
284294
}
295+
OPAL_THREAD_UNLOCK(&mca_coll_libnbc_component.lock);
285296

286297
opal_atomic_unlock(&mca_coll_libnbc_component.progress_lock);
287298

ompi/mca/coll/libnbc/nbc.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -702,7 +702,9 @@ int NBC_Start(NBC_Handle *handle, NBC_Schedule *schedule) {
702702
if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
703703
return res;
704704
}
705+
OPAL_THREAD_LOCK(&mca_coll_libnbc_component.lock);
705706
opal_list_append(&mca_coll_libnbc_component.active_requests, &(handle->super.super.super));
707+
OPAL_THREAD_UNLOCK(&mca_coll_libnbc_component.lock);
706708

707709
return OMPI_SUCCESS;
708710
}

0 commit comments

Comments
 (0)