Skip to content

Commit 955269b

Browse files
authored
Merge pull request #1816 from hjelmn/request_perfm_regression
opal/sync: fix race condition
2 parents 5795682 + a6b3b1f commit 955269b

File tree

2 files changed

+55
-6
lines changed

2 files changed

+55
-6
lines changed

ompi/request/req_wait.c

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
* Copyright (c) 2012 Oak Ridge National Labs. All rights reserved.
1616
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
1717
* reserved.
18+
* Copyright (c) 2016 Mellanox Technologies. All rights reserved.
1819
* $COPYRIGHT$
1920
*
2021
* Additional copyrights may follow
@@ -116,7 +117,8 @@ int ompi_request_default_wait_any(size_t count,
116117
if (MPI_STATUS_IGNORE != status) {
117118
*status = ompi_status_empty;
118119
}
119-
WAIT_SYNC_RELEASE(&sync);
120+
/* No signal-in-flight can be in this case */
121+
WAIT_SYNC_RELEASE_NOWAIT(&sync);
120122
return rc;
121123
}
122124

@@ -140,6 +142,15 @@ int ompi_request_default_wait_any(size_t count,
140142
*index = i;
141143
}
142144
}
145+
146+
if( *index == completed ){
147+
/* Only one request has triggered. There was no
148+
* in-flight completions.
149+
* Drop the signalled flag so we won't block
150+
* in WAIT_SYNC_RELEASE
151+
*/
152+
WAIT_SYNC_SIGNALLED(&sync);
153+
}
143154

144155
request = requests[*index];
145156
assert( REQUEST_COMPLETE(request) );
@@ -361,7 +372,8 @@ int ompi_request_default_wait_some(size_t count,
361372
ompi_request_t **rptr = NULL;
362373
ompi_request_t *request = NULL;
363374
ompi_wait_sync_t sync;
364-
375+
size_t sync_sets = 0, sync_unsets = 0;
376+
365377
WAIT_SYNC_INIT(&sync, 1);
366378

367379
*outcount = 0;
@@ -386,10 +398,12 @@ int ompi_request_default_wait_some(size_t count,
386398
num_requests_done++;
387399
}
388400
}
401+
sync_sets = count - num_requests_null_inactive - num_requests_done;
389402

390403
if(num_requests_null_inactive == count) {
391404
*outcount = MPI_UNDEFINED;
392-
WAIT_SYNC_RELEASE(&sync);
405+
/* nobody will signall us */
406+
WAIT_SYNC_RELEASE_NOWAIT(&sync);
393407
return rc;
394408
}
395409

@@ -420,6 +434,14 @@ int ompi_request_default_wait_some(size_t count,
420434
num_requests_done++;
421435
}
422436
}
437+
sync_unsets = count - num_requests_null_inactive - num_requests_done;
438+
439+
if( sync_sets == sync_unsets ){
440+
/* nobody knows about us,
441+
* set signa-in-progress flag to false
442+
*/
443+
WAIT_SYNC_SIGNALLED(&sync);
444+
}
423445

424446
WAIT_SYNC_RELEASE(&sync);
425447

opal/threads/wait_sync.h

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* reserved.
66
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
77
* reserved.
8+
* Copyright (c) 2016 Mellanox Technologies. All rights reserved.
89
* $COPYRIGHT$
910
*
1011
* Additional copyrights may follow
@@ -24,26 +25,50 @@ typedef struct ompi_wait_sync_t {
2425
pthread_mutex_t lock;
2526
struct ompi_wait_sync_t *next;
2627
struct ompi_wait_sync_t *prev;
28+
volatile bool signaling;
2729
} ompi_wait_sync_t;
2830

2931
#define REQUEST_PENDING (void*)0L
3032
#define REQUEST_COMPLETED (void*)1L
3133

3234
#define SYNC_WAIT(sync) (opal_using_threads() ? sync_wait_mt (sync) : sync_wait_st (sync))
3335

36+
/* The loop in release handles a race condition between the signaling
37+
* thread and the destruction of the condition variable. The signaling
38+
* member will be set to false after the final signaling thread has
39+
* finished opertating on the sync object. This is done to avoid
40+
* extra atomics in the singalling function and keep it as fast
41+
* as possible. Note that the race window is small so spinning here
42+
* is more optimal than sleeping since this macro is called in
43+
* the critical path. */
3444
#define WAIT_SYNC_RELEASE(sync) \
3545
if (opal_using_threads()) { \
36-
pthread_cond_destroy(&(sync)->condition); \
37-
pthread_mutex_destroy(&(sync)->lock); \
46+
while ((sync)->signaling) { \
47+
continue; \
48+
} \
49+
pthread_cond_destroy(&(sync)->condition); \
50+
pthread_mutex_destroy(&(sync)->lock); \
3851
}
3952

53+
#define WAIT_SYNC_RELEASE_NOWAIT(sync) \
54+
if (opal_using_threads()) { \
55+
pthread_cond_destroy(&(sync)->condition); \
56+
pthread_mutex_destroy(&(sync)->lock); \
57+
}
58+
59+
4060
#define WAIT_SYNC_SIGNAL(sync) \
4161
if (opal_using_threads()) { \
4262
pthread_mutex_lock(&(sync->lock)); \
4363
pthread_cond_signal(&sync->condition); \
4464
pthread_mutex_unlock(&(sync->lock)); \
65+
sync->signaling = false; \
4566
}
4667

68+
#define WAIT_SYNC_SIGNALLED(sync){ \
69+
(sync)->signaling = false; \
70+
}
71+
4772
OPAL_DECLSPEC int sync_wait_mt(ompi_wait_sync_t *sync);
4873
static inline int sync_wait_st (ompi_wait_sync_t *sync)
4974
{
@@ -61,6 +86,7 @@ static inline int sync_wait_st (ompi_wait_sync_t *sync)
6186
(sync)->next = NULL; \
6287
(sync)->prev = NULL; \
6388
(sync)->status = 0; \
89+
(sync)->signaling = true; \
6490
if (opal_using_threads()) { \
6591
pthread_cond_init (&(sync)->condition, NULL); \
6692
pthread_mutex_init (&(sync)->lock, NULL); \
@@ -81,8 +107,9 @@ static inline void wait_sync_update(ompi_wait_sync_t *sync, int updates, int sta
81107
}
82108
} else {
83109
/* this is an error path so just use the atomic */
84-
opal_atomic_swap_32 (&sync->count, 0);
85110
sync->status = OPAL_ERROR;
111+
opal_atomic_wmb ();
112+
opal_atomic_swap_32 (&sync->count, 0);
86113
}
87114
WAIT_SYNC_SIGNAL(sync);
88115
}

0 commit comments

Comments
 (0)