Skip to content

Commit 6e60cf7

Browse files
committed
ompi request handling race condition fix (MT-case)
Described in #1813
1 parent dc8b758 commit 6e60cf7

File tree

1 file changed

+44
-6
lines changed

1 file changed

+44
-6
lines changed

opal/threads/wait_sync.h

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* reserved.
66
* Copyright (c) 2016 Los Alamos National Security, LLC. All rights
77
* reserved.
8+
* Copyright (c) 2016 Mellanox Technologies. All rights reserved.
89
* $COPYRIGHT$
910
*
1011
* Additional copyrights may follow
@@ -13,6 +14,8 @@
1314
*/
1415
#include "opal/sys/atomic.h"
1516
#include "opal/threads/condition.h"
17+
#include "opal/constants.h"
18+
#include "opal/prefetch.h"
1619
#include <pthread.h>
1720

1821
BEGIN_C_DECLS
@@ -33,17 +36,29 @@ typedef struct ompi_wait_sync_t {
3336

3437
#define WAIT_SYNC_RELEASE(sync) \
3538
if (opal_using_threads()) { \
39+
pthread_mutex_lock(&(sync)->lock); \
3640
pthread_cond_destroy(&(sync)->condition); \
41+
pthread_mutex_unlock(&(sync)->lock); \
3742
pthread_mutex_destroy(&(sync)->lock); \
3843
}
3944

40-
#define WAIT_SYNC_SIGNAL(sync) \
45+
#define WAIT_SYNC_LOCK(sync) \
4146
if (opal_using_threads()) { \
42-
pthread_mutex_lock(&(sync->lock)); \
43-
pthread_cond_signal(&sync->condition); \
44-
pthread_mutex_unlock(&(sync->lock)); \
47+
pthread_mutex_lock(&((sync)->lock)); \
4548
}
4649

50+
#define WAIT_SYNC_SIGNAL_UNLOCK(sync) \
51+
if (opal_using_threads()) { \
52+
pthread_cond_signal(&((sync)->condition)); \
53+
pthread_mutex_unlock(&((sync)->lock)); \
54+
}
55+
56+
#define WAIT_SYNC_UNLOCK(sync) \
57+
if (opal_using_threads()) { \
58+
pthread_mutex_unlock(&((sync)->lock)); \
59+
}
60+
61+
4762
OPAL_DECLSPEC int sync_wait_mt(ompi_wait_sync_t *sync);
4863
static inline int sync_wait_st (ompi_wait_sync_t *sync)
4964
{
@@ -75,16 +90,39 @@ static inline int sync_wait_st (ompi_wait_sync_t *sync)
7590
*/
7691
static inline void wait_sync_update(ompi_wait_sync_t *sync, int updates, int status)
7792
{
93+
/* Fast path: if we can decrement the sync->count without
94+
* dropping it to 0 - just return
95+
* Consider that there might be concurrent decrements
96+
*/
97+
if( OPAL_LIKELY(OPAL_SUCCESS == status) ) {
98+
/* we know that our contribution is not yet there
99+
* so we can safely check if the count will still be above 0
100+
* after the change */
101+
while( (sync->count - updates > 0) ){
102+
int tmp = sync->count;
103+
if( OPAL_ATOMIC_CMPSET_32(&sync->count, tmp, tmp - updates) ){
104+
/* fastpath succeeds */
105+
return;
106+
}
107+
}
108+
}
109+
110+
/* Slow path */
111+
WAIT_SYNC_LOCK(sync);
112+
78113
if( OPAL_LIKELY(OPAL_SUCCESS == status) ) {
79114
if( 0 != (OPAL_THREAD_ADD32(&sync->count, -updates)) ) {
80-
return;
115+
goto unlock;
81116
}
82117
} else {
83118
/* this is an error path so just use the atomic */
84119
opal_atomic_swap_32 (&sync->count, 0);
85120
sync->status = OPAL_ERROR;
86121
}
87-
WAIT_SYNC_SIGNAL(sync);
122+
WAIT_SYNC_SIGNAL_UNLOCK(sync);
123+
return;
124+
unlock:
125+
WAIT_SYNC_UNLOCK(sync);
88126
}
89127

90128
END_C_DECLS

0 commit comments

Comments
 (0)