Skip to content

Commit 2c008de

Browse files
committed
Merge pull request open-mpi#554 from hjelmn/v2.x_openib_thread_fixes
v2.x openib thread fixes
2 parents 86b8105 + 0209235 commit 2c008de

File tree

3 files changed

+35
-21
lines changed

3 files changed

+35
-21
lines changed

opal/mca/btl/openib/btl_openib_component.c

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2534,12 +2534,6 @@ btl_openib_component_init(int *num_btl_modules,
25342534
malloc_hook_set = true;
25352535
}
25362536
#endif
2537-
/* Currently refuse to run if MPI_THREAD_MULTIPLE is enabled */
2538-
if (enable_mpi_threads && !mca_btl_base_thread_multiple_override) {
2539-
opal_output_verbose(5, opal_btl_base_framework.framework_output,
2540-
"btl:openib: MPI_THREAD_MULTIPLE not suppported; skipping this component");
2541-
goto no_btls;
2542-
}
25432537

25442538
/* Per https://svn.open-mpi.org/trac/ompi/ticket/1305, check to
25452539
see if $sysfsdir/class/infiniband exists. If it does not,

opal/mca/btl/openib/btl_openib_eager_rdma.h

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1+
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
12
/*
23
* Copyright (c) 2006-2007 Voltaire All rights reserved.
4+
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
5+
* reserved.
36
* $COPYRIGHT$
47
*
58
* Additional copyrights may follow
@@ -81,17 +84,33 @@ typedef struct mca_btl_openib_eager_rdma_remote_t mca_btl_openib_eager_rdma_remo
8184
mca_btl_openib_component.eager_rdma_num) \
8285
(I) = 0; \
8386
} while (0)
84-
#define MCA_BTL_OPENIB_RDMA_MOVE_INDEX(HEAD, OLD_HEAD) \
85-
do { \
86-
int32_t new_head; \
87-
do { \
88-
OLD_HEAD = HEAD; \
89-
new_head = OLD_HEAD + 1; \
90-
if(new_head == mca_btl_openib_component.eager_rdma_num) \
91-
new_head = 0; \
92-
} while(!OPAL_ATOMIC_CMPSET_32(&HEAD, OLD_HEAD, new_head)); \
87+
88+
89+
#if OPAL_ENABLE_DEBUG
90+
91+
/**
92+
* @brief read and increment the remote head index and generate a sequence
93+
* number
94+
*/
95+
96+
#define MCA_BTL_OPENIB_RDMA_MOVE_INDEX(HEAD, OLD_HEAD, SEQ) \
97+
do { \
98+
(SEQ) = OPAL_THREAD_ADD32(&(HEAD), 1) - 1; \
99+
(OLD_HEAD) = (SEQ) % mca_btl_openib_component.eager_rdma_num; \
93100
} while(0)
94101

102+
#else
103+
104+
/**
105+
* @brief read and increment the remote head index
106+
*/
107+
108+
#define MCA_BTL_OPENIB_RDMA_MOVE_INDEX(HEAD, OLD_HEAD) \
109+
do { \
110+
(OLD_HEAD) = (OPAL_THREAD_ADD32(&(HEAD), 1) - 1) % mca_btl_openib_component.eager_rdma_num; \
111+
} while(0)
112+
113+
#endif
95114

96115
END_C_DECLS
97116
#endif

opal/mca/btl/openib/btl_openib_endpoint.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -569,17 +569,18 @@ static inline int post_send(mca_btl_openib_endpoint_t *ep,
569569
MCA_BTL_OPENIB_RDMA_FRAG_SET_SIZE(ftr, sg->length);
570570
MCA_BTL_OPENIB_RDMA_MAKE_LOCAL(ftr);
571571
#if OPAL_ENABLE_DEBUG
572-
do {
573-
ftr->seq = ep->eager_rdma_remote.seq;
574-
} while (!OPAL_ATOMIC_CMPSET_32((int32_t*) &ep->eager_rdma_remote.seq,
575-
(int32_t) ftr->seq,
576-
(int32_t) (ftr->seq+1)));
572+
/* NTH: generate the sequence from the remote head index to ensure that the
573+
* wrong sequence isn't set. The way this code used to look the sequence number
574+
* and head were updated independently and it led to false positives for incorrect
575+
* sequence numbers. */
576+
MCA_BTL_OPENIB_RDMA_MOVE_INDEX(ep->eager_rdma_remote.head, head, ftr->seq);
577+
#else
578+
MCA_BTL_OPENIB_RDMA_MOVE_INDEX(ep->eager_rdma_remote.head, head);
577579
#endif
578580
if(ep->nbo)
579581
BTL_OPENIB_FOOTER_HTON(*ftr);
580582

581583
sr_desc->wr.rdma.rkey = ep->eager_rdma_remote.rkey;
582-
MCA_BTL_OPENIB_RDMA_MOVE_INDEX(ep->eager_rdma_remote.head, head);
583584
#if BTL_OPENIB_FAILOVER_ENABLED
584585
/* frag->ftr is unused on the sending fragment, so use it
585586
* to indicate it is an eager fragment. A non-zero value

0 commit comments

Comments
 (0)