Skip to content

Commit e2e6592

Browse files
bosilcahppritcha
authored andcommitted
Fix the random errors related to the recursive sends and receives
identified by Fujitsu. (cherry picked from commit open-mpi/ompi@01d8e23)
1 parent 955c73b commit e2e6592

File tree

2 files changed

+29
-9
lines changed

2 files changed

+29
-9
lines changed

ompi/mca/pml/ob1/pml_ob1_irecv.c

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2014 The University of Tennessee and The University
6+
* Copyright (c) 2004-2015 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -29,6 +29,13 @@
2929
#include "ompi/peruse/peruse-internal.h"
3030
#include "ompi/message/message.h"
3131

32+
/**
33+
* Single usage request. As we allow recursive calls to recv
34+
* (from the request completion callback), we cannot rely on
35+
* using a global request. Thus, once a recv acquires ownership
36+
* this global request, it should set it to NULL to prevent
37+
* the reuse until the first user completes.
38+
*/
3239
mca_pml_ob1_recv_request_t *mca_pml_ob1_recvreq = NULL;
3340

3441
int mca_pml_ob1_irecv_init(void *addr,
@@ -96,15 +103,13 @@ int mca_pml_ob1_recv(void *addr,
96103

97104
#if !OMPI_ENABLE_THREAD_MULTIPLE
98105
recvreq = mca_pml_ob1_recvreq;
106+
mca_pml_ob1_recvreq = NULL;
99107
if( OPAL_UNLIKELY(NULL == recvreq) )
100108
#endif /* !OMPI_ENABLE_THREAD_MULTIPLE */
101109
{
102110
MCA_PML_OB1_RECV_REQUEST_ALLOC(recvreq);
103111
if (NULL == recvreq)
104112
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
105-
#if !OMPI_ENABLE_THREAD_MULTIPLE
106-
mca_pml_ob1_recvreq = recvreq;
107-
#endif /* !OMPI_ENABLE_THREAD_MULTIPLE */
108113
}
109114

110115
MCA_PML_OB1_RECV_REQUEST_INIT(recvreq, addr, count, datatype,
@@ -126,7 +131,12 @@ int mca_pml_ob1_recv(void *addr,
126131
#if OMPI_ENABLE_THREAD_MULTIPLE
127132
MCA_PML_OB1_RECV_REQUEST_RETURN(recvreq);
128133
#else
129-
mca_pml_ob1_recv_request_fini (recvreq);
134+
if( NULL != mca_pml_ob1_recvreq ) {
135+
MCA_PML_OB1_RECV_REQUEST_RETURN(recvreq);
136+
} else {
137+
mca_pml_ob1_recv_request_fini (recvreq);
138+
mca_pml_ob1_recvreq = recvreq;
139+
}
130140
#endif
131141

132142
return rc;

ompi/mca/pml/ob1/pml_ob1_isend.c

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,13 @@
2929
#include "pml_ob1_recvreq.h"
3030
#include "ompi/peruse/peruse-internal.h"
3131

32+
/**
33+
* Single usage request. As we allow recursive calls (as an
34+
* example from the request completion callback), we cannot rely
35+
* on using a global request. Thus, once a send acquires ownership
36+
* of this global request, it should set it to NULL to prevent
37+
* the reuse until the first user completes.
38+
*/
3239
mca_pml_ob1_send_request_t *mca_pml_ob1_sendreq = NULL;
3340

3441
int mca_pml_ob1_isend_init(const void *buf,
@@ -219,15 +226,13 @@ int mca_pml_ob1_send(const void *buf,
219226

220227
#if !OMPI_ENABLE_THREAD_MULTIPLE
221228
sendreq = mca_pml_ob1_sendreq;
229+
mca_pml_ob1_sendreq = NULL;
222230
if( OPAL_UNLIKELY(NULL == sendreq) )
223231
#endif /* !OMPI_ENABLE_THREAD_MULTIPLE */
224232
{
225233
MCA_PML_OB1_SEND_REQUEST_ALLOC(comm, dst, sendreq);
226234
if (NULL == sendreq)
227235
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
228-
#if !OMPI_ENABLE_THREAD_MULTIPLE
229-
mca_pml_ob1_sendreq = sendreq;
230-
#endif /* !OMPI_ENABLE_THREAD_MULTIPLE */
231236
}
232237
sendreq->req_send.req_base.req_proc = dst_proc;
233238
sendreq->rdma_frag = NULL;
@@ -253,7 +258,12 @@ int mca_pml_ob1_send(const void *buf,
253258
#if OMPI_ENABLE_THREAD_MULTIPLE
254259
MCA_PML_OB1_SEND_REQUEST_RETURN(sendreq);
255260
#else
256-
mca_pml_ob1_send_request_fini (sendreq);
261+
if( NULL != mca_pml_ob1_sendreq ) {
262+
MCA_PML_OB1_SEND_REQUEST_RETURN(sendreq);
263+
} else {
264+
mca_pml_ob1_send_request_fini (sendreq);
265+
mca_pml_ob1_sendreq = sendreq;
266+
}
257267
#endif
258268

259269
return rc;

0 commit comments

Comments
 (0)