Skip to content
This repository was archived by the owner on Sep 30, 2022. It is now read-only.

Commit bdf9f95

Browse files
authored
Merge pull request #1273 from ggouaillardet/topic/v2.x/pending-in_coll
v2.x: Do not return MPI_ERR_PENDING from collectives
2 parents f7efc39 + 04cc528 commit bdf9f95

File tree

2 files changed

+48
-80
lines changed

2 files changed

+48
-80
lines changed

ompi/mca/coll/base/coll_base_barrier.c

Lines changed: 31 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2015 The University of Tennessee and The University
6+
* Copyright (c) 2004-2016 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -41,52 +41,40 @@
4141
* signal a two peer synchronization.
4242
*/
4343
static inline int
44-
ompi_coll_base_sendrecv_zero(int dest, int stag,
44+
ompi_coll_base_sendrecv_zero( int dest, int stag,
4545
int source, int rtag,
46-
MPI_Comm comm)
46+
MPI_Comm comm )
4747

4848
{
49-
int err, line = 0;
50-
ompi_request_t* reqs[2];
51-
ompi_status_public_t statuses[2];
49+
int rc, line = 0;
50+
ompi_request_t *req = MPI_REQUEST_NULL;
51+
ompi_status_public_t status;
5252

5353
/* post new irecv */
54-
err = MCA_PML_CALL(irecv( NULL, 0, MPI_BYTE, source, rtag,
55-
comm, &reqs[0]));
56-
if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; }
54+
rc = MCA_PML_CALL(irecv( NULL, 0, MPI_BYTE, source, rtag,
55+
comm, &req ));
56+
if( MPI_SUCCESS != rc ) { line = __LINE__; goto error_handler; }
5757

5858
/* send data to children */
59-
err = MCA_PML_CALL(isend( NULL, 0, MPI_BYTE, dest, stag,
60-
MCA_PML_BASE_SEND_STANDARD, comm, &reqs[1]));
61-
if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; }
62-
63-
err = ompi_request_wait_all( 2, reqs, statuses );
64-
if( MPI_ERR_IN_STATUS == err ) {
65-
/* As we use wait_all we will get MPI_ERR_IN_STATUS which is not an error
66-
* code that we can propagate up the stack. Instead, look for the real
67-
* error code from the MPI_ERROR in the status.
68-
*/
69-
int err_index = 0;
70-
if( MPI_SUCCESS == statuses[0].MPI_ERROR ) {
71-
err_index = 1;
72-
}
73-
err = statuses[err_index].MPI_ERROR;
74-
OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred in the %s"
75-
" stage of ompi_coll_base_sendrecv_zero\n",
76-
__FILE__, line, err, (0 == err_index ? "receive" : "send")));
77-
return err;
78-
}
79-
if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; }
59+
rc = MCA_PML_CALL(send( NULL, 0, MPI_BYTE, dest, stag,
60+
MCA_PML_BASE_SEND_STANDARD, comm ));
61+
if( MPI_SUCCESS != rc ) { line = __LINE__; goto error_handler; }
62+
63+
rc = ompi_request_wait( &req, &status );
64+
if( MPI_SUCCESS != rc ) { line = __LINE__; goto error_handler; }
8065

8166
return (MPI_SUCCESS);
8267

8368
error_handler:
84-
/* Error discovered during the posting of the irecv or isend,
85-
* and no status is available.
86-
*/
69+
if( MPI_REQUEST_NULL != req ) { /* cancel and complete the receive request */
70+
(void)ompi_request_cancel(req);
71+
(void)ompi_request_wait(&req, &status);
72+
}
73+
8774
OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred\n",
88-
__FILE__, line, err));
89-
return err;
75+
__FILE__, line, rc));
76+
(void)line; // silence compiler warning
77+
return rc;
9078
}
9179

9280
/*
@@ -198,8 +186,8 @@ int ompi_coll_base_barrier_intra_recursivedoubling(struct ompi_communicator_t *c
198186
/* send message to lower ranked node */
199187
remote = rank - adjsize;
200188
err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
201-
remote, MCA_COLL_BASE_TAG_BARRIER,
202-
comm);
189+
remote, MCA_COLL_BASE_TAG_BARRIER,
190+
comm);
203191
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
204192

205193
} else if (rank < (size - adjsize)) {
@@ -223,8 +211,8 @@ int ompi_coll_base_barrier_intra_recursivedoubling(struct ompi_communicator_t *c
223211

224212
/* post receive from the remote node */
225213
err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
226-
remote, MCA_COLL_BASE_TAG_BARRIER,
227-
comm);
214+
remote, MCA_COLL_BASE_TAG_BARRIER,
215+
comm);
228216
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
229217
}
230218
}
@@ -272,8 +260,8 @@ int ompi_coll_base_barrier_intra_bruck(struct ompi_communicator_t *comm,
272260

273261
/* send message to lower ranked node */
274262
err = ompi_coll_base_sendrecv_zero(to, MCA_COLL_BASE_TAG_BARRIER,
275-
from, MCA_COLL_BASE_TAG_BARRIER,
276-
comm);
263+
from, MCA_COLL_BASE_TAG_BARRIER,
264+
comm);
277265
if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;}
278266
}
279267

@@ -306,8 +294,8 @@ int ompi_coll_base_barrier_intra_two_procs(struct ompi_communicator_t *comm,
306294
remote = (remote + 1) & 0x1;
307295

308296
err = ompi_coll_base_sendrecv_zero(remote, MCA_COLL_BASE_TAG_BARRIER,
309-
remote, MCA_COLL_BASE_TAG_BARRIER,
310-
comm);
297+
remote, MCA_COLL_BASE_TAG_BARRIER,
298+
comm);
311299
return (err);
312300
}
313301

ompi/mca/coll/base/coll_base_util.c

Lines changed: 17 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,14 @@
22
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
33
* University Research and Technology
44
* Corporation. All rights reserved.
5-
* Copyright (c) 2004-2015 The University of Tennessee and The University
5+
* Copyright (c) 2004-2016 The University of Tennessee and The University
66
* of Tennessee Research Foundation. All rights
77
* reserved.
88
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12-
* Copyright (c) 2014-2015 Research Organization for Information Science
12+
* Copyright (c) 2014-2016 Research Organization for Information Science
1313
* and Technology (RIST). All rights reserved.
1414
* $COPYRIGHT$
1515
*
@@ -39,53 +39,33 @@ int ompi_coll_base_sendrecv_nonzero_actual( void* sendbuf, size_t scount,
3939
ompi_status_public_t* status )
4040

4141
{ /* post receive first, then send, then waitall... should be fast (I hope) */
42-
int err, line = 0, nreqs = 0;
43-
size_t typesize;
44-
ompi_request_t* reqs[2], **req = reqs;
45-
ompi_status_public_t statuses[2];
42+
int err, line = 0;
43+
size_t rtypesize, stypesize;
44+
ompi_request_t *req;
45+
ompi_status_public_t rstatus;
4646

4747
/* post new irecv */
48-
ompi_datatype_type_size(rdatatype, &typesize);
49-
if (0 != rcount && 0 != typesize) {
48+
ompi_datatype_type_size(rdatatype, &rtypesize);
49+
if (0 != rcount && 0 != rtypesize) {
5050
err = MCA_PML_CALL(irecv( recvbuf, rcount, rdatatype, source, rtag,
51-
comm, req++));
52-
++nreqs;
51+
comm, &req));
5352
if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; }
5453
}
5554

5655
/* send data to children */
57-
ompi_datatype_type_size(sdatatype, &typesize);
58-
if (0 != scount && 0 != typesize) {
59-
err = MCA_PML_CALL(isend( sendbuf, scount, sdatatype, dest, stag,
60-
MCA_PML_BASE_SEND_STANDARD, comm, req++));
61-
++nreqs;
56+
ompi_datatype_type_size(sdatatype, &stypesize);
57+
if (0 != scount && 0 != stypesize) {
58+
err = MCA_PML_CALL(send( sendbuf, scount, sdatatype, dest, stag,
59+
MCA_PML_BASE_SEND_STANDARD, comm));
6260
if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; }
6361
}
6462

65-
if (0 != nreqs) {
66-
err = ompi_request_wait_all( nreqs, reqs, statuses );
67-
if( MPI_ERR_IN_STATUS == err ) {
68-
/* As we use wait_all we will get MPI_ERR_IN_STATUS which is not an error
69-
* code that we can propagate up the stack. Instead, look for the real
70-
* error code from the MPI_ERROR in the status.
71-
*/
72-
int err_index = 0;
73-
if( MPI_SUCCESS == statuses[0].MPI_ERROR ) {
74-
err_index = 1;
75-
}
76-
if (MPI_STATUS_IGNORE != status) {
77-
*status = statuses[err_index];
78-
}
79-
err = statuses[err_index].MPI_ERROR;
80-
OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred in the %s"
81-
" stage of ompi_coll_base_sendrecv_zero\n",
82-
__FILE__, line, err, (0 == err_index ? "receive" : "send")));
83-
return err;
84-
}
63+
if (0 != rcount && 0 != rtypesize) {
64+
err = ompi_request_wait( &req, &rstatus);
8565
if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; }
8666

8767
if (MPI_STATUS_IGNORE != status) {
88-
*status = statuses[0];
68+
*status = rstatus;
8969
}
9070
} else {
9171
if( MPI_STATUS_IGNORE != status )
@@ -95,7 +75,7 @@ int ompi_coll_base_sendrecv_nonzero_actual( void* sendbuf, size_t scount,
9575
return (MPI_SUCCESS);
9676

9777
error_handler:
98-
/* Error discovered during the posting of the irecv or isend,
78+
/* Error discovered during the posting of the irecv or send,
9979
* and no status is available.
10080
*/
10181
OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred\n",

0 commit comments

Comments
 (0)