Skip to content

Commit 7bd8940

Browse files
bosilcaderbeyn
authored andcommitted
Update the basic module to dynamically allocate the right
number of requests. Remove unnecessary fields.We don't need these fields. (cherry picked from commit 01b32ca)
1 parent 2a56061 commit 7bd8940

19 files changed

+356
-317
lines changed

ompi/mca/coll/base/coll_base_frame.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,9 +119,11 @@ OBJ_CLASS_INSTANCE(mca_coll_base_comm_t, opal_object_t,
119119

120120
ompi_request_t** coll_base_comm_get_reqs(mca_coll_base_comm_t* data, int nreqs)
121121
{
122-
if( data->mcct_num_reqs <= nreqs ) {
122+
if( 0 == nreqs ) return NULL;
123+
124+
if( data->mcct_num_reqs <= nreqs )
123125
data->mcct_reqs = (ompi_request_t**)realloc(data->mcct_reqs, sizeof(ompi_request_t*) * nreqs);
124-
}
126+
125127
if( NULL != data->mcct_reqs ) {
126128
for( int i = data->mcct_num_reqs; i < nreqs; i++ )
127129
data->mcct_reqs[i] = MPI_REQUEST_NULL;

ompi/mca/coll/base/coll_base_functions.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -353,9 +353,11 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_base_comm_t);
353353
static inline void ompi_coll_base_free_reqs(ompi_request_t **reqs, int count)
354354
{
355355
int i;
356-
for (i = 0; i < count; ++i)
357-
if( MPI_REQUEST_NULL != reqs[i] )
356+
for (i = 0; i < count; ++i) {
357+
if( MPI_REQUEST_NULL != reqs[i] ) {
358358
ompi_request_free(&reqs[i]);
359+
}
360+
}
359361
}
360362

361363
/**

ompi/mca/coll/basic/coll_basic.h

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -283,17 +283,6 @@ BEGIN_C_DECLS
283283
int mca_coll_basic_ft_event(int status);
284284

285285

286-
/* Utility functions */
287-
288-
static inline void mca_coll_basic_free_reqs(ompi_request_t ** reqs,
289-
int count)
290-
{
291-
int i;
292-
for (i = 0; i < count; ++i)
293-
ompi_request_free(&reqs[i]);
294-
}
295-
296-
297286
struct mca_coll_basic_module_t {
298287
mca_coll_base_module_t super;
299288

@@ -303,6 +292,23 @@ struct mca_coll_basic_module_t {
303292
typedef struct mca_coll_basic_module_t mca_coll_basic_module_t;
304293
OBJ_CLASS_DECLARATION(mca_coll_basic_module_t);
305294

295+
/* Utility functions */
296+
297+
static inline void mca_coll_basic_free_reqs(ompi_request_t ** reqs, int count)
298+
{
299+
int i;
300+
for (i = 0; i < count; ++i)
301+
if( MPI_REQUEST_NULL != reqs[i] ) {
302+
ompi_request_free(&reqs[i]);
303+
}
304+
}
305+
306+
/**
307+
* Return the array of requests on the data. If the array was not initialized
308+
* or if it's size was too small, allocate it to fit the requested size.
309+
*/
310+
ompi_request_t** mca_coll_basic_get_reqs(mca_coll_basic_module_t* data, int nreqs);
311+
306312
END_C_DECLS
307313

308314
#endif /* MCA_COLL_BASIC_EXPORT_H */

ompi/mca/coll/basic/coll_basic_allgather.c

Lines changed: 26 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,12 @@ mca_coll_basic_allgather_inter(const void *sbuf, int scount,
4747
struct ompi_communicator_t *comm,
4848
mca_coll_base_module_t *module)
4949
{
50-
int rank, root = 0, size, rsize, err, i;
50+
int rank, root = 0, size, rsize, err, i, line;
5151
char *tmpbuf = NULL, *ptmp;
5252
ptrdiff_t rlb, slb, rextent, sextent, incr;
5353
ompi_request_t *req;
5454
mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module;
55-
ompi_request_t **reqs = basic_module->mccb_reqs;
55+
ompi_request_t **reqs = NULL;
5656

5757
rank = ompi_comm_rank(comm);
5858
size = ompi_comm_size(comm);
@@ -71,91 +71,71 @@ mca_coll_basic_allgather_inter(const void *sbuf, int scount,
7171
err = MCA_PML_CALL(send(sbuf, scount, sdtype, root,
7272
MCA_COLL_BASE_TAG_ALLGATHER,
7373
MCA_PML_BASE_SEND_STANDARD, comm));
74-
if (OMPI_SUCCESS != err) {
75-
return err;
76-
}
74+
if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
7775
} else {
7876
/* receive a msg. from all other procs. */
7977
err = ompi_datatype_get_extent(rdtype, &rlb, &rextent);
80-
if (OMPI_SUCCESS != err) {
81-
return err;
82-
}
78+
if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
8379
err = ompi_datatype_get_extent(sdtype, &slb, &sextent);
84-
if (OMPI_SUCCESS != err) {
85-
return err;
86-
}
80+
if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
81+
82+
/* Get a requests arrays of the right size */
83+
reqs = mca_coll_basic_get_reqs(basic_module, rsize + 1);
84+
if( NULL == reqs ) { line = __LINE__; goto exit; }
8785

8886
/* Do a send-recv between the two root procs. to avoid deadlock */
8987
err = MCA_PML_CALL(isend(sbuf, scount, sdtype, 0,
9088
MCA_COLL_BASE_TAG_ALLGATHER,
9189
MCA_PML_BASE_SEND_STANDARD,
9290
comm, &reqs[rsize]));
93-
if (OMPI_SUCCESS != err) {
94-
return err;
95-
}
91+
if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
9692

9793
err = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, 0,
9894
MCA_COLL_BASE_TAG_ALLGATHER, comm,
9995
&reqs[0]));
100-
if (OMPI_SUCCESS != err) {
101-
return err;
102-
}
96+
if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
10397

10498
incr = rextent * rcount;
10599
ptmp = (char *) rbuf + incr;
106100
for (i = 1; i < rsize; ++i, ptmp += incr) {
107101
err = MCA_PML_CALL(irecv(ptmp, rcount, rdtype, i,
108102
MCA_COLL_BASE_TAG_ALLGATHER,
109103
comm, &reqs[i]));
110-
if (MPI_SUCCESS != err) {
111-
return err;
112-
}
104+
if (MPI_SUCCESS != err) { line = __LINE__; goto exit; }
113105
}
114106

115107
err = ompi_request_wait_all(rsize + 1, reqs, MPI_STATUSES_IGNORE);
116-
if (OMPI_SUCCESS != err) {
117-
return err;
118-
}
108+
if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
119109

120110
/* Step 2: exchange the resuts between the root processes */
121111
tmpbuf = (char *) malloc(scount * size * sextent);
122-
if (NULL == tmpbuf) {
123-
return err;
124-
}
112+
if (NULL == tmpbuf) { line = __LINE__; goto exit; }
125113

126114
err = MCA_PML_CALL(isend(rbuf, rsize * rcount, rdtype, 0,
127115
MCA_COLL_BASE_TAG_ALLGATHER,
128116
MCA_PML_BASE_SEND_STANDARD, comm, &req));
129-
if (OMPI_SUCCESS != err) {
130-
goto exit;
131-
}
117+
if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
132118

133119
err = MCA_PML_CALL(recv(tmpbuf, size * scount, sdtype, 0,
134120
MCA_COLL_BASE_TAG_ALLGATHER, comm,
135121
MPI_STATUS_IGNORE));
136-
if (OMPI_SUCCESS != err) {
137-
goto exit;
138-
}
122+
if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
139123

140124
err = ompi_request_wait( &req, MPI_STATUS_IGNORE);
141-
if (OMPI_SUCCESS != err) {
142-
goto exit;
143-
}
125+
if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
144126
}
145127

146128

147129
/* Step 3: bcast the data to the remote group. This
148-
* happens in both groups simultaniously, thus we can
130+
* happens in both groups simultaneously, thus we can
149131
* not use coll_bcast (this would deadlock).
150132
*/
151133
if (rank != root) {
152134
/* post the recv */
153135
err = MCA_PML_CALL(recv(rbuf, rsize * rcount, rdtype, 0,
154136
MCA_COLL_BASE_TAG_ALLGATHER, comm,
155137
MPI_STATUS_IGNORE));
156-
if (OMPI_SUCCESS != err) {
157-
goto exit;
158-
}
138+
if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
159139

160140
} else {
161141
/* Send the data to every other process in the remote group
@@ -165,19 +145,19 @@ mca_coll_basic_allgather_inter(const void *sbuf, int scount,
165145
MCA_COLL_BASE_TAG_ALLGATHER,
166146
MCA_PML_BASE_SEND_STANDARD,
167147
comm, &reqs[i - 1]));
168-
if (OMPI_SUCCESS != err) {
169-
goto exit;
170-
}
171-
148+
if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
172149
}
173150

174151
err = ompi_request_wait_all(rsize - 1, reqs, MPI_STATUSES_IGNORE);
175-
if (OMPI_SUCCESS != err) {
176-
goto exit;
177-
}
152+
if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
178153
}
179154

180155
exit:
156+
if( MPI_SUCCESS != err ) {
157+
OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
158+
__FILE__, line, err, rank) );
159+
if( NULL != reqs ) mca_coll_basic_free_reqs(reqs, rsize+1);
160+
}
181161
if (NULL != tmpbuf) {
182162
free(tmpbuf);
183163
}

ompi/mca/coll/basic/coll_basic_allreduce.c

Lines changed: 23 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
33
* University Research and Technology
44
* Corporation. All rights reserved.
5-
* Copyright (c) 2004-2014 The University of Tennessee and The University
5+
* Copyright (c) 2004-2015 The University of Tennessee and The University
66
* of Tennessee Research Foundation. All rights
77
* reserved.
88
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -80,13 +80,13 @@ mca_coll_basic_allreduce_inter(const void *sbuf, void *rbuf, int count,
8080
struct ompi_communicator_t *comm,
8181
mca_coll_base_module_t *module)
8282
{
83-
int err, i, rank, root = 0, rsize;
83+
int err, i, rank, root = 0, rsize, line;
8484
ptrdiff_t lb, extent;
8585
ptrdiff_t true_lb, true_extent;
8686
char *tmpbuf = NULL, *pml_buffer = NULL;
8787
ompi_request_t *req[2];
8888
mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module;
89-
ompi_request_t **reqs = basic_module->mccb_reqs;
89+
ompi_request_t **reqs = NULL;
9090

9191
rank = ompi_comm_rank(comm);
9292
rsize = ompi_comm_remote_size(comm);
@@ -111,41 +111,33 @@ mca_coll_basic_allreduce_inter(const void *sbuf, void *rbuf, int count,
111111
}
112112

113113
tmpbuf = (char *) malloc(true_extent + (count - 1) * extent);
114-
if (NULL == tmpbuf) {
115-
return OMPI_ERR_OUT_OF_RESOURCE;
116-
}
114+
if (NULL == tmpbuf) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto exit; }
117115
pml_buffer = tmpbuf - true_lb;
118116

117+
reqs = mca_coll_basic_get_reqs(basic_module, rsize - 1);
118+
if( NULL == reqs ) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto exit; }
119+
119120
/* Do a send-recv between the two root procs. to avoid deadlock */
120121
err = MCA_PML_CALL(irecv(rbuf, count, dtype, 0,
121122
MCA_COLL_BASE_TAG_ALLREDUCE, comm,
122123
&(req[0])));
123-
if (OMPI_SUCCESS != err) {
124-
goto exit;
125-
}
124+
if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
126125

127126
err = MCA_PML_CALL(isend(sbuf, count, dtype, 0,
128127
MCA_COLL_BASE_TAG_ALLREDUCE,
129128
MCA_PML_BASE_SEND_STANDARD,
130129
comm, &(req[1])));
131-
if (OMPI_SUCCESS != err) {
132-
goto exit;
133-
}
130+
if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
134131

135132
err = ompi_request_wait_all(2, req, MPI_STATUSES_IGNORE);
136-
if (OMPI_SUCCESS != err) {
137-
goto exit;
138-
}
139-
133+
if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
140134

141135
/* Loop receiving and calling reduction function (C or Fortran). */
142136
for (i = 1; i < rsize; i++) {
143137
err = MCA_PML_CALL(recv(pml_buffer, count, dtype, i,
144138
MCA_COLL_BASE_TAG_ALLREDUCE, comm,
145139
MPI_STATUS_IGNORE));
146-
if (MPI_SUCCESS != err) {
147-
goto exit;
148-
}
140+
if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
149141

150142
/* Perform the reduction */
151143
ompi_op_reduce(op, pml_buffer, rbuf, count, dtype);
@@ -155,9 +147,7 @@ mca_coll_basic_allreduce_inter(const void *sbuf, void *rbuf, int count,
155147
err = MCA_PML_CALL(send(sbuf, count, dtype, root,
156148
MCA_COLL_BASE_TAG_ALLREDUCE,
157149
MCA_PML_BASE_SEND_STANDARD, comm));
158-
if (OMPI_SUCCESS != err) {
159-
goto exit;
160-
}
150+
if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
161151
}
162152

163153

@@ -171,21 +161,16 @@ mca_coll_basic_allreduce_inter(const void *sbuf, void *rbuf, int count,
171161
err = MCA_PML_CALL(irecv(pml_buffer, count, dtype, 0,
172162
MCA_COLL_BASE_TAG_ALLREDUCE,
173163
comm, &(req[1])));
174-
if (OMPI_SUCCESS != err) {
175-
goto exit;
176-
}
164+
if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
177165

178166
err = MCA_PML_CALL(isend(rbuf, count, dtype, 0,
179167
MCA_COLL_BASE_TAG_ALLREDUCE,
180168
MCA_PML_BASE_SEND_STANDARD, comm,
181169
&(req[0])));
182-
if (OMPI_SUCCESS != err) {
183-
goto exit;
184-
}
170+
if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
171+
185172
err = ompi_request_wait_all(2, req, MPI_STATUSES_IGNORE);
186-
if (OMPI_SUCCESS != err) {
187-
goto exit;
188-
}
173+
if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
189174

190175
/* distribute the data to other processes in remote group.
191176
* Note that we start from 1 (not from zero), since zero
@@ -198,17 +183,13 @@ mca_coll_basic_allreduce_inter(const void *sbuf, void *rbuf, int count,
198183
MCA_COLL_BASE_TAG_ALLREDUCE,
199184
MCA_PML_BASE_SEND_STANDARD, comm,
200185
&reqs[i - 1]));
201-
if (OMPI_SUCCESS != err) {
202-
goto exit;
203-
}
186+
if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
204187
}
205188

206189
err =
207190
ompi_request_wait_all(rsize - 1, reqs,
208191
MPI_STATUSES_IGNORE);
209-
if (OMPI_SUCCESS != err) {
210-
goto exit;
211-
}
192+
if (OMPI_SUCCESS != err) { line = __LINE__; goto exit; }
212193
}
213194
} else {
214195
err = MCA_PML_CALL(recv(rbuf, count, dtype, root,
@@ -217,10 +198,14 @@ mca_coll_basic_allreduce_inter(const void *sbuf, void *rbuf, int count,
217198
}
218199

219200
exit:
201+
if( MPI_SUCCESS != err ) {
202+
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d", __FILE__,
203+
line, err, rank));
204+
mca_coll_basic_free_reqs(reqs, rsize - 1);
205+
}
220206
if (NULL != tmpbuf) {
221207
free(tmpbuf);
222208
}
223209

224-
225210
return err;
226211
}

0 commit comments

Comments
 (0)