Skip to content

Commit 2a56061

Browse files
bosilcaderbeyn
authored andcommitted
Never allocate a temporary array for the requests. Instead rely on the
module_data to hold one with the largest necessary size. This array is only allocated when needed, and it is released upon communicator destruction. (cherry picked from commit a324602)
1 parent 98ac7a8 commit 2a56061

File tree

8 files changed

+82
-99
lines changed

8 files changed

+82
-99
lines changed

ompi/mca/coll/base/coll_base_alltoall.c

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ mca_coll_base_alltoall_intra_basic_inplace(const void *rbuf, int rcount,
4242
mca_coll_base_module_t *module)
4343
{
4444
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
45-
int i, j, size, rank, err=MPI_SUCCESS;
45+
int i, j, size, rank, err = MPI_SUCCESS, line;
4646
MPI_Request *preq;
4747
char *tmp_buffer;
4848
size_t max_size;
@@ -80,48 +80,54 @@ mca_coll_base_alltoall_intra_basic_inplace(const void *rbuf, int rcount,
8080
/* Copy the data into the temporary buffer */
8181
err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer,
8282
(char *) rbuf + j * max_size);
83-
if (MPI_SUCCESS != err) { goto error_hndl; }
83+
if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }
8484

8585
/* Exchange data with the peer */
8686
err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * j, rcount, rdtype,
8787
j, MCA_COLL_BASE_TAG_ALLTOALL, comm, preq++));
88-
if (MPI_SUCCESS != err) { goto error_hndl; }
88+
if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }
8989

9090
err = MCA_PML_CALL(isend ((char *) tmp_buffer, rcount, rdtype,
9191
j, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD,
9292
comm, preq++));
93-
if (MPI_SUCCESS != err) { goto error_hndl; }
93+
if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }
9494
} else if (j == rank) {
9595
/* Copy the data into the temporary buffer */
9696
err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer,
9797
(char *) rbuf + i * max_size);
98-
if (MPI_SUCCESS != err) { goto error_hndl; }
98+
if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }
9999

100100
/* Exchange data with the peer */
101101
err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * i, rcount, rdtype,
102102
i, MCA_COLL_BASE_TAG_ALLTOALL, comm, preq++));
103-
if (MPI_SUCCESS != err) { goto error_hndl; }
103+
if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }
104104

105105
err = MCA_PML_CALL(isend ((char *) tmp_buffer, rcount, rdtype,
106106
i, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD,
107107
comm, preq++));
108-
if (MPI_SUCCESS != err) { goto error_hndl; }
108+
if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }
109109
} else {
110110
continue;
111111
}
112112

113113
/* Wait for the requests to complete */
114114
err = ompi_request_wait_all (2, base_module->base_data->mcct_reqs, MPI_STATUSES_IGNORE);
115-
if (MPI_SUCCESS != err) { goto error_hndl; }
115+
if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }
116116
}
117117
}
118118

119119
error_hndl:
120120
/* Free the temporary buffer */
121121
free (tmp_buffer);
122122

123-
/* All done */
123+
if( MPI_SUCCESS != err ) {
124+
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
125+
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err,
126+
rank));
127+
ompi_coll_base_free_reqs(base_module->base_data->mcct_reqs, 2);
128+
}
124129

130+
/* All done */
125131
return err;
126132
}
127133

@@ -385,8 +391,7 @@ int ompi_coll_base_alltoall_intra_linear_sync(const void *sbuf, int scount,
385391
total_reqs = (((max_outstanding_reqs > (size - 1)) ||
386392
(max_outstanding_reqs <= 0)) ?
387393
(size - 1) : (max_outstanding_reqs));
388-
reqs = (ompi_request_t**) malloc( 2 * total_reqs *
389-
sizeof(ompi_request_t*));
394+
reqs = coll_base_comm_get_reqs(module->base_data, 2 * total_reqs);
390395
if (NULL == reqs) { error = -1; line = __LINE__; goto error_hndl; }
391396

392397
prcv = (char *) rbuf;
@@ -458,17 +463,14 @@ int ompi_coll_base_alltoall_intra_linear_sync(const void *sbuf, int scount,
458463
}
459464
}
460465

461-
/* Free the reqs */
462-
free(reqs);
463-
464466
/* All done */
465467
return MPI_SUCCESS;
466468

467469
error_hndl:
468470
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
469471
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, error,
470472
rank));
471-
if (NULL != reqs) free(reqs);
473+
ompi_coll_base_free_reqs(reqs, 2 * total_reqs);
472474
return error;
473475
}
474476

@@ -554,7 +556,7 @@ int ompi_coll_base_alltoall_intra_basic_linear(const void *sbuf, int scount,
554556
struct ompi_communicator_t *comm,
555557
mca_coll_base_module_t *module)
556558
{
557-
int i, rank, size, err, nreqs;
559+
int i, rank, size, err, nreqs, line;
558560
char *psnd, *prcv;
559561
MPI_Aint lb, sndinc, rcvinc;
560562
ompi_request_t **req, **sreq, **rreq;
@@ -616,10 +618,7 @@ int ompi_coll_base_alltoall_intra_basic_linear(const void *sbuf, int scount,
616618
err = MCA_PML_CALL(irecv_init
617619
(prcv + (ptrdiff_t)i * rcvinc, rcount, rdtype, i,
618620
MCA_COLL_BASE_TAG_ALLTOALL, comm, rreq));
619-
if (MPI_SUCCESS != err) {
620-
ompi_coll_base_free_reqs(req, nreqs);
621-
return err;
622-
}
621+
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
623622
}
624623

625624
/* Now post all sends in reverse order
@@ -633,10 +632,7 @@ int ompi_coll_base_alltoall_intra_basic_linear(const void *sbuf, int scount,
633632
(psnd + (ptrdiff_t)i * sndinc, scount, sdtype, i,
634633
MCA_COLL_BASE_TAG_ALLTOALL,
635634
MCA_PML_BASE_SEND_STANDARD, comm, sreq));
636-
if (MPI_SUCCESS != err) {
637-
ompi_coll_base_free_reqs(req, nreqs);
638-
return err;
639-
}
635+
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
640636
}
641637

642638
/* Start your engines. This will never return an error. */
@@ -652,8 +648,13 @@ int ompi_coll_base_alltoall_intra_basic_linear(const void *sbuf, int scount,
652648

653649
err = ompi_request_wait_all(nreqs, req, MPI_STATUSES_IGNORE);
654650

655-
/* Free the reqs */
656-
ompi_coll_base_free_reqs(req, nreqs);
651+
err_hndl:
652+
if( MPI_SUCCESS != err ) {
653+
OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
654+
__FILE__, line, err, rank) );
655+
/* Free the reqs */
656+
ompi_coll_base_free_reqs(req, nreqs);
657+
}
657658

658659
/* All done */
659660
return err;

ompi/mca/coll/base/coll_base_alltoallv.c

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -123,9 +123,11 @@ mca_coll_base_alltoallv_intra_basic_inplace(const void *rbuf, const int *rcounts
123123
error_hndl:
124124
/* Free the temporary buffer */
125125
free (tmp_buffer);
126+
if( MPI_SUCCESS != err ) {
127+
ompi_coll_base_free_reqs(base_module->base_data->mcct_reqs, 2 );
128+
}
126129

127130
/* All done */
128-
129131
return err;
130132
}
131133

@@ -253,8 +255,7 @@ ompi_coll_base_alltoallv_intra_basic_linear(const void *sbuf, const int *scounts
253255
preq++));
254256
++nreqs;
255257
if (MPI_SUCCESS != err) {
256-
ompi_coll_base_free_reqs(data->mcct_reqs, nreqs);
257-
return err;
258+
goto err_hndl;
258259
}
259260
}
260261

@@ -271,8 +272,7 @@ ompi_coll_base_alltoallv_intra_basic_linear(const void *sbuf, const int *scounts
271272
preq++));
272273
++nreqs;
273274
if (MPI_SUCCESS != err) {
274-
ompi_coll_base_free_reqs(data->mcct_reqs, nreqs);
275-
return err;
275+
goto err_hndl;
276276
}
277277
}
278278

@@ -287,9 +287,10 @@ ompi_coll_base_alltoallv_intra_basic_linear(const void *sbuf, const int *scounts
287287
* error after we free everything. */
288288
err = ompi_request_wait_all(nreqs, data->mcct_reqs,
289289
MPI_STATUSES_IGNORE);
290-
291-
/* Free the requests. */
292-
ompi_coll_base_free_reqs(data->mcct_reqs, nreqs);
290+
err_hndl:
291+
if( MPI_SUCCESS != err ) { /* Free the requests. */
292+
ompi_coll_base_free_reqs(data->mcct_reqs, nreqs);
293+
}
293294

294295
return err;
295296
}

ompi/mca/coll/base/coll_base_barrier.c

Lines changed: 14 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,8 @@ int ompi_coll_base_barrier_intra_two_procs(struct ompi_communicator_t *comm,
324324
int ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t *comm,
325325
mca_coll_base_module_t *module)
326326
{
327-
int i, err, rank, size;
327+
int i, err, rank, size, line;
328+
ompi_request_t** requests = NULL;
328329

329330
rank = ompi_comm_rank(comm);
330331
size = ompi_comm_size(comm);
@@ -334,50 +335,43 @@ int ompi_coll_base_barrier_intra_basic_linear(struct ompi_communicator_t *comm,
334335
err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, 0,
335336
MCA_COLL_BASE_TAG_BARRIER,
336337
MCA_PML_BASE_SEND_STANDARD, comm));
337-
if (MPI_SUCCESS != err) {
338-
return err;
339-
}
338+
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
340339

341340
err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, 0,
342341
MCA_COLL_BASE_TAG_BARRIER,
343342
comm, MPI_STATUS_IGNORE));
344-
if (MPI_SUCCESS != err) {
345-
return err;
346-
}
343+
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
347344
}
348345

349346
/* The root collects and broadcasts the messages. */
350347

351348
else {
352-
ompi_request_t** requests;
353-
354-
requests = (ompi_request_t**)malloc( size * sizeof(ompi_request_t*) );
349+
requests = coll_base_comm_get_reqs(module->base_data, size);
355350
for (i = 1; i < size; ++i) {
356351
err = MCA_PML_CALL(irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE,
357352
MCA_COLL_BASE_TAG_BARRIER, comm,
358353
&(requests[i])));
359-
if (MPI_SUCCESS != err) {
360-
return err;
361-
}
354+
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
362355
}
363356
ompi_request_wait_all( size-1, requests+1, MPI_STATUSES_IGNORE );
357+
requests = NULL; /* we're done the requests array is clean */
364358

365359
for (i = 1; i < size; ++i) {
366360
err = MCA_PML_CALL(send(NULL, 0, MPI_BYTE, i,
367361
MCA_COLL_BASE_TAG_BARRIER,
368362
MCA_PML_BASE_SEND_STANDARD, comm));
369-
if (MPI_SUCCESS != err) {
370-
return err;
371-
}
363+
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
372364
}
373-
374-
free( requests );
375365
}
376366

377367
/* All done */
378-
379368
return MPI_SUCCESS;
380-
369+
err_hndl:
370+
OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
371+
__FILE__, line, err, rank) );
372+
if( NULL != requests )
373+
ompi_coll_base_free_reqs(requests, size-1);
374+
return err;
381375
}
382376
/* copied function (with appropriate renaming) ends here */
383377

ompi/mca/coll/base/coll_base_bcast.c

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,7 @@ ompi_coll_base_bcast_intra_generic( void* buffer,
6666

6767
#if !defined(COLL_BASE_BCAST_USE_BLOCKING)
6868
if( tree->tree_nextsize != 0 ) {
69-
send_reqs = (ompi_request_t**)malloc( (ptrdiff_t)tree->tree_nextsize *
70-
sizeof(ompi_request_t*) );
69+
send_reqs = coll_base_comm_get_reqs(module->base_data, tree->tree_nextsize);
7170
}
7271
#endif
7372

@@ -236,19 +235,16 @@ ompi_coll_base_bcast_intra_generic( void* buffer,
236235
if (err != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
237236
}
238237

239-
#if !defined(COLL_BASE_BCAST_USE_BLOCKING)
240-
if( NULL != send_reqs ) free(send_reqs);
241-
#endif
242-
243238
return (MPI_SUCCESS);
244239

245240
error_hndl:
246241
OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
247242
__FILE__, line, err, rank) );
248-
#if !defined(COLL_BASE_BCAST_USE_BLOCKING)
249-
if( NULL != send_reqs ) free(send_reqs);
250-
#endif
251-
return (err);
243+
if( (MPI_SUCCESS != err) && (NULL != send_reqs) ) {
244+
ompi_coll_base_free_reqs( send_reqs, tree->tree_nextsize);
245+
}
246+
247+
return err;
252248
}
253249

254250
int
@@ -665,10 +661,7 @@ ompi_coll_base_bcast_intra_basic_linear(void *buff, int count,
665661
MCA_COLL_BASE_TAG_BCAST,
666662
MCA_PML_BASE_SEND_STANDARD,
667663
comm, preq++));
668-
if (MPI_SUCCESS != err) {
669-
ompi_coll_base_free_reqs(data->mcct_reqs, i);
670-
return err;
671-
}
664+
if (MPI_SUCCESS != err) { goto err_hndl; }
672665
}
673666
--i;
674667

@@ -684,9 +677,10 @@ ompi_coll_base_bcast_intra_basic_linear(void *buff, int count,
684677
* the error after we free everything. */
685678

686679
err = ompi_request_wait_all(i, reqs, MPI_STATUSES_IGNORE);
687-
688-
/* Free the reqs */
689-
ompi_coll_base_free_reqs(reqs, i);
680+
err_hndl:
681+
if( MPI_SUCCESS != err ) { /* Free the reqs */
682+
ompi_coll_base_free_reqs(reqs, i);
683+
}
690684

691685
/* All done */
692686
return err;

ompi/mca/coll/base/coll_base_frame.c

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -83,10 +83,7 @@ static void
8383
coll_base_comm_destruct(mca_coll_base_comm_t *data)
8484
{
8585
if( NULL != data->mcct_reqs ) {
86-
for( int i = 0; i < data->mcct_num_reqs; ++i ) {
87-
if( MPI_REQUEST_NULL != data->mcct_reqs[i] )
88-
ompi_request_free(&data->mcct_reqs[i]);
89-
}
86+
ompi_coll_base_free_reqs( data->mcct_reqs, data->mcct_num_reqs );
9087
free(data->mcct_reqs);
9188
data->mcct_reqs = NULL;
9289
data->mcct_num_reqs = 0;
@@ -122,18 +119,13 @@ OBJ_CLASS_INSTANCE(mca_coll_base_comm_t, opal_object_t,
122119

123120
ompi_request_t** coll_base_comm_get_reqs(mca_coll_base_comm_t* data, int nreqs)
124121
{
125-
int startfrom = data->mcct_num_reqs;
126-
127-
if( NULL == data->mcct_reqs ) {
128-
assert(0 == data->mcct_num_reqs);
129-
data->mcct_reqs = (ompi_request_t**)malloc(sizeof(ompi_request_t*) * nreqs);
130-
} else if( data->mcct_num_reqs <= nreqs ) {
122+
if( data->mcct_num_reqs <= nreqs ) {
131123
data->mcct_reqs = (ompi_request_t**)realloc(data->mcct_reqs, sizeof(ompi_request_t*) * nreqs);
132124
}
133125
if( NULL != data->mcct_reqs ) {
134-
data->mcct_num_reqs = nreqs;
135-
for( int i = startfrom; i < data->mcct_num_reqs; i++ )
126+
for( int i = data->mcct_num_reqs; i < nreqs; i++ )
136127
data->mcct_reqs[i] = MPI_REQUEST_NULL;
128+
data->mcct_num_reqs = nreqs;
137129
} else
138130
data->mcct_num_reqs = 0; /* nothing to return */
139131
return data->mcct_reqs;

ompi/mca/coll/base/coll_base_functions.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -343,11 +343,19 @@ struct mca_coll_base_comm_t {
343343
typedef struct mca_coll_base_comm_t mca_coll_base_comm_t;
344344
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(mca_coll_base_comm_t);
345345

346+
/**
347+
* Free all requests in an array. As these requests are usually used during
348+
* collective communications, and as on a succesful collective they are
349+
* expected to be released during the corresponding wait, the array should
350+
* generally be empty. However, this function might be used on error conditions
351+
* where it will allow a correct cleanup.
352+
*/
346353
static inline void ompi_coll_base_free_reqs(ompi_request_t **reqs, int count)
347354
{
348355
int i;
349356
for (i = 0; i < count; ++i)
350-
ompi_request_free(&reqs[i]);
357+
if( MPI_REQUEST_NULL != reqs[i] )
358+
ompi_request_free(&reqs[i]);
351359
}
352360

353361
/**

0 commit comments

Comments
 (0)