Skip to content
This repository was archived by the owner on Sep 30, 2022. It is now read-only.

Commit e7f909e

Browse files
committed
Merge pull request #1015 from derbeyn/coll_pr
Fix a segfault occuring in IMB Alltoall if tuned is unselected
2 parents a9cecde + 693d2d1 commit e7f909e

28 files changed

+555
-577
lines changed

ompi/mca/coll/base/base.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
* These functions are normally invoked by the back-ends of:
2424
*
2525
* - The back-ends of MPI_Init() and MPI_Finalize()
26-
* - Communuicactor constructors (e.g., MPI_Comm_split()) and
26+
* - Communicator constructors (e.g., MPI_Comm_split()) and
2727
* destructors (e.g., MPI_Comm_free())
2828
* - The laminfo command
2929
*/

ompi/mca/coll/base/coll_base_alltoall.c

Lines changed: 57 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2015 The University of Tennessee and The University
6+
* Copyright (c) 2004-2016 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -42,8 +42,8 @@ mca_coll_base_alltoall_intra_basic_inplace(const void *rbuf, int rcount,
4242
mca_coll_base_module_t *module)
4343
{
4444
mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module;
45-
int i, j, size, rank, err=MPI_SUCCESS;
46-
MPI_Request *preq;
45+
int i, j, size, rank, err = MPI_SUCCESS, line;
46+
ompi_request_t **preq, **reqs;
4747
char *tmp_buffer;
4848
size_t max_size;
4949
ptrdiff_t ext, true_lb, true_ext;
@@ -63,65 +63,72 @@ mca_coll_base_alltoall_intra_basic_inplace(const void *rbuf, int rcount,
6363
ompi_datatype_get_true_extent ( rdtype, &true_lb, &true_ext);
6464
max_size = true_ext + ext * (rcount-1);
6565

66+
/* Initiate all send/recv to/from others. */
67+
reqs = coll_base_comm_get_reqs(base_module->base_data, 2);
68+
if( NULL == reqs ) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto error_hndl; }
69+
6670
/* Allocate a temporary buffer */
6771
tmp_buffer = calloc (max_size, 1);
68-
if (NULL == tmp_buffer) {
69-
return OMPI_ERR_OUT_OF_RESOURCE;
70-
}
72+
if (NULL == tmp_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; }
7173
max_size = ext * rcount;
7274

7375
/* in-place alltoall slow algorithm (but works) */
7476
for (i = 0 ; i < size ; ++i) {
7577
for (j = i+1 ; j < size ; ++j) {
76-
/* Initiate all send/recv to/from others. */
77-
preq = coll_base_comm_get_reqs(base_module->base_data, size * 2);
78+
preq = reqs;
7879

7980
if (i == rank) {
8081
/* Copy the data into the temporary buffer */
8182
err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer,
8283
(char *) rbuf + j * max_size);
83-
if (MPI_SUCCESS != err) { goto error_hndl; }
84+
if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }
8485

8586
/* Exchange data with the peer */
8687
err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * j, rcount, rdtype,
8788
j, MCA_COLL_BASE_TAG_ALLTOALL, comm, preq++));
88-
if (MPI_SUCCESS != err) { goto error_hndl; }
89+
if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }
8990

9091
err = MCA_PML_CALL(isend ((char *) tmp_buffer, rcount, rdtype,
9192
j, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD,
9293
comm, preq++));
93-
if (MPI_SUCCESS != err) { goto error_hndl; }
94+
if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }
9495
} else if (j == rank) {
9596
/* Copy the data into the temporary buffer */
9697
err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer,
9798
(char *) rbuf + i * max_size);
98-
if (MPI_SUCCESS != err) { goto error_hndl; }
99+
if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }
99100

100101
/* Exchange data with the peer */
101102
err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * i, rcount, rdtype,
102103
i, MCA_COLL_BASE_TAG_ALLTOALL, comm, preq++));
103-
if (MPI_SUCCESS != err) { goto error_hndl; }
104+
if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }
104105

105106
err = MCA_PML_CALL(isend ((char *) tmp_buffer, rcount, rdtype,
106107
i, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD,
107108
comm, preq++));
108-
if (MPI_SUCCESS != err) { goto error_hndl; }
109+
if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }
109110
} else {
110111
continue;
111112
}
112113

113114
/* Wait for the requests to complete */
114-
err = ompi_request_wait_all (2, base_module->base_data->mcct_reqs, MPI_STATUSES_IGNORE);
115-
if (MPI_SUCCESS != err) { goto error_hndl; }
115+
err = ompi_request_wait_all (2, reqs, MPI_STATUSES_IGNORE);
116+
if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; }
116117
}
117118
}
118119

119120
error_hndl:
120121
/* Free the temporary buffer */
121122
free (tmp_buffer);
122123

123-
/* All done */
124+
if( MPI_SUCCESS != err ) {
125+
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
126+
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err,
127+
rank));
128+
ompi_coll_base_free_reqs(reqs, 2);
129+
}
124130

131+
/* All done */
125132
return err;
126133
}
127134

@@ -385,29 +392,28 @@ int ompi_coll_base_alltoall_intra_linear_sync(const void *sbuf, int scount,
385392
total_reqs = (((max_outstanding_reqs > (size - 1)) ||
386393
(max_outstanding_reqs <= 0)) ?
387394
(size - 1) : (max_outstanding_reqs));
388-
reqs = (ompi_request_t**) malloc( 2 * total_reqs *
389-
sizeof(ompi_request_t*));
395+
reqs = coll_base_comm_get_reqs(module->base_data, 2 * total_reqs);
390396
if (NULL == reqs) { error = -1; line = __LINE__; goto error_hndl; }
391397

392398
prcv = (char *) rbuf;
393399
psnd = (char *) sbuf;
394400

395401
/* Post first batch or ireceive and isend requests */
396402
for (nreqs = 0, nrreqs = 0, ri = (rank + 1) % size; nreqs < total_reqs;
397-
ri = (ri + 1) % size, ++nreqs, ++nrreqs) {
398-
error =
399-
MCA_PML_CALL(irecv
400-
(prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri,
401-
MCA_COLL_BASE_TAG_ALLTOALL, comm, &reqs[nreqs]));
403+
ri = (ri + 1) % size, ++nrreqs) {
404+
nreqs++;
405+
error = MCA_PML_CALL(irecv
406+
(prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri,
407+
MCA_COLL_BASE_TAG_ALLTOALL, comm, &reqs[nreqs]));
402408
if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
403409
}
404-
for ( nsreqs = 0, si = (rank + size - 1) % size; nreqs < 2 * total_reqs;
405-
si = (si + size - 1) % size, ++nreqs, ++nsreqs) {
406-
error =
407-
MCA_PML_CALL(isend
408-
(psnd + (ptrdiff_t)si * sext, scount, sdtype, si,
409-
MCA_COLL_BASE_TAG_ALLTOALL,
410-
MCA_PML_BASE_SEND_STANDARD, comm, &reqs[nreqs]));
410+
for (nsreqs = 0, si = (rank + size - 1) % size; nreqs < 2 * total_reqs;
411+
si = (si + size - 1) % size, ++nsreqs) {
412+
nreqs++;
413+
error = MCA_PML_CALL(isend
414+
(psnd + (ptrdiff_t)si * sext, scount, sdtype, si,
415+
MCA_COLL_BASE_TAG_ALLTOALL,
416+
MCA_PML_BASE_SEND_STANDARD, comm, &reqs[nreqs]));
411417
if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
412418
}
413419

@@ -435,11 +441,10 @@ int ompi_coll_base_alltoall_intra_linear_sync(const void *sbuf, int scount,
435441
ncreqs++;
436442
if (completed < total_reqs) {
437443
if (nrreqs < (size - 1)) {
438-
error =
439-
MCA_PML_CALL(irecv
440-
(prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri,
441-
MCA_COLL_BASE_TAG_ALLTOALL, comm,
442-
&reqs[completed]));
444+
error = MCA_PML_CALL(irecv
445+
(prcv + (ptrdiff_t)ri * rext, rcount, rdtype, ri,
446+
MCA_COLL_BASE_TAG_ALLTOALL, comm,
447+
&reqs[completed]));
443448
if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
444449
++nrreqs;
445450
ri = (ri + 1) % size;
@@ -451,24 +456,22 @@ int ompi_coll_base_alltoall_intra_linear_sync(const void *sbuf, int scount,
451456
MCA_COLL_BASE_TAG_ALLTOALL,
452457
MCA_PML_BASE_SEND_STANDARD, comm,
453458
&reqs[completed]));
459+
if (MPI_SUCCESS != error) { line = __LINE__; goto error_hndl; }
454460
++nsreqs;
455461
si = (si + size - 1) % size;
456462
}
457463
}
458464
}
459465
}
460466

461-
/* Free the reqs */
462-
free(reqs);
463-
464467
/* All done */
465468
return MPI_SUCCESS;
466469

467470
error_hndl:
468471
OPAL_OUTPUT((ompi_coll_base_framework.framework_output,
469472
"%s:%4d\tError occurred %d, rank %2d", __FILE__, line, error,
470473
rank));
471-
if (NULL != reqs) free(reqs);
474+
ompi_coll_base_free_reqs(reqs, nreqs);
472475
return error;
473476
}
474477

@@ -554,7 +557,7 @@ int ompi_coll_base_alltoall_intra_basic_linear(const void *sbuf, int scount,
554557
struct ompi_communicator_t *comm,
555558
mca_coll_base_module_t *module)
556559
{
557-
int i, rank, size, err, nreqs;
560+
int i, rank, size, err, nreqs, line;
558561
char *psnd, *prcv;
559562
MPI_Aint lb, sndinc, rcvinc;
560563
ompi_request_t **req, **sreq, **rreq;
@@ -605,21 +608,20 @@ int ompi_coll_base_alltoall_intra_basic_linear(const void *sbuf, int scount,
605608
/* Initiate all send/recv to/from others. */
606609

607610
req = rreq = coll_base_comm_get_reqs(data, (size - 1) * 2);
611+
if (NULL == req) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; }
608612

609613
prcv = (char *) rbuf;
610614
psnd = (char *) sbuf;
611615

612616
/* Post all receives first -- a simple optimization */
613617

614618
for (nreqs = 0, i = (rank + 1) % size; i != rank;
615-
i = (i + 1) % size, ++rreq, ++nreqs) {
619+
i = (i + 1) % size, ++rreq) {
620+
nreqs++;
616621
err = MCA_PML_CALL(irecv_init
617622
(prcv + (ptrdiff_t)i * rcvinc, rcount, rdtype, i,
618623
MCA_COLL_BASE_TAG_ALLTOALL, comm, rreq));
619-
if (MPI_SUCCESS != err) {
620-
ompi_coll_base_free_reqs(req, nreqs);
621-
return err;
622-
}
624+
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
623625
}
624626

625627
/* Now post all sends in reverse order
@@ -628,15 +630,13 @@ int ompi_coll_base_alltoall_intra_basic_linear(const void *sbuf, int scount,
628630
*/
629631
sreq = rreq;
630632
for (i = (rank + size - 1) % size; i != rank;
631-
i = (i + size - 1) % size, ++sreq, ++nreqs) {
633+
i = (i + size - 1) % size, ++sreq) {
634+
nreqs++;
632635
err = MCA_PML_CALL(isend_init
633636
(psnd + (ptrdiff_t)i * sndinc, scount, sdtype, i,
634637
MCA_COLL_BASE_TAG_ALLTOALL,
635638
MCA_PML_BASE_SEND_STANDARD, comm, sreq));
636-
if (MPI_SUCCESS != err) {
637-
ompi_coll_base_free_reqs(req, nreqs);
638-
return err;
639-
}
639+
if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
640640
}
641641

642642
/* Start your engines. This will never return an error. */
@@ -652,7 +652,12 @@ int ompi_coll_base_alltoall_intra_basic_linear(const void *sbuf, int scount,
652652

653653
err = ompi_request_wait_all(nreqs, req, MPI_STATUSES_IGNORE);
654654

655-
/* Free the reqs */
655+
err_hndl:
656+
if( MPI_SUCCESS != err ) {
657+
OPAL_OUTPUT( (ompi_coll_base_framework.framework_output,"%s:%4d\tError occurred %d, rank %2d",
658+
__FILE__, line, err, rank) );
659+
}
660+
/* Free the reqs in all cases as they are persistent requests */
656661
ompi_coll_base_free_reqs(req, nreqs);
657662

658663
/* All done */

0 commit comments

Comments
 (0)