Skip to content

Commit ad363d3

Browse files
authored
Merge pull request open-mpi#13440 from mentOS31/coll_ucc_fix_initialization
COLL/UCC: Fix aborting with SIGSEGV when persistent request is failed
2 parents d933b33 + 4b1b9a9 commit ad363d3

16 files changed

+34
-30
lines changed

.mailmap

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,3 +137,5 @@ George Katevenis <[email protected]>
137137
Brian Barrett <[email protected]>
138138

139139
Andrii Bilokur <[email protected]> B-a-S <[email protected]>
140+
141+
Kento Hasegawa <[email protected]> hasegawa.kento <[email protected]>

ompi/mca/coll/ucc/coll_ucc_allgather.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ int mca_coll_ucc_iallgather(const void *sbuf, size_t scount, struct ompi_datatyp
9999
mca_coll_ucc_req_t *coll_req = NULL;
100100

101101
UCC_VERBOSE(3, "running ucc iallgather");
102-
COLL_UCC_GET_REQ(coll_req);
102+
COLL_UCC_GET_REQ(coll_req, comm);
103103
COLL_UCC_CHECK(mca_coll_ucc_allgather_init_common(sbuf, scount, sdtype,
104104
rbuf, rcount, rdtype,
105105
false, ucc_module, &req, coll_req));
@@ -124,7 +124,7 @@ int mca_coll_ucc_allgather_init(const void *sbuf, size_t scount, struct ompi_dat
124124
ucc_coll_req_h req;
125125
mca_coll_ucc_req_t *coll_req = NULL;
126126

127-
COLL_UCC_GET_REQ_PERSISTENT(coll_req);
127+
COLL_UCC_GET_REQ_PERSISTENT(coll_req, comm);
128128
UCC_VERBOSE(3, "allgather_init init %p", coll_req);
129129
COLL_UCC_CHECK(mca_coll_ucc_allgather_init_common(sbuf, scount, sdtype,
130130
rbuf, rcount, rdtype,

ompi/mca/coll/ucc/coll_ucc_allgatherv.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ int mca_coll_ucc_iallgatherv(const void *sbuf, size_t scount,
104104
mca_coll_ucc_req_t *coll_req = NULL;
105105

106106
UCC_VERBOSE(3, "running ucc iallgatherv");
107-
COLL_UCC_GET_REQ(coll_req);
107+
COLL_UCC_GET_REQ(coll_req, comm);
108108
COLL_UCC_CHECK(mca_coll_ucc_allgatherv_init_common(sbuf, scount, sdtype,
109109
rbuf, rcounts, rdisps, rdtype,
110110
false, ucc_module, &req, coll_req));
@@ -131,7 +131,7 @@ int mca_coll_ucc_allgatherv_init(const void *sbuf, size_t scount, struct ompi_da
131131
ucc_coll_req_h req;
132132
mca_coll_ucc_req_t *coll_req = NULL;
133133

134-
COLL_UCC_GET_REQ_PERSISTENT(coll_req);
134+
COLL_UCC_GET_REQ_PERSISTENT(coll_req, comm);
135135
UCC_VERBOSE(3, "allgatherv_init init %p", coll_req);
136136
COLL_UCC_CHECK(mca_coll_ucc_allgatherv_init_common(sbuf, scount, sdtype,
137137
rbuf, rcounts, rdisps, rdtype,

ompi/mca/coll/ucc/coll_ucc_allreduce.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ int mca_coll_ucc_iallreduce(const void *sbuf, void *rbuf, size_t count,
9292
mca_coll_ucc_req_t *coll_req = NULL;
9393

9494
UCC_VERBOSE(3, "running ucc iallreduce");
95-
COLL_UCC_GET_REQ(coll_req);
95+
COLL_UCC_GET_REQ(coll_req, comm);
9696
COLL_UCC_CHECK(mca_coll_ucc_allreduce_init_common(sbuf, rbuf, count, dtype, op,
9797
false, ucc_module, &req, coll_req));
9898
COLL_UCC_POST_AND_CHECK(req);
@@ -116,7 +116,7 @@ int mca_coll_ucc_allreduce_init(const void *sbuf, void *rbuf, size_t count,
116116
ucc_coll_req_h req;
117117
mca_coll_ucc_req_t *coll_req = NULL;
118118

119-
COLL_UCC_GET_REQ_PERSISTENT(coll_req);
119+
COLL_UCC_GET_REQ_PERSISTENT(coll_req, comm);
120120
UCC_VERBOSE(3, "allreduce_init init %p", coll_req);
121121
COLL_UCC_CHECK(mca_coll_ucc_allreduce_init_common(sbuf, rbuf, count, dtype, op,
122122
true, ucc_module, &req, coll_req));

ompi/mca/coll/ucc/coll_ucc_alltoall.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ int mca_coll_ucc_ialltoall(const void *sbuf, size_t scount, struct ompi_datatype
9999
mca_coll_ucc_req_t *coll_req = NULL;
100100

101101
UCC_VERBOSE(3, "running ucc ialltoall");
102-
COLL_UCC_GET_REQ(coll_req);
102+
COLL_UCC_GET_REQ(coll_req, comm);
103103
COLL_UCC_CHECK(mca_coll_ucc_alltoall_init_common(sbuf, scount, sdtype,
104104
rbuf, rcount, rdtype,
105105
false, ucc_module, &req, coll_req));
@@ -124,7 +124,7 @@ int mca_coll_ucc_alltoall_init(const void *sbuf, size_t scount, struct ompi_data
124124
ucc_coll_req_h req;
125125
mca_coll_ucc_req_t *coll_req = NULL;
126126

127-
COLL_UCC_GET_REQ_PERSISTENT(coll_req);
127+
COLL_UCC_GET_REQ_PERSISTENT(coll_req, comm);
128128
UCC_VERBOSE(3, "alltoall_init init %p", coll_req);
129129
COLL_UCC_CHECK(mca_coll_ucc_alltoall_init_common(sbuf, scount, sdtype,
130130
rbuf, rcount, rdtype,

ompi/mca/coll/ucc/coll_ucc_alltoallv.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ int mca_coll_ucc_ialltoallv(const void *sbuf, ompi_count_array_t scounts,
106106
mca_coll_ucc_req_t *coll_req = NULL;
107107

108108
UCC_VERBOSE(3, "running ucc ialltoallv");
109-
COLL_UCC_GET_REQ(coll_req);
109+
COLL_UCC_GET_REQ(coll_req, comm);
110110
COLL_UCC_CHECK(mca_coll_ucc_alltoallv_init_common(sbuf, scounts, sdisps, sdtype,
111111
rbuf, rcounts, rdisps, rdtype,
112112
false, ucc_module, &req, coll_req));
@@ -134,7 +134,7 @@ int mca_coll_ucc_alltoallv_init(const void *sbuf, ompi_count_array_t scounts,
134134
ucc_coll_req_h req;
135135
mca_coll_ucc_req_t *coll_req = NULL;
136136

137-
COLL_UCC_GET_REQ_PERSISTENT(coll_req);
137+
COLL_UCC_GET_REQ_PERSISTENT(coll_req, comm);
138138
UCC_VERBOSE(3, "alltoallv_init init %p", coll_req);
139139
COLL_UCC_CHECK(mca_coll_ucc_alltoallv_init_common(sbuf, scounts, sdisps, sdtype,
140140
rbuf, rcounts, rdisps, rdtype,

ompi/mca/coll/ucc/coll_ucc_barrier.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ int mca_coll_ucc_ibarrier(struct ompi_communicator_t *comm,
5454
mca_coll_ucc_req_t *coll_req = NULL;
5555

5656
UCC_VERBOSE(3, "running ucc ibarrier");
57-
COLL_UCC_GET_REQ(coll_req);
57+
COLL_UCC_GET_REQ(coll_req, comm);
5858
COLL_UCC_CHECK(mca_coll_ucc_barrier_init_common(false, ucc_module, &req, coll_req));
5959
COLL_UCC_POST_AND_CHECK(req);
6060
*request = &coll_req->super;
@@ -75,7 +75,7 @@ int mca_coll_ucc_barrier_init(struct ompi_communicator_t *comm, struct ompi_info
7575
ucc_coll_req_h req;
7676
mca_coll_ucc_req_t *coll_req = NULL;
7777

78-
COLL_UCC_GET_REQ_PERSISTENT(coll_req);
78+
COLL_UCC_GET_REQ_PERSISTENT(coll_req, comm);
7979
UCC_VERBOSE(3, "barrier_init init %p", coll_req);
8080
COLL_UCC_CHECK(mca_coll_ucc_barrier_init_common(true, ucc_module, &req, coll_req));
8181
*request = &coll_req->super;

ompi/mca/coll/ucc/coll_ucc_bcast.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ int mca_coll_ucc_ibcast(void *buf, size_t count, struct ompi_datatype_t *dtype,
7272
mca_coll_ucc_req_t *coll_req = NULL;
7373

7474
UCC_VERBOSE(3, "running ucc ibcast");
75-
COLL_UCC_GET_REQ(coll_req);
75+
COLL_UCC_GET_REQ(coll_req, comm);
7676
COLL_UCC_CHECK(mca_coll_ucc_bcast_init_common(buf, count, dtype, root,
7777
false, ucc_module, &req, coll_req));
7878
COLL_UCC_POST_AND_CHECK(req);
@@ -95,7 +95,7 @@ int mca_coll_ucc_bcast_init(void *buf, size_t count, struct ompi_datatype_t *dty
9595
ucc_coll_req_h req;
9696
mca_coll_ucc_req_t *coll_req = NULL;
9797

98-
COLL_UCC_GET_REQ_PERSISTENT(coll_req);
98+
COLL_UCC_GET_REQ_PERSISTENT(coll_req, comm);
9999
UCC_VERBOSE(3, "bcast_init init %p", coll_req);
100100
COLL_UCC_CHECK(mca_coll_ucc_bcast_init_common(buf, count, dtype, root,
101101
true, ucc_module, &req, coll_req));

ompi/mca/coll/ucc/coll_ucc_common.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
} \
2727
} while(0)
2828

29-
#define COLL_UCC_GET_REQ(_coll_req) do { \
29+
#define COLL_UCC_GET_REQ(_coll_req, _comm) do { \
3030
opal_free_list_item_t *item; \
3131
item = opal_free_list_wait (&mca_coll_ucc_component.requests); \
3232
if (OPAL_UNLIKELY(NULL == item)) { \
@@ -41,9 +41,10 @@
4141
_coll_req->super.req_state = OMPI_REQUEST_ACTIVE; \
4242
_coll_req->super.req_free = mca_coll_ucc_req_free; \
4343
_coll_req->super.req_type = OMPI_REQUEST_COLL; \
44+
_coll_req->super.req_mpi_object.comm = _comm; \
4445
} while(0)
4546

46-
#define COLL_UCC_GET_REQ_PERSISTENT(_coll_req) \
47+
#define COLL_UCC_GET_REQ_PERSISTENT(_coll_req, _comm) \
4748
do { \
4849
opal_free_list_item_t *item; \
4950
item = opal_free_list_wait(&mca_coll_ucc_component.requests); \
@@ -59,6 +60,7 @@
5960
_coll_req->super.req_free = mca_coll_ucc_req_free; \
6061
_coll_req->super.req_start = mca_coll_ucc_req_start; \
6162
_coll_req->super.req_type = OMPI_REQUEST_COLL; \
63+
_coll_req->super.req_mpi_object.comm = _comm; \
6264
_coll_req->ucc_req = NULL; \
6365
} while (0)
6466

ompi/mca/coll/ucc/coll_ucc_gather.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ int mca_coll_ucc_igather(const void *sbuf, size_t scount, struct ompi_datatype_t
116116
mca_coll_ucc_req_t *coll_req = NULL;
117117

118118
UCC_VERBOSE(3, "running ucc igather");
119-
COLL_UCC_GET_REQ(coll_req);
119+
COLL_UCC_GET_REQ(coll_req, comm);
120120
COLL_UCC_CHECK(mca_coll_ucc_gather_init_common(sbuf, scount, sdtype, rbuf, rcount,
121121
rdtype, root, false, ucc_module,
122122
&req, coll_req));
@@ -142,7 +142,7 @@ int mca_coll_ucc_gather_init(const void *sbuf, size_t scount, struct ompi_dataty
142142
ucc_coll_req_h req;
143143
mca_coll_ucc_req_t *coll_req = NULL;
144144

145-
COLL_UCC_GET_REQ_PERSISTENT(coll_req);
145+
COLL_UCC_GET_REQ_PERSISTENT(coll_req, comm);
146146
UCC_VERBOSE(3, "gather_init init %p", coll_req);
147147
COLL_UCC_CHECK(mca_coll_ucc_gather_init_common(sbuf, scount, sdtype, rbuf, rcount,
148148
rdtype, root, true, ucc_module,

0 commit comments

Comments
 (0)