Skip to content

Commit 7d96f12

Browse files
committed
pml/yalla: fix yalla performance regression
It was introduced in PR #1228 in particular in commit 041a6a9. Original solution was using "flexible array member" called "mxm_base" to "fall-through" to the "mxm" send/recv member that located in the outer structure. After changing number of elements in "mxm_base" from 0 to 1 we actually allocating 2 mxm_req_base_t elements which leads to increased overal size and harms cache performance. It also brakes "mca_pml_yalla_check_request_state" function.
1 parent c9ada8e commit 7d96f12

File tree

3 files changed

+15
-20
lines changed

3 files changed

+15
-20
lines changed

ompi/mca/pml/yalla/pml_yalla.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -688,6 +688,7 @@ int mca_pml_yalla_start(size_t count, ompi_request_t** requests)
688688

689689
for (i = 0; i < count; ++i) {
690690
req = (mca_pml_yalla_base_request_t *)requests[i];
691+
sreq = (mca_pml_yalla_send_request_t *)req;
691692

692693
if ((req == NULL) || (OMPI_REQUEST_PML != req->ompi.req_type)) {
693694
/* Skip irrelevant requests */
@@ -696,10 +697,9 @@ int mca_pml_yalla_start(size_t count, ompi_request_t** requests)
696697

697698
PML_YALLA_ASSERT(req->ompi.req_state != OMPI_REQUEST_INVALID);
698699
PML_YALLA_RESET_OMPI_REQ(&req->ompi, OMPI_REQUEST_ACTIVE);
699-
PML_YALLA_RESET_PML_REQ(req);
700+
PML_YALLA_RESET_PML_REQ(req, PML_YALLA_MXM_REQBASE(sreq));
700701

701702
if (req->flags & MCA_PML_YALLA_REQUEST_FLAG_SEND) {
702-
sreq = (mca_pml_yalla_send_request_t *)req;
703703
if (req->flags & MCA_PML_YALLA_REQUEST_FLAG_BSEND) {
704704
PML_YALLA_VERBOSE(8, "start bsend request %p", (void *)sreq);
705705
rc = mca_pml_yalla_bsend(&sreq->mxm);

ompi/mca/pml/yalla/pml_yalla_request.c

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,10 @@ static inline void mca_pml_yalla_request_release(mca_pml_yalla_base_request_t *r
3232
}
3333

3434
static inline int
35-
mca_pml_yalla_check_request_state(mca_pml_yalla_base_request_t *req)
35+
mca_pml_yalla_check_request_state(mca_pml_yalla_base_request_t *req, mxm_req_base_t *mxm_base)
3636
{
37-
if (req->mxm_base->state != MXM_REQ_COMPLETED) {
38-
PML_YALLA_VERBOSE(8, "request %p free called before completed", (void *)req);
37+
if ( mxm_base->state != MXM_REQ_COMPLETED) {
38+
PML_YALLA_VERBOSE(8, "request %p free called before completed", (void*)req);
3939
req->flags |= MCA_PML_YALLA_REQUEST_FLAG_FREE_CALLED;
4040
return 0;
4141
}
@@ -45,11 +45,12 @@ mca_pml_yalla_check_request_state(mca_pml_yalla_base_request_t *req)
4545

4646
static int mca_pml_yalla_send_request_free(ompi_request_t **request)
4747
{
48-
mca_pml_yalla_base_request_t *req = (mca_pml_yalla_base_request_t*)(*request);
48+
mca_pml_yalla_send_request_t *sreq = (mca_pml_yalla_send_request_t*)(*request);
49+
mca_pml_yalla_base_request_t *req = (mca_pml_yalla_base_request_t*)sreq;
4950

5051
PML_YALLA_VERBOSE(9, "free send request *%p=%p", (void *)request, (void *)*request);
5152

52-
if (mca_pml_yalla_check_request_state(req)) {
53+
if (mca_pml_yalla_check_request_state(req, PML_YALLA_MXM_REQBASE(sreq))) {
5354
mca_pml_yalla_request_release(req, &ompi_pml_yalla.send_reqs);
5455
}
5556

@@ -84,11 +85,12 @@ static int mca_pml_yalla_send_request_cancel(ompi_request_t *request, int flag)
8485

8586
static int mca_pml_yalla_recv_request_free(ompi_request_t **request)
8687
{
87-
mca_pml_yalla_base_request_t *req = (mca_pml_yalla_base_request_t*)(*request);
88+
mca_pml_yalla_recv_request_t *rreq = (mca_pml_yalla_recv_request_t*)(*request);
89+
mca_pml_yalla_base_request_t *req = (mca_pml_yalla_base_request_t*)rreq;
8890

8991
PML_YALLA_VERBOSE(9, "free receive request *%p=%p", (void *)request, (void *)*request);
9092

91-
if (mca_pml_yalla_check_request_state(req)) {
93+
if (mca_pml_yalla_check_request_state(req, PML_YALLA_MXM_REQBASE(rreq))) {
9294
mca_pml_yalla_request_release(req, &ompi_pml_yalla.recv_reqs);
9395
}
9496

ompi/mca/pml/yalla/pml_yalla_request.h

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,15 +25,6 @@ struct pml_yalla_base_request {
2525
ompi_request_t ompi;
2626
mca_pml_yalla_convertor_t *convertor;
2727
int flags;
28-
/* overlaps with base of send/recv
29-
* In ISO C90, you would have to give contents a length of 1,
30-
* which means either you waste space or complicate the argument to malloc.
31-
* Note:
32-
* - 1 was the portable way to go, though it was rather strange
33-
* - 0 was better at indicating intent, but not legal as far as
34-
* the Standard was concerned and supported as an extension by some compilers (including gcc)
35-
*/
36-
mxm_req_base_t mxm_base[1];
3728
};
3829

3930
struct pml_yalla_send_request {
@@ -58,6 +49,8 @@ OBJ_CLASS_DECLARATION(mca_pml_yalla_recv_request_t);
5849

5950
void mca_pml_yalla_init_reqs(void);
6051

52+
#define PML_YALLA_MXM_REQBASE( x ) ( &((x)->mxm.base) )
53+
6154
#define PML_YALLA_RESET_OMPI_REQ(_ompi_req, _state) \
6255
{ \
6356
(_ompi_req)->req_state = _state; \
@@ -72,9 +65,9 @@ void mca_pml_yalla_init_reqs(void);
7265
OBJ_RETAIN(_comm); \
7366
}
7467

75-
#define PML_YALLA_RESET_PML_REQ(_pml_req) \
68+
#define PML_YALLA_RESET_PML_REQ(_pml_req, mxm_base) \
7669
{ \
77-
(_pml_req)->mxm_base[0].state = MXM_REQ_NEW; \
70+
mxm_base->state = MXM_REQ_NEW; \
7871
PML_YALLA_RESET_PML_REQ_DATA(_pml_req); \
7972
}
8073

0 commit comments

Comments
 (0)