Skip to content

Commit b9eeabd

Browse files
committed
udcm: fix bugs
This commit fixes the following bugs: - On send failure release newly allocated message. - In the destructor for udcm_message_sent_t always remove the send timeout event from the event base. Failure to do this can lead to memory corruption since the destructor may be called from an event callback. Signed-off-by: Nathan Hjelm <[email protected]> (cherry picked from open-mpi/ompi@e10afcd)
1 parent ada7bed commit b9eeabd

File tree

1 file changed

+16
-17
lines changed

1 file changed

+16
-17
lines changed

opal/mca/btl/openib/connect/btl_openib_connect_udcm.c

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ static void *udcm_cq_event_dispatch(int fd, int flags, void *context);
306306
static void *udcm_message_callback (void *context);
307307

308308
static void udcm_set_message_timeout (udcm_message_sent_t *message);
309-
static void udcm_cancel_message_timeout (udcm_message_sent_t *message);
309+
static void udcm_free_message (udcm_message_sent_t *message);
310310

311311
static int udcm_module_init (udcm_module_t *m, mca_btl_openib_module_t *btl);
312312

@@ -1679,7 +1679,7 @@ static int udcm_send_request (mca_btl_base_endpoint_t *lcl_ep,
16791679
if (0 != (rc = udcm_post_send (lcl_ep, msg->data, m->msg_length, 0))) {
16801680
BTL_VERBOSE(("error posting REQ"));
16811681

1682-
udcm_cancel_message_timeout (msg);
1682+
udcm_free_message (msg);
16831683

16841684
return rc;
16851685
}
@@ -1702,7 +1702,7 @@ static int udcm_send_complete (mca_btl_base_endpoint_t *lcl_ep,
17021702
if (0 != rc) {
17031703
BTL_VERBOSE(("error posting complete"));
17041704

1705-
udcm_cancel_message_timeout (msg);
1705+
udcm_free_message (msg);
17061706

17071707
return rc;
17081708
}
@@ -1728,7 +1728,7 @@ static int udcm_send_reject (mca_btl_base_endpoint_t *lcl_ep,
17281728
if (0 != rc) {
17291729
BTL_VERBOSE(("error posting rejection"));
17301730

1731-
udcm_cancel_message_timeout (msg);
1731+
udcm_free_message (msg);
17321732

17331733
return rc;
17341734
}
@@ -2216,10 +2216,8 @@ static void udcm_sent_message_destructor (udcm_message_sent_t *message)
22162216
free (message->data);
22172217
}
22182218

2219-
if (message->event_active) {
2220-
opal_event_evtimer_del (&message->event);
2221-
message->event_active = false;
2222-
}
2219+
opal_event_evtimer_del (&message->event);
2220+
message->event_active = false;
22232221
}
22242222

22252223
/* mark: message timeout code */
@@ -2298,21 +2296,22 @@ static void udcm_set_message_timeout (udcm_message_sent_t *message)
22982296
opal_mutex_unlock (&m->cm_timeout_lock);
22992297
}
23002298

2301-
static void udcm_cancel_message_timeout (udcm_message_sent_t *message)
2299+
static void udcm_free_message (udcm_message_sent_t *message)
23022300
{
23032301
udcm_module_t *m = UDCM_ENDPOINT_MODULE(message->endpoint);
23042302

2305-
BTL_VERBOSE(("cancelling timeout for message %p", (void *) message));
2303+
BTL_VERBOSE(("releasing message %p", (void *) message));
23062304

23072305
opal_mutex_lock (&m->cm_timeout_lock);
23082306

2309-
opal_list_remove_item (&m->flying_messages, &message->super);
2310-
2311-
/* start the event */
2312-
opal_event_evtimer_del (&message->event);
2313-
message->event_active = false;
2307+
if (message->event_active) {
2308+
opal_list_remove_item (&m->flying_messages, &message->super);
2309+
message->event_active = false;
2310+
}
23142311

23152312
opal_mutex_unlock (&m->cm_timeout_lock);
2313+
2314+
OBJ_RELEASE(message);
23162315
}
23172316

23182317
/* mark: xrc connection support */
@@ -2830,7 +2829,7 @@ static int udcm_xrc_send_request (mca_btl_base_endpoint_t *lcl_ep, mca_btl_base_
28302829
if (0 != (rc = udcm_post_send (lcl_ep, msg->data, sizeof (udcm_msg_hdr_t), 0))) {
28312830
BTL_VERBOSE(("error posting XREQ"));
28322831

2833-
udcm_cancel_message_timeout (msg);
2832+
udcm_free_message (msg);
28342833

28352834
return rc;
28362835
}
@@ -2883,7 +2882,7 @@ static int udcm_xrc_send_xresponse (mca_btl_base_endpoint_t *lcl_ep, mca_btl_bas
28832882
if (0 != rc) {
28842883
BTL_VERBOSE(("error posting complete"));
28852884

2886-
udcm_cancel_message_timeout (msg);
2885+
udcm_free_message (msg);
28872886

28882887
return rc;
28892888
}

0 commit comments

Comments
 (0)