Skip to content

Commit 4b90310

Browse files
author
Ruizhe Zhou
committed
perftest: Fix double frees during RDMA CM connection retry
When running perftest with RDMA CM enabled (-R), if the connection request is rejected (RDMA_CM_EVENT_REJECTED), the client enters a retry loop in `rdma_cm_client_connection`. However, the previous retry logic contained multiple flaws causing segmentation faults, double frees, and heap corruption. The following specific issues were identified and fixed: 1. Double Free of Event Channel: The `rdma_destroy_event_channel()` was called inside `rdma_cm_destroy_cma()` during internal cleanup, but the pointer was not cleared. The caller function `create_rdma_cm_connection()` would then attempt to destroy the same channel again in its error path. Fix: Set the channel pointer to NULL after destruction and check for NULL before attempting to destroy it. The event channel and cm nodes will be reallocated when entering another retry attempt. 2. Heap Corruption via Index Overflow: The `ctx->cma_master.connection_index` was incremented on every connection attempt but was never reset upon failure. During retries, this index would exceed the bounds of the `nodes` array, leading to out-of-bound writes and heap metadata corruption. Similar things would happen for other fields of cma_master. Fix: Complete reset for fields of cma_master in `rdma_cm_destroy_cma()`. 3. Context Corruption and Leaks: `rdma_cm_route_handler()` unconditionally called `ctx_init()` and `create_qp_main()` on every retry attempt. This overwrote existing pointers (PD, MR, Buffers) in the context structure without releasing the old resources, causing memory leaks and "Bad file descriptor" errors during final cleanup. Recreating old qp would cause qp creation error causing retry to fail. Fix: Add a check in `rdma_cm_route_handler()` to ensure `ctx_init()` and `create_qp_main` are only called if the context has not been initialized yet. 4. Similar issues happened on server side retry in `rdma_cm_connection_request_handler`. Apply same fix as in `rdma_cm_route_handler()`. Signed-off-by: Ruizhe Zhou <[email protected]>
1 parent 07d75db commit 4b90310

File tree

2 files changed

+67
-18
lines changed

2 files changed

+67
-18
lines changed

src/perftest_communication.c

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2416,7 +2416,7 @@ int rdma_cm_route_handler(struct pingpong_context *ctx,
24162416
connection_index = ctx->cma_master.connection_index;
24172417

24182418
// Initialization of client contexts in case of first connection:
2419-
if (connection_index == 0) {
2419+
if ((user_param->use_event && !ctx->send_channel)|| !ctx->pd) {
24202420
rc = ctx_init(ctx, user_param);
24212421
if (rc) {
24222422
error_message = "Failed to initialize RDMA contexts.";
@@ -2425,10 +2425,15 @@ int rdma_cm_route_handler(struct pingpong_context *ctx,
24252425
}
24262426

24272427
ctx->cm_id = cma_id;
2428-
rc = create_qp_main(ctx, user_param, connection_index);
2429-
if (rc) {
2430-
error_message = "Failed to create QP.";
2431-
goto error;
2428+
2429+
/* Only create qp when it's not available
2430+
* (i.e. avoid recreating qp during retry) */
2431+
if(!ctx->qp[connection_index]) {
2432+
rc = create_qp_main(ctx, user_param, connection_index);
2433+
if (rc) {
2434+
error_message = "Failed to create QP.";
2435+
goto error;
2436+
}
24322437
}
24332438

24342439
memset(&conn_param, 0, sizeof conn_param);
@@ -2446,6 +2451,9 @@ int rdma_cm_route_handler(struct pingpong_context *ctx,
24462451
rc = rdma_connect(cma_id, &conn_param);
24472452
if (rc) {
24482453
error_message = "Failed to connect through RDMA CM.";
2454+
/* IB core will destroy cm id if failed.
2455+
* Set cma_id to NULL to avoid double free.*/
2456+
cma_id = NULL;
24492457
goto error;
24502458
}
24512459

@@ -2484,7 +2492,7 @@ int rdma_cm_connection_request_handler(struct pingpong_context *ctx,
24842492

24852493
ctx->context = cma_id->verbs;
24862494
// Initialization of server contexts in case of first connection:
2487-
if (connection_index == 0) {
2495+
if ((user_param->use_event && !ctx->send_channel)|| !ctx->pd) {
24882496
rc = ctx_init(ctx, user_param);
24892497
if (rc) {
24902498
error_message = "Failed to initialize RDMA contexts.";
@@ -2493,10 +2501,15 @@ int rdma_cm_connection_request_handler(struct pingpong_context *ctx,
24932501
}
24942502

24952503
ctx->cm_id = cm_node->cma_id;
2496-
rc = create_qp_main(ctx, user_param, connection_index);
2497-
if (rc) {
2498-
error_message = "Failed to create QP.";
2499-
goto error_2;
2504+
2505+
/* Only create qp when it's not available
2506+
* (i.e. avoid recreating qp during retry) */
2507+
if(!ctx->qp[connection_index]) {
2508+
rc = create_qp_main(ctx, user_param, connection_index);
2509+
if (rc) {
2510+
error_message = "Failed to create QP.";
2511+
goto error_2;
2512+
}
25002513
}
25012514

25022515
memset(&conn_param, 0, sizeof(conn_param));
@@ -2859,6 +2872,20 @@ int rdma_cm_client_connection(struct pingpong_context *ctx,
28592872
char error_message[ERROR_MSG_SIZE] = "";
28602873

28612874
for (i = 0; i < max_retries; i++) {
2875+
if (i > 0 && ctx->cma_master.channel == NULL) {
2876+
ctx->cma_master.channel = rdma_create_event_channel();
2877+
if (!ctx->cma_master.channel) {
2878+
sprintf(error_message,
2879+
"Failed to recreate RDMA CM event channel during retry.");
2880+
goto error;
2881+
}
2882+
rc = rdma_cm_allocate_nodes(ctx, user_param, hints);
2883+
if (rc) {
2884+
sprintf(error_message,
2885+
"Failed to reallocate RDMA CM nodes during retry.");
2886+
goto error;
2887+
}
2888+
}
28622889
rc = _rdma_cm_client_connection(ctx, user_param, hints);
28632890
if (!rc) {
28642891
return rc;
@@ -2945,7 +2972,10 @@ int create_rdma_cm_connection(struct pingpong_context *ctx,
29452972
free(hints.ai_src_addr);
29462973

29472974
destroy_event_channel:
2948-
rdma_destroy_event_channel(ctx->cma_master.channel);
2975+
if (ctx->cma_master.channel) {
2976+
rdma_destroy_event_channel(ctx->cma_master.channel);
2977+
ctx->cma_master.channel = NULL;
2978+
}
29492979

29502980
error:
29512981
return error_handler(error_message);

src/perftest_resources.c

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2665,11 +2665,14 @@ xrcd: __attribute__((unused))
26652665
#endif
26662666

26672667
ibv_dealloc_pd(ctx->pd);
2668+
ctx->pd = NULL;
26682669

26692670
comp_channel:
26702671
if (user_param->use_event) {
26712672
ibv_destroy_comp_channel(ctx->send_channel);
26722673
ibv_destroy_comp_channel(ctx->recv_channel);
2674+
ctx->send_channel = NULL;
2675+
ctx->recv_channel = NULL;
26732676
}
26742677

26752678
return FAILURE;
@@ -6154,20 +6157,36 @@ int rdma_cm_destroy_cma(struct pingpong_context *ctx,
61546157

61556158
for (i = 0; i < user_param->num_of_qps; i++) {
61566159
cm_node = &ctx->cma_master.nodes[i];
6157-
rc = rdma_destroy_id(cm_node->cma_id);
6158-
if (rc) {
6159-
sprintf(error_message,
6160-
"Failed to destroy RDMA CM ID number %d.", i);
6161-
goto error;
6160+
if(cm_node && cm_node->cma_id)
6161+
{
6162+
rc = rdma_destroy_id(cm_node->cma_id);
6163+
if (rc) {
6164+
sprintf(error_message,
6165+
"Failed to destroy RDMA CM ID number %d.",
6166+
i);
6167+
goto error;
6168+
}
6169+
cm_node->cma_id = NULL;
61626170
}
61636171
}
61646172

6165-
rdma_destroy_event_channel(ctx->cma_master.channel);
6173+
if (ctx->cma_master.channel) {
6174+
rdma_destroy_event_channel(ctx->cma_master.channel);
6175+
ctx->cma_master.channel = NULL;
6176+
}
61666177
if (ctx->cma_master.rai) {
61676178
rdma_freeaddrinfo(ctx->cma_master.rai);
6179+
ctx->cma_master.rai = NULL;
61686180
}
61696181

6170-
free(ctx->cma_master.nodes);
6182+
ctx->cma_master.connects_left = user_param->num_of_qps;
6183+
ctx->cma_master.connection_index = 0;
6184+
ctx->cma_master.disconnects_left = 0;
6185+
6186+
if (ctx->cma_master.nodes) {
6187+
free(ctx->cma_master.nodes);
6188+
ctx->cma_master.nodes = NULL;
6189+
}
61716190
return rc;
61726191

61736192
error:

0 commit comments

Comments
 (0)