Skip to content

Commit e9714a2

Browse files
author
Ruizhe Zhou
committed
perftest: Fix double frees during RDMA CM connection retry
When running perftest with RDMA CM enabled (-R), if the connection request is rejected (RDMA_CM_EVENT_REJECTED), the client enters a retry loop in `rdma_cm_client_connection`. However, the previous retry logic contained multiple flaws causing segmentation faults, double frees, and heap corruption. The following specific issues were identified and fixed: 1. Double Free of Event Channel: The `rdma_destroy_event_channel()` was called inside `rdma_cm_destroy_cma()` during internal cleanup, but the pointer was not cleared. The caller function `create_rdma_cm_connection()` would then attempt to destroy the same channel again in its error path. Fix: Set the channel pointer to NULL after destruction and check for NULL before attempting to destroy it. The event channel and cm nodes will be reallocated when entering another retry attempt. 2. Heap Corruption via Index Overflow: The `ctx->cma_master.connection_index` was incremented on every connection attempt but was never reset upon failure. During retries, this index would exceed the bounds of the `nodes` array, leading to out-of-bound writes and heap metadata corruption. Similar things would happen for other fields of cma_master. Fix: Complete reset for fields of cma_master in `rdma_cm_destroy_cma()`. 3. Context Corruption and Leaks: `rdma_cm_route_handler()` unconditionally called `ctx_init()` and `create_qp_main()` on every retry attempt. This overwrote existing pointers (PD, MR, Buffers) in the context structure without releasing the old resources, causing memory leaks and "Bad file descriptor" errors during final cleanup. Recreating old qp would cause qp creation error causing retry to fail. Fix: Add a check in `rdma_cm_route_handler()` to ensure `ctx_init()` and `create_qp_main` are only called if the context has not been initialized yet. Signed-off-by: Ruizhe Zhou <[email protected]>
1 parent 07d75db commit e9714a2

File tree

2 files changed

+54
-13
lines changed

2 files changed

+54
-13
lines changed

src/perftest_communication.c

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2416,7 +2416,7 @@ int rdma_cm_route_handler(struct pingpong_context *ctx,
24162416
connection_index = ctx->cma_master.connection_index;
24172417

24182418
// Initialization of client contexts in case of first connection:
2419-
if (connection_index == 0) {
2419+
if (!ctx->pd) {
24202420
rc = ctx_init(ctx, user_param);
24212421
if (rc) {
24222422
error_message = "Failed to initialize RDMA contexts.";
@@ -2425,10 +2425,15 @@ int rdma_cm_route_handler(struct pingpong_context *ctx,
24252425
}
24262426

24272427
ctx->cm_id = cma_id;
2428-
rc = create_qp_main(ctx, user_param, connection_index);
2429-
if (rc) {
2430-
error_message = "Failed to create QP.";
2431-
goto error;
2428+
2429+
/* Only create qp when it's not available
2430+
* (i.e. avoid recreating qp during retry) */
2431+
if(!ctx->qp[connection_index]) {
2432+
rc = create_qp_main(ctx, user_param, connection_index);
2433+
if (rc) {
2434+
error_message = "Failed to create QP.";
2435+
goto error;
2436+
}
24322437
}
24332438

24342439
memset(&conn_param, 0, sizeof conn_param);
@@ -2446,6 +2451,9 @@ int rdma_cm_route_handler(struct pingpong_context *ctx,
24462451
rc = rdma_connect(cma_id, &conn_param);
24472452
if (rc) {
24482453
error_message = "Failed to connect through RDMA CM.";
2454+
/* IB core will destroy cm id if failed.
2455+
* Set cma_id to NULL to avoid double free.*/
2456+
cma_id = NULL;
24492457
goto error;
24502458
}
24512459

@@ -2859,6 +2867,20 @@ int rdma_cm_client_connection(struct pingpong_context *ctx,
28592867
char error_message[ERROR_MSG_SIZE] = "";
28602868

28612869
for (i = 0; i < max_retries; i++) {
2870+
if (i > 0 && ctx->cma_master.channel == NULL) {
2871+
ctx->cma_master.channel = rdma_create_event_channel();
2872+
if (!ctx->cma_master.channel) {
2873+
sprintf(error_message,
2874+
"Failed to recreate RDMA CM event channel during retry.");
2875+
goto error;
2876+
}
2877+
rc = rdma_cm_allocate_nodes(ctx, user_param, hints);
2878+
if (rc) {
2879+
sprintf(error_message,
2880+
"Failed to reallocate RDMA CM nodes during retry.");
2881+
goto error;
2882+
}
2883+
}
28622884
rc = _rdma_cm_client_connection(ctx, user_param, hints);
28632885
if (!rc) {
28642886
return rc;
@@ -2945,7 +2967,10 @@ int create_rdma_cm_connection(struct pingpong_context *ctx,
29452967
free(hints.ai_src_addr);
29462968

29472969
destroy_event_channel:
2948-
rdma_destroy_event_channel(ctx->cma_master.channel);
2970+
if (ctx->cma_master.channel) {
2971+
rdma_destroy_event_channel(ctx->cma_master.channel);
2972+
ctx->cma_master.channel = NULL;
2973+
}
29492974

29502975
error:
29512976
return error_handler(error_message);

src/perftest_resources.c

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6154,20 +6154,36 @@ int rdma_cm_destroy_cma(struct pingpong_context *ctx,
61546154

61556155
for (i = 0; i < user_param->num_of_qps; i++) {
61566156
cm_node = &ctx->cma_master.nodes[i];
6157-
rc = rdma_destroy_id(cm_node->cma_id);
6158-
if (rc) {
6159-
sprintf(error_message,
6160-
"Failed to destroy RDMA CM ID number %d.", i);
6161-
goto error;
6157+
if(cm_node && cm_node->cma_id)
6158+
{
6159+
rc = rdma_destroy_id(cm_node->cma_id);
6160+
if (rc) {
6161+
sprintf(error_message,
6162+
"Failed to destroy RDMA CM ID number %d.",
6163+
i);
6164+
goto error;
6165+
}
6166+
cm_node->cma_id = NULL;
61626167
}
61636168
}
61646169

6165-
rdma_destroy_event_channel(ctx->cma_master.channel);
6170+
if (ctx->cma_master.channel) {
6171+
rdma_destroy_event_channel(ctx->cma_master.channel);
6172+
ctx->cma_master.channel = NULL;
6173+
}
61666174
if (ctx->cma_master.rai) {
61676175
rdma_freeaddrinfo(ctx->cma_master.rai);
6176+
ctx->cma_master.rai = NULL;
61686177
}
61696178

6170-
free(ctx->cma_master.nodes);
6179+
ctx->cma_master.connects_left = user_param->num_of_qps;
6180+
ctx->cma_master.connection_index = 0;
6181+
ctx->cma_master.disconnects_left = 0;
6182+
6183+
if (ctx->cma_master.nodes) {
6184+
free(ctx->cma_master.nodes);
6185+
ctx->cma_master.nodes = NULL;
6186+
}
61716187
return rc;
61726188

61736189
error:

0 commit comments

Comments
 (0)