Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 71 additions & 21 deletions src/perftest_communication.c
Original file line number Diff line number Diff line change
Expand Up @@ -2416,7 +2416,7 @@ int rdma_cm_route_handler(struct pingpong_context *ctx,
connection_index = ctx->cma_master.connection_index;

// Initialization of client contexts in case of first connection:
if (connection_index == 0) {
if ((user_param->use_event && !ctx->send_channel)|| !ctx->pd) {
rc = ctx_init(ctx, user_param);
if (rc) {
error_message = "Failed to initialize RDMA contexts.";
Expand All @@ -2425,10 +2425,15 @@ int rdma_cm_route_handler(struct pingpong_context *ctx,
}

ctx->cm_id = cma_id;
rc = create_qp_main(ctx, user_param, connection_index);
if (rc) {
error_message = "Failed to create QP.";
goto error;

/* Only create qp when it's not available
* (i.e. avoid recreating qp during retry) */
if(!ctx->qp[connection_index]) {
rc = create_qp_main(ctx, user_param, connection_index);
if (rc) {
error_message = "Failed to create QP.";
goto error;
}
}

memset(&conn_param, 0, sizeof conn_param);
Expand All @@ -2446,6 +2451,10 @@ int rdma_cm_route_handler(struct pingpong_context *ctx,
rc = rdma_connect(cma_id, &conn_param);
if (rc) {
error_message = "Failed to connect through RDMA CM.";
/* IB core will destroy cm id if failed.
* Set cma_id to NULL to avoid double free.*/
ctx->cm_id = NULL;
ctx->cma_master.nodes[connection_index].cma_id = NULL;
goto error;
}

Expand Down Expand Up @@ -2484,7 +2493,7 @@ int rdma_cm_connection_request_handler(struct pingpong_context *ctx,

ctx->context = cma_id->verbs;
// Initialization of server contexts in case of first connection:
if (connection_index == 0) {
if ((user_param->use_event && !ctx->send_channel)|| !ctx->pd) {
rc = ctx_init(ctx, user_param);
if (rc) {
error_message = "Failed to initialize RDMA contexts.";
Expand All @@ -2493,10 +2502,15 @@ int rdma_cm_connection_request_handler(struct pingpong_context *ctx,
}

ctx->cm_id = cm_node->cma_id;
rc = create_qp_main(ctx, user_param, connection_index);
if (rc) {
error_message = "Failed to create QP.";
goto error_2;

/* Only create qp when it's not available
* (i.e. avoid recreating qp during retry) */
if(!ctx->qp[connection_index]) {
rc = create_qp_main(ctx, user_param, connection_index);
if (rc) {
error_message = "Failed to create QP.";
goto error_2;
}
}

memset(&conn_param, 0, sizeof(conn_param));
Expand Down Expand Up @@ -2819,19 +2833,24 @@ int _rdma_cm_client_connection(struct pingpong_context *ctx,
int i, rc;
char error_message[ERROR_MSG_SIZE] = "";

rc = rdma_cm_get_rdma_address(user_param, hints, &ctx->cma_master.rai);
if (rc) {
sprintf(error_message,
"Failed to get RDMA CM address - Error: %s.", gai_strerror(rc));
goto error;
if (!ctx->cma_master.rai) {
rc = rdma_cm_get_rdma_address(user_param, hints, &ctx->cma_master.rai);
if (rc) {
sprintf(error_message,
"Failed to get RDMA CM address - Error: %s.", gai_strerror(rc));
goto error;
}
}

for (i = 0; i < user_param->num_of_qps; i++) {
if (ctx->cma_master.nodes[i].connected) {
continue;
}
rc = rdma_resolve_addr(ctx->cma_master.nodes[i].cma_id,
ctx->cma_master.rai->ai_src_addr,
ctx->cma_master.rai->ai_dst_addr, 2000);
if (rc) {
sprintf(error_message, "Failed to resolve RDMA CM address.");
sprintf(error_message, "Failed to resolve RDMA CM address for cm node %d.", i);
rdma_cm_connect_error(ctx);
goto error;
}
Expand Down Expand Up @@ -2859,6 +2878,14 @@ int rdma_cm_client_connection(struct pingpong_context *ctx,
char error_message[ERROR_MSG_SIZE] = "";

for (i = 0; i < max_retries; i++) {
if (i > 0) {
rc = rdma_cm_allocate_nodes(ctx, user_param, hints, true);
if (rc) {
sprintf(error_message,
"Failed to reallocate RDMA CM nodes during retry.");
goto error;
}
}
rc = _rdma_cm_client_connection(ctx, user_param, hints);
if (!rc) {
return rc;
Expand Down Expand Up @@ -2899,7 +2926,7 @@ int create_rdma_cm_connection(struct pingpong_context *ctx,
goto error;
}

rc = rdma_cm_allocate_nodes(ctx, user_param, &hints);
rc = rdma_cm_allocate_nodes(ctx, user_param, &hints, false);
if (rc) {
error_message = "Failed to allocate RDMA CM nodes.";
goto destroy_event_channel;
Expand All @@ -2921,7 +2948,7 @@ int create_rdma_cm_connection(struct pingpong_context *ctx,
if (rc) {
error_message = "Failed to create RDMA CM connection.";
free(hints.ai_src_addr);
goto destroy_event_channel;
goto destroy_rdma_id;
}

rc = ctx_hand_shake(comm, &my_dest[0], &rem_dest[0]);
Expand All @@ -2938,14 +2965,37 @@ int create_rdma_cm_connection(struct pingpong_context *ctx,

destroy_rdma_id:
if (user_param->machine == CLIENT) {
for (i = 0; i < user_param->num_of_qps; i++)
rdma_destroy_id(ctx->cma_master.nodes[i].cma_id);
for (i = 0; i < user_param->num_of_qps; i++) {
struct cma_node * cm_node = &ctx->cma_master.nodes[i];
if(cm_node && cm_node->cma_id)
{
if (ctx->qp && ctx->qp[i]) {
ibv_destroy_qp(ctx->qp[i]);
ctx->qp[i] = NULL;
}
if (user_param->ah_allocated && ctx->ah && ctx->ah[i]) {
ibv_destroy_ah(ctx->ah[i]);
ctx->ah[i] = NULL;
}
rc = rdma_destroy_id(cm_node->cma_id);
if (rc) {
sprintf(error_message,
"Failed to destroy RDMA CM ID number %d.",
i);
goto error;
}
cm_node->cma_id = NULL;
}
}
}
free(ctx->cma_master.nodes);
free(hints.ai_src_addr);

destroy_event_channel:
rdma_destroy_event_channel(ctx->cma_master.channel);
if (ctx->cma_master.channel) {
rdma_destroy_event_channel(ctx->cma_master.channel);
ctx->cma_master.channel = NULL;
}

error:
return error_handler(error_message);
Expand Down
72 changes: 52 additions & 20 deletions src/perftest_resources.c
Original file line number Diff line number Diff line change
Expand Up @@ -1168,6 +1168,7 @@ int alloc_ctx(struct pingpong_context *ctx,struct perftest_parameters *user_para
ALLOC(user_param->tcompleted, cycles_t, 1);

ALLOC(ctx->qp, struct ibv_qp*, user_param->num_of_qps);
memset(ctx->qp, 0, user_param->num_of_qps * sizeof (struct ibv_qp*));
#ifdef HAVE_IBV_WR_API
ALLOC(ctx->qpx, struct ibv_qp_ex*, user_param->num_of_qps);
#ifdef HAVE_MLX5DV
Expand Down Expand Up @@ -2665,11 +2666,14 @@ xrcd: __attribute__((unused))
#endif

ibv_dealloc_pd(ctx->pd);
ctx->pd = NULL;

comp_channel:
if (user_param->use_event) {
ibv_destroy_comp_channel(ctx->send_channel);
ibv_destroy_comp_channel(ctx->recv_channel);
ctx->send_channel = NULL;
ctx->recv_channel = NULL;
}

return FAILURE;
Expand Down Expand Up @@ -6059,39 +6063,49 @@ int run_iter_fs(struct pingpong_context *ctx, struct perftest_parameters *user_p
*
******************************************************************************/
int rdma_cm_allocate_nodes(struct pingpong_context *ctx,
struct perftest_parameters *user_param, struct rdma_addrinfo *hints)
struct perftest_parameters *user_param, struct rdma_addrinfo *hints, bool retry)
{
int rc = SUCCESS, i = 0;
char *error_message;
char error_message[ERROR_MSG_SIZE] = "";

if (user_param->connection_type == UD
|| user_param->connection_type == RawEth)
hints->ai_port_space = RDMA_PS_UDP;
else
hints->ai_port_space = RDMA_PS_TCP;

ALLOCATE(ctx->cma_master.nodes, struct cma_node, user_param->num_of_qps);
if (!ctx->cma_master.nodes) {
error_message = "Failed to allocate memory for RDMA CM nodes.";
goto error;
if(!retry){
ALLOCATE(ctx->cma_master.nodes, struct cma_node, user_param->num_of_qps);
if (!ctx->cma_master.nodes) {
sprintf(error_message, "Failed to allocate memory for RDMA CM nodes.");
goto error;
}

memset(ctx->cma_master.nodes, 0,
(sizeof *ctx->cma_master.nodes) * user_param->num_of_qps);
}

memset(ctx->cma_master.nodes, 0,
(sizeof *ctx->cma_master.nodes) * user_param->num_of_qps);

for (i = 0; i < user_param->num_of_qps; i++) {
if (ctx->cma_master.nodes[i].cma_id) {
continue;
}
ctx->cma_master.nodes[i].id = i;
if (user_param->machine == CLIENT) {
rc = rdma_create_id(ctx->cma_master.channel,
&ctx->cma_master.nodes[i].cma_id, NULL, hints->ai_port_space);
if (rc) {
error_message = "Failed to create RDMA CM ID.";
sprintf(error_message, "Failed to create RDMA CM ID on cm_node %d.", i);
goto error;
}
}
}

if (user_param->has_source_ip) {
if (retry && hints->ai_src_addr) {
free(hints->ai_src_addr);
hints->ai_src_addr = NULL;
}
if (AF_INET == user_param->ai_family) {
struct sockaddr_in *source_addr;
source_addr = calloc(1, sizeof(*source_addr));
Expand Down Expand Up @@ -6131,7 +6145,7 @@ int rdma_cm_allocate_nodes(struct pingpong_context *ctx,
while (--i >= 0) {
rc = rdma_destroy_id(ctx->cma_master.nodes[i].cma_id);
if (rc) {
error_message = "Failed to destroy RDMA CM ID.";
sprintf(error_message, "Failed to destroy RDMA CM ID for node %d.", i);
break;
}
}
Expand Down Expand Up @@ -6168,20 +6182,38 @@ int rdma_cm_destroy_cma(struct pingpong_context *ctx,

for (i = 0; i < user_param->num_of_qps; i++) {
cm_node = &ctx->cma_master.nodes[i];
rc = rdma_destroy_id(cm_node->cma_id);
if (rc) {
sprintf(error_message,
"Failed to destroy RDMA CM ID number %d.", i);
goto error;
if(cm_node && cm_node->cma_id)
{
if (cm_node->connected) {
continue;
}

if (ctx->qp && ctx->qp[i]) {
ibv_destroy_qp(ctx->qp[i]);
ctx->qp[i] = NULL;
}
if (user_param->ah_allocated && ctx->ah && ctx->ah[i]) {
ibv_destroy_ah(ctx->ah[i]);
ctx->ah[i] = NULL;
}
rc = rdma_destroy_id(cm_node->cma_id);
if (rc) {
sprintf(error_message,
"Failed to destroy RDMA CM ID number %d.",
i);
goto error;
}
cm_node->cma_id = NULL;
}
}

rdma_destroy_event_channel(ctx->cma_master.channel);
if (ctx->cma_master.rai) {
rdma_freeaddrinfo(ctx->cma_master.rai);
int connected_count = 0;
for (i = 0; i < user_param->num_of_qps; i++) {
if (ctx->cma_master.nodes[i].connected) {
connected_count++;
}
}

free(ctx->cma_master.nodes);
ctx->cma_master.connects_left = user_param->num_of_qps - connected_count;
return rc;

error:
Expand Down
3 changes: 2 additions & 1 deletion src/perftest_resources.h
Original file line number Diff line number Diff line change
Expand Up @@ -1027,13 +1027,14 @@ int run_iter_fs(struct pingpong_context *ctx, struct perftest_parameters *user_p
* ctx - Application contexts.
* user_param - User parameters from the parser.
* hints - RDMA address information.
* retry - Whether this is a retry attempt.
*
* Return value:
* rc - On success: SUCCESS(0), on failure: FAILURE(1).
*
*/
int rdma_cm_allocate_nodes(struct pingpong_context *ctx,
struct perftest_parameters *user_param, struct rdma_addrinfo *hints);
struct perftest_parameters *user_param, struct rdma_addrinfo *hints, bool retry);

/* rdma_cm_destroy_qps:
*
Expand Down