diff --git a/src/perftest_communication.c b/src/perftest_communication.c index 8c22824f..8826757c 100755 --- a/src/perftest_communication.c +++ b/src/perftest_communication.c @@ -2416,7 +2416,7 @@ int rdma_cm_route_handler(struct pingpong_context *ctx, connection_index = ctx->cma_master.connection_index; // Initialization of client contexts in case of first connection: - if (connection_index == 0) { + if ((user_param->use_event && !ctx->send_channel)|| !ctx->pd) { rc = ctx_init(ctx, user_param); if (rc) { error_message = "Failed to initialize RDMA contexts."; @@ -2425,10 +2425,15 @@ int rdma_cm_route_handler(struct pingpong_context *ctx, } ctx->cm_id = cma_id; - rc = create_qp_main(ctx, user_param, connection_index); - if (rc) { - error_message = "Failed to create QP."; - goto error; + + /* Only create qp when it's not available + * (i.e. avoid recreating qp during retry) */ + if(!ctx->qp[connection_index]) { + rc = create_qp_main(ctx, user_param, connection_index); + if (rc) { + error_message = "Failed to create QP."; + goto error; + } } memset(&conn_param, 0, sizeof conn_param); @@ -2446,6 +2451,10 @@ int rdma_cm_route_handler(struct pingpong_context *ctx, rc = rdma_connect(cma_id, &conn_param); if (rc) { error_message = "Failed to connect through RDMA CM."; + /* IB core will destroy cm id if failed. + * Set cma_id to NULL to avoid double free.*/ + ctx->cm_id = NULL; + ctx->cma_master.nodes[connection_index].cma_id = NULL; goto error; } @@ -2484,7 +2493,7 @@ int rdma_cm_connection_request_handler(struct pingpong_context *ctx, ctx->context = cma_id->verbs; // Initialization of server contexts in case of first connection: - if (connection_index == 0) { + if ((user_param->use_event && !ctx->send_channel)|| !ctx->pd) { rc = ctx_init(ctx, user_param); if (rc) { error_message = "Failed to initialize RDMA contexts."; @@ -2493,10 +2502,15 @@ int rdma_cm_connection_request_handler(struct pingpong_context *ctx, } ctx->cm_id = cm_node->cma_id; - rc = create_qp_main(ctx, user_param, connection_index); - if (rc) { - error_message = "Failed to create QP."; - goto error_2; + + /* Only create qp when it's not available + * (i.e. avoid recreating qp during retry) */ + if(!ctx->qp[connection_index]) { + rc = create_qp_main(ctx, user_param, connection_index); + if (rc) { + error_message = "Failed to create QP."; + goto error_2; + } } memset(&conn_param, 0, sizeof(conn_param)); @@ -2819,19 +2833,24 @@ int _rdma_cm_client_connection(struct pingpong_context *ctx, int i, rc; char error_message[ERROR_MSG_SIZE] = ""; - rc = rdma_cm_get_rdma_address(user_param, hints, &ctx->cma_master.rai); - if (rc) { - sprintf(error_message, - "Failed to get RDMA CM address - Error: %s.", gai_strerror(rc)); - goto error; + if (!ctx->cma_master.rai) { + rc = rdma_cm_get_rdma_address(user_param, hints, &ctx->cma_master.rai); + if (rc) { + sprintf(error_message, + "Failed to get RDMA CM address - Error: %s.", gai_strerror(rc)); + goto error; + } } for (i = 0; i < user_param->num_of_qps; i++) { + if (ctx->cma_master.nodes[i].connected) { + continue; + } rc = rdma_resolve_addr(ctx->cma_master.nodes[i].cma_id, ctx->cma_master.rai->ai_src_addr, ctx->cma_master.rai->ai_dst_addr, 2000); if (rc) { - sprintf(error_message, "Failed to resolve RDMA CM address."); + sprintf(error_message, "Failed to resolve RDMA CM address for cm node %d.", i); rdma_cm_connect_error(ctx); goto error; } @@ -2859,6 +2878,14 @@ int rdma_cm_client_connection(struct pingpong_context *ctx, char error_message[ERROR_MSG_SIZE] = ""; for (i = 0; i < max_retries; i++) { + if (i > 0) { + rc = rdma_cm_allocate_nodes(ctx, user_param, hints, true); + if (rc) { + sprintf(error_message, + "Failed to reallocate RDMA CM nodes during retry."); + goto error; + } + } rc = _rdma_cm_client_connection(ctx, user_param, hints); if (!rc) { return rc; @@ -2899,7 +2926,7 @@ int create_rdma_cm_connection(struct pingpong_context *ctx, goto error; } - rc = rdma_cm_allocate_nodes(ctx, user_param, &hints); + rc = rdma_cm_allocate_nodes(ctx, user_param, &hints, false); if (rc) { error_message = "Failed to allocate RDMA CM nodes."; goto destroy_event_channel; @@ -2921,7 +2948,7 @@ int create_rdma_cm_connection(struct pingpong_context *ctx, if (rc) { error_message = "Failed to create RDMA CM connection."; free(hints.ai_src_addr); - goto destroy_event_channel; + goto destroy_rdma_id; } rc = ctx_hand_shake(comm, &my_dest[0], &rem_dest[0]); @@ -2938,14 +2965,37 @@ int create_rdma_cm_connection(struct pingpong_context *ctx, destroy_rdma_id: if (user_param->machine == CLIENT) { - for (i = 0; i < user_param->num_of_qps; i++) - rdma_destroy_id(ctx->cma_master.nodes[i].cma_id); + for (i = 0; i < user_param->num_of_qps; i++) { + struct cma_node * cm_node = &ctx->cma_master.nodes[i]; + if(cm_node && cm_node->cma_id) + { + if (ctx->qp && ctx->qp[i]) { + ibv_destroy_qp(ctx->qp[i]); + ctx->qp[i] = NULL; + } + if (user_param->ah_allocated && ctx->ah && ctx->ah[i]) { + ibv_destroy_ah(ctx->ah[i]); + ctx->ah[i] = NULL; + } + rc = rdma_destroy_id(cm_node->cma_id); + if (rc) { + sprintf(error_message, + "Failed to destroy RDMA CM ID number %d.", + i); + goto error; + } + cm_node->cma_id = NULL; + } + } } free(ctx->cma_master.nodes); free(hints.ai_src_addr); destroy_event_channel: - rdma_destroy_event_channel(ctx->cma_master.channel); + if (ctx->cma_master.channel) { + rdma_destroy_event_channel(ctx->cma_master.channel); + ctx->cma_master.channel = NULL; + } error: return error_handler(error_message); diff --git a/src/perftest_resources.c b/src/perftest_resources.c index c10933dc..3cace096 100755 --- a/src/perftest_resources.c +++ b/src/perftest_resources.c @@ -1168,6 +1168,7 @@ int alloc_ctx(struct pingpong_context *ctx,struct perftest_parameters *user_para ALLOC(user_param->tcompleted, cycles_t, 1); ALLOC(ctx->qp, struct ibv_qp*, user_param->num_of_qps); + memset(ctx->qp, 0, user_param->num_of_qps * sizeof (struct ibv_qp*)); #ifdef HAVE_IBV_WR_API ALLOC(ctx->qpx, struct ibv_qp_ex*, user_param->num_of_qps); #ifdef HAVE_MLX5DV @@ -2665,11 +2666,14 @@ xrcd: __attribute__((unused)) #endif ibv_dealloc_pd(ctx->pd); + ctx->pd = NULL; comp_channel: if (user_param->use_event) { ibv_destroy_comp_channel(ctx->send_channel); ibv_destroy_comp_channel(ctx->recv_channel); + ctx->send_channel = NULL; + ctx->recv_channel = NULL; } return FAILURE; @@ -6059,10 +6063,10 @@ int run_iter_fs(struct pingpong_context *ctx, struct perftest_parameters *user_p * ******************************************************************************/ int rdma_cm_allocate_nodes(struct pingpong_context *ctx, - struct perftest_parameters *user_param, struct rdma_addrinfo *hints) + struct perftest_parameters *user_param, struct rdma_addrinfo *hints, bool retry) { int rc = SUCCESS, i = 0; - char *error_message; + char error_message[ERROR_MSG_SIZE] = ""; if (user_param->connection_type == UD || user_param->connection_type == RawEth) @@ -6070,28 +6074,38 @@ int rdma_cm_allocate_nodes(struct pingpong_context *ctx, else hints->ai_port_space = RDMA_PS_TCP; - ALLOCATE(ctx->cma_master.nodes, struct cma_node, user_param->num_of_qps); - if (!ctx->cma_master.nodes) { - error_message = "Failed to allocate memory for RDMA CM nodes."; - goto error; + if(!retry){ + ALLOCATE(ctx->cma_master.nodes, struct cma_node, user_param->num_of_qps); + if (!ctx->cma_master.nodes) { + sprintf(error_message, "Failed to allocate memory for RDMA CM nodes."); + goto error; + } + + memset(ctx->cma_master.nodes, 0, + (sizeof *ctx->cma_master.nodes) * user_param->num_of_qps); } - memset(ctx->cma_master.nodes, 0, - (sizeof *ctx->cma_master.nodes) * user_param->num_of_qps); for (i = 0; i < user_param->num_of_qps; i++) { + if (ctx->cma_master.nodes[i].cma_id) { + continue; + } ctx->cma_master.nodes[i].id = i; if (user_param->machine == CLIENT) { rc = rdma_create_id(ctx->cma_master.channel, &ctx->cma_master.nodes[i].cma_id, NULL, hints->ai_port_space); if (rc) { - error_message = "Failed to create RDMA CM ID."; + sprintf(error_message, "Failed to create RDMA CM ID on cm_node %d.", i); goto error; } } } if (user_param->has_source_ip) { + if (retry && hints->ai_src_addr) { + free(hints->ai_src_addr); + hints->ai_src_addr = NULL; + } if (AF_INET == user_param->ai_family) { struct sockaddr_in *source_addr; source_addr = calloc(1, sizeof(*source_addr)); @@ -6131,7 +6145,7 @@ int rdma_cm_allocate_nodes(struct pingpong_context *ctx, while (--i >= 0) { rc = rdma_destroy_id(ctx->cma_master.nodes[i].cma_id); if (rc) { - error_message = "Failed to destroy RDMA CM ID."; + sprintf(error_message, "Failed to destroy RDMA CM ID for node %d.", i); break; } } @@ -6168,20 +6182,38 @@ int rdma_cm_destroy_cma(struct pingpong_context *ctx, for (i = 0; i < user_param->num_of_qps; i++) { cm_node = &ctx->cma_master.nodes[i]; - rc = rdma_destroy_id(cm_node->cma_id); - if (rc) { - sprintf(error_message, - "Failed to destroy RDMA CM ID number %d.", i); - goto error; + if(cm_node && cm_node->cma_id) + { + if (cm_node->connected) { + continue; + } + + if (ctx->qp && ctx->qp[i]) { + ibv_destroy_qp(ctx->qp[i]); + ctx->qp[i] = NULL; + } + if (user_param->ah_allocated && ctx->ah && ctx->ah[i]) { + ibv_destroy_ah(ctx->ah[i]); + ctx->ah[i] = NULL; + } + rc = rdma_destroy_id(cm_node->cma_id); + if (rc) { + sprintf(error_message, + "Failed to destroy RDMA CM ID number %d.", + i); + goto error; + } + cm_node->cma_id = NULL; } } - rdma_destroy_event_channel(ctx->cma_master.channel); - if (ctx->cma_master.rai) { - rdma_freeaddrinfo(ctx->cma_master.rai); + int connected_count = 0; + for (i = 0; i < user_param->num_of_qps; i++) { + if (ctx->cma_master.nodes[i].connected) { + connected_count++; + } } - - free(ctx->cma_master.nodes); + ctx->cma_master.connects_left = user_param->num_of_qps - connected_count; return rc; error: diff --git a/src/perftest_resources.h b/src/perftest_resources.h index 3bdc7952..aaf90336 100644 --- a/src/perftest_resources.h +++ b/src/perftest_resources.h @@ -1027,13 +1027,14 @@ int run_iter_fs(struct pingpong_context *ctx, struct perftest_parameters *user_p * ctx - Application contexts. * user_param - User parameters from the parser. * hints - RDMA address information. +* retry - Whether this is a retry attempt. * * Return value: * rc - On success: SUCCESS(0), on failure: FAILURE(1). * */ int rdma_cm_allocate_nodes(struct pingpong_context *ctx, - struct perftest_parameters *user_param, struct rdma_addrinfo *hints); + struct perftest_parameters *user_param, struct rdma_addrinfo *hints, bool retry); /* rdma_cm_destroy_qps: *