Skip to content

Commit bff1fa5

Browse files
authored
Merge pull request open-mpi#1219 from hjelmn/v2.x_cq_count
btl/openib: fix cq resize calculation
2 parents bd6fbe7 + 9af7741 commit bff1fa5

File tree

2 files changed

+40
-26
lines changed

2 files changed

+40
-26
lines changed

opal/mca/btl/openib/btl_openib.c

Lines changed: 31 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -421,13 +421,20 @@ static int openib_btl_prepare(struct mca_btl_openib_module_t* openib_btl)
421421
static int openib_btl_size_queues(struct mca_btl_openib_module_t* openib_btl)
422422
{
423423
uint32_t send_cqes, recv_cqes;
424-
int rc = OPAL_SUCCESS, qp;
424+
int rc = OPAL_SUCCESS;
425425
mca_btl_openib_device_t *device = openib_btl->device;
426+
uint32_t requested[BTL_OPENIB_MAX_CQ];
427+
bool need_resize = false;
426428

427429
opal_mutex_lock(&openib_btl->ib_lock);
430+
431+
for (int cq = 0 ; cq < BTL_OPENIB_MAX_CQ ; ++cq) {
432+
requested[cq] = 0;
433+
}
434+
428435
/* figure out reasonable sizes for completion queues */
429-
for(qp = 0; qp < mca_btl_openib_component.num_qps; qp++) {
430-
if(BTL_OPENIB_QP_TYPE_SRQ(qp)) {
436+
for (int qp = 0 ; qp < mca_btl_openib_component.num_qps ; qp++) {
437+
if (BTL_OPENIB_QP_TYPE_SRQ(qp)) {
431438
send_cqes = mca_btl_openib_component.qp_infos[qp].u.srq_qp.sd_max;
432439
recv_cqes = mca_btl_openib_component.qp_infos[qp].rd_num;
433440
} else {
@@ -436,24 +443,30 @@ static int openib_btl_size_queues(struct mca_btl_openib_module_t* openib_btl)
436443
recv_cqes = send_cqes;
437444
}
438445

439-
opal_mutex_lock(&openib_btl->device->device_lock);
440-
openib_btl->device->cq_size[qp_cq_prio(qp)] += recv_cqes;
441-
openib_btl->device->cq_size[BTL_OPENIB_LP_CQ] += send_cqes;
442-
opal_mutex_unlock(&openib_btl->device->device_lock);
446+
requested[qp_cq_prio(qp)] += recv_cqes;
447+
requested[BTL_OPENIB_LP_CQ] += send_cqes;
443448
}
444449

445-
rc = adjust_cq(device, BTL_OPENIB_HP_CQ);
446-
if (OPAL_SUCCESS != rc) {
447-
goto out;
448-
}
450+
opal_mutex_lock (&openib_btl->device->device_lock);
451+
for (int cq = 0 ; cq < BTL_OPENIB_MAX_CQ ; ++cq) {
452+
if (requested[cq] < mca_btl_openib_component.ib_cq_size[cq]) {
453+
requested[cq] = mca_btl_openib_component.ib_cq_size[cq];
454+
} else if (requested[cq] > openib_btl->device->ib_dev_attr.max_cqe) {
455+
requested[cq] = openib_btl->device->ib_dev_attr.max_cqe;
456+
}
449457

450-
rc = adjust_cq(device, BTL_OPENIB_LP_CQ);
451-
if (OPAL_SUCCESS != rc) {
452-
goto out;
453-
}
458+
if (openib_btl->device->cq_size[cq] < requested[cq]) {
459+
openib_btl->device->cq_size[cq] = requested[cq];
454460

455-
out:
461+
rc = adjust_cq (device, cq);
462+
if (OPAL_SUCCESS != rc) {
463+
break;
464+
}
465+
}
466+
}
467+
opal_mutex_unlock (&openib_btl->device->device_lock);
456468
opal_mutex_unlock(&openib_btl->ib_lock);
469+
457470
return rc;
458471
}
459472

@@ -1081,7 +1094,7 @@ int mca_btl_openib_add_procs(
10811094
}
10821095

10831096
if (nprocs_new) {
1084-
OPAL_THREAD_ADD32(&openib_btl->num_peers, nprocs_new);
1097+
opal_atomic_add_32 (&openib_btl->num_peers, nprocs_new);
10851098

10861099
/* adjust cq sizes given the new procs */
10871100
rc = openib_btl_size_queues (openib_btl);
@@ -1191,7 +1204,7 @@ struct mca_btl_base_endpoint_t *mca_btl_openib_get_ep (struct mca_btl_base_modul
11911204

11921205
/* this is a new process to this openib btl
11931206
* account this procs if need */
1194-
OPAL_THREAD_ADD32(&openib_btl->num_peers, 1);
1207+
opal_atomic_add_32 (&openib_btl->num_peers, 1);
11951208
rc = openib_btl_size_queues(openib_btl);
11961209
if (OPAL_SUCCESS != rc) {
11971210
BTL_ERROR(("error creating cqs"));

opal/mca/btl/openib/btl_openib.h

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,12 @@ BEGIN_C_DECLS
8383
* Infiniband (IB) BTL component.
8484
*/
8585

86+
enum {
87+
BTL_OPENIB_HP_CQ,
88+
BTL_OPENIB_LP_CQ,
89+
BTL_OPENIB_MAX_CQ,
90+
};
91+
8692
typedef enum {
8793
MCA_BTL_OPENIB_TRANSPORT_IB,
8894
MCA_BTL_OPENIB_TRANSPORT_IWARP,
@@ -204,7 +210,7 @@ struct mca_btl_openib_component_t {
204210
uint32_t reg_mru_len; /**< Length of the registration cache most recently used list */
205211
uint32_t use_srq; /**< Use the Shared Receive Queue (SRQ mode) */
206212

207-
uint32_t ib_cq_size[2]; /**< Max outstanding CQE on the CQ */
213+
uint32_t ib_cq_size[BTL_OPENIB_MAX_CQ]; /**< Max outstanding CQE on the CQ */
208214

209215
int ib_max_inline_data; /**< Max size of inline data */
210216
unsigned int ib_pkey_val;
@@ -374,8 +380,8 @@ typedef struct mca_btl_openib_device_t {
374380
#endif
375381
struct ibv_device_attr ib_dev_attr;
376382
struct ibv_pd *ib_pd;
377-
struct ibv_cq *ib_cq[2];
378-
uint32_t cq_size[2];
383+
struct ibv_cq *ib_cq[BTL_OPENIB_MAX_CQ];
384+
uint32_t cq_size[BTL_OPENIB_MAX_CQ];
379385
mca_mpool_base_module_t *mpool;
380386
/* MTU for this device */
381387
uint32_t mtu;
@@ -848,11 +854,6 @@ extern void mca_btl_openib_frag_progress_pending_put_get(
848854
*/
849855
void mca_btl_openib_show_init_error(const char *file, int line,
850856
const char *func, const char *dev);
851-
852-
#define BTL_OPENIB_HP_CQ 0
853-
#define BTL_OPENIB_LP_CQ 1
854-
855-
856857
/**
857858
* Post to Shared Receive Queue with certain priority
858859
*

0 commit comments

Comments
 (0)