Skip to content

Commit 5133a6b

Browse files
UCT/GDA: Collapsed CQ
1 parent 5fe5daf commit 5133a6b

File tree

3 files changed

+42
-46
lines changed

3 files changed

+42
-46
lines changed

src/uct/ib/mlx5/gdaki/gdaki.c

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,7 @@ static UCS_CLASS_INIT_FUNC(uct_rc_gdaki_ep_t, const uct_ep_params_t *params)
8383
return status;
8484
}
8585

86-
init_attr.cq_len[UCT_IB_DIR_TX] = iface->super.super.config.tx_qp_len *
87-
UCT_IB_MLX5_MAX_BB;
86+
init_attr.cq_len[UCT_IB_DIR_TX] = 1;
8887
uct_ib_mlx5_cq_calc_sizes(&iface->super.super.super, UCT_IB_DIR_TX,
8988
&init_attr, 0, &cq_attr);
9089
uct_rc_iface_fill_attr(&iface->super.super, &qp_attr.super,
@@ -178,18 +177,11 @@ static UCS_CLASS_INIT_FUNC(uct_rc_gdaki_ep_t, const uct_ep_params_t *params)
178177
qp_attr.umem_offset);
179178
dev_ep.sq_wqe_num = qp_attr.max_tx;
180179
dev_ep.sq_dbrec = &self->ep_gpu->qp_dbrec[MLX5_SND_DBR];
180+
dev_ep.sq_fc_mask = (qp_attr.max_tx >> 1) - 1;
181181
dev_ep.cqe_daddr = UCS_PTR_BYTE_OFFSET(self->ep_gpu, cq_attr.umem_offset);
182182
dev_ep.cqe_num = cq_attr.cq_size;
183183
dev_ep.sq_db = self->sq_db;
184184

185-
status = UCT_CUDADRV_FUNC_LOG_ERR(
186-
cuMemsetD8((CUdeviceptr)UCS_PTR_BYTE_OFFSET(self->ep_gpu,
187-
cq_attr.umem_offset),
188-
0xff, cq_attr.umem_len));
189-
if (status != UCS_OK) {
190-
goto err_dev_ep;
191-
}
192-
193185
status = UCT_CUDADRV_FUNC_LOG_ERR(
194186
cuMemcpyHtoD((CUdeviceptr)self->ep_gpu, &dev_ep, sizeof(dev_ep)));
195187
if (status != UCS_OK) {

src/uct/ib/mlx5/gdaki/gdaki.cuh

Lines changed: 38 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ UCS_F_DEVICE uint64_t uct_rc_mlx5_gda_max_alloc_wqe_base(
100100
when processing a new completion */
101101
uint64_t pi = doca_gpu_dev_verbs_atomic_read<uint64_t,
102102
DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU>(&ep->sq_wqe_pi);
103-
return pi + ep->sq_wqe_num - count;
103+
return pi + ep->sq_wqe_num + 1 - count;
104104
}
105105

106106
UCS_F_DEVICE uint64_t uct_rc_mlx5_gda_reserv_wqe_thread(
@@ -231,6 +231,12 @@ UCS_F_DEVICE void uct_rc_mlx5_gda_db(uct_rc_gdaki_dev_ep_t *ep,
231231
&ep->sq_lock);
232232
}
233233

234+
UCS_F_DEVICE bool
235+
uct_rc_mlx5_gda_fc(const uct_rc_gdaki_dev_ep_t *ep, uint16_t wqe_idx)
236+
{
237+
return !(wqe_idx & ep->sq_fc_mask);
238+
}
239+
234240
template<ucs_device_level_t level>
235241
UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_single(
236242
uct_rc_gdaki_dev_ep_t *ep, const uct_device_mem_element_t *tl_mem_elem,
@@ -245,18 +251,16 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_single(
245251
uint64_t wqe_idx;
246252
unsigned lane_id;
247253
unsigned num_lanes;
248-
uint32_t fc;
249254

250255
uct_rc_mlx5_gda_exec_init<level>(lane_id, num_lanes);
251256
uct_rc_mlx5_gda_reserv_wqe<level>(ep, 1, lane_id, wqe_base);
252257
if (wqe_base == UCT_RC_GDA_RESV_WQE_NO_RESOURCE) {
253258
return UCS_ERR_NO_RESOURCE;
254259
}
255260

256-
fc = doca_gpu_dev_verbs_wqe_idx_inc_mask(ep->sq_wqe_pi, ep->sq_wqe_num / 2);
257261
wqe_idx = wqe_base & 0xffff;
258262
if (lane_id == 0) {
259-
if ((comp != nullptr) || (wqe_idx == fc)) {
263+
if ((comp != nullptr) || uct_rc_mlx5_gda_fc(ep, wqe_idx)) {
260264
cflag = DOCA_GPUNETIO_MLX5_WQE_CTRL_CQ_UPDATE;
261265
if (comp != nullptr) {
262266
comp->wqe_idx = wqe_base;
@@ -333,7 +337,6 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi(
333337
unsigned cflag;
334338
unsigned lane_id;
335339
unsigned num_lanes;
336-
uint32_t fc;
337340
uint64_t wqe_base;
338341
size_t length;
339342
void *address;
@@ -357,7 +360,6 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi(
357360
return UCS_ERR_NO_RESOURCE;
358361
}
359362

360-
fc = doca_gpu_dev_verbs_wqe_idx_inc_mask(ep->sq_wqe_pi, ep->sq_wqe_num / 2);
361363
wqe_idx = doca_gpu_dev_verbs_wqe_idx_inc_mask(wqe_base, lane_id);
362364
for (uint32_t i = lane_id; i < count; i += num_lanes) {
363365
if (i == counter_index) {
@@ -379,7 +381,7 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi(
379381

380382
cflag = 0;
381383
if (((comp != nullptr) && (i == count - 1)) ||
382-
((comp == nullptr) && (wqe_idx == fc))) {
384+
((comp == nullptr) && uct_rc_mlx5_gda_fc(ep, wqe_idx))) {
383385
cflag = DOCA_GPUNETIO_MLX5_WQE_CTRL_CQ_UPDATE;
384386
if (comp != nullptr) {
385387
comp->wqe_idx = wqe_base;
@@ -426,7 +428,6 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi_partial(
426428
unsigned lane_id;
427429
unsigned num_lanes;
428430
unsigned cflag;
429-
uint32_t fc;
430431
uint64_t wqe_base;
431432
size_t length;
432433
void *address;
@@ -451,7 +452,6 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi_partial(
451452
return UCS_ERR_NO_RESOURCE;
452453
}
453454

454-
fc = doca_gpu_dev_verbs_wqe_idx_inc_mask(ep->sq_wqe_pi, ep->sq_wqe_num / 2);
455455
wqe_idx = doca_gpu_dev_verbs_wqe_idx_inc_mask(wqe_base, lane_id);
456456
for (uint32_t i = lane_id; i < count; i += num_lanes) {
457457
if (i == mem_list_count) {
@@ -475,7 +475,7 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi_partial(
475475

476476
cflag = 0;
477477
if (((comp != nullptr) && (i == count - 1)) ||
478-
((comp == nullptr) && (wqe_idx == fc))) {
478+
((comp == nullptr) && uct_rc_mlx5_gda_fc(ep, wqe_idx))) {
479479
cflag = DOCA_GPUNETIO_MLX5_WQE_CTRL_CQ_UPDATE;
480480
if (comp != nullptr) {
481481
comp->wqe_idx = wqe_base;
@@ -533,52 +533,55 @@ uct_rc_mlx5_gda_qedump(const char *pfx, void *buff, ssize_t len)
533533
}
534534
}
535535

536+
UCS_F_DEVICE int uct_rc_mlx5_gda_trylock(int *lock) {
537+
if (atomicCAS(lock, 0, 1) == 0) {
538+
doca_gpu_dev_verbs_fence_acquire<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU>();
539+
return 1;
540+
}
541+
542+
return 0;
543+
}
544+
545+
UCS_F_DEVICE void uct_rc_mlx5_gda_unlock(int *lock) {
546+
cuda::atomic_ref<int, cuda::thread_scope_device> lock_aref(*lock);
547+
lock_aref.store(0, cuda::std::memory_order_release);
548+
}
549+
536550
UCS_F_DEVICE void uct_rc_mlx5_gda_progress_thread(uct_rc_gdaki_dev_ep_t *ep)
537551
{
538-
void *cqe = ep->cqe_daddr;
539-
size_t cqe_num = ep->cqe_num;
540-
uint64_t cqe_idx = ep->cqe_ci;
541-
const size_t cqe_sz = DOCA_GPUNETIO_VERBS_CQE_SIZE;
542-
uint32_t idx = cqe_idx & (cqe_num - 1);
543-
void *curr_cqe = (uint8_t*)cqe + idx * cqe_sz;
544-
auto *cqe64 = reinterpret_cast<mlx5_cqe64*>(curr_cqe);
545-
uint8_t op_owner;
546-
547-
op_owner = READ_ONCE(cqe64->op_own);
548-
if ((op_owner & MLX5_CQE_OWNER_MASK) ^ !!(cqe_idx & cqe_num)) {
552+
if (!uct_rc_mlx5_gda_trylock(&ep->cq_lock)) {
549553
return;
550554
}
551555

552-
cuda::atomic_ref<uint64_t, cuda::thread_scope_device> ref(ep->cqe_ci);
553-
if (!ref.compare_exchange_strong(cqe_idx, cqe_idx + 1,
554-
cuda::std::memory_order_relaxed)) {
555-
return;
556-
}
556+
void *cqe = ep->cqe_daddr;
557+
auto *cqe64 = reinterpret_cast<mlx5_cqe64*>(cqe);
557558

558-
uint8_t opcode = op_owner >> DOCA_GPUNETIO_VERBS_MLX5_CQE_OPCODE_SHIFT;
559+
uint8_t opcode = cqe64->op_own >> DOCA_GPUNETIO_VERBS_MLX5_CQE_OPCODE_SHIFT;
559560
uint16_t wqe_cnt = uct_rc_mlx5_gda_bswap16(cqe64->wqe_counter);
560561
uint16_t wqe_idx = wqe_cnt & (ep->sq_wqe_num - 1);
561562

562-
cuda::atomic_ref<uint64_t, cuda::thread_scope_device> pi_ref(ep->sq_wqe_pi);
563563
uint64_t sq_wqe_pi = ep->sq_wqe_pi;
564-
sq_wqe_pi = ((wqe_cnt - sq_wqe_pi) & 0xffff) + sq_wqe_pi + 1;
564+
sq_wqe_pi = ((wqe_cnt - sq_wqe_pi) & 0xffff) + sq_wqe_pi;
565565

566-
if (opcode == MLX5_CQE_REQ) {
567-
pi_ref.fetch_max(sq_wqe_pi);
566+
if (opcode != MLX5_CQE_REQ_ERR) {
567+
ep->sq_wqe_pi = sq_wqe_pi;
568+
uct_rc_mlx5_gda_unlock(&ep->cq_lock);
568569
return;
569570
}
570571

571572
auto err_cqe = reinterpret_cast<mlx5_err_cqe_ex*>(cqe64);
572573
auto wqe_ptr = uct_rc_mlx5_gda_get_wqe_ptr(ep, wqe_idx);
573-
ucs_device_error("CQE[%d] with syndrome:%x vendor:%x hw:%x "
574+
ucs_device_error("CQE with syndrome:%x vendor:%x hw:%x "
574575
"wqe_idx:0x%x qp:0x%x",
575-
idx, err_cqe->syndrome, err_cqe->vendor_err_synd,
576+
err_cqe->syndrome, err_cqe->vendor_err_synd,
576577
err_cqe->hw_err_synd, wqe_idx,
577578
doca_gpu_dev_verbs_bswap32(err_cqe->s_wqe_opcode_qpn) &
578579
0xffffff);
579580
uct_rc_mlx5_gda_qedump("WQE", wqe_ptr, 64);
580581
uct_rc_mlx5_gda_qedump("CQE", cqe64, 64);
581-
pi_ref.fetch_max(sq_wqe_pi | UCT_RC_GDA_WQE_ERR);
582+
ep->sq_wqe_pi = sq_wqe_pi | UCT_RC_GDA_WQE_ERR;
583+
584+
uct_rc_mlx5_gda_unlock(&ep->cq_lock);
582585
}
583586

584587
template<ucs_device_level_t level>
@@ -604,7 +607,7 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_check_completion(
604607
uct_rc_gda_completion_t *comp = &tl_comp->rc_gda;
605608
uint64_t sq_wqe_pi = ep->sq_wqe_pi;
606609

607-
if ((sq_wqe_pi & UCT_RC_GDA_WQE_MASK) <= comp->wqe_idx) {
610+
if ((sq_wqe_pi & UCT_RC_GDA_WQE_MASK) < comp->wqe_idx) {
608611
return UCS_INPROGRESS;
609612
}
610613

src/uct/ib/mlx5/gdaki/gdaki_dev.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ typedef struct {
2020
uint64_t sq_rsvd_index;
2121
uint64_t sq_ready_index;
2222
uint64_t sq_wqe_pi;
23-
uint64_t cqe_ci;
2423
int sq_lock;
24+
int cq_lock;
2525

2626
uint8_t *sq_wqe_daddr;
2727
uint32_t *sq_dbrec;
@@ -30,6 +30,7 @@ typedef struct {
3030
uint32_t cqe_num;
3131
uint16_t sq_wqe_num;
3232
uint32_t sq_num;
33+
uint16_t sq_fc_mask;
3334
} uct_rc_gdaki_dev_ep_t;
3435

3536

0 commit comments

Comments
 (0)