@@ -100,7 +100,7 @@ UCS_F_DEVICE uint64_t uct_rc_mlx5_gda_max_alloc_wqe_base(
100100 when processing a new completion */
101101 uint64_t pi = doca_gpu_dev_verbs_atomic_read<uint64_t ,
102102 DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU>(&ep->sq_wqe_pi );
103- return pi + ep->sq_wqe_num - count;
103+ return pi + ep->sq_wqe_num + 1 - count;
104104}
105105
106106UCS_F_DEVICE uint64_t uct_rc_mlx5_gda_reserv_wqe_thread (
@@ -231,6 +231,12 @@ UCS_F_DEVICE void uct_rc_mlx5_gda_db(uct_rc_gdaki_dev_ep_t *ep,
231231 &ep->sq_lock );
232232}
233233
234+ UCS_F_DEVICE bool
235+ uct_rc_mlx5_gda_fc (const uct_rc_gdaki_dev_ep_t *ep, uint16_t wqe_idx)
236+ {
237+ return !(wqe_idx & ep->sq_fc_mask );
238+ }
239+
234240template <ucs_device_level_t level>
235241UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_single (
236242 uct_rc_gdaki_dev_ep_t *ep, const uct_device_mem_element_t *tl_mem_elem,
@@ -245,18 +251,16 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_single(
245251 uint64_t wqe_idx;
246252 unsigned lane_id;
247253 unsigned num_lanes;
248- uint32_t fc;
249254
250255 uct_rc_mlx5_gda_exec_init<level>(lane_id, num_lanes);
251256 uct_rc_mlx5_gda_reserv_wqe<level>(ep, 1 , lane_id, wqe_base);
252257 if (wqe_base == UCT_RC_GDA_RESV_WQE_NO_RESOURCE) {
253258 return UCS_ERR_NO_RESOURCE;
254259 }
255260
256- fc = doca_gpu_dev_verbs_wqe_idx_inc_mask (ep->sq_wqe_pi , ep->sq_wqe_num / 2 );
257261 wqe_idx = wqe_base & 0xffff ;
258262 if (lane_id == 0 ) {
259- if ((comp != nullptr ) || (wqe_idx == fc )) {
263+ if ((comp != nullptr ) || uct_rc_mlx5_gda_fc (ep, wqe_idx )) {
260264 cflag = DOCA_GPUNETIO_MLX5_WQE_CTRL_CQ_UPDATE;
261265 if (comp != nullptr ) {
262266 comp->wqe_idx = wqe_base;
@@ -333,7 +337,6 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi(
333337 unsigned cflag;
334338 unsigned lane_id;
335339 unsigned num_lanes;
336- uint32_t fc;
337340 uint64_t wqe_base;
338341 size_t length;
339342 void *address;
@@ -357,7 +360,6 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi(
357360 return UCS_ERR_NO_RESOURCE;
358361 }
359362
360- fc = doca_gpu_dev_verbs_wqe_idx_inc_mask (ep->sq_wqe_pi , ep->sq_wqe_num / 2 );
361363 wqe_idx = doca_gpu_dev_verbs_wqe_idx_inc_mask (wqe_base, lane_id);
362364 for (uint32_t i = lane_id; i < count; i += num_lanes) {
363365 if (i == counter_index) {
@@ -379,7 +381,7 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi(
379381
380382 cflag = 0 ;
381383 if (((comp != nullptr ) && (i == count - 1 )) ||
382- ((comp == nullptr ) && (wqe_idx == fc ))) {
384+ ((comp == nullptr ) && uct_rc_mlx5_gda_fc (ep, wqe_idx ))) {
383385 cflag = DOCA_GPUNETIO_MLX5_WQE_CTRL_CQ_UPDATE;
384386 if (comp != nullptr ) {
385387 comp->wqe_idx = wqe_base;
@@ -426,7 +428,6 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi_partial(
426428 unsigned lane_id;
427429 unsigned num_lanes;
428430 unsigned cflag;
429- uint32_t fc;
430431 uint64_t wqe_base;
431432 size_t length;
432433 void *address;
@@ -451,7 +452,6 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi_partial(
451452 return UCS_ERR_NO_RESOURCE;
452453 }
453454
454- fc = doca_gpu_dev_verbs_wqe_idx_inc_mask (ep->sq_wqe_pi , ep->sq_wqe_num / 2 );
455455 wqe_idx = doca_gpu_dev_verbs_wqe_idx_inc_mask (wqe_base, lane_id);
456456 for (uint32_t i = lane_id; i < count; i += num_lanes) {
457457 if (i == mem_list_count) {
@@ -475,7 +475,7 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi_partial(
475475
476476 cflag = 0 ;
477477 if (((comp != nullptr ) && (i == count - 1 )) ||
478- ((comp == nullptr ) && (wqe_idx == fc ))) {
478+ ((comp == nullptr ) && uct_rc_mlx5_gda_fc (ep, wqe_idx ))) {
479479 cflag = DOCA_GPUNETIO_MLX5_WQE_CTRL_CQ_UPDATE;
480480 if (comp != nullptr ) {
481481 comp->wqe_idx = wqe_base;
@@ -533,52 +533,55 @@ uct_rc_mlx5_gda_qedump(const char *pfx, void *buff, ssize_t len)
533533 }
534534}
535535
536+ UCS_F_DEVICE int uct_rc_mlx5_gda_trylock (int *lock) {
537+ if (atomicCAS (lock, 0 , 1 ) == 0 ) {
538+ doca_gpu_dev_verbs_fence_acquire<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU>();
539+ return 1 ;
540+ }
541+
542+ return 0 ;
543+ }
544+
545+ UCS_F_DEVICE void uct_rc_mlx5_gda_unlock (int *lock) {
546+ cuda::atomic_ref<int , cuda::thread_scope_device> lock_aref (*lock);
547+ lock_aref.store (0 , cuda::std::memory_order_release);
548+ }
549+
536550UCS_F_DEVICE void uct_rc_mlx5_gda_progress_thread (uct_rc_gdaki_dev_ep_t *ep)
537551{
538- void *cqe = ep->cqe_daddr ;
539- size_t cqe_num = ep->cqe_num ;
540- uint64_t cqe_idx = ep->cqe_ci ;
541- const size_t cqe_sz = DOCA_GPUNETIO_VERBS_CQE_SIZE;
542- uint32_t idx = cqe_idx & (cqe_num - 1 );
543- void *curr_cqe = (uint8_t *)cqe + idx * cqe_sz;
544- auto *cqe64 = reinterpret_cast <mlx5_cqe64*>(curr_cqe);
545- uint8_t op_owner;
546-
547- op_owner = READ_ONCE (cqe64->op_own );
548- if ((op_owner & MLX5_CQE_OWNER_MASK) ^ !!(cqe_idx & cqe_num)) {
552+ if (!uct_rc_mlx5_gda_trylock (&ep->cq_lock )) {
549553 return ;
550554 }
551555
552- cuda::atomic_ref<uint64_t , cuda::thread_scope_device> ref (ep->cqe_ci );
553- if (!ref.compare_exchange_strong (cqe_idx, cqe_idx + 1 ,
554- cuda::std::memory_order_relaxed)) {
555- return ;
556- }
556+ void *cqe = ep->cqe_daddr ;
557+ auto *cqe64 = reinterpret_cast <mlx5_cqe64*>(cqe);
557558
558- uint8_t opcode = op_owner >> DOCA_GPUNETIO_VERBS_MLX5_CQE_OPCODE_SHIFT;
559+ uint8_t opcode = cqe64-> op_own >> DOCA_GPUNETIO_VERBS_MLX5_CQE_OPCODE_SHIFT;
559560 uint16_t wqe_cnt = uct_rc_mlx5_gda_bswap16 (cqe64->wqe_counter );
560561 uint16_t wqe_idx = wqe_cnt & (ep->sq_wqe_num - 1 );
561562
562- cuda::atomic_ref<uint64_t , cuda::thread_scope_device> pi_ref (ep->sq_wqe_pi );
563563 uint64_t sq_wqe_pi = ep->sq_wqe_pi ;
564- sq_wqe_pi = ((wqe_cnt - sq_wqe_pi) & 0xffff ) + sq_wqe_pi + 1 ;
564+ sq_wqe_pi = ((wqe_cnt - sq_wqe_pi) & 0xffff ) + sq_wqe_pi;
565565
566- if (opcode == MLX5_CQE_REQ) {
567- pi_ref.fetch_max (sq_wqe_pi);
566+ if (opcode != MLX5_CQE_REQ_ERR) {
567+ ep->sq_wqe_pi = sq_wqe_pi;
568+ uct_rc_mlx5_gda_unlock (&ep->cq_lock );
568569 return ;
569570 }
570571
571572 auto err_cqe = reinterpret_cast <mlx5_err_cqe_ex*>(cqe64);
572573 auto wqe_ptr = uct_rc_mlx5_gda_get_wqe_ptr (ep, wqe_idx);
573- ucs_device_error (" CQE[%d] with syndrome:%x vendor:%x hw:%x "
574+ ucs_device_error (" CQE with syndrome:%x vendor:%x hw:%x "
574575 " wqe_idx:0x%x qp:0x%x" ,
575- idx, err_cqe->syndrome , err_cqe->vendor_err_synd ,
576+ err_cqe->syndrome , err_cqe->vendor_err_synd ,
576577 err_cqe->hw_err_synd , wqe_idx,
577578 doca_gpu_dev_verbs_bswap32 (err_cqe->s_wqe_opcode_qpn ) &
578579 0xffffff );
579580 uct_rc_mlx5_gda_qedump (" WQE" , wqe_ptr, 64 );
580581 uct_rc_mlx5_gda_qedump (" CQE" , cqe64, 64 );
581- pi_ref.fetch_max (sq_wqe_pi | UCT_RC_GDA_WQE_ERR);
582+ ep->sq_wqe_pi = sq_wqe_pi | UCT_RC_GDA_WQE_ERR;
583+
584+ uct_rc_mlx5_gda_unlock (&ep->cq_lock );
582585}
583586
584587template <ucs_device_level_t level>
@@ -604,7 +607,7 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_check_completion(
604607 uct_rc_gda_completion_t *comp = &tl_comp->rc_gda ;
605608 uint64_t sq_wqe_pi = ep->sq_wqe_pi ;
606609
607- if ((sq_wqe_pi & UCT_RC_GDA_WQE_MASK) <= comp->wqe_idx ) {
610+ if ((sq_wqe_pi & UCT_RC_GDA_WQE_MASK) < comp->wqe_idx ) {
608611 return UCS_INPROGRESS;
609612 }
610613
0 commit comments