@@ -93,13 +93,49 @@ template<ucs_device_level_t level> UCS_F_DEVICE void uct_rc_mlx5_gda_sync(void)
9393 }
9494}
9595
96+ UCS_F_DEVICE uint16_t uct_rc_mlx5_gda_bswap16 (uint16_t x)
97+ {
98+ uint32_t ret;
99+ asm volatile (" {\n\t "
100+ " .reg .b32 mask;\n\t "
101+ " .reg .b32 ign;\n\t "
102+ " mov.b32 mask, 0x1;\n\t "
103+ " prmt.b32 %0, %1, ign, mask;\n\t "
104+ " }"
105+ : " =r" (ret)
106+ : " r" ((uint32_t )x));
107+ return ret;
108+ }
109+
110+ UCS_F_DEVICE void uct_rc_mlx5_gda_read_cqe (uct_rc_gdaki_dev_ep_t *ep,
111+ uint16_t *wqe_cnt, uint8_t *opcode)
112+ {
113+ auto *cqe64 = reinterpret_cast <mlx5_cqe64*>(ep->cqe_daddr );
114+ uint32_t *data_ptr = (uint32_t *)&cqe64->wqe_counter ;
115+ uint32_t data = READ_ONCE (*data_ptr);
116+
117+ *wqe_cnt = uct_rc_mlx5_gda_bswap16 (data);
118+ if (opcode != NULL ) {
119+ *opcode = data >> 28 ;
120+ }
121+ }
122+
123+ UCS_F_DEVICE uint64_t uct_rc_mlx5_gda_calc_pi (uct_rc_gdaki_dev_ep_t *ep,
124+ uint16_t wqe_cnt)
125+ {
126+ uint64_t rsvd_idx = READ_ONCE (ep->sq_rsvd_index );
127+ return rsvd_idx - ((rsvd_idx - wqe_cnt) & 0xffff );
128+ }
129+
130+
96131UCS_F_DEVICE uint64_t uct_rc_mlx5_gda_max_alloc_wqe_base (
97132 uct_rc_gdaki_dev_ep_t *ep, unsigned count)
98133{
99- /* TODO optimize by including sq_wqe_num in qp->sq_wqe_pi and updating it
100- when processing a new completion */
101- uint64_t pi = doca_gpu_dev_verbs_atomic_read<uint64_t ,
102- DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU>(&ep->sq_wqe_pi );
134+ uint16_t wqe_cnt;
135+ uint64_t pi;
136+
137+ uct_rc_mlx5_gda_read_cqe (ep, &wqe_cnt, NULL );
138+ pi = uct_rc_mlx5_gda_calc_pi (ep, wqe_cnt);
103139 return pi + ep->sq_wqe_num + 1 - count;
104140}
105141
@@ -501,20 +537,6 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi_partial(
501537 return UCS_INPROGRESS;
502538}
503539
504- UCS_F_DEVICE uint16_t uct_rc_mlx5_gda_bswap16 (uint16_t x)
505- {
506- uint32_t ret;
507- asm volatile (" {\n\t "
508- " .reg .b32 mask;\n\t "
509- " .reg .b32 ign;\n\t "
510- " mov.b32 mask, 0x1;\n\t "
511- " prmt.b32 %0, %1, ign, mask;\n\t "
512- " }"
513- : " =r" (ret)
514- : " r" ((uint32_t )x));
515- return ret;
516- }
517-
518540UCS_F_DEVICE void
519541uct_rc_mlx5_gda_qedump (const char *pfx, void *buff, ssize_t len)
520542{
@@ -532,70 +554,9 @@ uct_rc_mlx5_gda_qedump(const char *pfx, void *buff, ssize_t len)
532554 }
533555}
534556
535- UCS_F_DEVICE int uct_rc_mlx5_gda_trylock (int *lock) {
536- if (atomicCAS (lock, 0 , 1 ) == 0 ) {
537- doca_gpu_dev_verbs_fence_acquire<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU>();
538- return 1 ;
539- }
540-
541- return 0 ;
542- }
543-
544- UCS_F_DEVICE void uct_rc_mlx5_gda_unlock (int *lock) {
545- cuda::atomic_ref<int , cuda::thread_scope_device> lock_aref (*lock);
546- lock_aref.store (0 , cuda::std::memory_order_release);
547- }
548-
549- UCS_F_DEVICE void uct_rc_mlx5_gda_progress_thread (uct_rc_gdaki_dev_ep_t *ep)
550- {
551- if (!uct_rc_mlx5_gda_trylock (&ep->cq_lock )) {
552- return ;
553- }
554-
555- void *cqe = ep->cqe_daddr ;
556- auto *cqe64 = reinterpret_cast <mlx5_cqe64*>(cqe);
557-
558- uint8_t opcode = cqe64->op_own >> DOCA_GPUNETIO_VERBS_MLX5_CQE_OPCODE_SHIFT;
559- uint16_t wqe_cnt = uct_rc_mlx5_gda_bswap16 (cqe64->wqe_counter );
560- uint16_t wqe_idx = wqe_cnt & (ep->sq_wqe_num - 1 );
561-
562- uint64_t sq_wqe_pi = ep->sq_wqe_pi ;
563- sq_wqe_pi = ((wqe_cnt - sq_wqe_pi) & 0xffff ) + sq_wqe_pi;
564-
565- if (opcode != MLX5_CQE_REQ_ERR) {
566- ep->sq_wqe_pi = sq_wqe_pi;
567- uct_rc_mlx5_gda_unlock (&ep->cq_lock );
568- return ;
569- }
570-
571- auto err_cqe = reinterpret_cast <mlx5_err_cqe_ex*>(cqe64);
572- auto wqe_ptr = uct_rc_mlx5_gda_get_wqe_ptr (ep, wqe_idx);
573- ucs_device_error (" CQE with syndrome:%x vendor:%x hw:%x "
574- " wqe_idx:0x%x qp:0x%x" ,
575- err_cqe->syndrome , err_cqe->vendor_err_synd ,
576- err_cqe->hw_err_synd , wqe_idx,
577- doca_gpu_dev_verbs_bswap32 (err_cqe->s_wqe_opcode_qpn ) &
578- 0xffffff );
579- uct_rc_mlx5_gda_qedump (" WQE" , wqe_ptr, 64 );
580- uct_rc_mlx5_gda_qedump (" CQE" , cqe64, 64 );
581- ep->sq_wqe_pi = sq_wqe_pi | UCT_RC_GDA_WQE_ERR;
582-
583- uct_rc_mlx5_gda_unlock (&ep->cq_lock );
584- }
585-
586557template <ucs_device_level_t level>
587558UCS_F_DEVICE void uct_rc_mlx5_gda_ep_progress (uct_device_ep_h tl_ep)
588559{
589- uct_rc_gdaki_dev_ep_t *ep = (uct_rc_gdaki_dev_ep_t *)tl_ep;
590- unsigned num_lanes;
591- unsigned lane_id;
592-
593- uct_rc_mlx5_gda_exec_init<level>(lane_id, num_lanes);
594- if (lane_id == 0 ) {
595- uct_rc_mlx5_gda_progress_thread (ep);
596- }
597-
598- uct_rc_mlx5_gda_sync<level>();
599560}
600561
601562template <ucs_device_level_t level>
@@ -604,13 +565,22 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_check_completion(
604565{
605566 uct_rc_gdaki_dev_ep_t *ep = reinterpret_cast <uct_rc_gdaki_dev_ep_t *>(tl_ep);
606567 uct_rc_gda_completion_t *comp = &tl_comp->rc_gda ;
607- uint64_t sq_wqe_pi = ep->sq_wqe_pi ;
568+ uint16_t wqe_cnt;
569+ uint8_t opcode;
570+ uint64_t pi;
571+
572+ uct_rc_mlx5_gda_read_cqe (ep, &wqe_cnt, &opcode);
573+ pi = uct_rc_mlx5_gda_calc_pi (ep, wqe_cnt);
608574
609- if ((sq_wqe_pi & UCT_RC_GDA_WQE_MASK) < comp->wqe_idx ) {
575+ if (pi < comp->wqe_idx ) {
610576 return UCS_INPROGRESS;
611577 }
612578
613- if (sq_wqe_pi & UCT_RC_GDA_WQE_ERR) {
579+ if (opcode == MLX5_CQE_REQ_ERR) {
580+ uint16_t wqe_idx = wqe_cnt & (ep->sq_wqe_num - 1 );
581+ auto wqe_ptr = uct_rc_mlx5_gda_get_wqe_ptr (ep, wqe_idx);
582+ uct_rc_mlx5_gda_qedump (" WQE" , wqe_ptr, 64 );
583+ uct_rc_mlx5_gda_qedump (" CQE" , ep->cqe_daddr , 64 );
614584 return UCS_ERR_IO_ERROR;
615585 }
616586
0 commit comments