@@ -263,24 +263,26 @@ UCS_F_DEVICE void uct_rc_mlx5_gda_db(uct_rc_gdaki_dev_ep_t *ep,
263263{
264264 cuda::atomic_ref<uint64_t , cuda::thread_scope_device> ref (
265265 ep->sq_ready_index );
266- uint64_t wqe_base_orig = wqe_base;
266+ const uint64_t wqe_num = __ldg (&ep->sq_wqe_num );
267+ const uint64_t wqe_next = wqe_base + count;
268+ const uint64_t wqe_base_orig = wqe_base;
267269
268270 __threadfence ();
269- while (!ref.compare_exchange_strong (wqe_base, wqe_base + count ,
271+ while (!ref.compare_exchange_strong (wqe_base, wqe_next ,
270272 cuda::std::memory_order_relaxed)) {
271273 wqe_base = wqe_base_orig;
272274 }
273275
274- if (!(flags & UCT_DEVICE_FLAG_NODELAY) &&
275- !((wqe_base ^ (wqe_base + count)) & 128 )) {
276- return ;
276+ if (READ_ONCE (ep->sq_ready_index ) == wqe_next) {
277+ uct_rc_mlx5_gda_lock (&ep->sq_lock );
278+ const uint64_t ready_index = READ_ONCE (ep->sq_ready_index );
279+ if (ready_index == wqe_next) {
280+ uct_rc_mlx5_gda_ring_db (ep, ready_index);
281+ uct_rc_mlx5_gda_update_dbr (ep, ready_index);
282+ uct_rc_mlx5_gda_ring_db (ep, ready_index);
283+ }
284+ uct_rc_mlx5_gda_unlock (&ep->sq_lock );
277285 }
278-
279- uct_rc_mlx5_gda_lock (&ep->sq_lock );
280- uct_rc_mlx5_gda_ring_db (ep, ep->sq_ready_index );
281- uct_rc_mlx5_gda_update_dbr (ep, ep->sq_ready_index );
282- uct_rc_mlx5_gda_ring_db (ep, ep->sq_ready_index );
283- uct_rc_mlx5_gda_unlock (&ep->sq_lock );
284286}
285287
286288UCS_F_DEVICE bool
0 commit comments