@@ -263,24 +263,25 @@ UCS_F_DEVICE void uct_rc_mlx5_gda_db(uct_rc_gdaki_dev_ep_t *ep,
263263{
264264 cuda::atomic_ref<uint64_t , cuda::thread_scope_device> ref (
265265 ep->sq_ready_index );
266- uint64_t wqe_base_orig = wqe_base;
266+ const uint64_t wqe_next = wqe_base + count;
267+ const bool skip_db = !(flags & UCT_DEVICE_FLAG_NODELAY) &&
268+ !((wqe_base ^ wqe_next) & 128 );
267269
268270 __threadfence ();
269- while (!ref.compare_exchange_strong (wqe_base, wqe_base + count,
270- cuda::std::memory_order_relaxed)) {
271- wqe_base = wqe_base_orig;
272- }
273-
274- if (!(flags & UCT_DEVICE_FLAG_NODELAY) &&
275- !((wqe_base ^ (wqe_base + count)) & 128 )) {
276- return ;
271+ if (skip_db) {
272+ const uint64_t wqe_base_orig = wqe_base;
273+ while (!ref.compare_exchange_strong (wqe_base, wqe_next,
274+ cuda::std::memory_order_relaxed)) {
275+ wqe_base = wqe_base_orig;
276+ }
277+ } else {
278+ while (READ_ONCE (ep->sq_ready_index ) != wqe_base) {
279+ }
280+ uct_rc_mlx5_gda_ring_db (ep, wqe_next);
281+ uct_rc_mlx5_gda_update_dbr (ep, wqe_next);
282+ uct_rc_mlx5_gda_ring_db (ep, wqe_next);
283+ ref.store (wqe_next, cuda::std::memory_order_release);
277284 }
278-
279- uct_rc_mlx5_gda_lock (&ep->sq_lock );
280- uct_rc_mlx5_gda_ring_db (ep, ep->sq_ready_index );
281- uct_rc_mlx5_gda_update_dbr (ep, ep->sq_ready_index );
282- uct_rc_mlx5_gda_ring_db (ep, ep->sq_ready_index );
283- uct_rc_mlx5_gda_unlock (&ep->sq_lock );
284285}
285286
286287UCS_F_DEVICE bool
0 commit comments