@@ -258,24 +258,25 @@ UCS_F_DEVICE void uct_rc_mlx5_gda_db(uct_rc_gdaki_dev_ep_t *ep,
258258{
259259 cuda::atomic_ref<uint64_t , cuda::thread_scope_device> ref (
260260 ep->sq_ready_index );
261- uint64_t wqe_base_orig = wqe_base;
261+ const uint64_t wqe_next = wqe_base + count;
262+ const bool skip_db = !(flags & UCT_DEVICE_FLAG_NODELAY) &&
263+ !((wqe_base ^ wqe_next) & 128 );
262264
263265 __threadfence ();
264- while (!ref.compare_exchange_strong (wqe_base, wqe_base + count,
265- cuda::std::memory_order_relaxed)) {
266- wqe_base = wqe_base_orig;
267- }
268-
269- if (!(flags & UCT_DEVICE_FLAG_NODELAY) &&
270- !((wqe_base ^ (wqe_base + count)) & 128 )) {
271- return ;
266+ if (skip_db) {
267+ const uint64_t wqe_base_orig = wqe_base;
268+ while (!ref.compare_exchange_strong (wqe_base, wqe_next,
269+ cuda::std::memory_order_relaxed)) {
270+ wqe_base = wqe_base_orig;
271+ }
272+ } else {
273+ while (READ_ONCE (ep->sq_ready_index ) != wqe_base) {
274+ }
275+ uct_rc_mlx5_gda_ring_db (ep, wqe_next);
276+ uct_rc_mlx5_gda_update_dbr (ep, wqe_next);
277+ uct_rc_mlx5_gda_ring_db (ep, wqe_next);
278+ ref.store (wqe_next, cuda::std::memory_order_release);
272279 }
273-
274- uct_rc_mlx5_gda_lock (&ep->sq_lock );
275- uct_rc_mlx5_gda_ring_db (ep, ep->sq_ready_index );
276- uct_rc_mlx5_gda_update_dbr (ep, ep->sq_ready_index );
277- uct_rc_mlx5_gda_ring_db (ep, ep->sq_ready_index );
278- uct_rc_mlx5_gda_unlock (&ep->sq_lock );
279280}
280281
281282UCS_F_DEVICE bool
0 commit comments