Skip to content

Commit 1e9db41

Browse files
committed
UCT/DEVICE: optimize doorbell
1 parent 0f0e3ec commit 1e9db41

File tree

1 file changed

+13
-11
lines changed

1 file changed

+13
-11
lines changed

src/uct/ib/mlx5/gdaki/gdaki.cuh

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -263,24 +263,26 @@ UCS_F_DEVICE void uct_rc_mlx5_gda_db(uct_rc_gdaki_dev_ep_t *ep,
263263
{
264264
cuda::atomic_ref<uint64_t, cuda::thread_scope_device> ref(
265265
ep->sq_ready_index);
266-
uint64_t wqe_base_orig = wqe_base;
266+
const uint64_t wqe_num = __ldg(&ep->sq_wqe_num);
267+
const uint64_t wqe_next = wqe_base + count;
268+
const uint64_t wqe_base_orig = wqe_base;
267269

268270
__threadfence();
269-
while (!ref.compare_exchange_strong(wqe_base, wqe_base + count,
271+
while (!ref.compare_exchange_strong(wqe_base, wqe_next,
270272
cuda::std::memory_order_relaxed)) {
271273
wqe_base = wqe_base_orig;
272274
}
273275

274-
if (!(flags & UCT_DEVICE_FLAG_NODELAY) &&
275-
!((wqe_base ^ (wqe_base + count)) & 128)) {
276-
return;
276+
if (READ_ONCE(ep->sq_ready_index) == wqe_next) {
277+
uct_rc_mlx5_gda_lock(&ep->sq_lock);
278+
const uint64_t ready_index = READ_ONCE(ep->sq_ready_index);
279+
if (ready_index == wqe_next) {
280+
uct_rc_mlx5_gda_ring_db(ep, ready_index);
281+
uct_rc_mlx5_gda_update_dbr(ep, ready_index);
282+
uct_rc_mlx5_gda_ring_db(ep, ready_index);
283+
}
284+
uct_rc_mlx5_gda_unlock(&ep->sq_lock);
277285
}
278-
279-
uct_rc_mlx5_gda_lock(&ep->sq_lock);
280-
uct_rc_mlx5_gda_ring_db(ep, ep->sq_ready_index);
281-
uct_rc_mlx5_gda_update_dbr(ep, ep->sq_ready_index);
282-
uct_rc_mlx5_gda_ring_db(ep, ep->sq_ready_index);
283-
uct_rc_mlx5_gda_unlock(&ep->sq_lock);
284286
}
285287

286288
UCS_F_DEVICE bool

0 commit comments

Comments
 (0)