@@ -240,6 +240,23 @@ UCS_F_DEVICE void uct_rc_mlx5_gda_wqe_prepare_put_or_atomic(
240240 doca_gpu_dev_verbs_store_wqe_seg (dseg_ptr, (uint64_t *)&(dseg));
241241}
242242
243+ UCS_F_DEVICE void uct_rc_mlx5_gda_lock (int *lock) {
244+ while (atomicCAS (lock, 0 , 1 ) != 0 )
245+ ;
246+ #ifdef DOCA_GPUNETIO_VERBS_HAS_FENCE_ACQUIRE_RELEASE_PTX
247+ asm volatile (" fence.acquire.gpu;" );
248+ #else
249+ uint32_t dummy;
250+ uint32_t UCS_V_UNUSED val;
251+ asm volatile (" ld.acquire.gpu.b32 %0, [%1];" : " =r" (val) : " l" (&dummy));
252+ #endif
253+ }
254+
255+ UCS_F_DEVICE void uct_rc_mlx5_gda_unlock (int *lock) {
256+ cuda::atomic_ref<int , cuda::thread_scope_device> lock_aref (*lock);
257+ lock_aref.store (0 , cuda::std::memory_order_release);
258+ }
259+
243260UCS_F_DEVICE void uct_rc_mlx5_gda_db (uct_rc_gdaki_dev_ep_t *ep,
244261 uint64_t wqe_base, unsigned count,
245262 uint64_t flags)
@@ -259,13 +276,11 @@ UCS_F_DEVICE void uct_rc_mlx5_gda_db(uct_rc_gdaki_dev_ep_t *ep,
259276 return ;
260277 }
261278
262- doca_gpu_dev_verbs_lock<DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU>(
263- &ep->sq_lock );
279+ uct_rc_mlx5_gda_lock (&ep->sq_lock );
264280 uct_rc_mlx5_gda_ring_db (ep, ep->sq_ready_index );
265281 uct_rc_mlx5_gda_update_dbr (ep, ep->sq_ready_index );
266282 uct_rc_mlx5_gda_ring_db (ep, ep->sq_ready_index );
267- doca_gpu_dev_verbs_unlock<DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU>(
268- &ep->sq_lock );
283+ uct_rc_mlx5_gda_unlock (&ep->sq_lock );
269284}
270285
271286UCS_F_DEVICE bool
0 commit comments