@@ -165,6 +165,23 @@ UCS_F_DEVICE void uct_rc_mlx5_gda_wqe_prepare_put_or_atomic(
165165 doca_gpu_dev_verbs_store_wqe_seg (dseg_ptr, (uint64_t *)&(dseg));
166166}
167167
168+ UCS_F_DEVICE void uct_rc_mlx5_gda_lock (int *lock) {
169+ while (atomicCAS (lock, 0 , 1 ) != 0 )
170+ ;
171+ #ifdef DOCA_GPUNETIO_VERBS_HAS_FENCE_ACQUIRE_RELEASE_PTX
172+ asm volatile (" fence.acquire.gpu;" );
173+ #else
174+ uint32_t dummy;
175+ uint32_t UCS_V_UNUSED val;
176+ asm volatile (" ld.acquire.gpu.b32 %0, [%1];" : " =r" (val) : " l" (&dummy));
177+ #endif
178+ }
179+
180+ UCS_F_DEVICE void uct_rc_mlx5_gda_unlock (int *lock) {
181+ cuda::atomic_ref<int , cuda::thread_scope_device> lock_aref (*lock);
182+ lock_aref.store (0 , cuda::std::memory_order_release);
183+ }
184+
168185UCS_F_DEVICE void uct_rc_mlx5_gda_db (uct_rc_gdaki_dev_ep_t *ep,
169186 uint64_t wqe_base, unsigned count,
170187 uint64_t flags)
@@ -184,13 +201,11 @@ UCS_F_DEVICE void uct_rc_mlx5_gda_db(uct_rc_gdaki_dev_ep_t *ep,
184201 return ;
185202 }
186203
187- doca_gpu_dev_verbs_lock<DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU>(
188- &ep->sq_lock );
204+ uct_rc_mlx5_gda_lock (&ep->sq_lock );
189205 uct_rc_mlx5_gda_ring_db (ep, ep->sq_ready_index );
190206 uct_rc_mlx5_gda_update_dbr (ep, ep->sq_ready_index );
191207 uct_rc_mlx5_gda_ring_db (ep, ep->sq_ready_index );
192- doca_gpu_dev_verbs_unlock<DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU>(
193- &ep->sq_lock );
208+ uct_rc_mlx5_gda_unlock (&ep->sq_lock );
194209}
195210
196211UCS_F_DEVICE bool
0 commit comments