Skip to content

Commit d675046

Browse files
committed
UCP/PERF: Merge branch 'master' into ucp-perf-warp-fixes
2 parents 8d28eb6 + 122348a commit d675046

File tree

9 files changed

+88
-24
lines changed

9 files changed

+88
-24
lines changed

buildlib/az-helpers.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,14 +199,14 @@ try_load_cuda_env() {
199199
have_cuda="${cuda_local_dir}"
200200
else
201201
# Fallback to env module
202-
az_module_load dev/cuda13.0.0 || return 0
202+
az_module_load dev/cuda13.0.2 || return 0
203203
have_cuda=yes
204204
fi
205205

206206
# Check gdrcopy
207207
if [ -w "/dev/gdrdrv" ]
208208
then
209-
az_module_load dev/gdrcopy2.5.1_cuda13.0.0 && have_gdrcopy=yes
209+
az_module_load dev/gdrcopy2.5.1_cuda13.0.2 && have_gdrcopy=yes
210210
fi
211211
}
212212

buildlib/tools/common.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ WORKSPACE=${WORKSPACE:=$PWD}
44
# build in local directory which goes away when docker exits
55
ucx_build_dir=$HOME/${BUILD_ID}/build
66
ucx_inst=$ucx_build_dir/install
7-
CUDA_MODULE="dev/cuda13.0.0"
8-
GDRCOPY_MODULE="dev/gdrcopy2.5.1_cuda13.0.0"
7+
CUDA_MODULE="dev/cuda13.0.2"
8+
GDRCOPY_MODULE="dev/gdrcopy2.5.1_cuda13.0.2"
99
JDK_MODULE="dev/jdk"
1010
MVN_MODULE="dev/mvn"
1111
XPMEM_MODULE="dev/xpmem-90a95a4"

buildlib/tools/perf-common.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ steps:
1313
1414
case "${{ parameters.Name }}" in
1515
"Build-UCX")
16-
module="/hpc/local/etc/modulefiles/dev/cuda13.0.0"
16+
module="/hpc/local/etc/modulefiles/dev/cuda13.0.2"
1717
perfxParams=(--skip-run --source-branch $(Build.SourceBranch) --omb-cuda)
1818
;;
1919
"Perf-test-multi-node")

src/ucp/api/device/ucp_device_types.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -63,22 +63,22 @@ typedef struct ucp_device_mem_list_handle {
6363
/**
6464
* Array of local addresses for the device transfer operations.
6565
*/
66-
void **local_addrs;
66+
void **local_addrs;
6767

6868
/**
6969
* Array of remote addresses for the device transfer operations.
7070
*/
71-
uint64_t *remote_addrs;
72-
71+
uint64_t *remote_addrs;
72+
7373
/**
7474
* Array of lengths of the local buffers in bytes.
7575
*/
76-
size_t *lengths;
76+
size_t *lengths;
7777

7878
/**
7979
* Array of UCT memory element objects.
8080
*/
81-
void *uct_mem_elements;
81+
void *uct_mem_elements;
8282

8383
/**
8484
* local address, remote address, and length arrays, are allocated contiguously.

src/ucp/api/device/ucp_host.h

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ typedef struct ucp_device_mem_list_params {
152152
*
153153
* @param [in] ep Remote endpoint handle.
154154
* @param [in] params Parameters used to create the handle.
155-
* @param [out] handle Created descriptor list handle.
155+
* @param [out] handle Created descriptors list handle.
156156
*
157157
* @return Error code as defined by @ref ucs_status_t.
158158
* @retval UCS_ERR_NOT_CONNECTED if the endpoint is not connected yet.
@@ -164,6 +164,18 @@ ucp_device_mem_list_create(ucp_ep_h ep,
164164
ucp_device_mem_list_handle_h *handle);
165165

166166

167+
/**
168+
* @ingroup UCP_DEVICE
169+
* @brief Return the number of elements in the descriptors mem list handle.
170+
*
171+
* @param [in] handle Descriptors list handle.
172+
*
173+
* @return Descriptors mem list length.
174+
*/
175+
uint32_t
176+
ucp_device_get_mem_list_length(const ucp_device_mem_list_handle_h handle);
177+
178+
167179
/**
168180
* @ingroup UCP_DEVICE
169181
* @brief Release function for a descriptor list handle.

src/ucp/core/ucp_device.c

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,17 @@
2121
#include "ucp_mm.inl"
2222

2323

24+
typedef struct {
25+
uct_allocated_memory_t mem;
26+
uint32_t mem_list_length;
27+
} ucp_device_handle_info_t;
28+
2429
KHASH_TYPE(ucp_device_handle_allocs, ucp_device_mem_list_handle_h,
25-
uct_allocated_memory_t);
30+
ucp_device_handle_info_t);
2631
#define ucp_device_handle_hash_key(_handle) \
2732
kh_int64_hash_func((uintptr_t)(_handle))
2833
KHASH_IMPL(ucp_device_handle_allocs, ucp_device_mem_list_handle_h,
29-
uct_allocated_memory_t, 1, ucp_device_handle_hash_key,
34+
ucp_device_handle_info_t, 1, ucp_device_handle_hash_key,
3035
kh_int64_hash_equal);
3136

3237
/* Hash to track handle allocator, used at release time */
@@ -50,11 +55,16 @@ void ucp_device_cleanup(void)
5055
}
5156

5257
static ucs_status_t
53-
ucp_device_mem_handle_hash_insert(uct_allocated_memory_t *mem_handle)
58+
ucp_device_mem_handle_hash_insert(const uct_allocated_memory_t *mem_handle,
59+
uint32_t mem_list_length)
5460
{
5561
ucs_status_t status;
5662
khiter_t iter;
5763
int ret;
64+
ucp_device_handle_info_t info;
65+
66+
info.mem = *mem_handle;
67+
info.mem_list_length = mem_list_length;
5868

5969
ucs_spin_lock(&ucp_device_handle_hash_lock);
6070
iter = kh_put(ucp_device_handle_allocs, &ucp_device_handle_hash,
@@ -66,7 +76,7 @@ ucp_device_mem_handle_hash_insert(uct_allocated_memory_t *mem_handle)
6676
ucs_error("handle=%p already found in hash", mem_handle->address);
6777
status = UCS_ERR_ALREADY_EXISTS;
6878
} else {
69-
kh_value(&ucp_device_handle_hash, iter) = *mem_handle;
79+
kh_value(&ucp_device_handle_hash, iter) = info;
7080
status = UCS_OK;
7181
}
7282

@@ -84,7 +94,7 @@ ucp_device_mem_handle_hash_remove(ucp_device_mem_list_handle_h handle)
8494
iter = kh_get(ucp_device_handle_allocs, &ucp_device_handle_hash, handle);
8595
ucs_assertv_always((iter != kh_end(&ucp_device_handle_hash)), "handle=%p",
8696
handle);
87-
mem = kh_value(&ucp_device_handle_hash, iter);
97+
mem = kh_value(&ucp_device_handle_hash, iter).mem;
8898
kh_del(ucp_device_handle_allocs, &ucp_device_handle_hash, iter);
8999
ucs_spin_unlock(&ucp_device_handle_hash_lock);
90100
return mem;
@@ -595,7 +605,7 @@ ucp_device_mem_list_create(ucp_ep_h ep,
595605
}
596606

597607
/* Track memory allocator for later release */
598-
status = ucp_device_mem_handle_hash_insert(&mem);
608+
status = ucp_device_mem_handle_hash_insert(&mem, params->num_elements);
599609
if (status != UCS_OK) {
600610
uct_mem_free(&mem);
601611
} else {
@@ -605,6 +615,24 @@ ucp_device_mem_list_create(ucp_ep_h ep,
605615
return status;
606616
}
607617

618+
uint32_t
619+
ucp_device_get_mem_list_length(const ucp_device_mem_list_handle_h handle)
620+
{
621+
khiter_t iter;
622+
uint32_t length;
623+
624+
ucs_assert(handle != NULL);
625+
626+
ucs_spin_lock(&ucp_device_handle_hash_lock);
627+
iter = kh_get(ucp_device_handle_allocs, &ucp_device_handle_hash, handle);
628+
ucs_assertv_always((iter != kh_end(&ucp_device_handle_hash)), "handle=%p",
629+
handle);
630+
length = kh_value(&ucp_device_handle_hash, iter).mem_list_length;
631+
ucs_spin_unlock(&ucp_device_handle_hash_lock);
632+
633+
return length;
634+
}
635+
608636
void ucp_device_mem_list_release(ucp_device_mem_list_handle_h handle)
609637
{
610638
uct_allocated_memory_t mem = ucp_device_mem_handle_hash_remove(handle);

src/uct/ib/mlx5/gdaki/gdaki.cuh

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,23 @@ UCS_F_DEVICE void uct_rc_mlx5_gda_wqe_prepare_put_or_atomic(
165165
doca_gpu_dev_verbs_store_wqe_seg(dseg_ptr, (uint64_t*)&(dseg));
166166
}
167167

168+
UCS_F_DEVICE void uct_rc_mlx5_gda_lock(int *lock) {
169+
while (atomicCAS(lock, 0, 1) != 0)
170+
;
171+
#ifdef DOCA_GPUNETIO_VERBS_HAS_FENCE_ACQUIRE_RELEASE_PTX
172+
asm volatile("fence.acquire.gpu;");
173+
#else
174+
uint32_t dummy;
175+
uint32_t UCS_V_UNUSED val;
176+
asm volatile("ld.acquire.gpu.b32 %0, [%1];" : "=r"(val) : "l"(&dummy));
177+
#endif
178+
}
179+
180+
UCS_F_DEVICE void uct_rc_mlx5_gda_unlock(int *lock) {
181+
cuda::atomic_ref<int, cuda::thread_scope_device> lock_aref(*lock);
182+
lock_aref.store(0, cuda::std::memory_order_release);
183+
}
184+
168185
UCS_F_DEVICE void uct_rc_mlx5_gda_db(uct_rc_gdaki_dev_ep_t *ep,
169186
uint64_t wqe_base, unsigned count,
170187
uint64_t flags)
@@ -184,13 +201,11 @@ UCS_F_DEVICE void uct_rc_mlx5_gda_db(uct_rc_gdaki_dev_ep_t *ep,
184201
return;
185202
}
186203

187-
doca_gpu_dev_verbs_lock<DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU>(
188-
&ep->sq_lock);
204+
uct_rc_mlx5_gda_lock(&ep->sq_lock);
189205
uct_rc_mlx5_gda_ring_db(ep, ep->sq_ready_index);
190206
uct_rc_mlx5_gda_update_dbr(ep, ep->sq_ready_index);
191207
uct_rc_mlx5_gda_ring_db(ep, ep->sq_ready_index);
192-
doca_gpu_dev_verbs_unlock<DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU>(
193-
&ep->sq_lock);
208+
uct_rc_mlx5_gda_unlock(&ep->sq_lock);
194209
}
195210

196211
UCS_F_DEVICE bool

src/uct/ib/mlx5/gga/gga_mlx5.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -852,9 +852,11 @@ uct_gga_mlx5_query_tl_devices(uct_md_h md,
852852
return UCS_ERR_NO_DEVICE;
853853
}
854854

855-
ucs_assertv(mlx5_md->super.cap_flags & UCT_MD_FLAG_EXPORTED_MKEY,
856-
"md %p: cap_flags=0x%" PRIx64 " do not have EXPORTED_MKEY flag",
857-
mlx5_md, mlx5_md->super.cap_flags);
855+
if (!(mlx5_md->super.cap_flags & UCT_MD_FLAG_EXPORTED_MKEY)) {
856+
ucs_debug("md %p: cap_flags=0x%" PRIx64 " does not have EXPORTED_MKEY "
857+
"flag", mlx5_md, mlx5_md->super.cap_flags);
858+
return UCS_ERR_NO_DEVICE;
859+
}
858860

859861
ucs_assertv(ucs_test_all_flags(mlx5_md->flags, UCT_GGA_MLX5_MD_CAPS),
860862
"md %p: flags=0x%x do not have mandatory capabilities 0x%x",

test/gtest/ucp/test_ucp_device.cc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,13 @@ UCS_TEST_P(test_ucp_device, create_fail)
334334
EXPECT_EQ(nullptr, handle);
335335
}
336336

337+
UCS_TEST_P(test_ucp_device, get_mem_list_length)
338+
{
339+
constexpr unsigned num_elements = 8;
340+
mem_list list(sender(), receiver(), 1 * UCS_KBYTE, num_elements);
341+
EXPECT_EQ(num_elements, ucp_device_get_mem_list_length(list.handle()));
342+
}
343+
337344
UCP_INSTANTIATE_TEST_CASE_TLS_GPU_AWARE(test_ucp_device, rc_gda, "rc,rc_gda")
338345

339346

0 commit comments

Comments
 (0)