Skip to content

Commit b03d005

Browse files
authored
UCP/GDA/TEST: Extend UCP device tests and fix issues found (#10903)
1 parent 77bfa25 commit b03d005

File tree

13 files changed

+573
-421
lines changed

13 files changed

+573
-421
lines changed

src/ucp/api/device/ucp_device_impl.h

Lines changed: 32 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,11 @@ UCS_F_DEVICE void ucp_device_request_init(uct_device_ep_t *device_ep,
6363
/**
6464
* Macro for device put operations with retry logic
6565
*/
66-
#define UCP_DEVICE_PUT_BLOCKING(_level, _uct_device_ep_put, _device_ep, ...) \
66+
#define UCP_DEVICE_SEND_BLOCKING(_level, _uct_device_ep_send, _device_ep, ...) \
6767
({ \
6868
ucs_status_t _status; \
6969
do { \
70-
_status = _uct_device_ep_put<_level>(_device_ep, __VA_ARGS__); \
70+
_status = _uct_device_ep_send<_level>(_device_ep, __VA_ARGS__); \
7171
if (_status != UCS_ERR_NO_RESOURCE) { \
7272
break; \
7373
} \
@@ -77,8 +77,8 @@ UCS_F_DEVICE void ucp_device_request_init(uct_device_ep_t *device_ep,
7777
})
7878

7979

80-
UCS_F_DEVICE ucs_status_t ucp_device_prepare_single(
81-
ucp_device_mem_list_handle_h mem_list_h, unsigned mem_list_index,
80+
UCS_F_DEVICE ucs_status_t ucp_device_prepare_send(
81+
ucp_device_mem_list_handle_h mem_list_h, unsigned first_mem_elem_index,
8282
ucp_device_request_t *req, uct_device_ep_t *&device_ep,
8383
const uct_device_mem_element_t *&uct_elem,
8484
uct_device_completion_t *&comp)
@@ -87,34 +87,14 @@ UCS_F_DEVICE ucs_status_t ucp_device_prepare_single(
8787
size_t elem_offset;
8888

8989
if ((mem_list_h->version != UCP_DEVICE_MEM_LIST_VERSION_V1) ||
90-
(mem_list_index >= mem_list_h->mem_list_length)) {
90+
(first_mem_elem_index >= mem_list_h->mem_list_length)) {
9191
return UCS_ERR_INVALID_PARAM;
9292
}
9393

9494
device_ep = mem_list_h->uct_device_eps[lane];
95-
elem_offset = mem_list_index * mem_list_h->uct_mem_element_size[lane];
96-
uct_elem = (uct_device_mem_element_t*)UCS_PTR_BYTE_OFFSET(mem_list_h + 1,
97-
elem_offset);
98-
ucp_device_request_init(device_ep, req, comp);
99-
100-
return UCS_OK;
101-
}
102-
103-
104-
UCS_F_DEVICE ucs_status_t
105-
ucp_device_prepare_multi(ucp_device_mem_list_handle_h mem_list_h,
106-
ucp_device_request_t *req, uct_device_ep_t *&device_ep,
107-
const uct_device_mem_element_t *&uct_mem_list,
108-
uct_device_completion_t *&comp)
109-
{
110-
const unsigned lane = 0;
111-
112-
if (mem_list_h->version != UCP_DEVICE_MEM_LIST_VERSION_V1) {
113-
return UCS_ERR_INVALID_PARAM;
114-
}
115-
116-
device_ep = mem_list_h->uct_device_eps[lane];
117-
uct_mem_list = (uct_device_mem_element_t*)(mem_list_h + 1);
95+
elem_offset = first_mem_elem_index * mem_list_h->uct_mem_element_size[lane];
96+
uct_elem = (uct_device_mem_element_t*)UCS_PTR_BYTE_OFFSET(mem_list_h + 1,
97+
elem_offset);
11898
ucp_device_request_init(device_ep, req, comp);
11999

120100
return UCS_OK;
@@ -161,15 +141,15 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_single(
161141
uct_device_ep_t *device_ep;
162142
ucs_status_t status;
163143

164-
status = ucp_device_prepare_single(mem_list_h, mem_list_index, req,
165-
device_ep, uct_elem, comp);
144+
status = ucp_device_prepare_send(mem_list_h, mem_list_index, req, device_ep,
145+
uct_elem, comp);
166146
if (status != UCS_OK) {
167147
return status;
168148
}
169149

170-
return UCP_DEVICE_PUT_BLOCKING(level, uct_device_ep_put_single, device_ep,
171-
uct_elem, address, remote_address, length,
172-
flags, comp);
150+
return UCP_DEVICE_SEND_BLOCKING(level, uct_device_ep_put_single, device_ep,
151+
uct_elem, address, remote_address, length,
152+
flags, comp);
173153
}
174154

175155

@@ -212,14 +192,15 @@ UCS_F_DEVICE ucs_status_t ucp_device_counter_inc(
212192
uct_device_ep_t *device_ep;
213193
ucs_status_t status;
214194

215-
status = ucp_device_prepare_single(mem_list_h, mem_list_index, req,
216-
device_ep, uct_elem, comp);
195+
status = ucp_device_prepare_send(mem_list_h, mem_list_index, req, device_ep,
196+
uct_elem, comp);
217197
if (status != UCS_OK) {
218198
return status;
219199
}
220200

221-
return uct_device_ep_atomic_add<level>(device_ep, uct_elem, inc_value,
222-
remote_address, flags, comp);
201+
return UCP_DEVICE_SEND_BLOCKING(level, uct_device_ep_atomic_add, device_ep,
202+
uct_elem, inc_value, remote_address, flags,
203+
comp);
223204
}
224205

225206

@@ -275,17 +256,17 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_multi(
275256
uct_device_ep_t *device_ep;
276257
ucs_status_t status;
277258

278-
status = ucp_device_prepare_multi(mem_list_h, req, device_ep, uct_mem_list,
279-
comp);
259+
status = ucp_device_prepare_send(mem_list_h, 0, req, device_ep,
260+
uct_mem_list, comp);
280261
if (status != UCS_OK) {
281262
return status;
282263
}
283264

284-
return UCP_DEVICE_PUT_BLOCKING(level, uct_device_ep_put_multi, device_ep,
285-
uct_mem_list, mem_list_h->mem_list_length,
286-
addresses, remote_addresses, lengths,
287-
counter_inc_value, counter_remote_address,
288-
flags, comp);
265+
return UCP_DEVICE_SEND_BLOCKING(level, uct_device_ep_put_multi, device_ep,
266+
uct_mem_list, mem_list_h->mem_list_length,
267+
addresses, remote_addresses, lengths,
268+
counter_inc_value, counter_remote_address,
269+
flags, comp);
289270
}
290271

291272

@@ -350,17 +331,17 @@ UCS_F_DEVICE ucs_status_t ucp_device_put_multi_partial(
350331
uct_device_ep_t *device_ep;
351332
ucs_status_t status;
352333

353-
status = ucp_device_prepare_multi(mem_list_h, req, device_ep, uct_mem_list,
354-
comp);
334+
status = ucp_device_prepare_send(mem_list_h, 0, req, device_ep,
335+
uct_mem_list, comp);
355336
if (status != UCS_OK) {
356337
return status;
357338
}
358339

359-
return UCP_DEVICE_PUT_BLOCKING(level, uct_device_ep_put_multi_partial,
360-
device_ep, uct_mem_list, mem_list_indices,
361-
mem_list_count, addresses, remote_addresses,
362-
lengths, counter_index, counter_inc_value,
363-
counter_remote_address, flags, comp);
340+
return UCP_DEVICE_SEND_BLOCKING(level, uct_device_ep_put_multi_partial,
341+
device_ep, uct_mem_list, mem_list_indices,
342+
mem_list_count, addresses, remote_addresses,
343+
lengths, counter_index, counter_inc_value,
344+
counter_remote_address, flags, comp);
364345
}
365346

366347

src/ucp/core/ucp_device.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include <ucp/api/device/ucp_host.h>
1515
#include <ucp/api/device/ucp_device_types.h>
1616
#include <ucs/type/param.h>
17+
#include <ucp/wireup/wireup_ep.h>
1718

1819
#include "ucp_worker.inl"
1920
#include "ucp_ep.inl"
@@ -237,7 +238,7 @@ static ucs_status_t ucp_device_mem_list_create_handle(
237238
{
238239
size_t handle_size = 0;
239240
size_t uct_elem_size[UCP_DEVICE_MEM_LIST_MAX_EPS];
240-
uint8_t i, j, num_uct_eps;
241+
unsigned i, j, num_uct_eps;
241242
uct_iface_attr_v2_t attr;
242243
ucs_status_t status;
243244
ucp_worker_iface_t *wiface;
@@ -253,6 +254,11 @@ static ucs_status_t ucp_device_mem_list_create_handle(
253254
for (i = 0;
254255
(i < UCP_DEVICE_MEM_LIST_MAX_EPS) && (lanes[i] != UCP_NULL_LANE);
255256
i++) {
257+
if (ucp_wireup_ep_test(ucp_ep_get_lane(ep, lanes[i]))) {
258+
/* TODO support proxy mem_element_pack() on wireup_ep */
259+
return UCS_ERR_NOT_CONNECTED;
260+
}
261+
256262
/* Query per transport UCT memory list element size */
257263
wiface = ucp_worker_iface(ep->worker,
258264
ucp_ep_get_rsc_index(ep, lanes[i]));

src/ucp/core/ucp_worker.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1821,6 +1821,7 @@ ucp_worker_print_used_tls(ucp_worker_h worker, ucp_worker_cfg_index_t cfg_index)
18211821
ucp_lane_map_t tag_lanes_map = 0;
18221822
ucp_lane_map_t rma_lanes_map = 0;
18231823
ucp_lane_map_t amo_lanes_map = 0;
1824+
ucp_lane_map_t device_lanes_map = 0;
18241825
ucp_lane_map_t stream_lanes_map = 0;
18251826
ucp_lane_map_t am_lanes_map = 0;
18261827
ucp_lane_map_t ka_lanes_map = 0;
@@ -1867,6 +1868,10 @@ ucp_worker_print_used_tls(ucp_worker_h worker, ucp_worker_cfg_index_t cfg_index)
18671868
ka_lanes_map |= UCS_BIT(lane);
18681869
}
18691870

1871+
if (key->lanes[lane].lane_types & UCS_BIT(UCP_LANE_TYPE_DEVICE)) {
1872+
device_lanes_map |= UCS_BIT(lane);
1873+
}
1874+
18701875
if ((ucp_ep_config_get_multi_lane_prio(key->rma_lanes, lane) >= 0)) {
18711876
rma_lanes_map |= UCS_BIT(lane);
18721877
}
@@ -1897,6 +1902,7 @@ ucp_worker_print_used_tls(ucp_worker_h worker, ucp_worker_cfg_index_t cfg_index)
18971902
!rma_emul ? "rma" : "rma_am", &strb);
18981903
ucp_worker_add_feature_rsc(context, key, amo_lanes_map,
18991904
!amo_emul ? "amo" : "amo_am", &strb);
1905+
ucp_worker_add_feature_rsc(context, key, device_lanes_map, "device", &strb);
19001906
ucp_worker_add_feature_rsc(context, key, am_lanes_map, "am", &strb);
19011907
ucp_worker_add_feature_rsc(context, key, stream_lanes_map, "stream", &strb);
19021908
ucp_worker_add_feature_rsc(context, key, ka_lanes_map, "ka", &strb);

src/ucs/sys/device_code.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@
2020
#endif /* __NVCC__ */
2121

2222

23+
/* Number of threads in a warp */
24+
#define UCS_DEVICE_NUM_THREADS_IN_WARP 32
25+
26+
2327
/**
2428
* @brief Cooperation level when calling device functions.
2529
*/

src/uct/ib/mlx5/gdaki/configure.m4

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ AC_ARG_WITH([doca-gpunetio],
99
[with_doca_gpunetio=$withval],
1010
[with_doca_gpunetio=guess])
1111

12+
UCX_CHECK_CUDA
1213

1314
AS_IF([test "x$cuda_happy" = "xyes"],
1415
[
@@ -45,7 +46,7 @@ AS_IF([test "x$gpunetio_happy" = "xyes"],
4546
[
4647
# gpunetio was requested but not found
4748
AS_IF([test "x$with_doca_gpunetio" != "xno" -a "x$with_doca_gpunetio" != "xguess"],
48-
[AC_MSG_ERROR([doca_gpunetio not found])])
49+
[AC_MSG_ERROR([doca_gpunetio not found (cuda found: $cuda_happy)])])
4950
])
5051

5152
AM_CONDITIONAL([HAVE_GPUNETIO], [test x$gpunetio_happy = xyes])

src/uct/ib/mlx5/gdaki/gdaki.c

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,9 @@ static UCS_CLASS_INIT_FUNC(uct_rc_gdaki_ep_t, const uct_ep_params_t *params)
103103
cq_attr.umem_len,
104104
ucs_get_page_size());
105105

106+
/* Disable inline scatter to TX CQE */
107+
qp_attr.super.max_inl_cqe[UCT_IB_DIR_TX] = 0;
108+
106109
dev_ep_size = qp_attr.umem_offset + qp_attr.len;
107110
/*
108111
* dev_ep layout:
@@ -116,6 +119,12 @@ static UCS_CLASS_INIT_FUNC(uct_rc_gdaki_ep_t, const uct_ep_params_t *params)
116119
goto err_ctx;
117120
}
118121

122+
status = UCT_CUDADRV_FUNC_LOG_ERR(
123+
cuMemsetD8((CUdeviceptr)self->ep_gpu, 0, dev_ep_size));
124+
if (status != UCS_OK) {
125+
goto err_mem;
126+
}
127+
119128
/* TODO add dmabuf_fd support */
120129
self->umem = mlx5dv_devx_umem_reg(md->super.dev.ibv_context, self->ep_gpu,
121130
dev_ep_size, IBV_ACCESS_LOCAL_WRITE);
@@ -175,12 +184,6 @@ static UCS_CLASS_INIT_FUNC(uct_rc_gdaki_ep_t, const uct_ep_params_t *params)
175184
dev_ep.cqe_num = cq_attr.cq_size;
176185
dev_ep.sq_db = self->sq_db;
177186

178-
status = UCT_CUDADRV_FUNC_LOG_ERR(
179-
cuMemsetD8((CUdeviceptr)self->ep_gpu, 0, dev_ep_size));
180-
if (status != UCS_OK) {
181-
goto err_dev_ep;
182-
}
183-
184187
status = UCT_CUDADRV_FUNC_LOG_ERR(
185188
cuMemsetD8((CUdeviceptr)UCS_PTR_BYTE_OFFSET(self->ep_gpu,
186189
cq_attr.umem_offset),

0 commit comments

Comments
 (0)