Skip to content

Commit f00182e

Browse files
UCT/GDA: Collapsed CQ - 2
1 parent 53139a0 commit f00182e

File tree

6 files changed

+63
-93
lines changed

6 files changed

+63
-93
lines changed

src/uct/ib/mlx5/gdaki/gdaki.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,14 @@ static UCS_CLASS_INIT_FUNC(uct_rc_gdaki_ep_t, const uct_ep_params_t *params)
184184
dev_ep.cqe_num = cq_attr.cq_size;
185185
dev_ep.sq_db = self->sq_db;
186186

187+
status = UCT_CUDADRV_FUNC_LOG_ERR(
188+
cuMemsetD8((CUdeviceptr)UCS_PTR_BYTE_OFFSET(self->ep_gpu,
189+
cq_attr.umem_offset),
190+
0xff, cq_attr.umem_len));
191+
if (status != UCS_OK) {
192+
goto err_dev_ep;
193+
}
194+
187195
status = UCT_CUDADRV_FUNC_LOG_ERR(
188196
cuMemcpyHtoD((CUdeviceptr)self->ep_gpu, &dev_ep, sizeof(dev_ep)));
189197
if (status != UCS_OK) {

src/uct/ib/mlx5/gdaki/gdaki.cuh

Lines changed: 52 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -93,13 +93,49 @@ template<ucs_device_level_t level> UCS_F_DEVICE void uct_rc_mlx5_gda_sync(void)
9393
}
9494
}
9595

96+
UCS_F_DEVICE uint16_t uct_rc_mlx5_gda_bswap16(uint16_t x)
97+
{
98+
uint32_t ret;
99+
asm volatile("{\n\t"
100+
".reg .b32 mask;\n\t"
101+
".reg .b32 ign;\n\t"
102+
"mov.b32 mask, 0x1;\n\t"
103+
"prmt.b32 %0, %1, ign, mask;\n\t"
104+
"}"
105+
: "=r"(ret)
106+
: "r"((uint32_t)x));
107+
return ret;
108+
}
109+
110+
UCS_F_DEVICE void uct_rc_mlx5_gda_read_cqe(uct_rc_gdaki_dev_ep_t *ep,
111+
uint16_t *wqe_cnt, uint8_t *opcode)
112+
{
113+
auto *cqe64 = reinterpret_cast<mlx5_cqe64*>(ep->cqe_daddr);
114+
uint32_t *data_ptr = (uint32_t*)&cqe64->wqe_counter;
115+
uint32_t data = READ_ONCE(*data_ptr);
116+
117+
*wqe_cnt = uct_rc_mlx5_gda_bswap16(data);
118+
if (opcode != NULL) {
119+
*opcode = data >> 28;
120+
}
121+
}
122+
123+
UCS_F_DEVICE uint64_t uct_rc_mlx5_gda_calc_pi(uct_rc_gdaki_dev_ep_t *ep,
124+
uint16_t wqe_cnt)
125+
{
126+
uint64_t rsvd_idx = READ_ONCE(ep->sq_rsvd_index);
127+
return rsvd_idx - ((rsvd_idx - wqe_cnt) & 0xffff);
128+
}
129+
130+
96131
UCS_F_DEVICE uint64_t uct_rc_mlx5_gda_max_alloc_wqe_base(
97132
uct_rc_gdaki_dev_ep_t *ep, unsigned count)
98133
{
99-
/* TODO optimize by including sq_wqe_num in qp->sq_wqe_pi and updating it
100-
when processing a new completion */
101-
uint64_t pi = doca_gpu_dev_verbs_atomic_read<uint64_t,
102-
DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU>(&ep->sq_wqe_pi);
134+
uint16_t wqe_cnt;
135+
uint64_t pi;
136+
137+
uct_rc_mlx5_gda_read_cqe(ep, &wqe_cnt, NULL);
138+
pi = uct_rc_mlx5_gda_calc_pi(ep, wqe_cnt);
103139
return pi + ep->sq_wqe_num + 1 - count;
104140
}
105141

@@ -501,20 +537,6 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_put_multi_partial(
501537
return UCS_INPROGRESS;
502538
}
503539

504-
UCS_F_DEVICE uint16_t uct_rc_mlx5_gda_bswap16(uint16_t x)
505-
{
506-
uint32_t ret;
507-
asm volatile("{\n\t"
508-
".reg .b32 mask;\n\t"
509-
".reg .b32 ign;\n\t"
510-
"mov.b32 mask, 0x1;\n\t"
511-
"prmt.b32 %0, %1, ign, mask;\n\t"
512-
"}"
513-
: "=r"(ret)
514-
: "r"((uint32_t)x));
515-
return ret;
516-
}
517-
518540
UCS_F_DEVICE void
519541
uct_rc_mlx5_gda_qedump(const char *pfx, void *buff, ssize_t len)
520542
{
@@ -532,70 +554,9 @@ uct_rc_mlx5_gda_qedump(const char *pfx, void *buff, ssize_t len)
532554
}
533555
}
534556

535-
UCS_F_DEVICE int uct_rc_mlx5_gda_trylock(int *lock) {
536-
if (atomicCAS(lock, 0, 1) == 0) {
537-
doca_gpu_dev_verbs_fence_acquire<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU>();
538-
return 1;
539-
}
540-
541-
return 0;
542-
}
543-
544-
UCS_F_DEVICE void uct_rc_mlx5_gda_unlock(int *lock) {
545-
cuda::atomic_ref<int, cuda::thread_scope_device> lock_aref(*lock);
546-
lock_aref.store(0, cuda::std::memory_order_release);
547-
}
548-
549-
UCS_F_DEVICE void uct_rc_mlx5_gda_progress_thread(uct_rc_gdaki_dev_ep_t *ep)
550-
{
551-
if (!uct_rc_mlx5_gda_trylock(&ep->cq_lock)) {
552-
return;
553-
}
554-
555-
void *cqe = ep->cqe_daddr;
556-
auto *cqe64 = reinterpret_cast<mlx5_cqe64*>(cqe);
557-
558-
uint8_t opcode = cqe64->op_own >> DOCA_GPUNETIO_VERBS_MLX5_CQE_OPCODE_SHIFT;
559-
uint16_t wqe_cnt = uct_rc_mlx5_gda_bswap16(cqe64->wqe_counter);
560-
uint16_t wqe_idx = wqe_cnt & (ep->sq_wqe_num - 1);
561-
562-
uint64_t sq_wqe_pi = ep->sq_wqe_pi;
563-
sq_wqe_pi = ((wqe_cnt - sq_wqe_pi) & 0xffff) + sq_wqe_pi;
564-
565-
if (opcode != MLX5_CQE_REQ_ERR) {
566-
ep->sq_wqe_pi = sq_wqe_pi;
567-
uct_rc_mlx5_gda_unlock(&ep->cq_lock);
568-
return;
569-
}
570-
571-
auto err_cqe = reinterpret_cast<mlx5_err_cqe_ex*>(cqe64);
572-
auto wqe_ptr = uct_rc_mlx5_gda_get_wqe_ptr(ep, wqe_idx);
573-
ucs_device_error("CQE with syndrome:%x vendor:%x hw:%x "
574-
"wqe_idx:0x%x qp:0x%x",
575-
err_cqe->syndrome, err_cqe->vendor_err_synd,
576-
err_cqe->hw_err_synd, wqe_idx,
577-
doca_gpu_dev_verbs_bswap32(err_cqe->s_wqe_opcode_qpn) &
578-
0xffffff);
579-
uct_rc_mlx5_gda_qedump("WQE", wqe_ptr, 64);
580-
uct_rc_mlx5_gda_qedump("CQE", cqe64, 64);
581-
ep->sq_wqe_pi = sq_wqe_pi | UCT_RC_GDA_WQE_ERR;
582-
583-
uct_rc_mlx5_gda_unlock(&ep->cq_lock);
584-
}
585-
586557
template<ucs_device_level_t level>
587558
UCS_F_DEVICE void uct_rc_mlx5_gda_ep_progress(uct_device_ep_h tl_ep)
588559
{
589-
uct_rc_gdaki_dev_ep_t *ep = (uct_rc_gdaki_dev_ep_t*)tl_ep;
590-
unsigned num_lanes;
591-
unsigned lane_id;
592-
593-
uct_rc_mlx5_gda_exec_init<level>(lane_id, num_lanes);
594-
if (lane_id == 0) {
595-
uct_rc_mlx5_gda_progress_thread(ep);
596-
}
597-
598-
uct_rc_mlx5_gda_sync<level>();
599560
}
600561

601562
template<ucs_device_level_t level>
@@ -604,13 +565,22 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_check_completion(
604565
{
605566
uct_rc_gdaki_dev_ep_t *ep = reinterpret_cast<uct_rc_gdaki_dev_ep_t*>(tl_ep);
606567
uct_rc_gda_completion_t *comp = &tl_comp->rc_gda;
607-
uint64_t sq_wqe_pi = ep->sq_wqe_pi;
568+
uint16_t wqe_cnt;
569+
uint8_t opcode;
570+
uint64_t pi;
571+
572+
uct_rc_mlx5_gda_read_cqe(ep, &wqe_cnt, &opcode);
573+
pi = uct_rc_mlx5_gda_calc_pi(ep, wqe_cnt);
608574

609-
if ((sq_wqe_pi & UCT_RC_GDA_WQE_MASK) < comp->wqe_idx) {
575+
if (pi < comp->wqe_idx) {
610576
return UCS_INPROGRESS;
611577
}
612578

613-
if (sq_wqe_pi & UCT_RC_GDA_WQE_ERR) {
579+
if (opcode == MLX5_CQE_REQ_ERR) {
580+
uint16_t wqe_idx = wqe_cnt & (ep->sq_wqe_num - 1);
581+
auto wqe_ptr = uct_rc_mlx5_gda_get_wqe_ptr(ep, wqe_idx);
582+
uct_rc_mlx5_gda_qedump("WQE", wqe_ptr, 64);
583+
uct_rc_mlx5_gda_qedump("CQE", ep->cqe_daddr, 64);
614584
return UCS_ERR_IO_ERROR;
615585
}
616586

src/uct/ib/mlx5/gdaki/gdaki_dev.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,7 @@ typedef struct {
1919

2020
uint64_t sq_rsvd_index;
2121
uint64_t sq_ready_index;
22-
uint64_t sq_wqe_pi;
2322
int sq_lock;
24-
int cq_lock;
2523

2624
uint8_t *sq_wqe_daddr;
2725
uint32_t *sq_dbrec;

test/gtest/ucp/cuda/test_kernels.cu

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,6 @@ ucp_test_kernel_get_state(const test_ucp_device_kernel_params_t &params,
140140
(device_ep->uct_tl_id == UCT_DEVICE_TL_RC_MLX5_GDA)) {
141141
uct_rc_gdaki_dev_ep_t *ep =
142142
reinterpret_cast<uct_rc_gdaki_dev_ep_t*>(device_ep);
143-
result.producer_index = ep->sq_wqe_pi - result.producer_index;
144143
result.ready_index = ep->sq_ready_index - result.ready_index;
145144
}
146145
}
@@ -239,7 +238,6 @@ launch_test_ucp_device_kernel(const test_ucp_device_kernel_params_t &params)
239238

240239
ucx_cuda::device_result_ptr<test_ucp_device_kernel_result_t> result;
241240
result->status = UCS_ERR_NOT_IMPLEMENTED;
242-
result->producer_index = 0;
243241
result->ready_index = 0;
244242

245243
switch (params.level) {

test/gtest/ucp/cuda/test_kernels.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@ typedef struct {
6666

6767
struct test_ucp_device_kernel_result_t {
6868
ucs_status_t status;
69-
uint64_t producer_index;
7069
uint64_t ready_index;
7170
};
7271

test/gtest/ucp/test_ucp_device.cc

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,6 @@
1212
#include <common/cuda.h>
1313
#include "cuda/test_kernels.h"
1414

15-
/* TODO: Enable these tests in CI */
16-
#define DISABLE_STRESS true
17-
1815
class test_ucp_device : public ucp_test {
1916
public:
2017
static void get_test_variants(std::vector<ucp_test_variant> &variants);
@@ -321,7 +318,6 @@ class test_ucp_device_kernel : public test_ucp_device {
321318

322319
uint64_t expected = params.num_iters * num_threads * count;
323320
EXPECT_UCS_OK(result.status);
324-
EXPECT_EQ(expected - 1, result.producer_index);
325321
EXPECT_EQ(expected, result.ready_index);
326322
}
327323
};
@@ -484,8 +480,9 @@ UCS_TEST_P(test_ucp_device_xfer, put_single)
484480
list.dst_pattern_check(mem_list_index + 1, mem_list::SEED_DST);
485481
}
486482

483+
/* TODO: Enable these tests in CI */
487484
UCS_TEST_SKIP_COND_P(test_ucp_device_xfer, put_single_stress_test,
488-
RUNNING_ON_VALGRIND || DISABLE_STRESS)
485+
RUNNING_ON_VALGRIND || true)
489486
{
490487
#ifdef __SANITIZE_ADDRESS__
491488
UCS_TEST_SKIP_R("Skipping stress test under ASAN");
@@ -538,7 +535,7 @@ UCS_TEST_P(test_ucp_device_xfer, put_multi)
538535
}
539536

540537
UCS_TEST_SKIP_COND_P(test_ucp_device_xfer, put_multi_stress_test,
541-
RUNNING_ON_VALGRIND || DISABLE_STRESS)
538+
RUNNING_ON_VALGRIND || true)
542539
{
543540
#ifdef __SANITIZE_ADDRESS__
544541
UCS_TEST_SKIP_R("Skipping stress test under ASAN");

0 commit comments

Comments
 (0)