Skip to content

Commit ea65cc6

Browse files
UCT/GDA: Collapsed CQ (#10959)
* UCT/GDA: Collapsed CQ * UCT/GDA: Collapsed CQ - 2 * UCT/GDA: Fix locking * UCT/GDA: Collapsed CQ - 3
1 parent e29e5f5 commit ea65cc6

File tree

7 files changed

+103
-107
lines changed

7 files changed

+103
-107
lines changed

src/uct/ib/mlx5/dv/ib_mlx5_dv.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,7 @@ uct_ib_mlx5_devx_create_cq_common(uct_ib_iface_t *iface, uct_ib_dir_t dir,
606606

607607
UCT_IB_MLX5DV_SET(cqc, cqctx, log_cq_size, log_cq_size);
608608
UCT_IB_MLX5DV_SET(cqc, cqctx, cqe_sz, (attr->cqe_size == 128) ? 1 : 0);
609+
UCT_IB_MLX5DV_SET(cqc, cqctx, cc, (attr->cq_size == 1) ? 1 : 0);
609610

610611
if (attr->flags & UCT_IB_MLX5_CQ_CQE_ZIP) {
611612
UCT_IB_MLX5DV_SET(cqc, cqctx, cqe_comp_en, 1);

src/uct/ib/mlx5/gdaki/gdaki.c

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,7 @@ static UCS_CLASS_INIT_FUNC(uct_rc_gdaki_ep_t, const uct_ep_params_t *params)
8383
return status;
8484
}
8585

86-
init_attr.cq_len[UCT_IB_DIR_TX] = iface->super.super.config.tx_qp_len *
87-
UCT_IB_MLX5_MAX_BB;
86+
init_attr.cq_len[UCT_IB_DIR_TX] = 1;
8887
uct_ib_mlx5_cq_calc_sizes(&iface->super.super.super, UCT_IB_DIR_TX,
8988
&init_attr, 0, &cq_attr);
9089
uct_rc_iface_fill_attr(&iface->super.super, &qp_attr.super,
@@ -176,16 +175,11 @@ static UCS_CLASS_INIT_FUNC(uct_rc_gdaki_ep_t, const uct_ep_params_t *params)
176175
dev_ep.sq_num = self->qp.super.qp_num;
177176
dev_ep.sq_wqe_daddr = UCS_PTR_BYTE_OFFSET(self->ep_gpu,
178177
qp_attr.umem_offset);
179-
dev_ep.sq_rsvd_index = 0;
180-
dev_ep.sq_ready_index = 0;
181-
dev_ep.sq_wqe_pi = 0;
182-
dev_ep.sq_wqe_num = qp_attr.max_tx;
178+
dev_ep.sq_wqe_num = qp_attr.max_tx;
179+
dev_ep.sq_dbrec = &self->ep_gpu->qp_dbrec[MLX5_SND_DBR];
183180
/* FC mask is used to determine if WQE should be posted with completion.
184181
* qp_attr.max_tx must be a power of 2. */
185-
dev_ep.sq_fc_mask = (qp_attr.max_tx >> 1) - 1;
186-
dev_ep.avail_count = qp_attr.max_tx;
187-
dev_ep.sq_dbrec = &self->ep_gpu->qp_dbrec[MLX5_SND_DBR];
188-
182+
dev_ep.sq_fc_mask = (qp_attr.max_tx >> 1) - 1;
189183
dev_ep.cqe_daddr = UCS_PTR_BYTE_OFFSET(self->ep_gpu, cq_attr.umem_offset);
190184
dev_ep.cqe_num = cq_attr.cq_size;
191185
dev_ep.sq_db = self->sq_db;

src/uct/ib/mlx5/gdaki/gdaki.cuh

Lines changed: 94 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -93,29 +93,99 @@ template<ucs_device_level_t level> UCS_F_DEVICE void uct_rc_mlx5_gda_sync(void)
9393
}
9494
}
9595

96-
UCS_F_DEVICE uint64_t
97-
uct_rc_mlx5_gda_reserv_wqe_thread(uct_rc_gdaki_dev_ep_t *ep, unsigned count)
96+
UCS_F_DEVICE uint16_t uct_rc_mlx5_gda_bswap16(uint16_t x)
9897
{
99-
/* Try to reserve optimistically */
100-
int32_t prev = atomicAdd(&ep->avail_count, -(int32_t)count);
101-
if (prev < (int32_t)count) {
102-
/* Rollback */
103-
atomicAdd(&ep->avail_count, count);
98+
uint32_t ret;
99+
asm volatile("{\n\t"
100+
".reg .b32 mask;\n\t"
101+
".reg .b32 ign;\n\t"
102+
"mov.b32 mask, 0x1;\n\t"
103+
"prmt.b32 %0, %1, ign, mask;\n\t"
104+
"}"
105+
: "=r"(ret)
106+
: "r"((uint32_t)x));
107+
return ret;
108+
}
109+
110+
UCS_F_DEVICE uint64_t uct_rc_mlx5_gda_parse_cqe(uct_rc_gdaki_dev_ep_t *ep,
111+
uint16_t *wqe_cnt,
112+
uint8_t *opcode)
113+
{
114+
auto *cqe64 = reinterpret_cast<mlx5_cqe64*>(ep->cqe_daddr);
115+
uint32_t *data_ptr = (uint32_t*)&cqe64->wqe_counter;
116+
uint32_t data = READ_ONCE(*data_ptr);
117+
uint64_t rsvd_idx = READ_ONCE(ep->sq_rsvd_index);
118+
119+
*wqe_cnt = uct_rc_mlx5_gda_bswap16(data);
120+
if (opcode != nullptr) {
121+
*opcode = data >> 28;
122+
}
123+
124+
return rsvd_idx - ((rsvd_idx - *wqe_cnt) & 0xffff);
125+
}
126+
127+
UCS_F_DEVICE uint64_t uct_rc_mlx5_gda_max_alloc_wqe_base(
128+
uct_rc_gdaki_dev_ep_t *ep, unsigned count)
129+
{
130+
uint16_t wqe_cnt;
131+
uint64_t pi;
132+
133+
pi = uct_rc_mlx5_gda_parse_cqe(ep, &wqe_cnt, nullptr);
134+
return pi + ep->sq_wqe_num + 1 - count;
135+
}
136+
137+
UCS_F_DEVICE uint64_t uct_rc_mlx5_gda_reserv_wqe_thread(
138+
uct_rc_gdaki_dev_ep_t *ep, unsigned count)
139+
{
140+
/* Do not attempt to reserve if the available space is less than the
141+
* requested count, to avoid starvation of threads trying to rollback the
142+
* reservation with atomicCAS. */
143+
uint64_t max_wqe_base = uct_rc_mlx5_gda_max_alloc_wqe_base(ep, count);
144+
if (ep->sq_rsvd_index > max_wqe_base) {
104145
return UCT_RC_GDA_RESV_WQE_NO_RESOURCE;
105146
}
106147

107-
/* We own count elements, now can safely increment the reserved index */
108-
return atomicAdd(reinterpret_cast<unsigned long long*>(&ep->sq_rsvd_index),
109-
count);
148+
uint64_t wqe_base = atomicAdd(reinterpret_cast<unsigned long long*>(
149+
&ep->sq_rsvd_index),
150+
static_cast<unsigned long long>(count));
151+
152+
/*
153+
* Attempt to reserve 'count' WQEs by atomically incrementing the reserved
154+
* index. If the reservation exceeds the available space in the work queue,
155+
* enter a rollback loop.
156+
*
157+
* Rollback Logic:
158+
* - Calculate the next potential index (wqe_next) after attempting the
159+
* reservation.
160+
* - Use atomic CAS to check if the current reserved index matches wqe_next.
161+
* If it does, revert the reservation by resetting the reserved index to
162+
* wqe_base.
163+
* - A successful CAS indicates no other thread has modified the reserved
164+
* index, allowing the rollback to complete, and the function returns
165+
* UCT_RC_GDA_RESV_WQE_NO_RESOURCE to signal insufficient resources.
166+
* - If CAS fails, it means another thread has modified the reserved index.
167+
* The loop continues to reevaluate resource availability to determine if
168+
* the reservation can now be satisfied, possibly due to other operations
169+
* freeing up resources.
170+
*/
171+
while (wqe_base > max_wqe_base) {
172+
uint64_t wqe_next = wqe_base + count;
173+
if (atomicCAS(reinterpret_cast<unsigned long long*>(&ep->sq_rsvd_index),
174+
wqe_next, wqe_base) == wqe_next) {
175+
return UCT_RC_GDA_RESV_WQE_NO_RESOURCE;
176+
}
177+
178+
max_wqe_base = uct_rc_mlx5_gda_max_alloc_wqe_base(ep, count);
179+
}
180+
181+
return wqe_base;
110182
}
111183

112184
template<ucs_device_level_t level>
113185
UCS_F_DEVICE void
114186
uct_rc_mlx5_gda_reserv_wqe(uct_rc_gdaki_dev_ep_t *ep, unsigned count,
115187
unsigned lane_id, uint64_t &wqe_base)
116188
{
117-
wqe_base = 0;
118-
119189
if (lane_id == 0) {
120190
wqe_base = uct_rc_mlx5_gda_reserv_wqe_thread(ep, count);
121191
}
@@ -211,7 +281,7 @@ UCS_F_DEVICE void uct_rc_mlx5_gda_db(uct_rc_gdaki_dev_ep_t *ep,
211281
UCS_F_DEVICE bool
212282
uct_rc_mlx5_gda_fc(const uct_rc_gdaki_dev_ep_t *ep, uint16_t wqe_idx)
213283
{
214-
return (wqe_idx & ep->sq_fc_mask) == 1;
284+
return !(wqe_idx & ep->sq_fc_mask);
215285
}
216286

217287
template<ucs_device_level_t level>
@@ -494,82 +564,9 @@ uct_rc_mlx5_gda_qedump(const char *pfx, void *buff, ssize_t len)
494564
}
495565
}
496566

497-
UCS_F_DEVICE void uct_rc_mlx5_gda_progress_thread(uct_rc_gdaki_dev_ep_t *ep)
498-
{
499-
void *cqe = ep->cqe_daddr;
500-
size_t cqe_num = ep->cqe_num;
501-
uint64_t cqe_idx = ep->cqe_ci;
502-
uint32_t idx = cqe_idx & (cqe_num - 1);
503-
void *curr_cqe = (uint8_t*)cqe + (idx * DOCA_GPUNETIO_VERBS_CQE_SIZE);
504-
auto *cqe64 = reinterpret_cast<mlx5_cqe64*>(curr_cqe);
505-
506-
/* Read last 3 fields with a single atomic operation */
507-
uint32_t *data_ptr = (uint32_t *)&cqe64->wqe_counter;
508-
uint32_t data = READ_ONCE(*data_ptr);
509-
uint8_t op_owner = data >> 24;
510-
if ((op_owner & MLX5_CQE_OWNER_MASK) ^ !!(cqe_idx & cqe_num)) {
511-
return;
512-
}
513-
514-
cuda::atomic_ref<uint64_t, cuda::thread_scope_device> ref(ep->cqe_ci);
515-
if (!ref.compare_exchange_strong(cqe_idx, cqe_idx + 1,
516-
cuda::std::memory_order_relaxed)) {
517-
return;
518-
}
519-
520-
uint8_t opcode = op_owner >> DOCA_GPUNETIO_VERBS_MLX5_CQE_OPCODE_SHIFT;
521-
uint32_t data_cpu = doca_gpu_dev_verbs_bswap32(data);
522-
uint16_t wqe_cnt = (data_cpu >> 16) & 0xffff;
523-
uint16_t wqe_idx = wqe_cnt & (ep->sq_wqe_num - 1);
524-
525-
cuda::atomic_ref<uint64_t, cuda::thread_scope_device> pi_ref(ep->sq_wqe_pi);
526-
uint64_t sq_wqe_pi = pi_ref.load(cuda::std::memory_order_relaxed);
527-
uint64_t new_wqe_pi;
528-
529-
do {
530-
/* Skip CQE if it's older than current producer index, could be already
531-
* processed by another thread. This handles CQE wrap-around. */
532-
if ((int16_t)(wqe_cnt - (uint16_t)sq_wqe_pi) < 0) {
533-
return;
534-
}
535-
536-
uint16_t completed_delta = wqe_cnt - (uint16_t)sq_wqe_pi;
537-
new_wqe_pi = sq_wqe_pi + completed_delta + 1;
538-
} while (!pi_ref.compare_exchange_weak(sq_wqe_pi, new_wqe_pi,
539-
cuda::std::memory_order_release,
540-
cuda::std::memory_order_relaxed));
541-
542-
if (opcode == MLX5_CQE_REQ) {
543-
atomicAdd(&ep->avail_count, (int32_t)(new_wqe_pi - sq_wqe_pi));
544-
return;
545-
}
546-
547-
auto err_cqe = reinterpret_cast<mlx5_err_cqe_ex*>(cqe64);
548-
auto wqe_ptr = uct_rc_mlx5_gda_get_wqe_ptr(ep, wqe_idx);
549-
ucs_device_error("CQE[%d] with syndrome:%x vendor:%x hw:%x "
550-
"wqe_idx:0x%x qp:0x%x",
551-
idx, err_cqe->syndrome, err_cqe->vendor_err_synd,
552-
err_cqe->hw_err_synd, wqe_idx,
553-
doca_gpu_dev_verbs_bswap32(err_cqe->s_wqe_opcode_qpn) &
554-
0xffffff);
555-
uct_rc_mlx5_gda_qedump("WQE", wqe_ptr, 64);
556-
uct_rc_mlx5_gda_qedump("CQE", cqe64, 64);
557-
pi_ref.fetch_max(sq_wqe_pi | UCT_RC_GDA_WQE_ERR);
558-
}
559-
560567
template<ucs_device_level_t level>
561568
UCS_F_DEVICE void uct_rc_mlx5_gda_ep_progress(uct_device_ep_h tl_ep)
562569
{
563-
uct_rc_gdaki_dev_ep_t *ep = (uct_rc_gdaki_dev_ep_t*)tl_ep;
564-
unsigned num_lanes;
565-
unsigned lane_id;
566-
567-
uct_rc_mlx5_gda_exec_init<level>(lane_id, num_lanes);
568-
if (lane_id == 0) {
569-
uct_rc_mlx5_gda_progress_thread(ep);
570-
}
571-
572-
uct_rc_mlx5_gda_sync<level>();
573570
}
574571

575572
template<ucs_device_level_t level>
@@ -578,13 +575,21 @@ UCS_F_DEVICE ucs_status_t uct_rc_mlx5_gda_ep_check_completion(
578575
{
579576
uct_rc_gdaki_dev_ep_t *ep = reinterpret_cast<uct_rc_gdaki_dev_ep_t*>(tl_ep);
580577
uct_rc_gda_completion_t *comp = &tl_comp->rc_gda;
581-
uint64_t sq_wqe_pi = ep->sq_wqe_pi;
578+
uint16_t wqe_cnt;
579+
uint8_t opcode;
580+
uint64_t pi;
581+
582+
pi = uct_rc_mlx5_gda_parse_cqe(ep, &wqe_cnt, &opcode);
582583

583-
if ((sq_wqe_pi & UCT_RC_GDA_WQE_MASK) <= comp->wqe_idx) {
584+
if (pi < comp->wqe_idx) {
584585
return UCS_INPROGRESS;
585586
}
586587

587-
if (sq_wqe_pi & UCT_RC_GDA_WQE_ERR) {
588+
if (opcode == MLX5_CQE_REQ_ERR) {
589+
uint16_t wqe_idx = wqe_cnt & (ep->sq_wqe_num - 1);
590+
auto wqe_ptr = uct_rc_mlx5_gda_get_wqe_ptr(ep, wqe_idx);
591+
uct_rc_mlx5_gda_qedump("WQE", wqe_ptr, 64);
592+
uct_rc_mlx5_gda_qedump("CQE", ep->cqe_daddr, 64);
588593
return UCS_ERR_IO_ERROR;
589594
}
590595

src/uct/ib/mlx5/gdaki/gdaki_dev.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@ typedef struct {
1919

2020
uint64_t sq_rsvd_index;
2121
uint64_t sq_ready_index;
22-
uint64_t sq_wqe_pi;
23-
uint64_t cqe_ci;
2422
int sq_lock;
2523

2624
uint8_t *sq_wqe_daddr;
@@ -31,7 +29,6 @@ typedef struct {
3129
uint16_t sq_wqe_num;
3230
uint32_t sq_num;
3331
uint16_t sq_fc_mask;
34-
int32_t avail_count;
3532
} uct_rc_gdaki_dev_ep_t;
3633

3734

test/gtest/ucp/cuda/test_kernels.cu

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -128,11 +128,13 @@ ucp_test_kernel_get_state(const test_ucp_device_kernel_params_t &params,
128128
uct_elem, comp);
129129
if ((status == UCS_OK) &&
130130
(device_ep->uct_tl_id == UCT_DEVICE_TL_RC_MLX5_GDA)) {
131+
uint16_t wqe_cnt;
131132
uct_rc_gdaki_dev_ep_t *ep =
132133
reinterpret_cast<uct_rc_gdaki_dev_ep_t*>(device_ep);
133-
result.producer_index = ep->sq_wqe_pi - result.producer_index;
134+
result.producer_index = uct_rc_mlx5_gda_parse_cqe(ep, &wqe_cnt,
135+
nullptr) +
136+
1;
134137
result.ready_index = ep->sq_ready_index - result.ready_index;
135-
result.avail_count = ep->avail_count - result.avail_count;
136138
}
137139
}
138140

@@ -240,7 +242,6 @@ launch_test_ucp_device_kernel(const test_ucp_device_kernel_params_t &params)
240242
result->status = UCS_ERR_NOT_IMPLEMENTED;
241243
result->producer_index = 0;
242244
result->ready_index = 0;
243-
result->avail_count = 0;
244245

245246
switch (params.level) {
246247
case UCS_DEVICE_LEVEL_THREAD:

test/gtest/ucp/cuda/test_kernels.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@ struct test_ucp_device_kernel_result_t {
6868
ucs_status_t status;
6969
uint64_t producer_index;
7070
uint64_t ready_index;
71-
int32_t avail_count;
7271
};
7372

7473
test_ucp_device_kernel_result_t

test/gtest/ucp/test_ucp_device.cc

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,6 @@ class test_ucp_device_kernel : public test_ucp_device {
406406
EXPECT_UCS_OK(result.status);
407407
EXPECT_EQ(expected, result.producer_index);
408408
EXPECT_EQ(expected, result.ready_index);
409-
EXPECT_EQ(0, result.avail_count);
410409
}
411410
};
412411

0 commit comments

Comments
 (0)