Skip to content

Commit 2be3040

Browse files
committed
UCP/RNDV: Throttle rndv fragment requests (both pipeline and standalone mtype requests)
1 parent e8ab489 commit 2be3040

File tree

9 files changed

+208
-6
lines changed

9 files changed

+208
-6
lines changed

src/ucp/core/ucp_context.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,16 @@ static ucs_config_field_t ucp_context_config_table[] = {
383383
"even if invalidation workflow isn't supported",
384384
ucs_offsetof(ucp_context_config_t, rndv_errh_ppln_enable), UCS_CONFIG_TYPE_BOOL},
385385

386+
{"RNDV_PIPELINE_WORKER_FC_ENABLE", "n",
387+
"Enable worker-level flow control to limit total concurrent pipeline fragments\n"
388+
"across all requests, preventing memory exhaustion",
389+
ucs_offsetof(ucp_context_config_t, rndv_ppln_worker_fc_enable), UCS_CONFIG_TYPE_BOOL},
390+
391+
{"RNDV_PIPELINE_WORKER_MAX_FRAGS", "5000",
392+
"Maximum number of concurrent pipeline fragments per worker\n"
393+
"(only applies when RNDV_PIPELINE_WORKER_FC_ENABLE=y)",
394+
ucs_offsetof(ucp_context_config_t, rndv_ppln_worker_max_frags), UCS_CONFIG_TYPE_ULUNITS},
395+
386396
{"FLUSH_WORKER_EPS", "y",
387397
"Enable flushing the worker by flushing its endpoints. Allows completing\n"
388398
"the flush operation in a bounded time even if there are new requests on\n"

src/ucp/core/ucp_context.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,10 @@ typedef struct ucp_context_config {
9898
int rndv_shm_ppln_enable;
9999
/** Enable error handling for rndv pipeline protocol */
100100
int rndv_errh_ppln_enable;
101+
/** Enable flow control for rndv pipeline fragments at worker level */
102+
int rndv_ppln_worker_fc_enable;
103+
/** Maximum number of concurrent pipeline fragments per worker */
104+
size_t rndv_ppln_worker_max_frags;
101105
/** Threshold for using tag matching offload capabilities. Smaller buffers
102106
* will not be posted to the transport. */
103107
size_t tm_thresh;

src/ucp/core/ucp_request.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,10 @@ struct ucp_request {
318318
/* Used by rndv/send/ppln and rndv/recv/ppln */
319319
struct {
320320
/* Size to send in ack message */
321-
ssize_t ack_data_size;
321+
ssize_t ack_data_size;
322+
/* Element in worker-level pending queue
323+
* for throttled ppln requests */
324+
ucs_queue_elem_t queue_elem;
322325
} ppln;
323326

324327
/* Used by rndv/rkey_ptr */

src/ucp/core/ucp_worker.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2519,6 +2519,10 @@ ucs_status_t ucp_worker_create(ucp_context_h context,
25192519
worker->counters.ep_closures = 0;
25202520
worker->counters.ep_failures = 0;
25212521

2522+
/* Initialize RNDV pipeline flow control */
2523+
worker->rndv_ppln_fc.active_frags = 0;
2524+
ucs_queue_head_init(&worker->rndv_ppln_fc.pending_q);
2525+
25222526
/* Copy user flags, and mask-out unsupported flags for compatibility */
25232527
worker->flags = UCP_PARAM_VALUE(WORKER, params, flags, FLAGS, 0) &
25242528
UCS_MASK(UCP_WORKER_INTERNAL_FLAGS_SHIFT);

src/ucp/core/ucp_worker.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,12 @@ typedef struct ucp_worker {
393393
uint64_t ep_failures;
394394
} counters;
395395

396+
struct {
397+
/* Worker-level ppln fragment flow control */
398+
size_t active_frags; /* Current active fragments */
399+
ucs_queue_head_t pending_q; /* Queue of throttled ppln requests */
400+
} rndv_ppln_fc;
401+
396402
struct {
397403
/* Usage tracker handle */
398404
ucs_usage_tracker_h handle;

src/ucp/rndv/rndv_get.c

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,11 +260,18 @@ ucp_proto_rndv_get_mtype_unpack_completion(uct_completion_t *uct_comp)
260260
{
261261
ucp_request_t *req = ucs_container_of(uct_comp, ucp_request_t,
262262
send.state.uct_comp);
263+
ucp_worker_h worker = req->send.ep->worker;
264+
ucp_context_h context = worker->context;
263265

264266
ucs_mpool_put_inline(req->send.rndv.mdesc);
265267
if (ucp_proto_rndv_request_is_ppln_frag(req)) {
266268
ucp_proto_rndv_ppln_recv_frag_complete(req, 1, 0);
267269
} else {
270+
/* Decrement worker-level throttle counter for standalone mtype */
271+
if (context->config.ext.rndv_ppln_worker_fc_enable) {
272+
ucs_assert(worker->rndv_ppln_fc.active_frags > 0);
273+
worker->rndv_ppln_fc.active_frags--;
274+
}
268275
ucp_proto_rndv_recv_complete_with_ats(req,
269276
UCP_PROTO_RNDV_GET_STAGE_ATS);
270277
}
@@ -284,21 +291,45 @@ ucp_proto_rndv_get_mtype_fetch_completion(uct_completion_t *uct_comp)
284291
static ucs_status_t
285292
ucp_proto_rndv_get_mtype_fetch_progress(uct_pending_req_t *uct_req)
286293
{
287-
ucp_request_t *req = ucs_container_of(uct_req, ucp_request_t, send.uct);
294+
ucp_request_t *req = ucs_container_of(uct_req, ucp_request_t, send.uct);
295+
ucp_worker_h worker = req->send.ep->worker;
296+
ucp_context_h context = worker->context;
288297
const ucp_proto_rndv_bulk_priv_t *rpriv;
289298
ucs_status_t status;
290299

291300
/* coverity[tainted_data_downcast] */
292301
rpriv = req->send.proto_config->priv;
293302

294303
if (!(req->flags & UCP_REQUEST_FLAG_PROTO_INITIALIZED)) {
304+
/* Check worker-level throttling limit before allocating fragment.
305+
* Skip for ppln_frag requests since ppln already handles throttling */
306+
if (!ucp_proto_rndv_request_is_ppln_frag(req) &&
307+
context->config.ext.rndv_ppln_worker_fc_enable &&
308+
(worker->rndv_ppln_fc.active_frags >=
309+
context->config.ext.rndv_ppln_worker_max_frags)) {
310+
ucs_trace_req("get_mtype_progress: worker throttle limit reached "
311+
"active_frags=%zu max=%zu, queuing request",
312+
worker->rndv_ppln_fc.active_frags,
313+
context->config.ext.rndv_ppln_worker_max_frags);
314+
/* Queue to ppln pending_q and schedule for later retry */
315+
ucs_queue_push(&worker->rndv_ppln_fc.pending_q,
316+
&req->send.rndv.ppln.queue_elem);
317+
return UCS_OK;
318+
}
319+
295320
status = ucp_proto_rndv_mtype_request_init(req, rpriv->frag_mem_type,
296321
rpriv->frag_sys_dev);
297322
if (status != UCS_OK) {
298323
ucp_proto_request_abort(req, status);
299324
return UCS_OK;
300325
}
301326

327+
/* Increment throttle counter after successful allocation */
328+
if (!ucp_proto_rndv_request_is_ppln_frag(req) &&
329+
context->config.ext.rndv_ppln_worker_fc_enable) {
330+
worker->rndv_ppln_fc.active_frags++;
331+
}
332+
302333
ucp_proto_rndv_get_common_request_init(req);
303334
ucp_proto_completion_init(&req->send.state.uct_comp,
304335
ucp_proto_rndv_get_mtype_fetch_completion);
@@ -356,6 +387,9 @@ ucp_proto_rndv_get_mtype_query(const ucp_proto_query_params_t *params,
356387

357388
static ucs_status_t ucp_proto_rndv_get_mtype_reset(ucp_request_t *req)
358389
{
390+
ucp_worker_h worker = req->send.ep->worker;
391+
ucp_context_h context = worker->context;
392+
359393
if (!(req->flags & UCP_REQUEST_FLAG_PROTO_INITIALIZED)) {
360394
return UCS_OK;
361395
}
@@ -364,6 +398,13 @@ static ucs_status_t ucp_proto_rndv_get_mtype_reset(ucp_request_t *req)
364398
req->send.rndv.mdesc = NULL;
365399
req->flags &= ~UCP_REQUEST_FLAG_PROTO_INITIALIZED;
366400

401+
/* Decrement worker-level throttle counter */
402+
if (!ucp_proto_rndv_request_is_ppln_frag(req) &&
403+
context->config.ext.rndv_ppln_worker_fc_enable) {
404+
ucs_assert(worker->rndv_ppln_fc.active_frags > 0);
405+
worker->rndv_ppln_fc.active_frags--;
406+
}
407+
367408
if ((req->send.proto_stage != UCP_PROTO_RNDV_GET_STAGE_FETCH) &&
368409
(req->send.proto_stage != UCP_PROTO_RNDV_GET_STAGE_ATS)) {
369410
ucp_proto_fatal_invalid_stage(req, "reset");

src/ucp/rndv/rndv_ppln.c

Lines changed: 58 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <ucp/proto/proto_debug.h>
1414
#include <ucp/proto/proto_multi.inl>
1515
#include <ucp/proto/proto_init.h>
16+
#include <ucs/datastruct/callbackq.h>
1617

1718

1819
enum {
@@ -32,6 +33,17 @@ typedef struct {
3233
size_t frag_proto_min_length; /* Frag proto min length */
3334
} ucp_proto_rndv_ppln_priv_t;
3435

36+
/* A callback to reschedule a throttled ppln request, that is called from the
37+
* worker's progress queue */
38+
static unsigned ucp_proto_rndv_ppln_reschedule_progress(void *arg)
39+
{
40+
ucp_request_t *req = arg;
41+
ucs_trace_req("ppln reschedule progress for request %p", req);
42+
ucp_request_send(req);
43+
44+
return 1;
45+
}
46+
3547
static ucs_status_t
3648
ucp_proto_rndv_ppln_add_overhead(ucp_proto_perf_t *ppln_perf, size_t frag_size)
3749
{
@@ -209,12 +221,37 @@ ucp_proto_rndv_ppln_frag_complete(ucp_request_t *freq, int send_ack, int abort,
209221
ucp_proto_complete_cb_t complete_func,
210222
const char *title)
211223
{
212-
ucp_request_t *req = ucp_request_get_super(freq);
224+
ucp_request_t *req = ucp_request_get_super(freq);
225+
ucp_worker_h worker = req->send.ep->worker;
226+
ucp_context_h context = worker->context;
227+
int fc_enabled = context->config.ext.rndv_ppln_worker_fc_enable;
213228

214229
if (send_ack) {
215230
req->send.rndv.ppln.ack_data_size += freq->send.state.dt_iter.length;
216231
}
217232

233+
if (fc_enabled) {
234+
ucs_assert(worker->rndv_ppln_fc.active_frags > 0);
235+
worker->rndv_ppln_fc.active_frags--;
236+
237+
ucs_trace_req("%s frag complete, worker active_frags=%zu",
238+
title, worker->rndv_ppln_fc.active_frags);
239+
240+
/* Reschedule throttled requests to the progress queue, if any are
241+
* pending. */
242+
if (!ucs_queue_is_empty(&worker->rndv_ppln_fc.pending_q)) {
243+
ucp_request_t *pending_req;
244+
ucs_queue_elem_t *elem;
245+
246+
elem = ucs_queue_pull(&worker->rndv_ppln_fc.pending_q);
247+
pending_req = ucs_container_of(elem, ucp_request_t,
248+
send.rndv.ppln.queue_elem);
249+
ucs_callbackq_add_oneshot(&worker->uct->progress_q, pending_req,
250+
ucp_proto_rndv_ppln_reschedule_progress,
251+
pending_req);
252+
}
253+
}
254+
218255
/* In case of abort we don't destroy super request until all fragments are
219256
* completed */
220257
if (!ucp_proto_rndv_frag_complete(req, freq, title)) {
@@ -252,8 +289,10 @@ void ucp_proto_rndv_ppln_recv_frag_complete(ucp_request_t *freq, int send_ack,
252289

253290
static ucs_status_t ucp_proto_rndv_ppln_progress(uct_pending_req_t *uct_req)
254291
{
255-
ucp_request_t *req = ucs_container_of(uct_req, ucp_request_t, send.uct);
256-
ucp_worker_h worker = req->send.ep->worker;
292+
ucp_request_t *req = ucs_container_of(uct_req, ucp_request_t, send.uct);
293+
ucp_worker_h worker = req->send.ep->worker;
294+
ucp_context_h context = worker->context;
295+
int fc_enabled = context->config.ext.rndv_ppln_worker_fc_enable;
257296
const ucp_proto_rndv_ppln_priv_t *rpriv;
258297
ucp_datatype_iter_t next_iter;
259298
ucs_status_t status;
@@ -271,6 +310,22 @@ static ucs_status_t ucp_proto_rndv_ppln_progress(uct_pending_req_t *uct_req)
271310
rpriv = req->send.proto_config->priv;
272311

273312
while (!ucp_datatype_iter_is_end(&req->send.state.dt_iter)) {
313+
/* Check throttling limit */
314+
if (fc_enabled &&
315+
(worker->rndv_ppln_fc.active_frags >=
316+
context->config.ext.rndv_ppln_worker_max_frags)) {
317+
318+
/* Add request to the pending queue. It will be rescheduled
319+
* when other fragments complete. */
320+
ucs_queue_push(&worker->rndv_ppln_fc.pending_q,
321+
&req->send.rndv.ppln.queue_elem);
322+
return UCS_OK;
323+
}
324+
325+
if (fc_enabled) {
326+
worker->rndv_ppln_fc.active_frags++;
327+
}
328+
274329
status = ucp_proto_rndv_frag_request_alloc(worker, req, &freq);
275330
if (status != UCS_OK) {
276331
ucp_proto_request_abort(req, status);

src/ucp/rndv/rndv_put.c

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -520,18 +520,43 @@ ucp_proto_rndv_put_mtype_copy_progress(uct_pending_req_t *uct_req)
520520
ucp_request_t *req = ucs_container_of(uct_req,
521521
ucp_request_t,
522522
send.uct);
523+
ucp_worker_h worker = req->send.ep->worker;
524+
ucp_context_h context = worker->context;
523525
const ucp_proto_rndv_put_priv_t *rpriv = req->send.proto_config->priv;
524526
ucs_status_t status;
525527

526528
ucs_assert(!(req->flags & UCP_REQUEST_FLAG_PROTO_INITIALIZED));
527529

530+
/* Check throttling limit before allocating fragment.
531+
* Skip for ppln_frag requests since ppln already handles throttling */
532+
if (!ucp_proto_rndv_request_is_ppln_frag(req) &&
533+
context->config.ext.rndv_ppln_worker_fc_enable &&
534+
(worker->rndv_ppln_fc.active_frags >=
535+
context->config.ext.rndv_ppln_worker_max_frags)) {
536+
ucs_trace_req("put_mtype_progress: worker throttle limit reached "
537+
"active_frags=%zu max=%zu, queuing request",
538+
worker->rndv_ppln_fc.active_frags,
539+
context->config.ext.rndv_ppln_worker_max_frags);
540+
541+
/* Queue to ppln pending_q and schedule for later retry */
542+
ucs_queue_push(&worker->rndv_ppln_fc.pending_q,
543+
&req->send.rndv.ppln.queue_elem);
544+
return UCS_OK;
545+
}
546+
528547
status = ucp_proto_rndv_mtype_request_init(req, rpriv->bulk.frag_mem_type,
529548
rpriv->bulk.frag_sys_dev);
530549
if (status != UCS_OK) {
531550
ucp_proto_request_abort(req, status);
532551
return UCS_OK;
533552
}
534553

554+
/* Increment throttle counter after successful allocation */
555+
if (!ucp_proto_rndv_request_is_ppln_frag(req) &&
556+
context->config.ext.rndv_ppln_worker_fc_enable) {
557+
worker->rndv_ppln_fc.active_frags++;
558+
}
559+
535560
ucp_proto_rndv_put_common_request_init(req);
536561
req->flags |= UCP_REQUEST_FLAG_PROTO_INITIALIZED;
537562
ucp_proto_rndv_mdesc_mtype_copy(req, uct_ep_get_zcopy,
@@ -560,9 +585,18 @@ static void ucp_proto_rndv_put_mtype_completion(uct_completion_t *uct_comp)
560585
{
561586
ucp_request_t *req = ucs_container_of(uct_comp, ucp_request_t,
562587
send.state.uct_comp);
588+
ucp_worker_h worker = req->send.ep->worker;
589+
ucp_context_h context = worker->context;
563590

564591
ucp_trace_req(req, "rndv_put_mtype_completion");
565592
ucs_mpool_put(req->send.rndv.mdesc);
593+
594+
/* Decrement throttle counter for mtype allocations */
595+
if (context->config.ext.rndv_ppln_worker_fc_enable) {
596+
ucs_assert(worker->rndv_ppln_fc.active_frags > 0);
597+
worker->rndv_ppln_fc.active_frags--;
598+
}
599+
566600
ucp_proto_rndv_put_common_complete(req);
567601
}
568602

0 commit comments

Comments
 (0)