UCP/RNDV: Throttle rndv fragment requests (both pipeline and standalone mtype requests)

amastbaum · amastbaum · commit 2be304034dea · 2025-12-15T14:28:18.000+02:00
diff --git a/src/ucp/core/ucp_context.c b/src/ucp/core/ucp_context.c
@@ -383,6 +383,16 @@ static ucs_config_field_t ucp_context_config_table[] = {
    "even if invalidation workflow isn't supported",
    ucs_offsetof(ucp_context_config_t, rndv_errh_ppln_enable), UCS_CONFIG_TYPE_BOOL},
 
+  {"RNDV_PIPELINE_WORKER_FC_ENABLE", "n",
+   "Enable worker-level flow control to limit total concurrent pipeline fragments\n"
+   "across all requests, preventing memory exhaustion",
+   ucs_offsetof(ucp_context_config_t, rndv_ppln_worker_fc_enable), UCS_CONFIG_TYPE_BOOL},
+
+  {"RNDV_PIPELINE_WORKER_MAX_FRAGS", "5000",
+   "Maximum number of concurrent pipeline fragments per worker\n"
+   "(only applies when RNDV_PIPELINE_WORKER_FC_ENABLE=y)",
+   ucs_offsetof(ucp_context_config_t, rndv_ppln_worker_max_frags), UCS_CONFIG_TYPE_ULUNITS},
+
   {"FLUSH_WORKER_EPS", "y",
    "Enable flushing the worker by flushing its endpoints. Allows completing\n"
    "the flush operation in a bounded time even if there are new requests on\n"
diff --git a/src/ucp/core/ucp_context.h b/src/ucp/core/ucp_context.h
@@ -98,6 +98,10 @@ typedef struct ucp_context_config {
     int                                    rndv_shm_ppln_enable;
     /** Enable error handling for rndv pipeline protocol */
     int                                    rndv_errh_ppln_enable;
+    /** Enable flow control for rndv pipeline fragments at worker level */
+    int                                    rndv_ppln_worker_fc_enable;
+    /** Maximum number of concurrent pipeline fragments per worker */
+    size_t                                 rndv_ppln_worker_max_frags;
     /** Threshold for using tag matching offload capabilities. Smaller buffers
      *  will not be posted to the transport. */
     size_t                                 tm_thresh;
diff --git a/src/ucp/core/ucp_request.h b/src/ucp/core/ucp_request.h
@@ -318,7 +318,10 @@ struct ucp_request {
                                 /* Used by rndv/send/ppln and rndv/recv/ppln */
                                 struct {
                                     /* Size to send in ack message */
-                                    ssize_t ack_data_size;
+                                    ssize_t          ack_data_size;
+                                    /* Element in worker-level pending queue
+                                     * for throttled ppln requests */
+                                    ucs_queue_elem_t queue_elem;
                                 } ppln;
 
                                 /* Used by rndv/rkey_ptr */
diff --git a/src/ucp/core/ucp_worker.c b/src/ucp/core/ucp_worker.c
@@ -2519,6 +2519,10 @@ ucs_status_t ucp_worker_create(ucp_context_h context,
     worker->counters.ep_closures          = 0;
     worker->counters.ep_failures          = 0;
 
+    /* Initialize RNDV pipeline flow control */
+    worker->rndv_ppln_fc.active_frags     = 0;
+    ucs_queue_head_init(&worker->rndv_ppln_fc.pending_q);
+
     /* Copy user flags, and mask-out unsupported flags for compatibility */
     worker->flags = UCP_PARAM_VALUE(WORKER, params, flags, FLAGS, 0) &
                     UCS_MASK(UCP_WORKER_INTERNAL_FLAGS_SHIFT);
diff --git a/src/ucp/core/ucp_worker.h b/src/ucp/core/ucp_worker.h
@@ -393,6 +393,12 @@ typedef struct ucp_worker {
         uint64_t                     ep_failures;
     } counters;
 
+    struct {
+        /* Worker-level ppln fragment flow control */
+        size_t                       active_frags;       /* Current active fragments */
+        ucs_queue_head_t             pending_q;          /* Queue of throttled ppln requests */
+    } rndv_ppln_fc;
+
     struct {
         /* Usage tracker handle */
         ucs_usage_tracker_h          handle;
diff --git a/src/ucp/rndv/rndv_get.c b/src/ucp/rndv/rndv_get.c
@@ -260,11 +260,18 @@ ucp_proto_rndv_get_mtype_unpack_completion(uct_completion_t *uct_comp)
 {
     ucp_request_t *req = ucs_container_of(uct_comp, ucp_request_t,
                                           send.state.uct_comp);
+    ucp_worker_h worker   = req->send.ep->worker;
+    ucp_context_h context = worker->context;
 
     ucs_mpool_put_inline(req->send.rndv.mdesc);
     if (ucp_proto_rndv_request_is_ppln_frag(req)) {
         ucp_proto_rndv_ppln_recv_frag_complete(req, 1, 0);
     } else {
+        /* Decrement worker-level throttle counter for standalone mtype */
+        if (context->config.ext.rndv_ppln_worker_fc_enable) {
+            ucs_assert(worker->rndv_ppln_fc.active_frags > 0);
+            worker->rndv_ppln_fc.active_frags--;
+        }
         ucp_proto_rndv_recv_complete_with_ats(req,
                                               UCP_PROTO_RNDV_GET_STAGE_ATS);
     }
@@ -284,21 +291,45 @@ ucp_proto_rndv_get_mtype_fetch_completion(uct_completion_t *uct_comp)
 static ucs_status_t
 ucp_proto_rndv_get_mtype_fetch_progress(uct_pending_req_t *uct_req)
 {
-    ucp_request_t *req = ucs_container_of(uct_req, ucp_request_t, send.uct);
+    ucp_request_t *req    = ucs_container_of(uct_req, ucp_request_t, send.uct);
+    ucp_worker_h worker   = req->send.ep->worker;
+    ucp_context_h context = worker->context;
     const ucp_proto_rndv_bulk_priv_t *rpriv;
     ucs_status_t status;
 
     /* coverity[tainted_data_downcast] */
     rpriv = req->send.proto_config->priv;
 
     if (!(req->flags & UCP_REQUEST_FLAG_PROTO_INITIALIZED)) {
+        /* Check worker-level throttling limit before allocating fragment.
+         * Skip for ppln_frag requests since ppln already handles throttling */
+        if (!ucp_proto_rndv_request_is_ppln_frag(req) &&
+            context->config.ext.rndv_ppln_worker_fc_enable &&
+            (worker->rndv_ppln_fc.active_frags >=
+             context->config.ext.rndv_ppln_worker_max_frags)) {
+            ucs_trace_req("get_mtype_progress: worker throttle limit reached "
+                          "active_frags=%zu max=%zu, queuing request",
+                          worker->rndv_ppln_fc.active_frags,
+                          context->config.ext.rndv_ppln_worker_max_frags);
+            /* Queue to ppln pending_q and schedule for later retry */
+            ucs_queue_push(&worker->rndv_ppln_fc.pending_q,
+                           &req->send.rndv.ppln.queue_elem);
+            return UCS_OK;
+        }
+
         status = ucp_proto_rndv_mtype_request_init(req, rpriv->frag_mem_type,
                                                    rpriv->frag_sys_dev);
         if (status != UCS_OK) {
             ucp_proto_request_abort(req, status);
             return UCS_OK;
         }
 
+        /* Increment throttle counter after successful allocation */
+        if (!ucp_proto_rndv_request_is_ppln_frag(req) &&
+            context->config.ext.rndv_ppln_worker_fc_enable) {
+            worker->rndv_ppln_fc.active_frags++;
+        }
+
         ucp_proto_rndv_get_common_request_init(req);
         ucp_proto_completion_init(&req->send.state.uct_comp,
                                   ucp_proto_rndv_get_mtype_fetch_completion);
@@ -356,6 +387,9 @@ ucp_proto_rndv_get_mtype_query(const ucp_proto_query_params_t *params,
 
 static ucs_status_t ucp_proto_rndv_get_mtype_reset(ucp_request_t *req)
 {
+    ucp_worker_h worker   = req->send.ep->worker;
+    ucp_context_h context = worker->context;
+
     if (!(req->flags & UCP_REQUEST_FLAG_PROTO_INITIALIZED)) {
         return UCS_OK;
     }
@@ -364,6 +398,13 @@ static ucs_status_t ucp_proto_rndv_get_mtype_reset(ucp_request_t *req)
     req->send.rndv.mdesc = NULL;
     req->flags          &= ~UCP_REQUEST_FLAG_PROTO_INITIALIZED;
 
+    /* Decrement worker-level throttle counter */
+    if (!ucp_proto_rndv_request_is_ppln_frag(req) &&
+        context->config.ext.rndv_ppln_worker_fc_enable) {
+        ucs_assert(worker->rndv_ppln_fc.active_frags > 0);
+        worker->rndv_ppln_fc.active_frags--;
+    }
+
     if ((req->send.proto_stage != UCP_PROTO_RNDV_GET_STAGE_FETCH) &&
         (req->send.proto_stage != UCP_PROTO_RNDV_GET_STAGE_ATS)) {
         ucp_proto_fatal_invalid_stage(req, "reset");
diff --git a/src/ucp/rndv/rndv_ppln.c b/src/ucp/rndv/rndv_ppln.c
@@ -13,6 +13,7 @@
 #include <ucp/proto/proto_debug.h>
 #include <ucp/proto/proto_multi.inl>
 #include <ucp/proto/proto_init.h>
+#include <ucs/datastruct/callbackq.h>
 
 
 enum {
@@ -32,6 +33,17 @@ typedef struct {
     size_t                    frag_proto_min_length; /* Frag proto min length */
 } ucp_proto_rndv_ppln_priv_t;
 
+/* A callback to reschedule a throttled ppln request, that is called from the
+ * worker's progress queue */
+static unsigned ucp_proto_rndv_ppln_reschedule_progress(void *arg)
+{
+    ucp_request_t *req = arg;
+    ucs_trace_req("ppln reschedule progress for request %p", req);
+    ucp_request_send(req);
+
+    return 1;
+}
+
 static ucs_status_t
 ucp_proto_rndv_ppln_add_overhead(ucp_proto_perf_t *ppln_perf, size_t frag_size)
 {
@@ -209,12 +221,37 @@ ucp_proto_rndv_ppln_frag_complete(ucp_request_t *freq, int send_ack, int abort,
                                   ucp_proto_complete_cb_t complete_func,
                                   const char *title)
 {
-    ucp_request_t *req = ucp_request_get_super(freq);
+    ucp_request_t *req    = ucp_request_get_super(freq);
+    ucp_worker_h worker   = req->send.ep->worker;
+    ucp_context_h context = worker->context;
+    int fc_enabled        = context->config.ext.rndv_ppln_worker_fc_enable;
 
     if (send_ack) {
         req->send.rndv.ppln.ack_data_size += freq->send.state.dt_iter.length;
     }
 
+    if (fc_enabled) {
+        ucs_assert(worker->rndv_ppln_fc.active_frags > 0);
+        worker->rndv_ppln_fc.active_frags--;
+
+        ucs_trace_req("%s frag complete, worker active_frags=%zu",
+                      title, worker->rndv_ppln_fc.active_frags);
+
+        /* Reschedule throttled requests to the progress queue, if any are
+         * pending. */
+        if (!ucs_queue_is_empty(&worker->rndv_ppln_fc.pending_q)) {
+            ucp_request_t *pending_req;
+            ucs_queue_elem_t *elem;
+
+            elem        = ucs_queue_pull(&worker->rndv_ppln_fc.pending_q);
+            pending_req = ucs_container_of(elem, ucp_request_t,
+                                           send.rndv.ppln.queue_elem);
+            ucs_callbackq_add_oneshot(&worker->uct->progress_q, pending_req,
+                                      ucp_proto_rndv_ppln_reschedule_progress,
+                                      pending_req);
+        }
+    }
+
     /* In case of abort we don't destroy super request until all fragments are
      * completed */
     if (!ucp_proto_rndv_frag_complete(req, freq, title)) {
@@ -252,8 +289,10 @@ void ucp_proto_rndv_ppln_recv_frag_complete(ucp_request_t *freq, int send_ack,
 
 static ucs_status_t ucp_proto_rndv_ppln_progress(uct_pending_req_t *uct_req)
 {
-    ucp_request_t *req  = ucs_container_of(uct_req, ucp_request_t, send.uct);
-    ucp_worker_h worker = req->send.ep->worker;
+    ucp_request_t *req    = ucs_container_of(uct_req, ucp_request_t, send.uct);
+    ucp_worker_h worker   = req->send.ep->worker;
+    ucp_context_h context = worker->context;
+    int fc_enabled        = context->config.ext.rndv_ppln_worker_fc_enable;
     const ucp_proto_rndv_ppln_priv_t *rpriv;
     ucp_datatype_iter_t next_iter;
     ucs_status_t status;
@@ -271,6 +310,22 @@ static ucs_status_t ucp_proto_rndv_ppln_progress(uct_pending_req_t *uct_req)
     rpriv                             = req->send.proto_config->priv;
 
     while (!ucp_datatype_iter_is_end(&req->send.state.dt_iter)) {
+        /* Check throttling limit */
+        if (fc_enabled &&
+            (worker->rndv_ppln_fc.active_frags >=
+             context->config.ext.rndv_ppln_worker_max_frags)) {
+
+            /* Add request to the pending queue. It will be rescheduled
+             * when other fragments complete. */
+            ucs_queue_push(&worker->rndv_ppln_fc.pending_q,
+                           &req->send.rndv.ppln.queue_elem);
+            return UCS_OK;
+        }
+
+        if (fc_enabled) {
+            worker->rndv_ppln_fc.active_frags++;
+        }
+
         status = ucp_proto_rndv_frag_request_alloc(worker, req, &freq);
         if (status != UCS_OK) {
             ucp_proto_request_abort(req, status);
diff --git a/src/ucp/rndv/rndv_put.c b/src/ucp/rndv/rndv_put.c
@@ -520,18 +520,43 @@ ucp_proto_rndv_put_mtype_copy_progress(uct_pending_req_t *uct_req)
     ucp_request_t *req                     = ucs_container_of(uct_req,
                                                               ucp_request_t,
                                                               send.uct);
+    ucp_worker_h worker                    = req->send.ep->worker;
+    ucp_context_h context                  = worker->context;
     const ucp_proto_rndv_put_priv_t *rpriv = req->send.proto_config->priv;
     ucs_status_t status;
 
     ucs_assert(!(req->flags & UCP_REQUEST_FLAG_PROTO_INITIALIZED));
 
+    /* Check throttling limit before allocating fragment.
+     * Skip for ppln_frag requests since ppln already handles throttling */
+    if (!ucp_proto_rndv_request_is_ppln_frag(req) &&
+        context->config.ext.rndv_ppln_worker_fc_enable &&
+        (worker->rndv_ppln_fc.active_frags >=
+         context->config.ext.rndv_ppln_worker_max_frags)) {
+        ucs_trace_req("put_mtype_progress: worker throttle limit reached "
+                      "active_frags=%zu max=%zu, queuing request",
+                      worker->rndv_ppln_fc.active_frags,
+                      context->config.ext.rndv_ppln_worker_max_frags);
+
+        /* Queue to ppln pending_q and schedule for later retry */
+        ucs_queue_push(&worker->rndv_ppln_fc.pending_q,
+                       &req->send.rndv.ppln.queue_elem);
+        return UCS_OK;
+    }
+
     status = ucp_proto_rndv_mtype_request_init(req, rpriv->bulk.frag_mem_type,
                                                rpriv->bulk.frag_sys_dev);
     if (status != UCS_OK) {
         ucp_proto_request_abort(req, status);
         return UCS_OK;
     }
 
+    /* Increment throttle counter after successful allocation */
+    if (!ucp_proto_rndv_request_is_ppln_frag(req) &&
+        context->config.ext.rndv_ppln_worker_fc_enable) {
+        worker->rndv_ppln_fc.active_frags++;
+    }
+
     ucp_proto_rndv_put_common_request_init(req);
     req->flags |= UCP_REQUEST_FLAG_PROTO_INITIALIZED;
     ucp_proto_rndv_mdesc_mtype_copy(req, uct_ep_get_zcopy,
@@ -560,9 +585,18 @@ static void ucp_proto_rndv_put_mtype_completion(uct_completion_t *uct_comp)
 {
     ucp_request_t *req = ucs_container_of(uct_comp, ucp_request_t,
                                           send.state.uct_comp);
+    ucp_worker_h worker   = req->send.ep->worker;
+    ucp_context_h context = worker->context;
 
     ucp_trace_req(req, "rndv_put_mtype_completion");
     ucs_mpool_put(req->send.rndv.mdesc);
+
+    /* Decrement throttle counter for mtype allocations */
+    if (context->config.ext.rndv_ppln_worker_fc_enable) {
+        ucs_assert(worker->rndv_ppln_fc.active_frags > 0);
+        worker->rndv_ppln_fc.active_frags--;
+    }
+
     ucp_proto_rndv_put_common_complete(req);
 }
 
diff --git a/src/ucp/rndv/rndv_rtr.c b/src/ucp/rndv/rndv_rtr.c