[Infercne] make internode_ll_two_stage supports async and hook mode. (#74405)

carryyu · web-flow · commit 698721eee0ac · 2025-08-06T20:22:16.000+08:00
diff --git a/paddle/fluid/distributed/collective/deep_ep/deep_ep.cpp b/paddle/fluid/distributed/collective/deep_ep/deep_ep.cpp
@@ -1983,7 +1983,10 @@ Buffer::low_latency_dispatch_two_stage(
   auto next_buffer = layout.buffers[low_latency_buffer_idx ^= 1];
 
   // Wait previous tasks to be finished
-  auto launch_stream = calc_ctx->stream();
+  auto compute_stream = calc_ctx->stream();
+  auto launch_stream = async ? comm_stream : compute_stream;
+  EP_HOST_ASSERT(!(async && return_recv_hook));
+
   auto return_x_dtype = phi::DataType::BFLOAT16;
   if (use_fp8) {
     return_x_dtype = phi::DataType::FLOAT8_E4M3FN;
@@ -2084,11 +2087,16 @@ Buffer::low_latency_dispatch_two_stage(
         phases,
         low_latency_buffer_idx);
   };
-  // TODO(Zhenyu Li): supports async/return_recv_hook
-  launcher((LOW_LATENCY_SEND_PHASE | LOW_LATENCY_RECV_PHASE));
-  // Wait streams
+  launcher(return_recv_hook
+               ? LOW_LATENCY_SEND_PHASE
+               : (LOW_LATENCY_SEND_PHASE | LOW_LATENCY_RECV_PHASE));
+  // Async event
   std::optional<EventHandle> event;
+  if (async) {
+    event = EventHandle(launch_stream);
+  }
   std::optional<std::function<void()>> recv_hook = std::nullopt;
+  if (return_recv_hook) recv_hook = [=]() { launcher(LOW_LATENCY_RECV_PHASE); };
   return {packed_recv_x,
           packed_recv_x_scales,
           packed_rdma_recv_x,
@@ -2158,7 +2166,9 @@ Buffer::low_latency_combine_two_stage(
   auto buffer = layout.buffers[low_latency_buffer_idx];
   auto next_buffer = layout.buffers[low_latency_buffer_idx ^= 1];
 
-  auto launch_stream = calc_ctx->stream();
+  auto compute_stream = calc_ctx->stream();
+  auto launch_stream = async ? comm_stream : compute_stream;
+  EP_HOST_ASSERT(!(async && return_recv_hook));
 
   // Allocate output tensor
   deep_ep::detail::Tensor combined_x;
@@ -2204,12 +2214,17 @@ Buffer::low_latency_combine_two_stage(
                                     dispatch_use_fp8,
                                     low_latency_buffer_idx);
   };
-  // TODO(Zhenyu Li): supports async/return_recv_hook
-  launcher((LOW_LATENCY_SEND_PHASE | LOW_LATENCY_RECV_PHASE));
-  // Wait streams
+  launcher(return_recv_hook
+               ? LOW_LATENCY_SEND_PHASE
+               : (LOW_LATENCY_SEND_PHASE | LOW_LATENCY_RECV_PHASE));
+  // Async event
   std::optional<EventHandle> event;
+  if (async) {
+    event = EventHandle(launch_stream);
+  }
   // Receiver callback
   std::optional<std::function<void()>> recv_hook = std::nullopt;
+  if (return_recv_hook) recv_hook = [=]() { launcher(LOW_LATENCY_RECV_PHASE); };
   // Return values
   return {combined_x, event, recv_hook};
 }
diff --git a/paddle/fluid/distributed/collective/deep_ep/kernels/internode_ll_two_stage.cu b/paddle/fluid/distributed/collective/deep_ep/kernels/internode_ll_two_stage.cu
@@ -148,6 +148,9 @@ __global__ __launch_bounds__(
       num_bytes_per_msg_rdma_revecier_and_nvl_sender % sizeof(int4) == 0);
   EP_DEVICE_ASSERT(num_bytes_per_msg_rdma_to_nvl % sizeof(int4) == 0);
 
+  // Sending phase
+  if ((phases & LOW_LATENCY_SEND_PHASE) == 0) goto LOW_LATENCY_DISPATCH_RECV;
+
   /* RDMA Sender */
   {
     constexpr int kNumElemsPerRead = sizeof(int4) / sizeof(nv_bfloat16);
@@ -363,6 +366,10 @@ __global__ __launch_bounds__(
     }
   }
 
+  // Receiving phase
+LOW_LATENCY_DISPATCH_RECV:
+  if ((phases & LOW_LATENCY_RECV_PHASE) == 0) return;
+
   /* RDMA Receiver and NVL Sender */
   {
     const int sms_per_rdma = num_sms / kNumRdmaRanks;
@@ -828,6 +835,9 @@ __global__ __launch_bounds__(
   const size_t NVL_BUFFER_OFFSET =
       nvl_buffer_id * NVL_BUFFER_X_BYTES_PER_BUFFER;
 
+  // Sending phase
+  if ((phases & LOW_LATENCY_SEND_PHASE) == 0) goto LOW_LATENCY_COMBINE_RECV;
+
   // Clean up next buffer
   if (sm_id == 0) {
 #pragma unroll
@@ -1068,6 +1078,10 @@ __global__ __launch_bounds__(
     }
   }
 
+  // Receiving phase
+LOW_LATENCY_COMBINE_RECV:
+  if ((phases & LOW_LATENCY_RECV_PHASE) == 0) return;
+
   /* RDMA Receiver / RDMA Reducer */
   // Wait all rdma ranks to arrive
   if (sm_id < kNumRdmaRanks) {