algorithm1832
diff --git a/‎paddle/fluid/distributed/collective/deep_ep/deep_ep.cpp‎
Lines changed: 10 additions & 5 deletions b/‎paddle/fluid/distributed/collective/deep_ep/deep_ep.cpp‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎paddle/fluid/distributed/collective/deep_ep/deep_ep.hpp‎
Lines changed: 4 additions & 2 deletions b/‎paddle/fluid/distributed/collective/deep_ep/deep_ep.hpp‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎paddle/fluid/distributed/collective/deep_ep/kernels/api.cuh‎
Lines changed: 2 additions & 1 deletion b/‎paddle/fluid/distributed/collective/deep_ep/kernels/api.cuh‎
Lines changed: 2 additions & 1 deletion
@@ -1752,7 +1752,8 @@ Buffer::low_latency_dispatch(
     int num_experts,
     bool use_fp8,
     bool async,
-    bool return_recv_hook) {
+    bool return_recv_hook,
+    int num_per_channel) {
   EP_HOST_ASSERT(low_latency_mode);
 
   // Tensor checks
@@ -1768,7 +1769,8 @@ Buffer::low_latency_dispatch(
 
   auto num_tokens = static_cast<int>(x.size(0)),
        hidden = static_cast<int>(x.size(1));
-  auto num_scales = hidden / 128, num_topk = static_cast<int>(topk_idx.size(1));
+  auto num_scales = num_per_channel == -1 ? 1 : hidden / 128,
+       num_topk = static_cast<int>(topk_idx.size(1));
   int num_local_experts = num_experts / num_ranks;
 
   // Buffer control
@@ -1872,7 +1874,8 @@ Buffer::low_latency_dispatch(
                            use_fp8,
                            workspace,
                            launch_stream,
-                           phases);
+                           phases,
+                           num_per_channel);
   };
   launcher(return_recv_hook
                ? LOW_LATENCY_SEND_PHASE
@@ -2976,7 +2979,8 @@ Buffer::low_latency_dispatch_api(
     int num_experts,
     bool use_fp8,
     bool async,
-    bool return_recv_hook) {
+    bool return_recv_hook,
+    int num_per_channel) {
 #ifdef PADDLE_WITH_NVSHMEM
   const auto& x_ = ConvertPaddleTensorToDetailTensor(x);
   const auto& topk_idx_ = ConvertPaddleTensorToDetailTensor(topk_idx);
@@ -2994,7 +2998,8 @@ Buffer::low_latency_dispatch_api(
                                   num_experts,
                                   use_fp8,
                                   async,
-                                  return_recv_hook);
+                                  return_recv_hook,
+                                  num_per_channel);
 
   auto packed_recv_x_ = ConvertDetailTensorToPaddleTensor(std::get<0>(res));
 
 
@@ -279,7 +279,8 @@ struct Buffer {
       int num_experts,
       bool use_fp8,
       bool async,
-      bool return_recv_hook);
+      bool return_recv_hook,
+      int num_per_channel);
 
   std::tuple<deep_ep::detail::Tensor,
              std::optional<EventHandle>,
@@ -452,7 +453,8 @@ struct Buffer {
       int num_experts,
       bool use_fp8,
       bool async,
-      bool return_recv_hook);
+      bool return_recv_hook,
+      int num_per_channel);
 
   std::tuple<paddle::Tensor,
              std::optional<EventHandle>,
 
@@ -318,7 +318,8 @@ void dispatch(void* packed_recv_x,
               bool use_fp8,
               void* workspace,
               cudaStream_t stream,
-              int phases);
+              int phases,
+              int num_per_channel);
 
 void combine(void* combined_x,
              void* rdma_recv_x,