cherry-pick fleety's customized moe_permute optimization (#74979)

A-nnonymous · web-flow · commit 91767ebcfe1a · 2025-08-30T02:01:44.000+08:00
* cherry-pick fleety

* fix miscs

* recover fp16

* fix miscs
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
@@ -6131,7 +6131,8 @@ void MoePermuteInferMeta(const MetaTensor& X,
                          const MetaTensor& expert_prob_topk,
                          const int num_experts,
                          const std::vector<int>& tokens_per_expert,
-                         const int padding_multiplex,
+                         const int padding_alignment,
+                         const bool do_gather,
                          MetaTensor* X_unzipped,
                          MetaTensor* zipped_expertwise_rowmap,
                          MetaTensor* token_prob_unzipped,
@@ -6154,7 +6155,7 @@ void MoePermuteInferMeta(const MetaTensor& X,
                     true,
                     common::errors::InvalidArgument(
                         "Input expert_prob_topk's dtype should be FLOAT32"));
-  if (XScale) {
+  if (XScale && do_gather) {
     PADDLE_ENFORCE_EQ(XScale.dtype(),
                       phi::DataType::FLOAT32,
                       common::errors::InvalidArgument(
@@ -6168,8 +6169,16 @@ void MoePermuteInferMeta(const MetaTensor& X,
   }
   const int rows = X.dims()[0];
   const int cols = X.dims()[1];
-  X_unzipped->set_dims({-1, cols});
-  X_unzipped->set_dtype(X.dtype());
+
+  if (do_gather) {
+    X_unzipped->set_dims({-1, cols});
+    X_unzipped->set_dtype(X.dtype());
+  } else {
+    // Meta only, not
+    X_unzipped->set_dims({0, cols});
+    X_unzipped->set_dtype(X.dtype());
+  }
+
   zipped_expertwise_rowmap->set_dims({rows, num_experts});
   zipped_expertwise_rowmap->set_dtype(phi::DataType::INT32);
   token_prob_unzipped->set_dims({-1});
@@ -6356,7 +6365,8 @@ void MaskedMultiheadAttentionInferMeta(const MetaTensor& x,
       num_head % k_num_head,
       0,
       errors::InvalidArgument(
-          "The num_head of query must be divisible by the num_head of key, but "
+          "The num_head of query must be divisible by the num_head of key, "
+          "but "
           "received num_head of query is %d, and the num_head of key is %d",
           num_head,
           k_num_head));
@@ -6798,6 +6808,5 @@ void MoeGateDispatchAutoInferMeta(const MetaTensor& x,
   expert_id->set_dims(common::make_ddim({num_rows, k}));
   expert_id->set_dtype(phi::DataType::INT32);
 }
-
 }  // namespace phi
 PD_REGISTER_INFER_META_FN(batch_norm_infer, phi::BatchNormInferInferMeta);
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
@@ -560,7 +560,8 @@ void MoePermuteInferMeta(const MetaTensor& X,
                          const MetaTensor& expert_prob_topk,
                          const int num_experts,
                          const std::vector<int>& tokens_per_expert,
-                         const int padding_multiplex,
+                         const int padding_alignment,
+                         const bool do_gather,
                          MetaTensor* X_unzipped,
                          MetaTensor* zipped_expertwise_rowmap,
                          MetaTensor* token_prob_unzipped,
@@ -858,6 +859,28 @@ void MomentumInferMeta(const MetaTensor& param,
                        MetaTensor* param_out,
                        MetaTensor* velocity_out,
                        MetaTensor* master_param_out);
+void MoePermuteInferMeta(const MetaTensor& X,
+                         const MetaTensor& XScale,
+                         const MetaTensor& expert_routemap_topk,
+                         const MetaTensor& expert_prob_topk,
+                         const int num_experts,
+                         const std::vector<int>& tokens_per_expert,
+                         const int padding_alignment,
+                         const bool do_gather,
+                         MetaTensor* X_unzipped,
+                         MetaTensor* zipped_expertwise_rowmap,
+                         MetaTensor* token_prob_unzipped,
+                         MetaTensor* XScale_unzipped);
+
+void MoeUnpermuteInferMeta(const MetaTensor& unzipped_tokens,
+                           const MetaTensor& zipped_expertwise_rowmap,
+                           const MetaTensor& expert_routemap_topk,
+                           const MetaTensor& unzipped_token_probs,
+                           const int total_zipped_tokens_num,
+                           const int num_experts,
+                           const bool MP,
+                           MetaTensor* zipped_tokens,
+                           MetaTensor* zipped_probs_topk);
 
 void MultiDotInferMeta(const std::vector<const MetaTensor*>& x,
                        MetaTensor* out);
diff --git a/paddle/phi/kernels/gpu/moe_permute_kernel.cu b/paddle/phi/kernels/gpu/moe_permute_kernel.cu
@@ -43,7 +43,11 @@ struct expert_infos {
   }
 };
 
-template <typename X_T, typename routemap_T, typename probs_T, bool has_scale>
+template <typename X_T,
+          typename routemap_T,
+          typename probs_T,
+          bool has_scale,
+          bool do_gather>
 __global__ __launch_bounds__(512) void tokens_unzip_stable_kernel(
     const X_T *__restrict__ X,
     const routemap_T *__restrict__ routemap_topk,
@@ -130,17 +134,19 @@ __global__ __launch_bounds__(512) void tokens_unzip_stable_kernel(
       if (proposed_row_idx == -1) continue;  // no memcpy
       if (threadIdx.x == 0)
         probs_unzipped[proposed_row_idx] = this_expert_token_info.expert_probs;
-      // vec copy
-      if constexpr (has_scale) {
+      if constexpr (do_gather) {
+        // vec copy
+        if constexpr (has_scale) {
+          vectorized_memcpy(&XScale[(int64_t)row * (int64_t)scale_length],
+                            &XScale_unzipped[(int64_t)proposed_row_idx *
+                                             (int64_t)scale_length],
+                            scale_length);
+        }
         vectorized_memcpy(
-            &XScale[(int64_t)row * (int64_t)scale_length],
-            &XScale_unzipped[(int64_t)proposed_row_idx * (int64_t)scale_length],
-            scale_length);
+            &X[(int64_t)row * (int64_t)token_length],
+            &X_unzipped[(int64_t)proposed_row_idx * (int64_t)token_length],
+            token_length);
       }
-      vectorized_memcpy(
-          &X[(int64_t)row * (int64_t)token_length],
-          &X_unzipped[(int64_t)proposed_row_idx * (int64_t)token_length],
-          token_length);
     }
   }
 }
@@ -160,7 +166,8 @@ void dispatch_tokens_unzip_stable(const Context &dev_ctx,
                                   const int token_length,
                                   const int topk,  // deprecated
                                   const int num_experts,
-                                  const int scale_length) {
+                                  const int scale_length,
+                                  const bool do_gather) {
   dim3 grid, block;
   grid.x =
       (total_zipped_tokens_num + CUMSUM_BLOCK_SIZE - 1) / CUMSUM_BLOCK_SIZE;
@@ -169,33 +176,41 @@ void dispatch_tokens_unzip_stable(const Context &dev_ctx,
 #define DTYPE_CASE(dtype, type) dtype == phi::DataType::type
 #define GET_DATA(tensor, type) tensor.data<type>()
 #define GET_PTR_DATA(tensor, type) tensor->data<type>()
-#define DISPATCH_CASE(TOKEN_T, PROB_T, INT_T, HAS_SCALE)                       \
-  auto kernel = tokens_unzip_stable_kernel<TOKEN_T, INT_T, PROB_T, HAS_SCALE>; \
-  kernel<<<grid, block, 0, dev_ctx.stream()>>>(                                \
-      GET_DATA(X, TOKEN_T),                                                    \
-      GET_DATA(expert_routemap_topk, INT_T),                                   \
-      GET_DATA(expert_prob_topk, PROB_T),                                      \
-      XScale ? XScale.get_ptr()->data<float>() : nullptr,                      \
-      GET_DATA(expert_offsets, int),                                           \
-      GET_PTR_DATA(X_unzipped, TOKEN_T),                                       \
-      GET_PTR_DATA(zipped_expertwise_rowmap, INT_T),                           \
-      GET_PTR_DATA(token_prob_unzipped, PROB_T),                               \
-      XScale_unzipped->data<float>(),                                          \
-      global_expertwise_block_cumsum->data<int>(),                             \
-      total_zipped_tokens_num,                                                 \
-      token_length,                                                            \
-      scale_length,                                                            \
-      num_experts,                                                             \
+#define DISPATCH_CASE(TOKEN_T, PROB_T, INT_T, HAS_SCALE, DO_GATHER) \
+  auto kernel = tokens_unzip_stable_kernel<TOKEN_T,                 \
+                                           INT_T,                   \
+                                           PROB_T,                  \
+                                           HAS_SCALE,               \
+                                           DO_GATHER>;              \
+  kernel<<<grid, block, 0, dev_ctx.stream()>>>(                     \
+      GET_DATA(X, TOKEN_T),                                         \
+      GET_DATA(expert_routemap_topk, INT_T),                        \
+      GET_DATA(expert_prob_topk, PROB_T),                           \
+      XScale ? XScale.get_ptr()->data<float>() : nullptr,           \
+      GET_DATA(expert_offsets, int),                                \
+      GET_PTR_DATA(X_unzipped, TOKEN_T),                            \
+      GET_PTR_DATA(zipped_expertwise_rowmap, INT_T),                \
+      GET_PTR_DATA(token_prob_unzipped, PROB_T),                    \
+      XScale_unzipped->data<float>(),                               \
+      global_expertwise_block_cumsum->data<int>(),                  \
+      total_zipped_tokens_num,                                      \
+      token_length,                                                 \
+      scale_length,                                                 \
+      num_experts,                                                  \
       topk);
 
-#define HANDLE_EXPERT_CASE(TOKEN_T, PROB_T, INT_T, HAS_SCALE) \
-  DISPATCH_CASE(TOKEN_T, PROB_T, INT_T, HAS_SCALE)
+#define HANDLE_GATHER_CASE(TOKEN_T, PROB_T, INT_T, HAS_SCALE) \
+  if (do_gather) {                                            \
+    DISPATCH_CASE(TOKEN_T, PROB_T, INT_T, HAS_SCALE, true)    \
+  } else {                                                    \
+    DISPATCH_CASE(TOKEN_T, PROB_T, INT_T, HAS_SCALE, false)   \
+  }
 
 #define HANDLE_TOKEN_TYPE(PROB_T, INT_T)                        \
   if (DTYPE_CASE(X.dtype(), BFLOAT16)) {                        \
-    HANDLE_EXPERT_CASE(phi::bfloat16, PROB_T, INT_T, false)     \
+    HANDLE_GATHER_CASE(phi::bfloat16, PROB_T, INT_T, false)     \
   } else if (DTYPE_CASE(X.dtype(), FLOAT8_E4M3FN)) {            \
-    HANDLE_EXPERT_CASE(phi::float8_e4m3fn, PROB_T, INT_T, true) \
+    HANDLE_GATHER_CASE(phi::float8_e4m3fn, PROB_T, INT_T, true) \
   }
 
 #define HANDLE_PROB_TYPE(INT_T)                               \
@@ -226,6 +241,7 @@ void MoePermuteKernel(const Context &dev_ctx,
                       const int num_experts,
                       const std::vector<int> &tokens_per_expert,
                       const int padding_multiplex,
+                      const bool do_gather,
                       DenseTensor *X_unzipped,
                       DenseTensor *zipped_expertwise_rowmap,
                       DenseTensor *token_prob_unzipped,
@@ -341,7 +357,8 @@ void MoePermuteKernel(const Context &dev_ctx,
                                            cols,
                                            topk_calculated,
                                            num_experts,
-                                           quanted_cols);
+                                           quanted_cols,
+                                           do_gather);
 }
 #undef CUMSUM_BLOCK_SIZE
 #undef CUMSUM_INVALID_TAG
diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml
@@ -3886,7 +3886,7 @@
   backward : moe_gate_dispatch_permute_grad
 
 - op : moe_permute
-  args : (Tensor hidden_states, Tensor scale, Tensor expert_routemap_topk, Tensor expert_prob_topk, int num_experts, int[] tokens_per_expert, int padding_alignment)
+  args : (Tensor hidden_states, Tensor scale, Tensor expert_routemap_topk, Tensor expert_prob_topk, int num_experts, int[] tokens_per_expert, int padding_alignment, bool do_gather)
   output : Tensor(hidden_states_unzipped), Tensor(zipped_expertwise_rowmap), Tensor(token_prob_unzipped), Tensor(scale_unzipped)
   infer_meta:
     func : MoePermuteInferMeta
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
@@ -242,6 +242,8 @@
     'max_unpool1d',
     'max_unpool2d',
     'max_unpool3d',
+    'moe_permute',
+    'moe_unpermute',
     'adaptive_avg_pool1d',
     'adaptive_avg_pool2d',
     'adaptive_avg_pool3d',
@@ -304,6 +306,4 @@
     "flash_attention_v3_varlen",
     'flash_attn_varlen_qkvpacked',
     'group_norm',
-    'moe_permute',
-    'moe_unpermute',
 ]
diff --git a/python/paddle/nn/functional/moe_permute.py b/python/paddle/nn/functional/moe_permute.py
@@ -31,6 +31,7 @@ def moe_permute(
     num_experts: int,
     tokens_per_expert: list,
     padding_alignment: int,
+    do_gather: bool = True,
     name: str | None = None,
 ) -> tuple[Tensor, Tensor, Tensor, Tensor]:
     r"""
@@ -67,6 +68,7 @@ def moe_permute(
             assigned to the corresponding expert.
         padding_alignment (int): Tokens alignment requirement for expert buffers (in bytes).
             Must be a power of 2. Typical values are 16, 32 or 64 for optimal memory access.
+        do_gather(bool): Decide whether do actual tokens gather operation or not, default is True.
         name (str|None, optional): Name prefix for the operation (optional).
             Default: None
 
@@ -133,6 +135,7 @@ def moe_permute(
             num_experts,
             tokens_per_expert,
             padding_alignment,
+            do_gather,
         )
         return (
             hidden_states_unzipped,
diff --git a/test/legacy_test/test_moe_permute_unpermute.py b/test/legacy_test/test_moe_permute_unpermute.py
@@ -139,6 +139,22 @@ def test_permute_unpermute_consistency(self):
                     tokens_per_expert=tokens_per_expert,
                     padding_alignment=128,
                 )
+                # do_gather = False
+                (
+                    _,
+                    zipped_expertwise_rowmap_no_gather,
+                    unzipped_probs_no_gather,
+                    _,
+                ) = moe_permute(
+                    hidden_states,
+                    scale,
+                    expert_routemap_topk,
+                    expert_prob_topk,
+                    num_experts=expert_num,
+                    tokens_per_expert=tokens_per_expert,
+                    padding_alignment=128,
+                    do_gather=False,
+                )
 
                 unpermute_input = (
                     unzipped_tokens.astype("float32")
@@ -174,6 +190,17 @@ def test_permute_unpermute_consistency(self):
                     err_msg="moe_permute_unpermute probs do not match",
                 )
 
+                np.testing.assert_equal(
+                    zipped_expertwise_rowmap_no_gather._md5sum(),
+                    zipped_expertwise_rowmap._md5sum(),
+                    err_msg="no_gather's zipped_expertwise_rowmap do not match",
+                )
+                np.testing.assert_equal(
+                    unzipped_probs_no_gather._md5sum(),
+                    unzipped_probs._md5sum(),
+                    err_msg="no_gather's unzipped_probs do not match",
+                )
+
 
 if __name__ == "__main__":
     unittest.main()