【cherry-pick】Add moe_combine_no_weight OP (#73592) (#73607)

From00 · lshpku · web-flow · commit cdcd76a93d1d · 2025-06-25T08:59:53.000+08:00
* Add moe_combine_no_weight OP (#73592) * Add moe_combine_no_weight op (#73531) * Add moe_combine_no_weight op * Remove seqlen and k from parameters * Set no_need_buffer for x * Add moe_combine_no_weight Op * Update --------- Co-authored-by: Shuhao Liang <50269654+lshpku@users.noreply.github.com> * Empty-Commit --------- Co-authored-by: Shuhao Liang <50269654+lshpku@users.noreply.github.com>
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
@@ -1630,6 +1630,45 @@ void MoeCombineInferMeta(const MetaTensor& x,
   y->set_dtype(x.dtype());
 }
 
+void MoeCombineNoWeightInferMeta(const MetaTensor& x,
+                                 const MetaTensor& combine_weights,
+                                 const MetaTensor& scatter_index,
+                                 float epsilon,
+                                 MetaTensor* y) {
+  auto x_dim = x.dims();
+  auto scatter_index_dim = scatter_index.dims();
+  PADDLE_ENFORCE_EQ(x_dim.size(),
+                    2,
+                    common::errors::InvalidArgument(
+                        "The dimensions of Input(x) must be 2, but "
+                        "received dimensions of Input(x) is [%d]",
+                        x_dim.size()));
+  PADDLE_ENFORCE_EQ(scatter_index_dim.size(),
+                    2,
+                    common::errors::InvalidArgument(
+                        "The dimensions of Input(scatter_index) must be 2, but "
+                        "received dimensions of Input(scatter_index) is [%d]",
+                        scatter_index_dim.size()));
+  PADDLE_ENFORCE_EQ(scatter_index.dtype(),
+                    phi::DataType::INT32,
+                    common::errors::InvalidArgument(
+                        "The input scatter_index type should be int32"
+                        "But received scatter_index type = %s",
+                        scatter_index.dtype()));
+  int64_t seqlen = scatter_index_dim[0];
+  int64_t k = scatter_index_dim[1];
+  int64_t hidden_size = x_dim[1];
+  PADDLE_ENFORCE_EQ(x_dim[0],
+                    seqlen * k,
+                    common::errors::InvalidArgument(
+                        "The upper dim of Input(x) [%d] must equal to "
+                        "the total size of Input(scatter_index) [%d].",
+                        x_dim[0],
+                        seqlen * k));
+  y->set_dims(phi::make_ddim({seqlen, hidden_size}));
+  y->set_dtype(x.dtype());
+}
+
 void MoeGateDispatchPartialNoSoftmaxTopKInferMeta(
     const MetaTensor& x,
     const MetaTensor& combine_weights,
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
@@ -274,6 +274,12 @@ void MoeCombineInferMeta(const MetaTensor& x,
                          const MetaTensor& scatter_index,
                          MetaTensor* y);
 
+void MoeCombineNoWeightInferMeta(const MetaTensor& x,
+                                 const MetaTensor& combine_weights,
+                                 const MetaTensor& scatter_index,
+                                 float epsilon,
+                                 MetaTensor* y);
+
 void MoeGateDispatchPartialNoSoftmaxTopKInferMeta(
     const MetaTensor& x,
     const MetaTensor& combine_weights,
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
@@ -64,6 +64,8 @@ if(((WITH_GPU) AND (CUDA_VERSION VERSION_LESS 12.0))
     "legacy/gpu/expand_modality_expert_id_kernel.cu"
     "legacy/gpu/moe_combine_kernel.cu"
     "legacy/gpu/moe_combine_grad_kernel.cu"
+    "legacy/gpu/moe_combine_no_weight_kernel.cu"
+    "legacy/gpu/moe_combine_no_weight_grad_kernel.cu"
     "legacy/gpu/cal_aux_loss_kernel.cu"
     "legacy/gpu/cal_aux_loss_grad_kernel.cu"
     "legacy/gpu/ext_build_src_rank_and_local_expert_id_kernel.cu"
diff --git a/paddle/phi/kernels/legacy/gpu/moe_combine_no_weight_grad_kernel.cu b/paddle/phi/kernels/legacy/gpu/moe_combine_no_weight_grad_kernel.cu
@@ -0,0 +1,131 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+
+namespace phi {
+
+template <typename T, typename MTP, int VecSize>
+__global__ void combine_no_weight_bwd_kernel(const T* combine_weights,
+                                             const int* scatter_index,
+                                             const T* grad_y,
+                                             T* grad_x,
+                                             const int64_t k,
+                                             const int64_t seqlen,
+                                             const int64_t hidden_size,
+                                             const float epsilon) {
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  LoadT grad_y_vec;
+  int i = blockIdx.x;   // Batch index (sequence length)
+  int ki = blockIdx.y;  // Sequence index
+
+  if (i < seqlen && ki < k) {
+    int idx = scatter_index[i * k + ki];  // Index into x
+    if (fabsf(combine_weights[i * k + ki]) <=
+        epsilon) {  // no grad for padding tokens
+      return;
+    }
+    // Loop over h dimension in strides of block
+    for (int h_i = threadIdx.x * VecSize; h_i < hidden_size;
+         h_i += blockDim.x * VecSize) {
+      phi::Load<T, VecSize>(&(grad_y[i * hidden_size + h_i]), &grad_y_vec);
+      phi::Store<T, VecSize>(grad_y_vec, &grad_x[idx * hidden_size + h_i]);
+    }
+  }
+}
+
+template <typename T>
+void moe_combine_no_weight_bwd(const T* combine_weights,
+                               const int* scatter_index,
+                               const T* grad_y,
+                               T* grad_x,
+                               const int64_t k,
+                               const int64_t seqlen,
+                               const int64_t hidden_size,
+                               const float epsilon,
+                               cudaStream_t stream) {
+  int block_size = 512;
+  int grid_size_i = seqlen;
+  int grid_size_k = k;
+  dim3 blockDim(block_size);
+  dim3 gridDim(grid_size_i, grid_size_k);
+
+  constexpr int max_pack_size = 16 / sizeof(T);
+  if (hidden_size % max_pack_size == 0) {
+    combine_no_weight_bwd_kernel<T, float, max_pack_size>
+        <<<gridDim, blockDim, 0, stream>>>(combine_weights,
+                                           scatter_index,
+                                           grad_y,
+                                           grad_x,
+                                           k,
+                                           seqlen,
+                                           hidden_size,
+                                           epsilon);
+  } else {
+    combine_no_weight_bwd_kernel<T, float, 1>
+        <<<gridDim, blockDim, 0, stream>>>(combine_weights,
+                                           scatter_index,
+                                           grad_y,
+                                           grad_x,
+                                           k,
+                                           seqlen,
+                                           hidden_size,
+                                           epsilon);
+  }
+}
+
+template <typename T, typename Context>
+void MoeCombineNoWeightGradKernel(const Context& dev_ctx,
+                                  const DenseTensor& x,
+                                  const DenseTensor& combine_weights,
+                                  const DenseTensor& scatter_index,
+                                  const DenseTensor& grad_y,
+                                  const float epsilon,
+                                  DenseTensor* grad_x) {
+  const auto x_shape = x.dims();
+  const int64_t hidden_size = x_shape[1];
+
+  const auto scatter_index_shape = scatter_index.dims();
+  const int64_t seqlen = scatter_index_shape[0];
+  const int64_t k = scatter_index_shape[1];
+
+  dev_ctx.template Alloc<T>(grad_x);
+  phi::Full<T, Context>(
+      dev_ctx, phi::IntArray(common::vectorize(grad_x->dims())), 0, grad_x);
+
+  moe_combine_no_weight_bwd<T>(combine_weights.data<T>(),
+                               scatter_index.data<int>(),
+                               grad_y.data<T>(),
+                               grad_x->data<T>(),
+                               k,
+                               seqlen,
+                               hidden_size,
+                               epsilon,
+                               dev_ctx.stream());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(moe_combine_no_weight_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MoeCombineNoWeightGradKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/legacy/gpu/moe_combine_no_weight_kernel.cu b/paddle/phi/kernels/legacy/gpu/moe_combine_no_weight_kernel.cu
@@ -0,0 +1,141 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename MTP, int k>
+__global__ void combine_no_weight_kernel(const T* __restrict__ x,
+                                         const T* __restrict__ combine_weights,
+                                         const int* __restrict__ scatter_index,
+                                         T* __restrict__ y,
+                                         const int64_t hidden_size,
+                                         const int64_t seqlen,
+                                         const float epsilon) {
+  extern __shared__ char shared_mem[];
+  MTP* shared_weights = reinterpret_cast<MTP*>(shared_mem);
+  int64_t* shared_indices = reinterpret_cast<int64_t*>(shared_mem);
+
+  int64_t seq_i = blockIdx.x;
+  for (int ki = threadIdx.x; ki < k; ki += blockDim.x) {
+    shared_weights[ki] = static_cast<MTP>(combine_weights[seq_i * k + ki]);
+    shared_indices[ki] = scatter_index[seq_i * k + ki];
+  }
+  __syncthreads();
+  for (int h_i = threadIdx.x; h_i < hidden_size; h_i += blockDim.x) {
+    MTP sum = static_cast<MTP>(0);
+#pragma unroll
+    for (int ki = 0; ki < k; ++ki) {
+      if (fabsf(shared_weights[ki]) <= epsilon) {
+        continue;
+      }
+      int64_t scatter_idx = shared_indices[ki];
+      T x_val = x[scatter_idx * hidden_size + h_i];
+      sum += static_cast<MTP>(x_val);
+    }
+    y[seq_i * hidden_size + h_i] = static_cast<T>(sum);
+  }
+}
+
+template <typename T>
+void moe_combine_no_weight_fwd(const T* x,
+                               const T* combine_weights,
+                               const int* scatter_index,
+                               T* y,
+                               const int64_t k,
+                               const int64_t seqlen,
+                               const int64_t hidden_size,
+                               const float epsilon,
+                               cudaStream_t stream) {
+  int threads_per_block = 1024;
+  dim3 blockDim(threads_per_block);
+  dim3 gridDim(seqlen);
+  size_t sharedMemSize = k * (sizeof(int64_t) + sizeof(T));
+
+#define CALL_KERNEL(K)                                          \
+  case K:                                                       \
+    combine_no_weight_kernel<T, float, K>                       \
+        <<<gridDim, blockDim, sharedMemSize>>>(x,               \
+                                               combine_weights, \
+                                               scatter_index,   \
+                                               y,               \
+                                               hidden_size,     \
+                                               seqlen,          \
+                                               epsilon);        \
+    break;
+
+  switch (k) {
+    CALL_KERNEL(1);
+    CALL_KERNEL(2);
+    CALL_KERNEL(3);
+    CALL_KERNEL(4);
+    CALL_KERNEL(5);
+    CALL_KERNEL(6);
+    CALL_KERNEL(7);
+    CALL_KERNEL(8);
+    CALL_KERNEL(9);
+    CALL_KERNEL(10);
+    CALL_KERNEL(11);
+    CALL_KERNEL(12);
+    CALL_KERNEL(13);
+    CALL_KERNEL(14);
+    CALL_KERNEL(15);
+    CALL_KERNEL(16);
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument("Invalid k value."));
+      break;
+  }
+#undef CALL_KERNEL
+}
+
+template <typename T, typename Context>
+void MoeCombineNoWeightKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& combine_weights,
+                              const DenseTensor& scatter_index,
+                              const float epsilon,
+                              DenseTensor* y) {
+  const auto x_shape = x.dims();
+  const int64_t hidden_size = x_shape[1];
+
+  const auto scatter_index_shape = scatter_index.dims();
+  const int64_t seqlen = scatter_index_shape[0];
+  const int64_t k = scatter_index_shape[1];
+
+  dev_ctx.template Alloc<T>(y);
+
+  moe_combine_no_weight_fwd<T>(x.data<T>(),
+                               combine_weights.data<T>(),
+                               scatter_index.data<int>(),
+                               y->data<T>(),
+                               k,
+                               seqlen,
+                               hidden_size,
+                               epsilon,
+                               dev_ctx.stream());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(moe_combine_no_weight,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MoeCombineNoWeightKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/ops/yaml/backward.yaml b/paddle/phi/ops/yaml/backward.yaml
@@ -2320,6 +2320,17 @@
   kernel :
     func : moe_combine_grad
 
+- backward_op : moe_combine_no_weight_grad
+  forward : moe_combine_no_weight (Tensor x, Tensor combine_weight, Tensor scatter_index, float epsilon = 1.0e-15) -> Tensor(y)
+  args : (Tensor x, Tensor combine_weight, Tensor scatter_index, Tensor y_grad, float epsilon = 1.0e-15)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : moe_combine_no_weight_grad
+  no_need_buffer : x
+
 - backward_op : moe_gate_dispatch_grad
   forward : moe_gate_dispatch (Tensor x, Tensor gate_logits, Tensor corr_bias, int64_t k, int64_t capacity, bool use_pad) -> Tensor(y), Tensor(combine_weights), Tensor(scatter_index), Tensor(expert_offset), Tensor(expert_id)
   args : (Tensor combine_weights, Tensor scatter_index, Tensor expert_id, Tensor y_grad, Tensor combine_weights_grad, int64_t k, int64_t capacity, bool use_pad)
diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml
@@ -3678,6 +3678,16 @@
     data_type : x
   backward : moe_combine_grad
 
+- op : moe_combine_no_weight
+  args : (Tensor x, Tensor combine_weight, Tensor scatter_index, float epsilon = 1.0e-15)
+  output : Tensor(y)
+  infer_meta :
+    func : MoeCombineNoWeightInferMeta
+  kernel :
+    func : moe_combine_no_weight
+    data_type : x
+  backward : moe_combine_no_weight_grad
+
 - op : moe_gate_dispatch
   args : (Tensor x, Tensor gate_logits, Tensor corr_bias, int64_t k, int64_t capacity, bool use_pad)
   output : Tensor(y), Tensor(combine_weights), Tensor(scatter_index), Tensor(expert_offset), Tensor(expert_id)
diff --git a/python/paddle/incubate/nn/functional/__init__.py b/python/paddle/incubate/nn/functional/__init__.py
@@ -60,6 +60,7 @@
 from .int_bincount import int_bincount
 from .masked_multihead_attention import masked_multihead_attention
 from .moe_combine import moe_combine
+from .moe_combine_no_weight import moe_combine_no_weight
 from .moe_gate_dispatch import moe_gate_dispatch
 from .moe_gate_dispatch_partial_nosoftmaxtopk import (
     moe_gate_dispatch_partial_nosoftmaxtopk,
diff --git a/python/paddle/incubate/nn/functional/moe_combine_no_weight.py b/python/paddle/incubate/nn/functional/moe_combine_no_weight.py
diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
diff --git a/test/legacy_test/test_incubate_moe_combine_no_weight.py b/test/legacy_test/test_incubate_moe_combine_no_weight.py