Add moe_combine_no_weight op (#73531)

lshpku · web-flow · commit 53919d3c9e2f · 2025-06-23T17:32:19.000+08:00
* Add moe_combine_no_weight op

* Remove seqlen and k from parameters

* Set no_need_buffer for x
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
@@ -1630,6 +1630,43 @@ void MoeCombineInferMeta(const MetaTensor& x,
   y->set_dtype(x.dtype());
 }
 
+void MoeCombineNoWeightInferMeta(const MetaTensor& x,
+                                 const MetaTensor& scatter_index,
+                                 MetaTensor* y) {
+  auto x_dim = x.dims();
+  auto scatter_index_dim = scatter_index.dims();
+  PADDLE_ENFORCE_EQ(x_dim.size(),
+                    2,
+                    common::errors::InvalidArgument(
+                        "The dimensions of Input(x) must be 2, but "
+                        "received dimensions of Input(x) is [%d]",
+                        x_dim.size()));
+  PADDLE_ENFORCE_EQ(scatter_index_dim.size(),
+                    2,
+                    common::errors::InvalidArgument(
+                        "The dimensions of Input(scatter_index) must be 2, but "
+                        "received dimensions of Input(scatter_index) is [%d]",
+                        scatter_index_dim.size()));
+  PADDLE_ENFORCE_EQ(scatter_index.dtype(),
+                    phi::DataType::INT32,
+                    common::errors::InvalidArgument(
+                        "The input scatter_index type should be int32"
+                        "But received scatter_index type = %s",
+                        scatter_index.dtype()));
+  int64_t seqlen = scatter_index_dim[0];
+  int64_t k = scatter_index_dim[1];
+  int64_t hidden_size = x_dim[1];
+  PADDLE_ENFORCE_EQ(x_dim[0],
+                    seqlen * k,
+                    common::errors::InvalidArgument(
+                        "The upper dim of Input(x) [%d] must equal to "
+                        "the total size of Input(scatter_index) [%d].",
+                        x_dim[0],
+                        seqlen * k));
+  y->set_dims(phi::make_ddim({seqlen, hidden_size}));
+  y->set_dtype(x.dtype());
+}
+
 void MoeGateDispatchPartialNoSoftmaxTopKInferMeta(
     const MetaTensor& x,
     const MetaTensor& combine_weights,
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
@@ -274,6 +274,10 @@ void MoeCombineInferMeta(const MetaTensor& x,
                          const MetaTensor& scatter_index,
                          MetaTensor* y);
 
+void MoeCombineNoWeightInferMeta(const MetaTensor& x,
+                                 const MetaTensor& scatter_index,
+                                 MetaTensor* y);
+
 void MoeGateDispatchPartialNoSoftmaxTopKInferMeta(
     const MetaTensor& x,
     const MetaTensor& combine_weights,
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
@@ -64,6 +64,8 @@ if(((WITH_GPU) AND (CUDA_VERSION VERSION_LESS 12.0))
     "legacy/gpu/expand_modality_expert_id_kernel.cu"
     "legacy/gpu/moe_combine_kernel.cu"
     "legacy/gpu/moe_combine_grad_kernel.cu"
+    "legacy/gpu/moe_combine_no_weight_kernel.cu"
+    "legacy/gpu/moe_combine_no_weight_grad_kernel.cu"
     "legacy/gpu/cal_aux_loss_kernel.cu"
     "legacy/gpu/cal_aux_loss_grad_kernel.cu"
     "legacy/gpu/ext_build_src_rank_and_local_expert_id_kernel.cu"
diff --git a/paddle/phi/kernels/legacy/gpu/moe_combine_no_weight_grad_kernel.cu b/paddle/phi/kernels/legacy/gpu/moe_combine_no_weight_grad_kernel.cu
@@ -0,0 +1,107 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+
+namespace phi {
+
+template <typename T, typename MTP, int VecSize>
+__global__ void combine_no_weight_bwd_kernel(const int* scatter_index,
+                                             const T* grad_y,
+                                             T* grad_x,
+                                             const int64_t k,
+                                             const int64_t seqlen,
+                                             const int64_t hidden_size) {
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  LoadT grad_y_vec;
+  int i = blockIdx.x;   // Batch index (sequence length)
+  int ki = blockIdx.y;  // Sequence index
+
+  if (i < seqlen && ki < k) {
+    int idx = scatter_index[i * k + ki];  // Index into x
+
+    // Loop over h dimension in strides of block
+    for (int h_i = threadIdx.x * VecSize; h_i < hidden_size;
+         h_i += blockDim.x * VecSize) {
+      phi::Load<T, VecSize>(&(grad_y[i * hidden_size + h_i]), &grad_y_vec);
+      phi::Store<T, VecSize>(grad_y_vec, &grad_x[idx * hidden_size + h_i]);
+    }
+  }
+}
+
+template <typename T>
+void moe_combine_no_weight_bwd(const int* scatter_index,
+                               const T* grad_y,
+                               T* grad_x,
+                               const int64_t k,
+                               const int64_t seqlen,
+                               const int64_t hidden_size,
+                               cudaStream_t stream) {
+  int block_size = 512;
+  int grid_size_i = seqlen;
+  int grid_size_k = k;
+  dim3 blockDim(block_size);
+  dim3 gridDim(grid_size_i, grid_size_k);
+
+  constexpr int max_pack_size = 16 / sizeof(T);
+  if (hidden_size % max_pack_size == 0) {
+    combine_no_weight_bwd_kernel<T, float, max_pack_size>
+        <<<gridDim, blockDim, 0, stream>>>(
+            scatter_index, grad_y, grad_x, k, seqlen, hidden_size);
+  } else {
+    combine_no_weight_bwd_kernel<T, float, 1><<<gridDim, blockDim, 0, stream>>>(
+        scatter_index, grad_y, grad_x, k, seqlen, hidden_size);
+  }
+}
+
+template <typename T, typename Context>
+void MoeCombineNoWeightGradKernel(const Context& dev_ctx,
+                                  const DenseTensor& x,
+                                  const DenseTensor& scatter_index,
+                                  const DenseTensor& grad_y,
+                                  DenseTensor* grad_x) {
+  const auto x_shape = x.dims();
+  const int64_t hidden_size = x_shape[1];
+
+  const auto scatter_index_shape = scatter_index.dims();
+  const int64_t seqlen = scatter_index_shape[0];
+  const int64_t k = scatter_index_shape[1];
+
+  dev_ctx.template Alloc<T>(grad_x);
+  phi::Full<T, Context>(
+      dev_ctx, phi::IntArray(common::vectorize(grad_x->dims())), 0, grad_x);
+
+  moe_combine_no_weight_bwd<T>(scatter_index.data<int>(),
+                               grad_y.data<T>(),
+                               grad_x->data<T>(),
+                               k,
+                               seqlen,
+                               hidden_size,
+                               dev_ctx.stream());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(moe_combine_no_weight_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MoeCombineNoWeightGradKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/legacy/gpu/moe_combine_no_weight_kernel.cu b/paddle/phi/kernels/legacy/gpu/moe_combine_no_weight_kernel.cu
@@ -0,0 +1,123 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename MTP, int k>
+__global__ void combine_no_weight_kernel(const T* __restrict__ x,
+                                         const int* __restrict__ scatter_index,
+                                         T* __restrict__ y,
+                                         const int64_t hidden_size,
+                                         const int64_t seqlen) {
+  extern __shared__ char shared_mem[];
+  int64_t* shared_indices = reinterpret_cast<int64_t*>(shared_mem);
+
+  int64_t seq_i = blockIdx.x;
+  for (int ki = threadIdx.x; ki < k; ki += blockDim.x) {
+    shared_indices[ki] = scatter_index[seq_i * k + ki];
+  }
+  __syncthreads();
+  for (int h_i = threadIdx.x; h_i < hidden_size; h_i += blockDim.x) {
+    MTP sum = static_cast<MTP>(0);
+#pragma unroll
+    for (int ki = 0; ki < k; ++ki) {
+      int64_t scatter_idx = shared_indices[ki];
+      T x_val = x[scatter_idx * hidden_size + h_i];
+      sum += static_cast<MTP>(x_val);
+    }
+    y[seq_i * hidden_size + h_i] = static_cast<T>(sum);
+  }
+}
+
+template <typename T>
+void moe_combine_no_weight_fwd(const T* x,
+                               const int* scatter_index,
+                               T* y,
+                               const int64_t k,
+                               const int64_t seqlen,
+                               const int64_t hidden_size,
+                               cudaStream_t stream) {
+  int threads_per_block = 1024;
+  dim3 blockDim(threads_per_block);
+  dim3 gridDim(seqlen);
+  size_t sharedMemSize = k * sizeof(int64_t);
+
+#define CALL_KERNEL(K)                                 \
+  case K:                                              \
+    combine_no_weight_kernel<T, float, K>              \
+        <<<gridDim, blockDim, sharedMemSize>>>(        \
+            x, scatter_index, y, hidden_size, seqlen); \
+    break;
+
+  switch (k) {
+    CALL_KERNEL(1);
+    CALL_KERNEL(2);
+    CALL_KERNEL(3);
+    CALL_KERNEL(4);
+    CALL_KERNEL(5);
+    CALL_KERNEL(6);
+    CALL_KERNEL(7);
+    CALL_KERNEL(8);
+    CALL_KERNEL(9);
+    CALL_KERNEL(10);
+    CALL_KERNEL(11);
+    CALL_KERNEL(12);
+    CALL_KERNEL(13);
+    CALL_KERNEL(14);
+    CALL_KERNEL(15);
+    CALL_KERNEL(16);
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument("Invalid k value."));
+      break;
+  }
+#undef CALL_KERNEL
+}
+
+template <typename T, typename Context>
+void MoeCombineNoWeightKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& scatter_index,
+                              DenseTensor* y) {
+  const auto x_shape = x.dims();
+  const int64_t hidden_size = x_shape[1];
+
+  const auto scatter_index_shape = scatter_index.dims();
+  const int64_t seqlen = scatter_index_shape[0];
+  const int64_t k = scatter_index_shape[1];
+
+  dev_ctx.template Alloc<T>(y);
+
+  moe_combine_no_weight_fwd<T>(x.data<T>(),
+                               scatter_index.data<int>(),
+                               y->data<T>(),
+                               k,
+                               seqlen,
+                               hidden_size,
+                               dev_ctx.stream());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(moe_combine_no_weight,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MoeCombineNoWeightKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/ops/yaml/backward.yaml b/paddle/phi/ops/yaml/backward.yaml
@@ -2320,6 +2320,17 @@
   kernel :
     func : moe_combine_grad
 
+- backward_op : moe_combine_no_weight_grad
+  forward : moe_combine_no_weight (Tensor x, Tensor scatter_index) -> Tensor(y)
+  args : (Tensor x, Tensor scatter_index, Tensor y_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : moe_combine_no_weight_grad
+  no_need_buffer : x
+
 - backward_op : moe_gate_dispatch_grad
   forward : moe_gate_dispatch (Tensor x, Tensor gate_logits, Tensor corr_bias, int64_t k, int64_t capacity, bool use_pad) -> Tensor(y), Tensor(combine_weights), Tensor(scatter_index), Tensor(expert_offset), Tensor(expert_id)
   args : (Tensor combine_weights, Tensor scatter_index, Tensor expert_id, Tensor y_grad, Tensor combine_weights_grad, int64_t k, int64_t capacity, bool use_pad)
diff --git a/paddle/phi/ops/yaml/ops.yaml b/paddle/phi/ops/yaml/ops.yaml
@@ -3678,6 +3678,16 @@
     data_type : x
   backward : moe_combine_grad
 
+- op : moe_combine_no_weight
+  args : (Tensor x, Tensor scatter_index)
+  output : Tensor(y)
+  infer_meta :
+    func : MoeCombineNoWeightInferMeta
+  kernel :
+    func : moe_combine_no_weight
+    data_type : x
+  backward : moe_combine_no_weight_grad
+
 - op : moe_gate_dispatch
   args : (Tensor x, Tensor gate_logits, Tensor corr_bias, int64_t k, int64_t capacity, bool use_pad)
   output : Tensor(y), Tensor(combine_weights), Tensor(scatter_index), Tensor(expert_offset), Tensor(expert_id)
diff --git a/python/paddle/incubate/nn/functional/__init__.py b/python/paddle/incubate/nn/functional/__init__.py
@@ -60,6 +60,7 @@
 from .int_bincount import int_bincount
 from .masked_multihead_attention import masked_multihead_attention
 from .moe_combine import moe_combine
+from .moe_combine_no_weight import moe_combine_no_weight
 from .moe_gate_dispatch import moe_gate_dispatch
 from .moe_gate_dispatch_partial_nosoftmaxtopk import (
     moe_gate_dispatch_partial_nosoftmaxtopk,
diff --git a/python/paddle/incubate/nn/functional/moe_combine_no_weight.py b/python/paddle/incubate/nn/functional/moe_combine_no_weight.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from paddle import _C_ops
+from paddle.base.framework import in_dynamic_or_pir_mode
+from paddle.base.layer_helper import LayerHelper
+
+if TYPE_CHECKING:
+    from paddle import Tensor
+
+
+def moe_combine_no_weight(
+    x: Tensor,
+    scatter_index: Tensor,
+    name: str | None = None,
+) -> Tensor:
+    """
+    Args:
+        x: Input tensor [num_tokens, hidden_size]
+        scatter_index: Scatter indices [seq_len, k] dtype=int32
+
+    Returns:
+        Output Combined output [seq_len, hidden_size]
+    """
+    if in_dynamic_or_pir_mode():
+        return _C_ops.moe_combine_no_weight(x, scatter_index)
+    helper = LayerHelper('moe_combine_no_weight', **locals())
+    y = helper.create_variable_for_type_inference(dtype=x.dtype)
+    inputs = {
+        'x': x,
+        'scatter_index': scatter_index,
+    }
+    helper.append_op(
+        type='moe_combine_no_weight', inputs=inputs, outputs={'y': y}
+    )
+    return y
diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
@@ -513,6 +513,7 @@ if(NOT WITH_GPU
     test_incubate_fused_rmsnorm_ext
     test_incubate_int_bincount
     test_incubate_moe_combine
+    test_incubate_moe_combine_no_weight
     test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk
     test_incubate_moe_gate_dispatch_w_permute_bwd
     test_incubate_moe_gate_dispatch_w_permute
diff --git a/test/legacy_test/test_incubate_moe_combine_no_weight.py b/test/legacy_test/test_incubate_moe_combine_no_weight.py