PaddlePaddle
diff --git a/‎paddle/phi/backends/xpu/xpu3_op_list.cc
Lines changed: 12 additions & 0 deletions b/‎paddle/phi/backends/xpu/xpu3_op_list.cc
Lines changed: 12 additions & 0 deletions
diff --git a/‎paddle/phi/kernels/xpu/expand_modality_expert_id_kernel.cc
Lines changed: 52 additions & 0 deletions b/‎paddle/phi/kernels/xpu/expand_modality_expert_id_kernel.cc
Lines changed: 52 additions & 0 deletions
diff --git a/‎paddle/phi/kernels/xpu/ext_build_src_rank_and_local_expert_id_kernel.cc
Lines changed: 58 additions & 0 deletions b/‎paddle/phi/kernels/xpu/ext_build_src_rank_and_local_expert_id_kernel.cc
Lines changed: 58 additions & 0 deletions
diff --git a/‎paddle/phi/kernels/xpu/moe_gate_dispatch_grad_kernel.cc
Lines changed: 3 additions & 9 deletions b/‎paddle/phi/kernels/xpu/moe_gate_dispatch_grad_kernel.cc
Lines changed: 3 additions & 9 deletions
diff --git a/‎paddle/phi/kernels/xpu/moe_gate_dispatch_kernel.cc
Lines changed: 21 additions & 20 deletions b/‎paddle/phi/kernels/xpu/moe_gate_dispatch_kernel.cc
Lines changed: 21 additions & 20 deletions
diff --git a/‎paddle/phi/kernels/xpu/moe_ops_partial_nosoftmaxtopk_grad_kernel.cc
Lines changed: 82 additions & 0 deletions b/‎paddle/phi/kernels/xpu/moe_ops_partial_nosoftmaxtopk_grad_kernel.cc
Lines changed: 82 additions & 0 deletions
@@ -1851,6 +1851,18 @@ XPUOpMap& get_kl3_ops() {
                      phi::DataType::FLOAT16,
                      phi::DataType::FLOAT32,
                      phi::DataType::INT32})},
+      {"expand_modality_expert_id",
+       XPUKernelSet({phi::DataType::INT32, phi::DataType::INT64})},
+      {"build_src_rank_and_local_expert_id",
+       XPUKernelSet({phi::DataType::INT32, phi::DataType::INT64})},
+      {"moe_gate_dispatch_partial_nosoftmaxtopk",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
+      {"moe_gate_dispatch_partial_nosoftmaxtopk_grad",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
       {"blha_get_max_len",
        XPUKernelSet({phi::DataType::INT32, phi::DataType::INT64})},
       {"full_with_tensor",
 
@@ -0,0 +1,52 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ExpandModalityExpertIDKernel(const Context& dev_ctx,
+                                  const DenseTensor& expert_id,
+                                  int64_t num_expert_per_modality,
+                                  int64_t group_size,
+                                  int64_t modality_offset,
+                                  bool is_group_expert,
+                                  DenseTensor* expert_id_out) {
+  dev_ctx.template Alloc<T>(expert_id_out);
+  auto expert_id_shape = expert_id.dims();
+  int64_t seqlen = expert_id_shape[0];
+  int64_t k = expert_id_shape[1];
+
+  int r = xpu::expand_modality_expert_id(dev_ctx.x_context(),
+                                         expert_id.data<T>(),
+                                         expert_id_out->data<T>(),
+                                         seqlen,
+                                         k,
+                                         num_expert_per_modality,
+                                         group_size,
+                                         modality_offset,
+                                         is_group_expert);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "expand_modality_expert_id");
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(expand_modality_expert_id,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::ExpandModalityExpertIDKernel,
+                   int,
+                   int64_t) {}
@@ -0,0 +1,58 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BuildSrcRankAndLocalExpertIdKernel(
+    const Context& dev_ctx,
+    const DenseTensor& expert_num_global_tensor,
+    const std::vector<int64_t>& expert_num_global,
+    int64_t num_local_experts,
+    DenseTensor* src_rank,
+    DenseTensor* local_expert_id) {
+  int64_t token_num =
+      std::accumulate(expert_num_global.begin(), expert_num_global.end(), 0);
+
+  const int64_t* expert_num_global_tensor_data =
+      expert_num_global_tensor.data<int64_t>();
+
+  // Hard coded as ernie-core did.
+  int* src_rank_data = dev_ctx.template Alloc<int>(src_rank);
+  int* local_expert_id_data = dev_ctx.template Alloc<int>(local_expert_id);
+
+  int r = xpu::build_srcrank_and_local_expert_id(
+      dev_ctx.x_context(),
+      src_rank_data,
+      local_expert_id_data,
+      expert_num_global_tensor_data,
+      expert_num_global,
+      token_num,
+      static_cast<int64_t>(expert_num_global.size()),
+      num_local_experts);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "build_srcrank_and_local_expert_id");
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(build_src_rank_and_local_expert_id,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::BuildSrcRankAndLocalExpertIdKernel,
+                   int,
+                   int64_t) {}
@@ -1,4 +1,3 @@
-// NOLINT
 // Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -73,13 +72,8 @@ void moe_dispatch_grad(
   int64_t num_rows = scatter_index.dims()[1];
 
   const std::vector<int32_t> axis = {1, 0};
-  DenseTensor t_scatter_index_tmp;
-  phi::Transpose<int, Context>(
-      dev_ctx, scatter_index, axis, &t_scatter_index_tmp);
-  DenseTensor t_scatter_index_;
-  phi::ContiguousKernel<int, Context>(
-      dev_ctx, t_scatter_index_tmp, &t_scatter_index_);
-  const DenseTensor t_scatter_index = t_scatter_index_;
+  DenseTensor t_scatter_index;
+  phi::Transpose<int, Context>(dev_ctx, scatter_index, axis, &t_scatter_index);
 
   // output
   DenseTensor x_grad_tmp =
@@ -92,7 +86,7 @@ void moe_dispatch_grad(
   auto combine_weights_data =
       reinterpret_cast<const float*>(combine_weights.data<float>());
   auto t_scatter_index_data =
-      reinterpret_cast<const int*>(t_scatter_index_tmp.data<int>());
+      reinterpret_cast<const int*>(t_scatter_index.data<int>());
   auto combine_weights_grad_data =
       reinterpret_cast<const float*>(combine_weights_grad.data<float>());
   auto expert_id_data = reinterpret_cast<const int*>(expert_id.data<int>());
 
@@ -1,4 +1,3 @@
-// NOLINT
 // Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -34,30 +33,32 @@ void moe_dispatch_fwd(const Context &dev_ctx,
                       DenseTensor *expert_offset,
                       DenseTensor *expert_id,
                       bool use_pad) {
-  if (!(x.dtype() == paddle::DataType::FLOAT32 ||
-        x.dtype() == paddle::DataType::FLOAT16 ||
-        x.dtype() == paddle::DataType::BFLOAT16)) {
-    PD_THROW(
-        "Unsupported dtype for x, "
-        "currently float32, float16 and bfloat16 are supported.");
-  }
-
-  if (gate_logits.dtype() != paddle::DataType::FLOAT32) {
-    PD_THROW(
-        "Unsupported dtype for gate_logits, "
-        "currently only float32 is supported.");
-  }
+  PADDLE_ENFORCE_EQ(gate_logits.dtype(),
+                    paddle::DataType::FLOAT32,
+                    ::common::errors::InvalidArgument(
+                        "Unsupported dtype for gate_logits, "
+                        "currently only float32 is supported."));
 
   int64_t s = x.dims()[0];
   int64_t d = x.dims()[1];
   int64_t e = gate_logits.dims()[1];
 
-  if (k <= 0) PD_THROW("the k of topk must more than 0.");
-  if (capacity <= 0) PD_THROW("the capacity of each expert must more than 0.");
-  if (e < k) PD_THROW("the amount of experts must greater than k.");
-  if (k > 512) PD_THROW("currently, the k of topk must lesser than 512.");
-  if (e > 512 * 64 * 12)
-    PD_THROW("currently, he amount of experts must lesser than 393216.");
+  PADDLE_ENFORCE_GT(
+      k,
+      0,
+      ::common::errors::InvalidArgument("the k of topk must more than 0."));
+  PADDLE_ENFORCE_GT(capacity,
+                    0,
+                    ::common::errors::InvalidArgument(
+                        "the capacity of each expert must more than 0."));
+  PADDLE_ENFORCE_GE(e,
+                    k,
+                    ::common::errors::InvalidArgument(
+                        "the amount of experts must greater than k."));
+  PADDLE_ENFORCE_EQ(
+      corr_bias.is_initialized(),
+      false,
+      ::common::errors::InvalidArgument("corr_bias is not supported yet"));
 
   using XPUType = typename XPUTypeTrait<T>::Type;
 
 
@@ -0,0 +1,82 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/contiguous_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MoeGateDispatchPartialNoSoftMaxTopkGradKernel(
+    const Context& dev_ctx,
+    const DenseTensor& combine_weights_out,
+    const DenseTensor& scatter_index,
+    const DenseTensor& scatter_index_rev,
+    const DenseTensor& expert_offset,
+    const DenseTensor& expert_offset_local,
+    const DenseTensor& y_grad,
+    const DenseTensor& combine_weights_out_grad,
+    int64_t k,
+    int64_t capacity,
+    bool use_pad,
+    int64_t expert_start_index,
+    int64_t expert_end_index,
+    DenseTensor* x_grad,
+    DenseTensor* combine_weights_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+  dev_ctx.template Alloc<float>(combine_weights_grad);
+  phi::Full<float, Context>(
+      dev_ctx,
+      phi::IntArray(common::vectorize(combine_weights_grad->dims())),
+      0,
+      combine_weights_grad);
+  DenseTensor t_scatter_index;
+  phi::Transpose<int, Context>(
+      dev_ctx, scatter_index, {1, 0}, &t_scatter_index);
+
+  int64_t num_rows = combine_weights_out.dims()[0];
+  int64_t hidden_size = y_grad.dims()[1];
+  int64_t num_experts = expert_offset.dims()[0];
+  int64_t num_active = y_grad.dims()[0];
+
+  using XPUDataType = typename XPUTypeTrait<T>::Type;
+  int r = xpu::moe_gate_dispatch_partial_nosoftmaxtopk_grad(
+      dev_ctx.x_context(),
+      reinterpret_cast<const XPUDataType*>(y_grad.data<T>()),
+      combine_weights_out.data<float>(),
+      t_scatter_index.data<int>(),
+      combine_weights_out_grad.data<float>(),
+      combine_weights_grad->data<float>(),
+      reinterpret_cast<XPUDataType*>(x_grad->data<T>()),
+      num_rows,
+      k,
+      hidden_size,
+      num_experts,
+      num_active);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r,
+                              "moe_gate_dispatch_partial_nosoftmaxtopk_grad");
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(moe_gate_dispatch_partial_nosoftmaxtopk_grad,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::MoeGateDispatchPartialNoSoftMaxTopkGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}