From 69f3721a36d20e83f9282cc7ff8f9d8154a3a59c Mon Sep 17 00:00:00 2001
From: chezhang <1376507468@qq.com>
Date: Thu, 4 Sep 2025 14:55:53 +0800
Subject: [PATCH 01/58] [fix] fix fail test when backend is mack

---
 .../batch_norm_kernel_register.cc             |   10 +-
 .../conv_transpose_grad_kernel_register.cu    |   40 -
 .../conv_transpose_grad_kernel_register.cu    | 1114 +++++++++++++++++
 .../impl/spectral_norm_grad_kernel_impl.h     |  130 --
 .../kernels/impl/spectral_norm_kernel_impl.h  |  182 ---
 backends/metax_gpu/kernels/metax_context.cc   |    1 +
 backends/metax_gpu/kernels/metax_context.h    |    1 +
 .../instance_norm_grad_kerne_registerl.cu     |  650 ++++++++++
 .../instance_norm_kernel_register.cu          |  253 ++++
 .../spectral_norm_grad_kernel_register.cu     |   22 +
 .../spectral_norm_kernel_register.cu          |   22 +
 backends/metax_gpu/patch/paddle.patch         |  462 +++++++
 12 files changed, 2534 insertions(+), 353 deletions(-)
 delete mode 100644 backends/metax_gpu/kernels/cuda_kernels/conv_transpose_grad_kernel_register.cu
 create mode 100644 backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu
 delete mode 100644 backends/metax_gpu/kernels/impl/spectral_norm_grad_kernel_impl.h
 delete mode 100644 backends/metax_gpu/kernels/impl/spectral_norm_kernel_impl.h
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/spectral_norm_grad_kernel_register.cu
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/spectral_norm_kernel_register.cu

diff --git a/backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cc b/backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cc
index b12f208bec0..ac3d8b95062 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cc
+++ b/backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cc
@@ -20,4 +20,12 @@ PD_CUSTOM_KERNEL_REGISTER(batch_norm_infer,
                           ALL_LAYOUT,
                           phi::BatchNormInferKernel,
                           float,
-                          phi::dtype::float16) {}
+                          double,
+                          phi::dtype::bfloat16,
+                          phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16 ||
+      kernel_key.dtype() == phi::DataType::BFLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  }
+}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_transpose_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/conv_transpose_grad_kernel_register.cu
deleted file mode 100644
index dacced51df4..00000000000
--- a/backends/metax_gpu/kernels/cuda_kernels/conv_transpose_grad_kernel_register.cu
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu"  // NOLINT
-PD_CUSTOM_KERNEL_REGISTER(conv2d_transpose_grad,
-                          metax_gpu,
-                          ALL_LAYOUT,
-                          phi::Conv2dTransposeGradKernel,
-                          float,
-                          double) {}
-PD_CUSTOM_KERNEL_REGISTER(conv2d_transpose_double_grad,
-                          metax_gpu,
-                          ALL_LAYOUT,
-                          phi::Conv2dTransposeDoubleGradKernel,
-                          float,
-                          double) {}
-PD_CUSTOM_KERNEL_REGISTER(conv3d_transpose_grad,
-                          metax_gpu,
-                          ALL_LAYOUT,
-                          phi::Conv3dTransposeGradKernel,
-                          float,
-                          double) {}
-PD_CUSTOM_KERNEL_REGISTER(depthwise_conv2d_transpose_grad,
-                          metax_gpu,
-                          ALL_LAYOUT,
-                          phi::DepthwiseConv2dTransposeGradKernel,
-                          float,
-                          double) {}
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu
new file mode 100644
index 00000000000..0067818d165
--- /dev/null
+++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu
@@ -0,0 +1,1114 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+
+#include "kernels/gpudnn/conv_cudnn_v7.h"
+#include "kernels/metax_context.h"
+#include "paddle/common/ddim.h"
+#include "paddle/phi/backends/context_pool.h"
+#include "paddle/phi/backends/dynload/cudnn.h"
+#include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
+#include "paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+#include "paddle/phi/kernels/funcs/slice.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+using GPUDNNDataLayout = phi::backends::gpu::DataLayout;
+
+template <typename T, typename Context>
+void ConvTransposeGradRawGPUDNNKernel(const Context& dev_ctx,
+                                      const DenseTensor& x,
+                                      const DenseTensor& filter,
+                                      const DenseTensor& dout,
+                                      const std::vector<int>& strides,
+                                      const std::vector<int>& paddings,
+                                      const std::string& padding_algorithm,
+                                      int groups,
+                                      const std::vector<int>& dilations,
+                                      const std::string& data_format,
+                                      DenseTensor* dx,
+                                      DenseTensor* dfilter) {
+  // 0-size
+  if (x.numel() == 0) {
+    if (dx) dev_ctx.template Alloc<T>(dx);
+    if (dfilter) {
+      phi::Full<T, Context>(dev_ctx,
+                            phi::IntArray(common::vectorize(dfilter->dims())),
+                            0,
+                            dfilter);
+    }
+    return;
+  }
+  if (filter.numel() == 0) {
+    if (dfilter) dev_ctx.template Alloc<T>(dfilter);
+    if (dx) {
+      phi::Full<T, Context>(
+          dev_ctx, phi::IntArray(common::vectorize(dx->dims())), 0, dx);
+    }
+    return;
+  }
+
+  const T* filter_data = filter.data<T>();
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> dilations_ =
+      dilations;  // cudnn v5 does not support dilations
+  const GPUDNNDataLayout data_layout =
+      (data_format != "NHWC" ? GPUDNNDataLayout::kNCHW
+                             : GPUDNNDataLayout::kNHWC);
+
+  // if channel_last, transpose to channel_first
+  DenseTensor x_transpose;
+  DenseTensor dout_transpose;
+  std::vector<int> x_vec = common::vectorize<int>(x.dims());
+  std::vector<int> out_vec = common::vectorize<int>(dout.dims());
+  if (data_layout == GPUDNNDataLayout::kNHWC) {
+    if (strides.size() == 2U) {
+      std::vector<int> axis = {0, 3, 1, 2};
+      for (size_t i = 0; i < axis.size(); ++i) {
+        x_vec[i] = x.dims()[axis[i]];
+        out_vec[i] = dout.dims()[axis[i]];
+      }
+      x_transpose = Transpose<T, Context>(dev_ctx, x, axis);
+      dout_transpose = Transpose<T, Context>(dev_ctx, dout, axis);
+    } else if (strides.size() == 3U) {
+      std::vector<int> axis = {0, 4, 1, 2, 3};
+      for (size_t i = 0; i < axis.size(); ++i) {
+        x_vec[i] = x.dims()[axis[i]];
+        out_vec[i] = dout.dims()[axis[i]];
+      }
+      x_transpose = Transpose<T, Context>(dev_ctx, x, axis);
+      dout_transpose = Transpose<T, Context>(dev_ctx, dout, axis);
+    }
+  } else {
+    x_transpose = x;
+    dout_transpose = dout;
+  }
+
+  // update padding and dilation
+  auto x_dims = x_transpose.dims();
+  auto filter_dims = filter.dims();
+  DDim x_data_dims;
+  x_data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize);
+
+  int data_dim = strides.size();  // 2d or 3d
+  bool is_sys_pad = funcs::IsSymmetricPadding(paddings_, data_dim);
+
+  std::vector<int> x_pad(x_dims.size() * 2, 0);
+  DenseTensor transformed_dout;
+  std::vector<int> padding_common(data_dim, 0);
+  if (!is_sys_pad) {
+    std::vector<int> padding_diff(data_dim);
+    std::vector<int> new_dout_shape_vec(data_dim + 2);
+    new_dout_shape_vec[0] = dout_transpose.dims()[0];
+    new_dout_shape_vec[1] = dout_transpose.dims()[1];
+
+    for (size_t i = 0; i < data_dim; ++i) {
+      padding_diff[i] = std::abs(paddings_[2 * i] - paddings_[2 * i + 1]);
+      padding_common[i] = std::min(paddings_[2 * i], paddings_[2 * i + 1]);
+      new_dout_shape_vec[i + 2] =
+          dout_transpose.dims()[i + 2] + padding_diff[i];
+      x_pad[2 * i + 4] = paddings_[2 * i] - padding_common[i];
+      x_pad[2 * i + 4 + 1] = paddings_[2 * i + 1] - padding_common[i];
+    }
+
+    transformed_dout.Resize(common::make_ddim(new_dout_shape_vec));
+    dev_ctx.template Alloc<T>(&transformed_dout);
+
+    const int rank = x_transpose.dims().size();
+    T pad_value(0.0);
+    switch (rank) {
+      case 4: {
+        funcs::PadFunction<Context, T, 4>(
+            dev_ctx, x_pad, dout_transpose, pad_value, &transformed_dout);
+      } break;
+      case 5: {
+        funcs::PadFunction<Context, T, 5>(
+            dev_ctx, x_pad, dout_transpose, pad_value, &transformed_dout);
+      } break;
+      default:
+        PADDLE_THROW(errors::InvalidArgument(
+            "Op(ConvTranspose) only supports 4-D or 5-D x DenseTensor."));
+    }
+  } else {
+    transformed_dout = dout_transpose;
+    if (paddings_.size() == data_dim) {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings_[i];
+      }
+    } else {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings_[2 * i];
+      }
+    }
+  }
+
+  const T* x_data = x_transpose.data<T>();
+  const T* dout_data = transformed_dout.data<T>();
+  out_vec = common::vectorize<int>(transformed_dout.dims());
+
+  // ------------------- cudnn descriptors ---------------------
+#ifndef PADDLE_WITH_HIP
+  CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(transformed_dout);
+  CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(filter);
+  CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(x_transpose);
+#endif
+
+  GPUDNNDataLayout layout;
+
+  if (strides.size() == 2U) {
+    layout = GPUDNNDataLayout::kNCHW;
+  } else {
+    layout = GPUDNNDataLayout::kNCDHW;
+  }
+
+  int iwo_groups = groups;
+  int c_groups = 1;
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+  iwo_groups = 1;
+  c_groups = groups;
+  groups = 1;
+#endif
+
+  auto dtype = phi::backends::gpu::CudnnDataType<T>::type;
+  // auto handle = dev_ctx.cudnn_handle();
+  auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
+  ConvArgs args1{handle,
+                 &transformed_dout,
+                 &filter,
+                 &x_transpose,
+                 strides,
+                 padding_common,
+                 dilations_,
+                 dtype,
+                 groups,
+                 layout};
+  ConvArgs args2{handle,
+                 &transformed_dout,
+                 &filter,
+                 &x_transpose,
+                 strides,
+                 padding_common,
+                 dilations_,
+                 dtype,
+                 groups,
+                 layout};
+
+#ifdef PADDLE_WITH_HIP
+  SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
+  SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
+#else
+  SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
+  SearchResult<cudnnConvolutionBwdFilterAlgo_t> filter_result;
+#endif
+
+  auto layout_tensor = phi::backends::gpu::GetCudnnTensorFormat(layout);
+  size_t workspace_size = 0;
+  bool deterministic = FLAGS_cudnn_deterministic;
+  T* dx_data = nullptr;
+  T* dfilter_data = nullptr;
+
+  if (dx) {
+    dx_data = dev_ctx.template Alloc<T>(dx);
+
+    args1.idesc.set(transformed_dout, iwo_groups);
+    args1.wdesc.set(filter, layout_tensor, iwo_groups);
+    args1.odesc.set(x_transpose, iwo_groups);
+    args1.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    phi::AllowTF32Cudnn(),
+                    c_groups);
+#ifdef PADDLE_WITH_HIP
+    using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1));
+    fwd_result.algo =
+        search1::Find<T>(args1, false, deterministic, workspace_size, dev_ctx);
+#else
+    using search1 = SearchAlgorithm<ConvKind::kForward>;
+    fwd_result = search1::Find<T>(dev_ctx, args1, false, deterministic, false);
+    workspace_size = std::max(
+        workspace_size, search1::GetWorkspaceSize(args1, fwd_result.algo));
+#endif
+  }
+
+  if (dfilter) {
+    dfilter_data = dev_ctx.template Alloc<T>(dfilter);
+
+    args2.idesc.set(transformed_dout, iwo_groups);
+    args2.wdesc.set(*dfilter, layout_tensor, iwo_groups);
+    args2.odesc.set(x_transpose, iwo_groups);
+    args2.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    phi::AllowTF32Cudnn(),
+                    c_groups);
+#ifdef PADDLE_WITH_HIP
+    using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
+    filter_result.algo =
+        search2::Find<T>(args2, false, deterministic, workspace_size, dev_ctx);
+#else
+    using search2 = SearchAlgorithm<ConvKind::kBackwardFilter>;
+    filter_result =
+        search2::Find<T>(dev_ctx, args2, false, deterministic, false);
+    workspace_size = std::max(
+        workspace_size, search2::GetWorkspaceSize(args2, filter_result.algo));
+#endif
+  }
+
+  // ------------------- cudnn conv backward data ---------------------
+  // FIxME(typhoonzero): template type T may not be the same as cudnn call.
+  int x_offset = x.numel() / x.dims()[0] / groups;
+  int dout_offset =
+      transformed_dout.numel() / transformed_dout.dims()[0] / groups;
+  int filter_offset = filter.numel() / groups;
+  ScalingParamType<T> alpha = 1.0f;
+  ScalingParamType<T> beta = 0.0f;
+  // auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+  auto workspace_handle = GetDnnWorkspace(
+      const_cast<Allocator*>(&(dev_ctx.GetAllocator())), dev_ctx.stream());
+  if (dx) {
+#ifdef PADDLE_WITH_HIP
+    // Because beta is zero, it is unnecessary to reset dx.
+    for (int g = 0; g < groups; g++) {
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            dynload::miopenConvolutionForward(handle,
+                                              &alpha,
+                                              args1.idesc.desc(),
+                                              dout_data + dout_offset * g,
+                                              args1.wdesc.desc(),
+                                              filter_data + filter_offset * g,
+                                              args1.cdesc.desc(),
+                                              fwd_result.algo,
+                                              &beta,
+                                              args1.odesc.desc(),
+                                              dx_data + x_offset * g,
+                                              cudnn_workspace,
+                                              workspace_size));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size);
+    }
+#else   // PADDLE_WITH_HIP
+    ConvRunner<T, ConvKind::kForward>::Apply(dev_ctx,
+                                             args1,
+                                             fwd_result,
+                                             dout_data,
+                                             filter_data,
+                                             dx_data,
+                                             groups,
+                                             dout_offset,
+                                             filter_offset,
+                                             x_offset,
+                                             workspace_size,
+                                             &workspace_handle,
+                                             false);
+#endif  // PADDLE_WITH_HIP
+
+    if (data_layout == GPUDNNDataLayout::kNHWC) {
+      DenseTensor dx_transpose;
+      DenseTensor dx_nchw;
+      dx_nchw.ShareDataWith(*dx);
+      dx_nchw.Resize(common::make_ddim(x_vec));
+      if (strides.size() == 2U) {
+        std::vector<int> axis = {0, 2, 3, 1};
+        dx_transpose = Transpose<T, Context>(dev_ctx, dx_nchw, axis);
+        *dx = dx_transpose;
+      } else if (strides.size() == 3U) {
+        std::vector<int> axis = {0, 2, 3, 4, 1};
+        dx_transpose = Transpose<T, Context>(dev_ctx, dx_nchw, axis);
+        *dx = dx_transpose;
+      }
+    }
+  }
+
+  // ------------------- cudnn conv backward filter ---------------------
+  if (dfilter) {
+    // Because beta is zero, it is unnecessary to reset dfilter.
+    // Gradient with respect to the filter
+#ifdef PADDLE_WITH_HIP
+    for (int g = 0; g < groups; g++) {
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardWeights(
+            handle,
+            &alpha,
+            args2.odesc.desc(),
+            x_data + x_offset * g,
+            args2.idesc.desc(),
+            dout_data + dout_offset * g,
+            args2.cdesc.desc(),
+            filter_result.algo,
+            &beta,
+            args2.wdesc.desc(),
+            dfilter_data + filter_offset * g,
+            cudnn_workspace,
+            workspace_size));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size);
+    }
+#else   // PADDLE_WITH_HIP
+    ConvRunner<T, ConvKind::kBackwardFilter>::Apply(dev_ctx,
+                                                    args2,
+                                                    filter_result,
+                                                    x_data,
+                                                    dout_data,
+                                                    dfilter_data,
+                                                    groups,
+                                                    dout_offset,
+                                                    filter_offset,
+                                                    x_offset,
+                                                    workspace_size,
+                                                    &workspace_handle,
+                                                    false);
+#endif  // PADDLE_WITH_HIP
+  }
+}
+
+template <typename T, typename Context>
+void Conv2dTransposeGradGPUDNNKernel(const Context& dev_ctx,
+                                     const DenseTensor& x,
+                                     const DenseTensor& filter,
+                                     const DenseTensor& dout,
+                                     const std::vector<int>& strides,
+                                     const std::vector<int>& paddings_,
+                                     const std::vector<int>& output_padding,
+                                     const IntArray& output_size,
+                                     const std::string& padding_algorithm,
+                                     int groups,
+                                     const std::vector<int>& dilations_,
+                                     const std::string& data_format,
+                                     DenseTensor* dx,
+                                     DenseTensor* dfilter) {
+  ConvTransposeGradRawGPUDNNKernel<T, Context>(dev_ctx,
+                                               x,
+                                               filter,
+                                               dout,
+                                               strides,
+                                               paddings_,
+                                               padding_algorithm,
+                                               groups,
+                                               dilations_,
+                                               data_format,
+                                               dx,
+                                               dfilter);
+}
+
+/*
+ * Inputs:  I, filter, dout, ddI, ddfilter
+ * Outputs: ddout, dfilter, dI
+ * ddo = conv_bp_data(filter, ddI) + conv_bp_data(ddfilter, I)
+ * dfilter = conv_bp_filter(dout, ddI)
+ * dI = conv(dout, ddfilter)
+ */
+template <typename T, typename Context>
+void Conv2dTransposeDoubleGradGPUDNNKernel(
+    const Context& dev_ctx,
+    const DenseTensor& x,
+    const DenseTensor& filter,
+    const DenseTensor& dout,
+    const DenseTensor& ddx,
+    const DenseTensor& ddfilter,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    const std::vector<int>& output_padding,
+    const IntArray& output_size,
+    const std::string& padding_algorithm,
+    int groups,
+    const std::vector<int>& dilations,
+    const std::string& data_format,
+    DenseTensor* dx,
+    DenseTensor* dfilter,
+    DenseTensor* ddout) {
+  if (dx) {
+    dev_ctx.template Alloc<T>(dx);
+  }
+  if (dfilter) {
+    dev_ctx.template Alloc<T>(dfilter);
+  }
+  if (ddout) {
+    dev_ctx.template Alloc<T>(ddout);
+    funcs::SetConstant<Context, T> set_zero;
+    set_zero(dev_ctx, ddout, static_cast<T>(0));
+  }
+
+  const T* filter_ = filter.data<T>();
+  const T* dout_ = dout.data<T>();
+  const T* ddx_ = nullptr;
+  const T* ddfilter_ = nullptr;
+  T* dx_ = nullptr;
+  T* dfilter_ = nullptr;
+  T* ddout_ = nullptr;
+  T* transformed_dx_ = nullptr;
+
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> dilations_ = dilations;
+
+  bool deterministic = FLAGS_cudnn_deterministic;
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  // transform DenseTensors to channel first-----------
+  DenseTensor transformed_x_channel(x.type());
+  DenseTensor transformed_dout_channel(dout.type());
+  DenseTensor transformed_ddx_channel(x.type());
+
+  DenseTensor transformed_dx_channel(x.type());
+  DenseTensor transformed_ddout_channel(dout.type());
+
+  if (channel_last) {
+    ResizeToChannelFirst<Context, T>(dev_ctx, &x, &transformed_x_channel);
+    TransToChannelFirst<Context, T>(dev_ctx, &x, &transformed_x_channel);
+
+    ResizeToChannelFirst<Context, T>(dev_ctx, &dout, &transformed_dout_channel);
+    TransToChannelFirst<Context, T>(dev_ctx, &dout, &transformed_dout_channel);
+
+    ResizeToChannelFirst<Context, T>(dev_ctx, &ddx, &transformed_ddx_channel);
+    TransToChannelFirst<Context, T>(dev_ctx, &ddx, &transformed_ddx_channel);
+
+    if (dx) {
+      ResizeToChannelFirst<Context, T>(dev_ctx, dx, &transformed_dx_channel);
+      dev_ctx.template Alloc<T>(&transformed_dx_channel);
+    }
+    if (ddout) {
+      ResizeToChannelFirst<Context, T>(
+          dev_ctx, ddout, &transformed_ddout_channel);
+    }
+  } else {
+    transformed_x_channel = x;
+    transformed_dout_channel = dout;
+    transformed_ddx_channel = ddx;
+
+    if (dx) {
+      transformed_dx_channel = *dx;
+    }
+  }
+  std::vector<int> out_vec =
+      common::vectorize<int>(transformed_dout_channel.dims());
+
+  auto x_dims = transformed_x_channel.dims();
+  auto filter_dims = filter.dims();
+  DDim x_data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize);
+
+  int data_dim = strides.size();  // 2d or 3d
+  bool is_sys_pad = funcs::IsSymmetricPadding(paddings_, data_dim);
+  DenseTensor transformed_x(x.type());
+  DenseTensor transformed_ddx(x.type());
+
+  DenseTensor transformed_dout(dout.type());
+
+  std::vector<int> padding_common(data_dim, 0);
+  std::vector<int> input_pad(x.dims().size() * 2, 0);
+
+  if (!is_sys_pad) {
+    // get pad
+    std::vector<int> padding_diff(data_dim);
+    std::vector<int> new_input_shape_vec(data_dim + 2);
+    std::vector<int> new_output_grad_shape_vec(data_dim + 2);
+
+    new_input_shape_vec[0] = transformed_x_channel.dims()[0];
+    new_input_shape_vec[1] = transformed_x_channel.dims()[1];
+
+    new_output_grad_shape_vec[0] = transformed_dout_channel.dims()[0];
+    new_output_grad_shape_vec[1] = transformed_dout_channel.dims()[1];
+
+    for (size_t i = 0; i < data_dim; ++i) {
+      padding_diff[i] = std::abs(paddings_[2 * i] - paddings_[2 * i + 1]);
+      padding_common[i] = std::min(paddings_[2 * i], paddings_[2 * i + 1]);
+      new_input_shape_vec[i + 2] =
+          transformed_x_channel.dims()[i + 2] + padding_diff[i];
+
+      new_output_grad_shape_vec[i + 2] =
+          transformed_dout_channel.dims()[i + 2] + padding_diff[i];
+
+      input_pad[2 * i + 4] = paddings_[2 * i] - padding_common[i];
+      input_pad[2 * i + 4 + 1] = paddings_[2 * i + 1] - padding_common[i];
+    }
+    DDim new_input_shape(common::make_ddim(new_input_shape_vec));
+    transformed_x.Resize(new_input_shape);
+    transformed_ddx.Resize(new_input_shape);
+    transformed_dout.Resize(common::make_ddim(new_output_grad_shape_vec));
+
+    dev_ctx.template Alloc<T>(&transformed_x);
+    dev_ctx.template Alloc<T>(&transformed_ddx);
+    dev_ctx.template Alloc<T>(&transformed_dout);
+
+    // pad for input
+    const int rank = x.dims().size();
+    T pad_value(0.0);
+    switch (rank) {
+      case 4: {
+        funcs::PadFunction<Context, T, 4>(dev_ctx,
+                                          input_pad,
+                                          transformed_x_channel,
+                                          pad_value,
+                                          &transformed_x);
+        funcs::PadFunction<Context, T, 4>(dev_ctx,
+                                          input_pad,
+                                          transformed_dout_channel,
+                                          pad_value,
+                                          &transformed_dout);
+        funcs::PadFunction<Context, T, 4>(dev_ctx,
+                                          input_pad,
+                                          transformed_ddx_channel,
+                                          pad_value,
+                                          &transformed_ddx);
+      } break;
+      case 5: {
+        funcs::PadFunction<Context, T, 5>(dev_ctx,
+                                          input_pad,
+                                          transformed_x_channel,
+                                          pad_value,
+                                          &transformed_x);
+        funcs::PadFunction<Context, T, 5>(dev_ctx,
+                                          input_pad,
+                                          transformed_ddx_channel,
+                                          pad_value,
+                                          &transformed_ddx);
+      } break;
+      default:
+        PADDLE_THROW(errors::InvalidArgument(
+            "ConvOp only support tensors with 4 or 5 dimensions."));
+    }
+  } else {
+    transformed_x = transformed_x_channel;
+    transformed_dout = transformed_dout_channel;
+    transformed_ddx = transformed_ddx_channel;
+
+    if (paddings_.size() == data_dim) {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings_[i];
+      }
+    } else {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings_[2 * i];
+      }
+    }
+  }
+
+  std::vector<int64_t> starts(data_dim, 0);
+  std::vector<int64_t> ends(data_dim, 0);
+  std::vector<int64_t> axes(data_dim, 0);
+  for (size_t i = 0; i < data_dim; ++i) {
+    starts[i] = input_pad[2 * i + 4] * (strides[i] + 1);
+    ends[i] = starts[i] + out_vec[i + 2];
+    axes[i] = i + 2;
+  }
+
+  std::vector<int> transformed_out_vec = out_vec;
+  for (size_t i = 0; i < data_dim; ++i) {
+    transformed_out_vec[i + 2] =
+        out_vec[i + 2] +
+        (input_pad[2 * i + 4] + input_pad[2 * i + 5]) * strides[i] -
+        2 * padding_common[i] + paddings_[2 * i] + paddings_[2 * i + 1];
+  }
+
+  if (!is_sys_pad) {
+    transformed_ddout_channel.Resize(common::make_ddim(transformed_out_vec));
+    dev_ctx.template Alloc<T>(&transformed_ddout_channel);
+  } else {
+    dev_ctx.template Alloc<T>(ddout);
+    transformed_ddout_channel = *ddout;
+    transformed_ddout_channel.Resize(common::make_ddim(transformed_out_vec));
+  }
+
+  const T* x_ = transformed_x.data<T>();
+
+  int iwo_group = groups;
+  int c_group = 1;
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+  iwo_group = 1;
+  c_group = groups;
+  groups = 1;
+#endif
+  auto dtype = phi::backends::gpu::CudnnDataType<T>::type;
+
+  // auto handle = dev_ctx.cudnn_handle();
+  auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
+  auto layout =
+      phi::backends::gpu::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW);
+
+  ConvArgs args1{handle,
+                 &transformed_ddout_channel,
+                 &filter,
+                 &transformed_ddx,
+                 strides,
+                 padding_common,
+                 dilations_,
+                 dtype,
+                 groups,
+                 GPUDNNDataLayout::kNCHW};
+  ConvArgs args2{handle,
+                 &transformed_ddout_channel,
+                 &ddfilter,
+                 &transformed_x,
+                 strides,
+                 padding_common,
+                 dilations_,
+                 dtype,
+                 groups,
+                 GPUDNNDataLayout::kNCHW};
+
+  ConvArgs args3{handle,
+                 &transformed_dout,
+                 dfilter,
+                 &transformed_ddx_channel,
+                 strides,
+                 padding_common,
+                 dilations_,
+                 dtype,
+                 groups,
+                 GPUDNNDataLayout::kNCHW};
+  ConvArgs args4{handle,
+                 &transformed_dout,
+                 &ddfilter,
+                 &transformed_dx_channel,
+                 strides,
+                 padding_common,
+                 dilations_,
+                 dtype,
+                 groups,
+                 GPUDNNDataLayout::kNCHW};
+#ifdef PADDLE_WITH_HIP
+  SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result1;
+  SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result2;
+  SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
+  SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
+#else
+  SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result1;
+  SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result2;
+  SearchResult<cudnnConvolutionBwdFilterAlgo_t> filter_result;
+  SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
+#endif
+
+  // ddo = conv(ddI, filter) + conv(I, ddfilter)
+  size_t workspace_size = 0;
+
+  T* transformed_ddout_channel_ = nullptr;
+
+  if (ddout) {
+    ddout_ = ddout->data<T>();
+    transformed_ddout_channel_ = transformed_ddout_channel.data<T>();
+
+    args1.idesc.set(transformed_ddout_channel, iwo_group);
+    args1.wdesc.set(filter, layout, iwo_group);
+    args1.odesc.set(transformed_ddx, iwo_group);
+    args1.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    phi::AllowTF32Cudnn(),
+                    c_group);
+#ifdef PADDLE_WITH_HIP
+    using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+    workspace_size = search1::GetWorkspaceSize(args1);
+    bwd_result1.algo =
+        search1::Find<T>(args1, false, deterministic, workspace_size, dev_ctx);
+#else
+    using search1 = SearchAlgorithm<ConvKind::kBackwardData>;
+    bwd_result1 = search1::Find<T>(dev_ctx, args1, false, deterministic, false);
+    workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo);
+#endif
+
+    ddfilter_ = ddfilter.data<T>();
+    args2.handle = handle;
+    args2.idesc.set(transformed_ddout_channel, iwo_group);
+    args2.wdesc.set(ddfilter, layout, iwo_group);
+    args2.odesc.set(transformed_x, iwo_group);
+    args2.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    phi::AllowTF32Cudnn(),
+                    c_group);
+#ifdef PADDLE_WITH_HIP
+    using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
+    bwd_result2.algo =
+        search2::Find<T>(args2, false, deterministic, workspace_size, dev_ctx);
+#else
+    using search2 = SearchAlgorithm<ConvKind::kBackwardData>;
+    bwd_result2 = search2::Find<T>(dev_ctx, args2, false, deterministic, false);
+    workspace_size = std::max(
+        workspace_size, search2::GetWorkspaceSize(args2, bwd_result2.algo));
+#endif
+  }
+
+  if (dfilter) {
+    dfilter_ = dfilter->data<T>();
+
+    args3.idesc.set(transformed_dout, iwo_group);
+    args3.wdesc.set(*dfilter, layout, iwo_group);
+    args3.odesc.set(transformed_ddx_channel, iwo_group);
+    args3.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    phi::AllowTF32Cudnn(),
+                    c_group);
+#ifdef PADDLE_WITH_HIP
+    using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
+    filter_result.algo =
+        search3::Find<T>(args3, false, deterministic, workspace_size, dev_ctx);
+#else
+    using search3 = SearchAlgorithm<ConvKind::kBackwardFilter>;
+    filter_result =
+        search3::Find<T>(dev_ctx, args3, false, deterministic, false);
+    workspace_size = std::max(
+        workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo));
+#endif
+  }
+
+  if (dx) {
+    transformed_dx_ = transformed_dx_channel.data<T>();
+
+    args4.handle = handle;
+    args4.idesc.set(transformed_dout, iwo_group);
+    args4.wdesc.set(ddfilter, layout, iwo_group);
+    args4.odesc.set(transformed_dx_channel, iwo_group);
+    args4.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    phi::AllowTF32Cudnn(),
+                    c_group);
+#ifdef PADDLE_WITH_HIP
+    using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
+    fwd_result.algo =
+        search4::Find<T>(args4, false, deterministic, workspace_size, dev_ctx);
+#else
+    using search4 = SearchAlgorithm<ConvKind::kForward>;
+    fwd_result = search4::Find<T>(dev_ctx, args4, false, deterministic, false);
+    workspace_size = std::max(
+        workspace_size, search4::GetWorkspaceSize(args4, fwd_result.algo));
+#endif
+  }
+
+  int i_n, i_c, i_d, i_h, i_w;
+  GetNCDHW(transformed_x.dims(),
+           GPUDNNDataLayout::kNCHW,
+           &i_n,
+           &i_c,
+           &i_d,
+           &i_h,
+           &i_w);
+
+  int o_n, o_c, o_d, o_h, o_w;
+  GetNCDHW(transformed_dout.dims(),
+           GPUDNNDataLayout::kNCHW,
+           &o_n,
+           &o_c,
+           &o_d,
+           &o_h,
+           &o_w);
+
+  int group_offset_in =
+      transformed_x.numel() / transformed_x.dims()[0] / groups;
+  int group_offset_out =
+      transformed_dout.numel() / transformed_dout.dims()[0] / groups;
+  int group_offset_filter = filter.numel() / groups;
+
+  ScalingParamType<T> alpha = 1.0f;
+  ScalingParamType<T> beta = 0.0f;
+
+  // auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+  auto workspace_handle = GetDnnWorkspace(
+      const_cast<Allocator*>(&(dev_ctx.GetAllocator())), dev_ctx.stream());
+  if (ddout) {
+    ddx_ = transformed_ddx.data<T>();
+#ifdef PADDLE_WITH_HIP
+    for (int i = 0; i < groups; i++) {
+      workspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
+                handle,
+                &alpha,
+                args1.odesc.desc(),
+                ddx_ + i * group_offset_in,
+                args1.wdesc.desc(),
+                filter_ + i * group_offset_filter,
+                args1.cdesc.desc(),
+                bwd_result1.algo,
+                &beta,
+                args1.idesc.desc(),
+                transformed_ddout_channel_ + i * group_offset_out,
+                workspace_ptr,
+                workspace_size));
+          },
+          workspace_size);
+    }
+#else   // PADDLE_WITH_HIP
+    ConvRunner<T, ConvKind::kBackwardData>::Apply(dev_ctx,
+                                                  args1,
+                                                  bwd_result1,
+                                                  ddx_,
+                                                  filter_,
+                                                  transformed_ddout_channel_,
+                                                  groups,
+                                                  group_offset_out,
+                                                  group_offset_filter,
+                                                  group_offset_in,
+                                                  workspace_size,
+                                                  &workspace_handle,
+                                                  false);
+#endif  // PADDLE_WITH_HIP
+
+#ifdef PADDLE_WITH_HIP
+    for (int i = 0; i < groups; i++) {
+      // MIOPEN ONLY support beta to be 0.0f
+      DenseTensor conv_x_ddfilter(dout.type());
+      conv_x_ddfilter.Resize(transformed_ddout_channel.dims());
+      T* conv_x_ddfilter_data = dev_ctx.template Alloc<T>(&conv_x_ddfilter);
+      workspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
+                handle,
+                &alpha,
+                args2.odesc.desc(),
+                x_ + i * group_offset_in,
+                args2.wdesc.desc(),
+                ddfilter_ + i * group_offset_filter,
+                args2.cdesc.desc(),
+                bwd_result2.algo,
+                &beta,
+                args2.idesc.desc(),
+                conv_x_ddfilter_data + i * group_offset_out,
+                workspace_ptr,
+                workspace_size));
+          },
+          workspace_size);
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenOpTensor(
+          handle,
+          miopenTensorOpAdd,
+          &alpha,
+          args2.idesc.desc(),
+          transformed_ddout_channel_ + i * group_offset_out,
+          &alpha,
+          args2.idesc.desc(),
+          conv_x_ddfilter_data + i * group_offset_out,
+          &beta,
+          args2.idesc.desc(),
+          transformed_ddout_channel_ + i * group_offset_out));
+    }
+#else   // PADDLE_WITH_HIP
+    ConvRunner<T, ConvKind::kBackwardData>::Apply(dev_ctx,
+                                                  args2,
+                                                  bwd_result2,
+                                                  x_,
+                                                  ddfilter_,
+                                                  transformed_ddout_channel_,
+                                                  groups,
+                                                  group_offset_out,
+                                                  group_offset_filter,
+                                                  group_offset_in,
+                                                  workspace_size,
+                                                  &workspace_handle,
+                                                  true);
+#endif  // PADDLE_WITH_HIP
+
+    if ((!is_sys_pad) && (!channel_last)) {
+      if (strides.size() == 2U) {
+        funcs::Slice<Context, T, 4>(
+            dev_ctx, &transformed_ddout_channel, ddout, starts, ends, axes);
+      } else if (!is_sys_pad && strides.size() == 3U) {
+        funcs::Slice<Context, T, 5>(
+            dev_ctx, &transformed_ddout_channel, ddout, starts, ends, axes);
+      }
+    } else if ((!is_sys_pad) && (channel_last)) {
+      if (strides.size() == 2U) {
+        funcs::Slice<Context, T, 4>(dev_ctx,
+                                    &transformed_ddout_channel,
+                                    &transformed_ddout_channel,
+                                    starts,
+                                    ends,
+                                    axes);
+      } else if (!is_sys_pad && strides.size() == 3U) {
+        funcs::Slice<Context, T, 5>(dev_ctx,
+                                    &transformed_ddout_channel,
+                                    &transformed_ddout_channel,
+                                    starts,
+                                    ends,
+                                    axes);
+      }
+
+      TransToChannelLast<Context, T>(
+          dev_ctx, &transformed_ddout_channel, ddout);
+    }
+  }
+
+  T* transformed_dout_channel_ = transformed_dout.data<T>();
+  if (dfilter) {
+    ddx_ = transformed_ddx_channel.data<T>();
+#ifdef PADDLE_WITH_HIP
+    for (int i = 0; i < groups; i++) {
+      workspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                dynload::miopenConvolutionBackwardWeights(
+                    handle,
+                    &alpha,
+                    args3.odesc.desc(),
+                    ddx_ + i * group_offset_in,
+                    args3.idesc.desc(),
+                    transformed_dout_channel_ + i * group_offset_out,
+                    args3.cdesc.desc(),
+                    filter_result.algo,
+                    &beta,
+                    args3.wdesc.desc(),
+                    dfilter_ + i * group_offset_filter,
+                    workspace_ptr,
+                    workspace_size));
+          },
+          workspace_size);
+    }
+#else   // PADDLE_WITH_HIP
+    ConvRunner<T, ConvKind::kBackwardFilter>::Apply(dev_ctx,
+                                                    args3,
+                                                    filter_result,
+                                                    ddx_,
+                                                    transformed_dout_channel_,
+                                                    dfilter_,
+                                                    groups,
+                                                    group_offset_out,
+                                                    group_offset_filter,
+                                                    group_offset_in,
+                                                    workspace_size,
+                                                    &workspace_handle,
+                                                    false);
+#endif  // PADDLE_WITH_HIP
+  }
+
+  if (dx) {
+    ddfilter_ = ddfilter.data<T>();
+#ifdef PADDLE_WITH_HIP
+    for (int i = 0; i < groups; i++) {
+      workspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionForward(
+                handle,
+                &alpha,
+                args4.idesc.desc(),
+                transformed_dout_channel_ + i * group_offset_out,
+                args4.wdesc.desc(),
+                ddfilter_ + i * group_offset_filter,
+                args4.cdesc.desc(),
+                fwd_result.algo,
+                &beta,
+                args4.odesc.desc(),
+                transformed_dx_ + i * group_offset_in,
+                workspace_ptr,
+                workspace_size));
+          },
+          workspace_size);
+    }
+#else   // PADDLE_WITH_HIP
+    ConvRunner<T, ConvKind::kForward>::Apply(dev_ctx,
+                                             args4,
+                                             fwd_result,
+                                             transformed_dout_channel_,
+                                             ddfilter_,
+                                             transformed_dx_,
+                                             groups,
+                                             group_offset_out,
+                                             group_offset_filter,
+                                             group_offset_in,
+                                             workspace_size,
+                                             &workspace_handle,
+                                             false);
+#endif  // PADDLE_WITH_HIP
+
+    if (channel_last) {
+      TransToChannelLast<Context, T>(dev_ctx, &transformed_dx_channel, dx);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void Conv3dTransposeGradGPUDNNKernel(const Context& dev_ctx,
+                                     const DenseTensor& x,
+                                     const DenseTensor& filter,
+                                     const DenseTensor& dout,
+                                     const std::vector<int>& strides,
+                                     const std::vector<int>& paddings_,
+                                     const std::vector<int>& output_padding,
+                                     const std::vector<int>& output_size,
+                                     const std::string& padding_algorithm,
+                                     int groups,
+                                     const std::vector<int>& dilations_,
+                                     const std::string& data_format,
+                                     DenseTensor* dx,
+                                     DenseTensor* dfilter) {
+  ConvTransposeGradRawGPUDNNKernel<T, Context>(dev_ctx,
+                                               x,
+                                               filter,
+                                               dout,
+                                               strides,
+                                               paddings_,
+                                               padding_algorithm,
+                                               groups,
+                                               dilations_,
+                                               data_format,
+                                               dx,
+                                               dfilter);
+}
+
+}  // namespace phi
+
+using float16 = phi::dtype::float16;
+
+PD_REGISTER_PLUGIN_KERNEL(conv2d_transpose_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::Conv2dTransposeGradGPUDNNKernel,
+                          float,
+                          double,
+                          float16,
+                          phi::dtype::bfloat16) {}
+PD_REGISTER_PLUGIN_KERNEL(conv2d_transpose_double_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::Conv2dTransposeDoubleGradGPUDNNKernel,
+                          float,
+                          double,
+                          float16,
+                          phi::dtype::bfloat16) {}
+PD_REGISTER_PLUGIN_KERNEL(conv3d_transpose_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::Conv3dTransposeGradGPUDNNKernel,
+                          float,
+                          double,
+                          float16,
+                          phi::dtype::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/impl/spectral_norm_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/spectral_norm_grad_kernel_impl.h
deleted file mode 100644
index 03651be95c3..00000000000
--- a/backends/metax_gpu/kernels/impl/spectral_norm_grad_kernel_impl.h
+++ /dev/null
@@ -1,130 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "kernels/impl/spectral_norm_kernel_impl.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void SpectralNormGradKernel(const Context& dev_ctx,
-                            const DenseTensor& weight,
-                            const DenseTensor& u,
-                            const DenseTensor& v,
-                            const DenseTensor& out_grad,
-                            int dim,
-                            int power_iters,
-                            float eps,
-                            DenseTensor* weight_grad) {
-  auto& place = *dev_ctx.eigen_device();
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
-  const int h = u.dims()[0];
-  const int w = v.dims()[0];
-
-  DenseTensor weight_mat, out_grad_mat;
-  auto dims = weight.dims();
-  const int rank = dims.size();
-  std::vector<int> real_dims;
-  if (dim != 0) {
-    std::vector<int> perm;
-    perm.push_back(dim);
-    real_dims.push_back(dims[dim]);
-    for (int i = 0; i < rank; i++) {
-      if (i != dim) {
-        perm.push_back(i);
-        real_dims.push_back(dims[i]);
-      }
-    }
-    weight_mat.Resize(common::make_ddim(real_dims));
-    dev_ctx.template Alloc<T>(&weight_mat);
-    out_grad_mat.Resize(common::make_ddim(real_dims));
-    dev_ctx.template Alloc<T>(&out_grad_mat);
-    TransCompute2DTo5D<Context, T>(dev_ctx, weight, rank, perm, &weight_mat);
-    TransCompute2DTo5D<Context, T>(
-        dev_ctx, out_grad, rank, perm, &out_grad_mat);
-  } else {
-    for (int i = 0; i < rank; i++) {
-      real_dims.push_back(i);
-    }
-    phi::Copy(dev_ctx, weight, dev_ctx.GetPlace(), true, &weight_mat);
-    phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), true, &out_grad_mat);
-  }
-  weight_mat = weight_mat.Resize({h, w});
-  out_grad_mat = out_grad_mat.Resize({h, w});
-
-  DenseTensor sigma;
-  sigma.Resize(weight_mat.dims());
-  dev_ctx.template Alloc<T>(&sigma);
-  DenseTensor uu, vv;
-  phi::Copy(dev_ctx, u, dev_ctx.GetPlace(), true, &uu);
-  phi::Copy(dev_ctx, v, dev_ctx.GetPlace(), true, &vv);
-  CalcMatrixSigmaAndNormWeight<Context, T>(dev_ctx,
-                                           &weight_mat,
-                                           &(uu.Resize({h, 1})),
-                                           &(vv.Resize({w, 1})),
-                                           &sigma,
-                                           power_iters,
-                                           eps);
-
-  DenseTensor uv;
-  uv.Resize({h, w});
-  dev_ctx.template Alloc<T>(&uv);
-  blas.MatMul(
-      uu.Resize({h, 1}), false, vv.Resize({w, 1}), false, T(1), &uv, T(0));
-
-  DenseTensor weight_grad_mat;
-  weight_grad_mat.Resize({h, w});
-  dev_ctx.template Alloc<T>(&weight_grad_mat);
-  auto weight_grad_mat_t = EigenTensor<T, 2>::From(weight_grad_mat);
-  auto weight_mat_t = EigenTensor<T, 2>::From(weight_mat);
-  auto out_grad_mat_t = EigenTensor<T, 2>::From(out_grad_mat);
-  auto sigma_t = EigenTensor<T, 2>::From(sigma);
-  auto uv_t = EigenTensor<T, 2>::From(uv);
-  weight_mat_t.device(place) =
-      weight_mat_t.sum().eval().reshape(Array2(1, 1)).broadcast(Array2(h, w));
-  weight_grad_mat_t.device(place) =
-      out_grad_mat_t * (out_grad_mat_t.constant(1.0) - uv_t * weight_mat_t) /
-      sigma_t;
-
-  if (dim != 0) {
-    std::vector<int> perm;
-    for (int i = 0; i < rank; i++) {
-      if (i < dim) {
-        perm.push_back(i + 1);
-      } else if (i == dim) {
-        perm.push_back(0);
-      } else {
-        perm.push_back(i);
-      }
-    }
-    weight_grad->Resize(dims);
-    dev_ctx.template Alloc<T>(weight_grad);
-    TransCompute2DTo5D<Context, T>(
-        dev_ctx,
-        weight_grad_mat.Resize(common::make_ddim(real_dims)),
-        rank,
-        perm,
-        weight_grad);
-  } else {
-    phi::Copy(dev_ctx,
-              weight_grad_mat.Resize(dims),
-              dev_ctx.GetPlace(),
-              true,
-              weight_grad);
-  }
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/spectral_norm_kernel_impl.h b/backends/metax_gpu/kernels/impl/spectral_norm_kernel_impl.h
deleted file mode 100644
index 8c9fc548259..00000000000
--- a/backends/metax_gpu/kernels/impl/spectral_norm_kernel_impl.h
+++ /dev/null
@@ -1,182 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "kernels/funcs/blas/blas.h"
-#include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace phi {
-
-using Array1 = Eigen::DSizes<int64_t, 1>;
-using Array2 = Eigen::DSizes<int64_t, 2>;
-using IndexPair = Eigen::IndexPair<int>;
-
-template <typename Context, typename T>
-static inline void TransCompute2DTo5D(const Context& dev_ctx,
-                                      const DenseTensor& in,
-                                      const int rank,
-                                      const std::vector<int>& perm,
-                                      DenseTensor* out) {
-  if (rank <= 1 || rank > 5) {
-    PADDLE_THROW(common::errors::Fatal(
-        "Weight rank of SpectralNorm should be in range [2, 5], but got %d.",
-        rank));
-  }
-
-  switch (rank) {
-    case 2:
-      phi::funcs::Transpose<Context, T, 2> trans2;
-      trans2(dev_ctx, in, out, perm);
-      break;
-    case 3:
-      phi::funcs::Transpose<Context, T, 3> trans3;
-      trans3(dev_ctx, in, out, perm);
-      break;
-    case 4:
-      phi::funcs::Transpose<Context, T, 4> trans4;
-      trans4(dev_ctx, in, out, perm);
-      break;
-    case 5:
-      phi::funcs::Transpose<Context, T, 5> trans5;
-      trans5(dev_ctx, in, out, perm);
-      break;
-    default:
-      break;
-  }
-}
-
-template <typename Context, typename T>
-static inline void CalcMatrixSigmaAndNormWeight(const Context& dev_ctx,
-                                                DenseTensor* weight,
-                                                DenseTensor* u,
-                                                DenseTensor* v,
-                                                DenseTensor* sigma,
-                                                const int power_iters,
-                                                const float eps) {
-  auto& place = *dev_ctx.eigen_device();
-  auto blas = funcs::GetBlas<Context, T>(dev_ctx);
-  auto sigma_t = EigenTensor<T, 2>::From(*sigma);
-  auto weight_t = EigenTensor<T, 2>::From(*weight);
-  auto u_t = EigenTensor<T, 2>::From(*u);
-  auto v_t = EigenTensor<T, 2>::From(*v);
-
-  const int h = weight->dims()[0];
-  const int w = weight->dims()[1];
-
-  for (int i = 0; i < power_iters; i++) {
-    // V = W^T * U / ||W^T * U||_2
-    blas.MatMul(*weight, true, *u, false, T(1), v, T(0));
-    auto v_t_norm =
-        v_t.square().sum().sqrt().eval().reshape(Array1(1)).broadcast(
-            Array1(w));
-    v_t.device(place) = v_t / (v_t_norm + v_t_norm.constant(eps));
-    // U = W^T * V / ||W^T * V||_2
-    blas.MatMul(*weight, false, *v, false, T(1), u, T(0));
-    auto u_t_norm =
-        u_t.square().sum().sqrt().eval().reshape(Array1(1)).broadcast(
-            Array1(h));
-    u_t.device(place) = u_t / (u_t_norm + u_t_norm.constant(eps));
-  }
-  DenseTensor weight_v;
-  weight_v.Resize({h, 1});
-  dev_ctx.template Alloc<T>(&weight_v);
-  blas.MatMul(*weight, false, *v, false, T(1), &weight_v, T(0));
-  auto weight_v_t = EigenTensor<T, 2>::From(weight_v);
-  sigma_t.device(place) = (u_t * weight_v_t)
-                              .sum()
-                              .eval()
-                              .reshape(Array2(1, 1))
-                              .broadcast(Array2(h, w));
-  weight_t.device(place) = weight_t / sigma_t;
-}
-
-template <typename T, typename Context>
-void SpectralNormKernel(const Context& dev_ctx,
-                        const DenseTensor& weight,
-                        const DenseTensor& u,
-                        const DenseTensor& v,
-                        int dim,
-                        int power_iters,
-                        float eps,
-                        DenseTensor* out) {
-  const int h = u.dims()[0];
-  const int w = v.dims()[0];
-
-  DenseTensor weight_mat;
-  auto dims = weight.dims();
-  const int rank = dims.size();
-  std::vector<int> real_dims;
-  if (dim != 0) {
-    std::vector<int> perm;
-    perm.push_back(dim);
-    real_dims.push_back(dims[dim]);
-    for (int i = 0; i < rank; i++) {
-      if (i != dim) {
-        perm.push_back(i);
-        real_dims.push_back(dims[i]);
-      }
-    }
-    weight_mat.Resize(common::make_ddim(real_dims));
-    dev_ctx.template Alloc<T>(&weight_mat);
-    TransCompute2DTo5D<Context, T>(dev_ctx, weight, rank, perm, &weight_mat);
-  } else {
-    for (int i = 0; i < rank; i++) {
-      real_dims.push_back(i);
-    }
-    phi::Copy(dev_ctx, weight, dev_ctx.GetPlace(), true, &weight_mat);
-  }
-  weight_mat = weight_mat.Resize({h, w});
-
-  DenseTensor sigma;
-  sigma.Resize(weight_mat.dims());
-  dev_ctx.template Alloc<T>(&sigma);
-  DenseTensor uu, vv;
-  phi::Copy(dev_ctx, u, dev_ctx.GetPlace(), true, &uu);
-  phi::Copy(dev_ctx, v, dev_ctx.GetPlace(), true, &vv);
-  CalcMatrixSigmaAndNormWeight<Context, T>(dev_ctx,
-                                           &weight_mat,
-                                           &(uu.Resize({h, 1})),
-                                           &(vv.Resize({w, 1})),
-                                           &sigma,
-                                           power_iters,
-                                           eps);
-
-  if (dim != 0) {
-    std::vector<int> perm;
-    for (int i = 0; i < rank; i++) {
-      if (i < dim) {
-        perm.push_back(i + 1);
-      } else if (i == dim) {
-        perm.push_back(0);
-      } else {
-        perm.push_back(i);
-      }
-    }
-    out->Resize(dims);
-    dev_ctx.template Alloc<T>(out);
-    TransCompute2DTo5D<Context, T>(
-        dev_ctx,
-        weight_mat.Resize(common::make_ddim(real_dims)),
-        rank,
-        perm,
-        out);
-  } else {
-    phi::Copy(dev_ctx, weight_mat.Resize(dims), dev_ctx.GetPlace(), true, out);
-  }
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/metax_context.cc b/backends/metax_gpu/kernels/metax_context.cc
index 9bd26a170c5..4df4d88b0b4 100644
--- a/backends/metax_gpu/kernels/metax_context.cc
+++ b/backends/metax_gpu/kernels/metax_context.cc
@@ -15,6 +15,7 @@
 #include "kernels/metax_context.h"
 
 namespace phi {
+bool AllowTF32Cudnn() { return false; }
 void DnnWorkspaceHandle::RunFuncSync(
     const std::function<void(void*)>& cudnn_func,
     size_t required_workspace_bytes,
diff --git a/backends/metax_gpu/kernels/metax_context.h b/backends/metax_gpu/kernels/metax_context.h
index 21e9084a977..5974aadcc41 100644
--- a/backends/metax_gpu/kernels/metax_context.h
+++ b/backends/metax_gpu/kernels/metax_context.h
@@ -128,6 +128,7 @@ inline void InitCusolverDnHandle(cusolverDnHandle_t* handle,
   }
 }
 
+bool AllowTF32Cudnn();
 inline cusolverDnHandle_t GetCusolverDnHandle(gpuStream_t stream, Place place) {
   std::call_once(flag_cusolver_dn_, [&]() {
     if (!cusolver_dn_handle_) {
diff --git a/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu b/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu
new file mode 100644
index 00000000000..d7540d949a9
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu
@@ -0,0 +1,650 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "glog/logging.h"
+#include "kernels/metax_context.h"
+#include "paddle/common/layout.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
+#include "paddle/phi/kernels/gpu/instance_norm_utils.h"
+#include "paddle/phi/kernels/instance_norm_grad_kernel.h"
+
+namespace phi {
+template <typename T, int BlockDim>
+static __global__ void GradComputeDX(const T *dy,
+                                     const BatchNormParamType<T> *scale,
+                                     const BatchNormParamType<T> *mean,
+                                     const T *x,
+                                     const BatchNormParamType<T> *variance,
+                                     const int C,
+                                     const int sample_size,
+                                     T *dx) {
+  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * sample_size;
+  int ncid = blockIdx.x;
+  int c = ncid % C;
+  BatchNormParamType<T> mean_val = mean[ncid];
+  BatchNormParamType<T> inv_var_val = variance[ncid];
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
+  __shared__ BatchNormParamType<T> dy_sum_val;
+  __shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
+  BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
+  BatchNormParamType<T> dy_x_sub_mean_sum =
+      static_cast<BatchNormParamType<T>>(0);
+
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    BatchNormParamType<T> dy_i = static_cast<BatchNormParamType<T>>(dy[i]);
+    dy_sum += dy_i;
+    dy_x_sub_mean_sum +=
+        dy_i * (static_cast<BatchNormParamType<T>>(x[i]) - mean_val);
+  }
+  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+  dy_x_sub_mean_sum =
+      BlockReduce(dy_x_sub_mean_storage).Reduce(dy_x_sub_mean_sum, cub::Sum());
+  if (threadIdx.x == 0) {
+    dy_sum_val = dy_sum;
+    dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
+  }
+  __syncthreads();
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    dx[i] = static_cast<T>(
+        (static_cast<BatchNormParamType<T>>(dy[i]) -
+         dy_sum_val / static_cast<BatchNormParamType<T>>(sample_size) -
+         (static_cast<BatchNormParamType<T>>(x[i]) - mean_val) *
+             dy_x_sub_mean_sum_val * inv_var_val * inv_var_val / sample_size) *
+        scale[c] * inv_var_val);
+  }
+}
+
+static __device__ __forceinline__ float real_sqrt(float x) {
+  return 1. / sqrtf(x);
+}
+static __device__ __forceinline__ double real_sqrt(double x) {
+  return 1. / sqrt(x);
+}
+
+template <typename T, typename AccT, int BlockDim>
+__global__ void DoubleGradComputeDX(const T *x,
+                                    const AccT *mean,
+                                    const AccT *variance,
+                                    const T *ddx,
+                                    const T *dy,
+                                    const AccT *scale,
+                                    const AccT *ddscale,
+                                    int C,
+                                    int sample_size,
+                                    const double epsilon,
+                                    T *dx) {
+  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * sample_size;
+  int ncid = blockIdx.x;
+  int c = ncid % C;
+
+  AccT mean_val = mean[ncid];
+  AccT var_val = variance[ncid];
+
+  typedef cub::BlockReduce<AccT, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage ddx_storage;
+  __shared__ typename BlockReduce::TempStorage dy_mul_ddx_storage;
+  __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
+  __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
+  __shared__ AccT dy_sum_val;
+  __shared__ AccT ddx_sum_val;
+  __shared__ AccT dy_mul_ddx_sum_val;
+  __shared__ AccT dy_mul_x_sub_mean_sum_val;
+  __shared__ AccT ddx_mul_x_sub_mean_sum_val;
+
+  AccT dy_sum = 0;
+  AccT ddx_sum = 0;
+  AccT dy_mul_ddx_sum = 0;
+  AccT dy_mul_x_sub_mean_sum = 0;
+  AccT ddx_mul_x_sub_mean_sum = 0;
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    AccT ddx_i = static_cast<AccT>(ddx[i]);
+    AccT dy_i = static_cast<AccT>(dy[i]);
+    AccT tmp = static_cast<AccT>(x[i]) - mean_val;
+
+    dy_sum += dy_i;
+    ddx_sum += ddx_i;
+    dy_mul_ddx_sum += (ddx_i * dy_i);
+
+    dy_mul_x_sub_mean_sum += (dy_i * tmp);
+    ddx_mul_x_sub_mean_sum += (ddx_i * tmp);
+  }
+
+  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+  ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
+  dy_mul_ddx_sum =
+      BlockReduce(dy_mul_ddx_storage).Reduce(dy_mul_ddx_sum, cub::Sum());
+  dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
+                              .Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
+  ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
+                               .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
+
+  if (threadIdx.x == 0) {
+    dy_sum_val = dy_sum;
+    ddx_sum_val = ddx_sum;
+    dy_mul_ddx_sum_val = dy_mul_ddx_sum;
+    dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
+    ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
+  }
+  __syncthreads();
+
+  if (ddx != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      AccT tmp = static_cast<AccT>(dx[i]);
+      tmp +=
+          ((static_cast<AccT>(x[i]) - mean_val) * var_val * var_val * var_val /
+               sample_size *
+               (ddx_sum_val * dy_sum_val / sample_size - dy_mul_ddx_sum_val +
+                3. * dy_mul_x_sub_mean_sum_val * var_val *
+                    ddx_mul_x_sub_mean_sum_val * var_val / sample_size) +
+           ddx_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
+               var_val * (dy_sum_val / sample_size - static_cast<AccT>(dy[i])) +
+           dy_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
+               var_val *
+               (ddx_sum_val / sample_size - static_cast<AccT>(ddx[i]))) *
+          scale[c];
+      dx[i] = static_cast<T>(tmp);
+    }
+  }
+  __syncthreads();
+  if (ddscale != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      AccT tmp = static_cast<AccT>(dx[i]);
+      tmp += (static_cast<AccT>(dy[i]) * var_val -
+              dy_sum_val / sample_size * var_val -
+              (static_cast<AccT>(x[i]) - mean_val) * var_val *
+                  dy_mul_x_sub_mean_sum_val * var_val / sample_size) *
+             ddscale[c];
+      dx[i] = static_cast<T>(tmp);
+    }
+  }
+}
+
+template <typename T, typename AccT, int BlockDim>
+__global__ void DoubleGradComputeDDY(const T *x,
+                                     const AccT *mean,
+                                     const AccT *variance,
+                                     const AccT *ddscale,
+                                     const AccT *ddbias,
+                                     const T *ddx,
+                                     const AccT *scale,
+                                     int C,
+                                     int sample_size,
+                                     const double epsilon,
+                                     T *ddy) {
+  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * sample_size;
+  int ncid = blockIdx.x;
+  int c = ncid % C;
+  AccT mean_val = mean[ncid];
+  AccT var_val = variance[ncid];
+  typedef cub::BlockReduce<AccT, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ddx_storage;
+  __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
+  __shared__ AccT ddx_sum_val;
+  __shared__ AccT ddx_mul_x_sub_mean_sum_val;
+
+  AccT ddx_sum = 0;
+  AccT ddx_mul_x_sub_mean_sum = 0;
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    AccT ddx_i = static_cast<AccT>(ddx[i]);
+    ddx_sum += ddx_i;
+    ddx_mul_x_sub_mean_sum += (ddx_i * (static_cast<AccT>(x[i]) - mean_val));
+  }
+  ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
+  ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
+                               .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
+  if (threadIdx.x == 0) {
+    ddx_sum_val = ddx_sum;
+    ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
+  }
+  __syncthreads();
+  if (ddx != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      AccT tmp = static_cast<AccT>(ddy[i]);
+      tmp += scale[c] * var_val *
+             (static_cast<AccT>(ddx[i]) - ddx_sum_val / sample_size -
+              (static_cast<AccT>(x[i]) - mean_val) * var_val *
+                  ddx_mul_x_sub_mean_sum_val * var_val / sample_size);
+      ddy[i] = static_cast<T>(tmp);
+    }
+  }
+  __syncthreads();
+  if (ddscale != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      AccT tmp = static_cast<AccT>(ddy[i]);
+      tmp += (static_cast<AccT>(x[i]) - mean_val) * var_val * ddscale[c];
+      ddy[i] = static_cast<T>(tmp);
+    }
+  }
+  __syncthreads();
+  if (ddbias != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      ddy[i] = static_cast<T>(static_cast<AccT>(ddy[i]) + ddbias[c]);
+    }
+  }
+}
+
+template <typename T, typename AccT, int BlockDim>
+__global__ void DoubleGradComputeDScale(const T *x,
+                                        const AccT *mean,
+                                        const AccT *variance,
+                                        const T *ddx,
+                                        const T *dy,
+                                        int C,
+                                        int sample_size,
+                                        const double epsilon,
+                                        AccT *dscale) {
+  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * sample_size;
+  int ncid = blockIdx.x;
+  int c = ncid % C;
+  AccT mean_val = mean[ncid];
+  AccT var_val = variance[ncid];
+  typedef cub::BlockReduce<AccT, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
+  __shared__ typename BlockReduce::TempStorage dscale_tmp_storage;
+  __shared__ AccT dy_sum_val;
+  __shared__ AccT dy_mul_x_sub_mean_sum_val;
+
+  AccT dy_sum = 0;
+  AccT dy_mul_x_sub_mean_sum = 0;
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    AccT dy_i = static_cast<AccT>(dy[i]);
+    dy_sum += dy_i;
+    dy_mul_x_sub_mean_sum += (dy_i * (static_cast<AccT>(x[i]) - mean_val));
+  }
+  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+  dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
+                              .Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
+
+  if (threadIdx.x == 0) {
+    dy_sum_val = dy_sum;
+    dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
+  }
+  __syncthreads();
+  if (ddx != nullptr) {
+    AccT dscale_tmp = 0;
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      dscale_tmp +=
+          static_cast<AccT>(ddx[i]) * var_val *
+          (static_cast<AccT>(dy[i]) - dy_sum_val / sample_size -
+           dy_mul_x_sub_mean_sum_val * (static_cast<AccT>(x[i]) - mean_val) *
+               var_val * var_val / sample_size);
+    }
+    dscale_tmp = BlockReduce(dscale_tmp_storage).Reduce(dscale_tmp, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale[ncid] += dscale_tmp;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, typename Context>
+void InstanceNormGradKernel(const Context &dev_ctx,
+                            const DenseTensor &x,
+                            const paddle::optional<DenseTensor> &scale,
+                            const paddle::optional<DenseTensor> &bias UNUSED,
+                            const DenseTensor &saved_mean,
+                            const DenseTensor &saved_variance,
+                            const DenseTensor &d_y,
+                            float epsilon_f,
+                            DenseTensor *d_x,
+                            DenseTensor *d_scale,
+                            DenseTensor *d_bias) {
+  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
+  double epsilon = static_cast<double>(epsilon_f);
+  const auto *scale_ptr = scale.get_ptr();
+
+  const auto &x_dims = x.dims();
+
+  int N, C, H, W, D;
+  funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
+  int NxC = N * C;
+
+  DenseTensor x_tmp, d_y_tmp;
+  x_tmp.ShareDataWith(x).Resize({1, NxC, H, W, D});
+  d_y_tmp.ShareDataWith(d_y).Resize({1, NxC, H, W, D});
+
+  phi::funcs::SetConstant<GPUContext, AccT> set_constant;
+
+  dev_ctx.template Alloc<T>(d_x);
+  if (x.numel() == 0) {
+    if (d_scale) {
+      dev_ctx.template Alloc<AccT>(d_scale);
+      set_constant(dev_ctx, d_scale, static_cast<AccT>(0));
+    }
+    if (d_bias) {
+      dev_ctx.template Alloc<AccT>(d_bias);
+      set_constant(dev_ctx, d_bias, static_cast<AccT>(0));
+    }
+    return;
+  }
+  if (d_scale && d_bias) {
+    dev_ctx.template Alloc<AccT>(d_scale);
+    dev_ctx.template Alloc<AccT>(d_bias);
+  }
+
+  if (scale_ptr) {
+    PADDLE_ENFORCE_EQ(
+        scale_ptr->dims().size(),
+        1UL,
+        common::errors::InvalidArgument(
+            "The `shape` in InstanceNormOp is invalid: "
+            "the size of scale's dimensions must be equal to 1. But "
+            "received: the size of scale's dimensions"
+            "is [%d]",
+            scale_ptr->dims().size()));
+    PADDLE_ENFORCE_EQ(scale_ptr->dims()[0],
+                      C,
+                      common::errors::InvalidArgument(
+                          "The `shape` in InstanceNormOp is invalid: "
+                          "the first dimension of scale must be equal to "
+                          "Channels([%d]). But received: "
+                          "the first dimension of scale is [%d],"
+                          "the dimensions of scale is [%s], ",
+                          C,
+                          scale_ptr->dims()[0],
+                          scale_ptr->dims()));
+  }
+
+  const int n = x.numel();
+  const int block = 512;
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  const int grid = std::min(NxC, max_blocks);
+  const int grid1 = (C + block - 1) / block;
+
+  DenseTensor scale_tmp;
+  scale_tmp.Resize({NxC});
+  dev_ctx.template Alloc<AccT>(&scale_tmp);
+
+  DenseTensor d_scale_tmp;
+  d_scale_tmp.Resize({NxC});
+  dev_ctx.template Alloc<AccT>(&d_scale_tmp);
+
+  DenseTensor d_bias_tmp;
+  d_bias_tmp.Resize({NxC});
+  dev_ctx.template Alloc<AccT>(&d_bias_tmp);
+  if (scale_ptr) {
+    repeat_param<AccT><<<grid, block, 0, dev_ctx.stream()>>>(
+        scale_ptr->data<AccT>(), scale_tmp.data<AccT>(), N, C);
+  } else {
+    set_constant(dev_ctx, &scale_tmp, static_cast<AccT>(1));
+  }
+  std::vector<int> dims;
+  std::vector<int> strides;
+  dims = {1, NxC, H, W, D};
+  strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
+
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t data_desc_;
+  miopenTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
+#else
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
+#endif
+
+  if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+    LOG(ERROR) << "Provided epsilon is smaller than "
+               << "CUDNN_BN_MIN_EPSILON. Setting it to "
+               << "CUDNN_BN_MIN_EPSILON instead.";
+  }
+  epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
+      data_desc_,
+      CudnnDataType<T>::type,
+      x_dims.size() > 3 ? x_dims.size() : 4,
+      const_cast<int *>(dims.data()),
+      const_cast<int *>(strides.data())));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor(
+      in_param_desc_, data_desc_, miopenBNSpatial));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
+      data_desc_,
+      CudnnDataType<T>::type,
+      x_dims.size() > 3 ? x_dims.size() : 4,
+      dims.data(),
+      strides.data()));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor(
+      in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
+#endif
+  const auto *saved_mean_data =
+      saved_mean.template data<BatchNormParamType<T>>();
+  const auto *saved_var_data =
+      saved_variance.template data<BatchNormParamType<T>>();
+
+  if (d_scale && d_bias) {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenBatchNormalizationBackward(
+        GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()),
+        miopenBNSpatial,
+        CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(),
+        CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(),
+        data_desc_,
+        x_tmp.template data<T>(),
+        data_desc_,
+        d_y_tmp.template data<T>(),
+        data_desc_,
+        d_x->template data<T>(),
+        in_param_desc_,
+        scale_tmp.template data<BatchNormParamType<T>>(),
+        d_scale_tmp.template data<BatchNormParamType<T>>(),
+        d_bias_tmp.template data<BatchNormParamType<T>>(),
+        epsilon,
+        saved_mean_data,
+        saved_var_data));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBatchNormalizationBackward(
+        GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()),
+        CUDNN_BATCHNORM_SPATIAL,
+        CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(),
+        CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(),
+        data_desc_,
+        x_tmp.template data<T>(),
+        data_desc_,
+        d_y_tmp.template data<T>(),
+        data_desc_,
+        d_x->template data<T>(),
+        in_param_desc_,
+        scale_tmp.template data<BatchNormParamType<T>>(),
+        d_scale_tmp.template data<BatchNormParamType<T>>(),
+        d_bias_tmp.template data<BatchNormParamType<T>>(),
+        epsilon,
+        saved_mean_data,
+        saved_var_data));
+#endif
+  } else {
+    if (d_x) {
+      GradComputeDX<T, block><<<NxC, block, 0, dev_ctx.stream()>>>(
+          d_y.data<T>(),
+          scale_tmp.data<BatchNormParamType<T>>(),
+          saved_mean_data,
+          x.data<T>(),
+          saved_var_data,
+          C,
+          H * W * D,
+          d_x->data<T>());
+    }
+  }
+  if (d_scale && d_bias) {
+    add_param<AccT, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
+        d_scale_tmp.data<AccT>(), d_scale->data<AccT>(), N, C);
+    add_param<AccT, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
+        d_bias_tmp.data<AccT>(), d_bias->data<AccT>(), N, C);
+  }
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
+#endif
+}
+
+template <typename T, typename Context>
+void InstanceNormDoubleGradKernel(const Context &dev_ctx,
+                                  const DenseTensor &x,
+                                  const paddle::optional<DenseTensor> &scale,
+                                  const DenseTensor &saved_mean,
+                                  const DenseTensor &saved_variance,
+                                  const DenseTensor &dy,
+                                  const paddle::optional<DenseTensor> &ddx,
+                                  const paddle::optional<DenseTensor> &ddscale,
+                                  const paddle::optional<DenseTensor> &ddbias,
+                                  float epsilon_f,
+                                  DenseTensor *dx,
+                                  DenseTensor *dscale,
+                                  DenseTensor *ddy) {
+  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
+  const auto *Scale = scale.get_ptr();
+  const auto *ddX = ddx.get_ptr();
+  const auto *ddScale = ddscale.get_ptr();
+  const auto *ddBias = ddbias.get_ptr();
+  const double epsilon = static_cast<double>(epsilon_f);
+  const T *x_data = x.data<T>();
+  const T *dy_data = dy.data<T>();
+  const T *ddx_data = (ddX == nullptr ? nullptr : ddX->data<T>());
+  const AccT *ddscale_data =
+      (ddScale == nullptr ? nullptr : ddScale->data<AccT>());
+  const AccT *ddbias_data =
+      (ddScale == nullptr ? nullptr : ddBias->data<AccT>());
+  const AccT *mean_data = saved_mean.data<AccT>();
+  const AccT *variance_data = saved_variance.data<AccT>();
+  phi::funcs::SetConstant<GPUContext, T> set_zero;
+  phi::funcs::SetConstant<GPUContext, AccT> set_zero_AccT;
+
+  auto &x_dims = x.dims();
+  int N, C, H, W, D;
+  funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
+  int NxC = N * C;
+  const int n = x.numel();
+  int sample_size = n / N / C;
+
+  DenseTensor scale_tmp;
+  if (!Scale) {
+    scale_tmp.Resize({C});
+    dev_ctx.template Alloc<AccT>(&scale_tmp);
+    set_zero_AccT(dev_ctx, &scale_tmp, static_cast<AccT>(1));
+  }
+  const AccT *scale_data = Scale ? Scale->data<AccT>() : scale_tmp.data<AccT>();
+  const int block = 512;
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  const int grid = NxC;
+  const int grid1 = (C + block - 1) / block;
+
+  if (dx) {
+    T *dx_data = dev_ctx.template Alloc<T>(dx);
+    set_zero(dev_ctx, dx, static_cast<T>(0));
+    DoubleGradComputeDX<T, AccT, block>
+        <<<grid, block, 0, dev_ctx.stream()>>>(x_data,
+                                               mean_data,
+                                               variance_data,
+                                               ddx_data,
+                                               dy_data,
+                                               scale_data,
+                                               ddscale_data,
+                                               C,
+                                               sample_size,
+                                               epsilon,
+                                               dx_data);
+  }
+  if (dscale) {
+    DenseTensor dscale_tmp;
+    dscale_tmp.Resize({NxC});
+    dev_ctx.template Alloc<AccT>(&dscale_tmp);
+    set_zero_AccT(dev_ctx, &dscale_tmp, static_cast<AccT>(0));
+    AccT *dscale_tmp_data = dscale_tmp.data<AccT>();
+
+    AccT *dscale_data = dev_ctx.template Alloc<AccT>(dscale);
+    set_zero_AccT(dev_ctx, dscale, static_cast<AccT>(0));
+    DoubleGradComputeDScale<T, AccT, block>
+        <<<grid, block, 0, dev_ctx.stream()>>>(x_data,
+                                               mean_data,
+                                               variance_data,
+                                               ddx_data,
+                                               dy_data,
+                                               C,
+                                               sample_size,
+                                               epsilon,
+                                               dscale_tmp_data);
+    add_param<AccT, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
+        dscale_tmp.data<AccT>(), dscale->data<AccT>(), N, C);
+  }
+  if (ddy) {
+    T *ddy_data = dev_ctx.template Alloc<T>(ddy);
+    set_zero(dev_ctx, ddy, static_cast<T>(0));
+    DoubleGradComputeDDY<T, AccT, block>
+        <<<grid, block, 0, dev_ctx.stream()>>>(x_data,
+                                               mean_data,
+                                               variance_data,
+                                               ddscale_data,
+                                               ddbias_data,
+                                               ddx_data,
+                                               scale_data,
+                                               C,
+                                               sample_size,
+                                               epsilon,
+                                               ddy_data);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_PLUGIN_KERNEL(instance_norm_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::InstanceNormGradKernel,
+                          float,
+                          double,
+                          phi::dtype::float16,
+                          phi::dtype::bfloat16) {}
+PD_REGISTER_PLUGIN_KERNEL(instance_norm_double_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::InstanceNormDoubleGradKernel,
+                          float,
+                          double,
+                          phi::dtype::float16,
+                          phi::dtype::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu
new file mode 100644
index 00000000000..db975d74665
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu
@@ -0,0 +1,253 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "glog/logging.h"
+#include "kernels/metax_context.h"
+#include "paddle/common/layout.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
+#include "paddle/phi/kernels/gpu/instance_norm_utils.h"
+#include "paddle/phi/kernels/instance_norm_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void InstanceNormKernel(const Context &dev_ctx,
+                        const DenseTensor &x,
+                        const paddle::optional<DenseTensor> &scale,
+                        const paddle::optional<DenseTensor> &bias,
+                        float epsilon_f,
+                        DenseTensor *y,
+                        DenseTensor *saved_mean,
+                        DenseTensor *saved_variance) {
+  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
+  double epsilon = static_cast<double>(epsilon_f);
+  auto &x_dims = x.dims();
+  PADDLE_ENFORCE_GE(x_dims.size(),
+                    2,
+                    common::errors::InvalidArgument(
+                        "The `shape` in InstanceNormOp is invalid: "
+                        "the size of X's dimensions must greater than "
+                        "or equal to 2. But received: "
+                        "the size of X's dimensions is [%d]",
+                        x_dims.size()));
+  PADDLE_ENFORCE_LE(x_dims.size(),
+                    5,
+                    common::errors::InvalidArgument(
+                        "The `shape` in InstanceNormOp is invalid: "
+                        "the size of X's dimensions must smaller than"
+                        "or equal to 5. But received: "
+                        "the size of X's dimensions is [%d]",
+                        x_dims.size()));
+  int N, C, H, W, D;
+  funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
+  int NxC = N * C;
+  DenseTensor x_tmp;
+  x_tmp.ShareDataWith(x).Resize({1, NxC, H, W, D});
+  dev_ctx.template Alloc<T>(y);
+  phi::funcs::SetConstant<GPUContext, BatchNormParamType<T>> functor;
+  phi::funcs::SetConstant<GPUContext, T> functor_y;
+  if (x.numel() == 0) {
+    functor_y(dev_ctx, y, static_cast<T>(0));
+    if (saved_mean) {
+      dev_ctx.template Alloc<BatchNormParamType<T>>(saved_mean);
+      functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
+    }
+    if (saved_variance) {
+      dev_ctx.template Alloc<BatchNormParamType<T>>(saved_variance);
+      functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
+    }
+    return;
+  }
+
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t data_desc_;
+  miopenTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
+#else
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
+#endif
+  if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+    LOG(ERROR) << "Provided epsilon is smaller than "
+               << "CUDNN_BN_MIN_EPSILON. Setting it to "
+               << "CUDNN_BN_MIN_EPSILON instead.";
+  }
+  epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+  VLOG(3) << "Setting descriptors.";
+  std::vector<int> dims;
+  std::vector<int> strides;
+  dims = {1, NxC, H, W, D};
+  strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
+      data_desc_,
+      CudnnDataType<T>::type,
+      x_dims.size() > 3 ? x_dims.size() : 4,
+      const_cast<int *>(dims.data()),
+      const_cast<int *>(strides.data())));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor(
+      in_param_desc_, data_desc_, miopenBNSpatial));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
+      data_desc_,
+      CudnnDataType<T>::type,
+      x_dims.size() > 3 ? x_dims.size() : 4,
+      dims.data(),
+      strides.data()));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor(
+      in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
+#endif
+
+  const auto scale_ptr = scale.get_ptr();
+  const auto bias_ptr = bias.get_ptr();
+
+  DenseTensor scale_tmp;
+  scale_tmp.Resize({NxC});
+  dev_ctx.template Alloc<AccT>(&scale_tmp);
+  DenseTensor bias_tmp;
+  bias_tmp.Resize({NxC});
+  dev_ctx.template Alloc<AccT>(&bias_tmp);
+
+  const int n = x.numel();
+  const int block = 512;
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  const int grid = std::min((NxC + block - 1) / block, max_blocks);
+
+  phi::funcs::SetConstant<GPUContext, AccT> set_constant;
+  if (scale_ptr) {
+    repeat_param<AccT><<<grid, block, 0, dev_ctx.stream()>>>(
+        scale_ptr->data<AccT>(), scale_tmp.data<AccT>(), N, C);
+  } else {
+    set_constant(dev_ctx, &scale_tmp, static_cast<AccT>(1));
+  }
+  if (bias_ptr) {
+    repeat_param<AccT><<<grid, block, 0, dev_ctx.stream()>>>(
+        bias_ptr->data<AccT>(), bias_tmp.data<AccT>(), N, C);
+  } else {
+    set_constant(dev_ctx, &bias_tmp, static_cast<AccT>(0));
+  }
+
+  auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
+  DenseTensor saved_mean_tmp, saved_variance_tmp;
+
+  if (saved_mean) {
+    dev_ctx.template Alloc<BatchNormParamType<T>>(saved_mean);
+    functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
+  } else {
+    saved_mean_tmp = phi::Full<BatchNormParamType<T>>(
+        dev_ctx, {NxC}, static_cast<BatchNormParamType<T>>(0));
+  }
+  if (saved_variance) {
+    dev_ctx.template Alloc<BatchNormParamType<T>>(saved_variance);
+    functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
+  } else {
+    saved_variance_tmp = phi::Full<BatchNormParamType<T>>(
+        dev_ctx, {NxC}, static_cast<BatchNormParamType<T>>(0));
+  }
+  auto *saved_mean_data = saved_mean
+                              ? saved_mean->data<BatchNormParamType<T>>()
+                              : saved_mean_tmp.data<BatchNormParamType<T>>();
+  auto *saved_variance_data =
+      saved_variance ? saved_variance->data<BatchNormParamType<T>>()
+                     : saved_variance_tmp.data<BatchNormParamType<T>>();
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenBatchNormalizationForwardTraining(
+          handle,
+          miopenBNSpatial,
+          const_cast<void *>(
+              static_cast<const void *>(CudnnDataType<T>::kOne())),
+          const_cast<void *>(
+              static_cast<const void *>(CudnnDataType<T>::kZero())),
+          data_desc_,
+          static_cast<const void *>(x_tmp.template data<T>()),
+          data_desc_,
+          static_cast<void *>(y->template data<T>()),
+          in_param_desc_,
+          const_cast<void *>(static_cast<const void *>(
+              scale_tmp.template data<BatchNormParamType<T>>())),
+          const_cast<void *>(static_cast<const void *>(
+              bias_tmp.template data<BatchNormParamType<T>>())),
+          0,
+          nullptr,
+          nullptr,
+          epsilon,
+          static_cast<void *>(saved_mean_data),
+          static_cast<void *>(saved_variance_data)));
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnBatchNormalizationForwardTraining(
+          handle,
+          CUDNN_BATCHNORM_SPATIAL,
+          CudnnDataType<T>::kOne(),
+          CudnnDataType<T>::kZero(),
+          data_desc_,
+          x_tmp.template data<T>(),
+          data_desc_,
+          y->template data<T>(),
+          in_param_desc_,
+          scale_tmp.template data<BatchNormParamType<T>>(),
+          bias_tmp.template data<BatchNormParamType<T>>(),
+          0,
+          nullptr,
+          nullptr,
+          epsilon,
+          saved_mean_data,
+          saved_variance_data));
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
+#endif
+}
+
+}  // namespace phi
+
+PD_REGISTER_PLUGIN_KERNEL(instance_norm,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::InstanceNormKernel,
+                          float,
+                          double,
+                          phi::dtype::float16,
+                          phi::dtype::bfloat16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16 ||
+      kernel_key.dtype() == phi::DataType::BFLOAT16) {
+    kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32);
+  }
+}
diff --git a/backends/metax_gpu/kernels/metax_kernel/spectral_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/spectral_norm_grad_kernel_register.cu
new file mode 100644
index 00000000000..f99621f8ab9
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/spectral_norm_grad_kernel_register.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/spectral_norm_grad_kernel.cu"  // NOLINT
+
+PD_CUSTOM_KERNEL_REGISTER(spectral_norm_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::SpectralNormGradKernel,
+                          float,
+                          double) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/spectral_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/spectral_norm_kernel_register.cu
new file mode 100644
index 00000000000..466937f993b
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/spectral_norm_kernel_register.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/spectral_norm_kernel.cu"  // NOLINT
+
+PD_CUSTOM_KERNEL_REGISTER(spectral_norm,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::SpectralNormKernel,
+                          float,
+                          double) {}
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 184599263fa..682cee35caf 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -1028,6 +1028,468 @@ index 6f03f76eeb..5fe2c3e7dc 100644
  #include "paddle/phi/kernels/funcs/for_range.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
  
+diff --git a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
+index 4099d8b506..baef2cd643 100644
+--- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
++++ b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
+@@ -14,7 +14,7 @@
+ 
+ #pragma once
+ 
+-#include "paddle/phi/kernels/funcs/blas/blas.h"
++#include "kernels/funcs/blas/blas.h"
+ #include "paddle/phi/kernels/funcs/eigen/common.h"
+ #include "paddle/phi/kernels/funcs/math_function.h"
+ 
+diff --git a/test/legacy_test/test_batch_norm_op.py b/test/legacy_test/test_batch_norm_op.py
+index 4a5660ea0e..ca4e456e02 100644
+--- a/test/legacy_test/test_batch_norm_op.py
++++ b/test/legacy_test/test_batch_norm_op.py
+@@ -22,7 +22,9 @@ from op_test import (
+     _set_use_system_allocator,
+     convert_float_to_uint16,
+     convert_uint16_to_float,
+-    get_places,
++    get_devices,
++    is_custom_device,
++    get_device_place,
+ )
+ 
+ import paddle
+@@ -189,6 +191,7 @@ def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
+ 
+ 
+ def create_or_get_tensor(scope, var_name, var, place):
++    
+     tensor = scope.var(var_name).get_tensor()
+     if var is not None:
+         assert isinstance(var, np.ndarray)
+@@ -321,7 +324,6 @@ class TestBatchNormOpInference(unittest.TestCase):
+             fuse_with_relu=self.fuse_with_relu,
+             epsilon=epsilon,
+         )
+-
+         batch_norm_op.run(scope, place)
+ 
+         # When op is called without Executor then
+@@ -454,7 +456,7 @@ class TestBatchNormOpInference(unittest.TestCase):
+         )
+ 
+     def test_check_output(self):
+-        for place in get_places():
++        for place in get_devices():
+             for data_format in ["NCHW", "NHWC"]:
+                 self.check_with_place(
+                     place,
+@@ -488,8 +490,8 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
+ 
+     def test_check_output(self):
+         places = []
+-        if core.is_compiled_with_cuda():
+-            place = core.CUDAPlace(0)
++        if core.is_compiled_with_cuda() or is_custom_device():
++            place = get_device_place()
+             if core.is_float16_supported(place):
+                 places.append(place)
+         for place in places:
+@@ -510,8 +512,8 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda()
+-    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
++    not (core.is_compiled_with_cuda() or is_custom_device())
++    or not core.is_bfloat16_supported(get_device_place()),
+     "core is not compiled with CUDA or not support the bfloat16",
+ )
+ class TestBF16BatchNormOpInference(TestBatchNormOpInference):
+@@ -522,7 +524,7 @@ class TestBF16BatchNormOpInference(TestBatchNormOpInference):
+         self.init_kernel_type()
+ 
+     def test_check_output(self):
+-        places = [core.CUDAPlace(0)]
++        places = [get_device_place()]
+         for place in places:
+             # for data_format in ["NCHW", "NHWC"]:
+             for data_format in ["NCHW"]:
+@@ -562,7 +564,7 @@ class TestDygraphBatchNormAPIError(unittest.TestCase):
+ 
+ class TestDygraphBatchNormTrainableStats(unittest.TestCase):
+     def test_dygraph(self):
+-        for p in get_places():
++        for p in get_devices():
+             shape = [4, 10, 4, 4]
+ 
+             def compute(x, is_test, trainable_statistics):
+@@ -581,7 +583,7 @@ class TestDygraphBatchNormTrainableStats(unittest.TestCase):
+             np.testing.assert_allclose(y1, y2, rtol=1e-05)
+ 
+     def test_static(self):
+-        for p in get_places():
++        for p in get_devices():
+             exe = base.Executor(p)
+             shape = [4, 10, 16, 16]
+ 
+@@ -625,7 +627,7 @@ class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase):
+ 
+ class TestBatchNormAPI_ZeroSize(unittest.TestCase):
+     def setUp(self):
+-        self.places = get_places()
++        self.places = get_devices()
+ 
+     def test_dygraph(self):
+         for place in self.places:
+diff --git a/test/legacy_test/test_conv3d_transpose_op.py b/test/legacy_test/test_conv3d_transpose_op.py
+index c9853e9073..277eb26d00 100644
+--- a/test/legacy_test/test_conv3d_transpose_op.py
++++ b/test/legacy_test/test_conv3d_transpose_op.py
+@@ -19,7 +19,7 @@ import numpy as np
+ import paddle
+ 
+ paddle.enable_static()
+-from op_test import OpTest, copy_bits_from_float_to_uint16
++from op_test import OpTest, copy_bits_from_float_to_uint16, is_custom_device, get_devices, get_device_place
+ 
+ from paddle.base import core
+ 
+@@ -150,7 +150,7 @@ def conv3dtranspose_forward_naive(input_, filter_, attrs):
+ 
+ def create_test_cudnn_fp16_class(parent, grad_check=True):
+     @unittest.skipIf(
+-        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++        not ((core.is_compiled_with_cuda() or is_custom_device()) or is_custom_device()), "core is not compiled with CUDA"
+     )
+     class TestConv3DTransposeCUDNNFP16(parent):
+         def init_kernel_type(self):
+@@ -158,20 +158,20 @@ def create_test_cudnn_fp16_class(parent, grad_check=True):
+             self.dtype = np.float16
+ 
+         def test_check_output(self):
+-            if core.is_compiled_with_cuda():
+-                place = core.CUDAPlace(0)
++            if ((core.is_compiled_with_cuda() or is_custom_device()) or is_custom_device()):
++                place = get_device_place()
+                 if core.is_float16_supported(place):
+                     self.check_output_with_place(place, atol=2e-2)
+ 
+         def test_check_grad_no_filter(self):
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             if core.is_float16_supported(place) and grad_check:
+                 self.check_grad_with_place(
+                     place, ['Input'], 'Output', no_grad_set={'Filter'}
+                 )
+ 
+         def test_check_grad_no_input(self):
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             if core.is_float16_supported(place) and grad_check:
+                 self.check_grad_with_place(
+                     place, ['Filter'], 'Output', no_grad_set={'Input'}
+@@ -184,8 +184,8 @@ def create_test_cudnn_fp16_class(parent, grad_check=True):
+ 
+ def create_test_cudnn_bf16_class(parent):
+     @unittest.skipIf(
+-        not core.is_compiled_with_cuda()
+-        or not core.is_bfloat16_supported(core.CUDAPlace(0)),
++        not (core.is_compiled_with_cuda() or is_custom_device())
++        or not core.is_bfloat16_supported(get_device_place()),
+         "core is not compiled with CUDA and do not support bfloat16",
+     )
+     class TestConv3DTransposeCUDNNBF16(parent):
+@@ -194,11 +194,11 @@ def create_test_cudnn_bf16_class(parent):
+             self.dtype = np.uint16
+ 
+         def test_check_output(self):
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             self.check_output_with_place(place)
+ 
+         def test_check_grad(self):
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             self.check_grad_with_place(
+                 place,
+                 {'Input', 'Filter'},
+@@ -206,7 +206,7 @@ def create_test_cudnn_bf16_class(parent):
+             )
+ 
+         def test_check_grad_no_filter(self):
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             self.check_grad_with_place(
+                 place,
+                 ['Input'],
+@@ -215,7 +215,7 @@ def create_test_cudnn_bf16_class(parent):
+             )
+ 
+         def test_check_grad_no_input(self):
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             self.check_grad_with_place(
+                 place,
+                 ['Filter'],
+@@ -306,14 +306,14 @@ class TestConv3DTransposeOp(OpTest):
+ 
+     def test_check_output(self):
+         if self.use_cudnn:
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             self.check_output_with_place(place, atol=1e-5)
+         else:
+             self.check_output()
+ 
+     def test_check_grad(self):
+         if self.use_cudnn:
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             self.check_grad_with_place(
+                 place,
+                 {'Input', 'Filter'},
+@@ -327,7 +327,7 @@ class TestConv3DTransposeOp(OpTest):
+ 
+     def test_check_grad_no_filter(self):
+         if self.use_cudnn:
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             self.check_grad_with_place(
+                 place,
+                 ['Input'],
+@@ -345,7 +345,7 @@ class TestConv3DTransposeOp(OpTest):
+ 
+     def test_check_grad_no_input(self):
+         if self.use_cudnn:
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             self.check_grad_with_place(
+                 place,
+                 ['Filter'],
+@@ -471,7 +471,7 @@ class Test_NHWC(TestConv3DTransposeOp):
+ 
+ # ------------ test_cudnn ------------
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNN(TestConv3DTransposeOp):
+     def init_op_type(self):
+@@ -481,7 +481,7 @@ class TestCUDNN(TestConv3DTransposeOp):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithSymmetricPad(TestWithSymmetricPad):
+     def init_test_case(self):
+@@ -500,7 +500,7 @@ class TestCUDNNWithSymmetricPad(TestWithSymmetricPad):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithAsymmetricPad(TestWithAsymmetricPad):
+     def init_test_case(self):
+@@ -519,7 +519,7 @@ class TestCUDNNWithAsymmetricPad(TestWithAsymmetricPad):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithSAMEPad(TestWithSAMEPad):
+     def init_test_case(self):
+@@ -538,7 +538,7 @@ class TestCUDNNWithSAMEPad(TestWithSAMEPad):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithVALIDPad(TestWithVALIDPad):
+     def init_test_case(self):
+@@ -557,7 +557,7 @@ class TestCUDNNWithVALIDPad(TestWithVALIDPad):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithStride(TestWithStride):
+     def init_test_case(self):
+@@ -576,7 +576,7 @@ class TestCUDNNWithStride(TestWithStride):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithGroups(TestWithGroups):
+     def init_test_case(self):
+@@ -610,7 +610,7 @@ class TestCUDNNWithGroups(TestWithGroups):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNN_NHWC(TestConv3DTransposeOp):
+     def init_test_case(self):
+@@ -630,7 +630,7 @@ class TestCUDNN_NHWC(TestConv3DTransposeOp):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithSymmetricPad_NHWC(TestWithSymmetricPad):
+     def init_test_case(self):
+@@ -650,7 +650,7 @@ class TestCUDNNWithSymmetricPad_NHWC(TestWithSymmetricPad):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithAsymmetricPad_NHWC(TestWithAsymmetricPad):
+     def init_test_case(self):
+@@ -670,7 +670,7 @@ class TestCUDNNWithAsymmetricPad_NHWC(TestWithAsymmetricPad):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithStride_NHWC(TestWithStride):
+     def init_test_case(self):
+@@ -690,7 +690,7 @@ class TestCUDNNWithStride_NHWC(TestWithStride):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithGroups_NHWC(TestWithGroups):
+     def init_test_case(self):
+diff --git a/test/legacy_test/test_cross_entropy_op.py b/test/legacy_test/test_cross_entropy_op.py
+index 74eedb6a48..e4c6ecb98a 100644
+--- a/test/legacy_test/test_cross_entropy_op.py
++++ b/test/legacy_test/test_cross_entropy_op.py
+@@ -20,6 +20,8 @@ from op_test import (
+     get_places,
+     paddle_static_guard,
+     randomize_probability,
++    is_custom_device,
++    get_device_place,
+ )
+ 
+ import paddle
+@@ -385,19 +387,19 @@ class TestCrossEntropyOp7RemoveLastDim(TestCrossEntropyOp7):
+ # Add Fp16 test
+ def create_test_class(parent, cls_name):
+     @unittest.skipIf(
+-        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++        not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+     )
+     class TestCrossEntropyFP16Op(parent):
+         def init_dtype_type(self):
+             return np.float16
+ 
+         def test_check_output(self):
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             if core.is_float16_supported(place):
+                 self.check_output_with_place(place, atol=2e-1)
+ 
+         def test_check_grad(self):
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             if core.is_float16_supported(place):
+                 self.check_grad_with_place(
+                     place, ['X'], 'Y', max_relative_error=0.9
+diff --git a/test/legacy_test/test_fmin_op.py b/test/legacy_test/test_fmin_op.py
+index 4c9944e877..e6ed5c0f8e 100644
+--- a/test/legacy_test/test_fmin_op.py
++++ b/test/legacy_test/test_fmin_op.py
+@@ -15,8 +15,7 @@
+ import unittest
+ 
+ import numpy as np
+-from op_test import OpTest, convert_float_to_uint16
+-
++from op_test import OpTest, convert_float_to_uint16, is_custom_device, get_devices, get_device_place
+ import paddle
+ from paddle.base import core
+ 
+@@ -28,8 +27,8 @@ class ApiFMinTest(unittest.TestCase):
+ 
+     def setUp(self):
+         """setUp"""
+-        if core.is_compiled_with_cuda():
+-            self.place = core.CUDAPlace(0)
++        if core.is_compiled_with_cuda() or is_custom_device():
++            self.place = get_device_place()
+         else:
+             self.place = core.CPUPlace()
+ 
+@@ -259,8 +258,8 @@ class TestElementwiseFmin3Op(OpTest):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda()
+-    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
++    not (core.is_compiled_with_cuda() or is_custom_device())
++    or not core.is_bfloat16_supported(get_device_place()),
+     "core is not compiled with CUDA and not support the bfloat16",
+ )
+ class TestFminBF16OP(OpTest):
+@@ -281,13 +280,13 @@ class TestFminBF16OP(OpTest):
+         self.outputs = {'Out': convert_float_to_uint16(out)}
+ 
+     def test_check_output(self):
+-        place = core.CUDAPlace(0)
++        place = get_device_place()
+         self.check_output_with_place(
+             place, check_pir=True, check_symbol_infer=False
+         )
+ 
+     def test_check_grad(self):
+-        place = core.CUDAPlace(0)
++        place = get_device_place()
+         self.check_grad_with_place(
+             place, ['X', 'Y'], 'Out', check_pir=True, check_prim_pir=True
+         )
+@@ -304,7 +303,7 @@ class TestElementwiseFminOpZeroSize1(TestElementwiseFminOp):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestElementwiseFminOp_Stride(OpTest):
+     no_need_check_grad = True
+@@ -335,7 +334,7 @@ class TestElementwiseFminOp_Stride(OpTest):
+         self.val_dtype = np.float64
+ 
+     def test_check_output(self):
+-        place = core.CUDAPlace(0)
++        place = get_device_place()
+         self.check_strided_forward = True
+         self.check_output(
+             place,
+diff --git a/test/legacy_test/test_spectral_norm_op.py b/test/legacy_test/test_spectral_norm_op.py
+index 80e5c2ec63..f1602a8b40 100644
+--- a/test/legacy_test/test_spectral_norm_op.py
++++ b/test/legacy_test/test_spectral_norm_op.py
+@@ -112,6 +112,7 @@ class TestSpectralNormOpNoGrad2(TestSpectralNormOpNoGrad):
+ 
+ class TestSpectralNormOp(TestSpectralNormOpNoGrad):
+     def test_check_grad_ignore_uv(self):
++        
+         self.check_grad(
+             ['Weight'],
+             'Out',
 diff --git a/third_party/flagcx b/third_party/flagcx
 index 77495cd6a8..7e6c4cc3ca 160000
 --- a/third_party/flagcx

From a1530d2b4a9837dc9975fff03fac774a45ea702d Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 9 Sep 2025 15:41:45 +0800
Subject: [PATCH 02/58] [metax]change_cupti_and_fix_softmax (#7)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 backends/metax_gpu/kernels/funcs/softmax.cu   | 168 ++++++
 .../cross_entropy_grad_kernel_register.cu     |  10 +-
 backends/metax_gpu/patch/paddle.patch         | 511 ++----------------
 .../metax_gpu/runtime/process_cupti_data.cc   | 136 +++--
 4 files changed, 309 insertions(+), 516 deletions(-)
 create mode 100644 backends/metax_gpu/kernels/funcs/softmax.cu

diff --git a/backends/metax_gpu/kernels/funcs/softmax.cu b/backends/metax_gpu/kernels/funcs/softmax.cu
new file mode 100644
index 00000000000..d738a53f43a
--- /dev/null
+++ b/backends/metax_gpu/kernels/funcs/softmax.cu
@@ -0,0 +1,168 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <vector>
+
+#include "kernels/metax_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_dnn.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/softmax.h"
+#include "paddle/phi/kernels/funcs/softmax_impl.h"
+
+namespace phi {
+namespace funcs {
+
+using ScopedTensorDescriptor = phi::backends::gpu::ScopedTensorDescriptor;
+using DataLayout = phi::backends::gpu::DataLayout;
+template <typename T>
+using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
+
+template <typename T, typename DeviceContext>
+void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
+    const DeviceContext& dev_ctx,
+    const phi::DenseTensor* X,
+    phi::DenseTensor* Y) {
+  // ------------------- cudnn descriptors ---------------------
+  ScopedTensorDescriptor xDesc;
+  ScopedTensorDescriptor yDesc;
+  std::vector<int> cudnn_tensor_dims = common::vectorize<int>(X->dims());
+  DataLayout layout = DataLayout::kNCHW;
+  if (cudnn_tensor_dims.size() == 5) {
+    layout = DataLayout::kNCDHW;
+  }
+  // NOTE(*) : cudnn softmax only support >= 4D phi::DenseTensor,
+  // fill 1 at unused dims
+  if (cudnn_tensor_dims.size() <= 2) {
+    cudnn_tensor_dims.resize(4, 1);
+  }
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t cudnn_x_desc =
+      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  miopenTensorDescriptor_t cudnn_y_desc =
+      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenSoftmaxForward_V2(dev_ctx.cudnn_handle(),
+                                            CudnnDataType<T>::kOne(),
+                                            cudnn_x_desc,
+                                            X->data<T>(),
+                                            CudnnDataType<T>::kZero(),
+                                            cudnn_y_desc,
+                                            dev_ctx.template Alloc<T>(Y),
+                                            MIOPEN_SOFTMAX_ACCURATE,
+                                            MIOPEN_SOFTMAX_MODE_INSTANCE));
+#else
+  cudnnTensorDescriptor_t cudnn_x_desc =
+      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  cudnnTensorDescriptor_t cudnn_y_desc =
+      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSoftmaxForward(
+      GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()),
+      CUDNN_SOFTMAX_ACCURATE,
+      CUDNN_SOFTMAX_MODE_INSTANCE,
+      CudnnDataType<T>::kOne(),
+      cudnn_x_desc,
+      X->data<T>(),
+      CudnnDataType<T>::kZero(),
+      cudnn_y_desc,
+      dev_ctx.template Alloc<T>(Y)));
+#endif
+}
+
+template <typename T, typename DeviceContext>
+void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
+    const DeviceContext& dev_ctx,
+    const phi::DenseTensor* Y,
+    const phi::DenseTensor* YGrad,
+    phi::DenseTensor* XGrad) {
+  // ------------------- cudnn descriptors ---------------------
+  ScopedTensorDescriptor yDesc;
+  ScopedTensorDescriptor dyDesc;
+  ScopedTensorDescriptor dxDesc;
+  std::vector<int> cudnn_tensor_dims = common::vectorize<int>(Y->dims());
+  DataLayout layout = DataLayout::kNCHW;
+  if (cudnn_tensor_dims.size() == 5) {
+    layout = DataLayout::kNCDHW;
+  }
+  // NOTE(*) : cudnn softmax only support >= 4D phi::DenseTensor,
+  // fill 1 at unused dims
+  if (cudnn_tensor_dims.size() <= 2) {
+    cudnn_tensor_dims.resize(4, 1);
+  }
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t cudnn_y_desc =
+      yDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  miopenTensorDescriptor_t cudnn_xgrad_desc =
+      dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  miopenTensorDescriptor_t cudnn_ygrad_desc =
+      dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSoftmaxBackward_V2(
+      GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()),
+      CudnnDataType<T>::kOne(),
+      cudnn_y_desc,
+      Y->data<T>(),
+      cudnn_ygrad_desc,
+      YGrad->data<T>(),
+      CudnnDataType<T>::kZero(),
+      cudnn_xgrad_desc,
+      dev_ctx.template Alloc<T>(XGrad),
+      MIOPEN_SOFTMAX_ACCURATE,
+      MIOPEN_SOFTMAX_MODE_INSTANCE));
+#else
+  cudnnTensorDescriptor_t cudnn_y_desc =
+      yDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  cudnnTensorDescriptor_t cudnn_xgrad_desc =
+      dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  cudnnTensorDescriptor_t cudnn_ygrad_desc =
+      dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSoftmaxBackward(
+      GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()),
+      CUDNN_SOFTMAX_ACCURATE,
+      CUDNN_SOFTMAX_MODE_INSTANCE,
+      CudnnDataType<T>::kOne(),
+      cudnn_y_desc,
+      Y->data<T>(),
+      cudnn_ygrad_desc,
+      YGrad->data<T>(),
+      CudnnDataType<T>::kZero(),
+      cudnn_xgrad_desc,
+      dev_ctx.template Alloc<T>(XGrad)));
+#endif
+}
+
+template class SoftmaxCUDNNFunctor<float, phi::GPUContext>;
+template class SoftmaxCUDNNFunctor<phi::dtype::float16, phi::GPUContext>;
+template class SoftmaxGradCUDNNFunctor<float, phi::GPUContext>;
+template class SoftmaxGradCUDNNFunctor<phi::dtype::float16, phi::GPUContext>;
+#if CUDNN_VERSION_MIN(8, 1, 0)
+template class SoftmaxCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
+template class SoftmaxGradCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
+#endif
+
+// MIOPEN do not support double
+#ifndef PADDLE_WITH_HIP
+template class SoftmaxCUDNNFunctor<double, phi::GPUContext>;
+template class SoftmaxGradCUDNNFunctor<double, phi::GPUContext>;
+#endif
+
+template class SoftmaxFunctor<phi::GPUContext, phi::dtype::float16>;
+template class SoftmaxFunctor<phi::GPUContext, phi::dtype::bfloat16>;
+template class SoftmaxFunctor<phi::GPUContext, float>;
+template class SoftmaxFunctor<phi::GPUContext, double>;
+template class SoftmaxGradFunctor<phi::GPUContext, float>;
+template class SoftmaxGradFunctor<phi::GPUContext, double>;
+template class SoftmaxGradFunctor<phi::GPUContext, phi::dtype::float16>;
+template class SoftmaxGradFunctor<phi::GPUContext, phi::dtype::bfloat16>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_grad_kernel_register.cu
index b5de9dd8f3c..402f69a9958 100644
--- a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_grad_kernel_register.cu
@@ -149,11 +149,11 @@ void CrossEntropyWithSoftmaxGradGPUKernel(const GPUContext& dev_ctx,
                                           int ignore_index,
                                           int axis,
                                           DenseTensor* logits_grad) {
-  PADDLE_ENFORCE_EQ(
-      dev_ctx.GetPlace().GetType(),
-      phi::AllocationType::GPU,
-      common::errors::Unavailable("softmax_with_cross_entropy operator's "
-                                  "CUDA kernel only runs on GPU device."));
+  // PADDLE_ENFORCE_EQ(
+  //     dev_ctx.GetPlace().GetType(),
+  //     phi::AllocationType::GPU,
+  //     common::errors::Unavailable("softmax_with_cross_entropy operator's "
+  //                                 "CUDA kernel only runs on GPU device."));
   const T* loss_grad_data = loss_grad.data<T>();
   DenseTensor* logit_grad = logits_grad;
 
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 682cee35caf..1935217baa0 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -419,7 +419,7 @@ index d69eb67d6f..1d8b6e9375 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
  
 diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
-index bdfd7313af..546bd07d5e 100644
+index cb35feee32..64f5bd24ac 100644
 --- a/paddle/phi/kernels/funcs/fc_functor.cu
 +++ b/paddle/phi/kernels/funcs/fc_functor.cu
 @@ -16,12 +16,12 @@ limitations under the License. */
@@ -438,7 +438,7 @@ index bdfd7313af..546bd07d5e 100644
  #include "paddle/phi/kernels/matmul_kernel.h"
  
 diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
-index 1a9a9cfb85..08ebe4b8af 100644
+index e101224970..a52eb6096f 100644
 --- a/paddle/phi/kernels/funcs/matrix_inverse.cu
 +++ b/paddle/phi/kernels/funcs/matrix_inverse.cu
 @@ -15,11 +15,13 @@ limitations under the License. */
@@ -470,10 +470,10 @@ index 558d363b39..05da04b517 100644
  #include "paddle/phi/kernels/funcs/scatter.cu.h"
  
 diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
-index dc7935423c..84896c2214 100644
+index e30d440ff3..3c74792690 100644
 --- a/paddle/phi/kernels/funcs/top_k_function_cuda.h
 +++ b/paddle/phi/kernels/funcs/top_k_function_cuda.h
-@@ -32,11 +32,11 @@ limitations under the License. */
+@@ -30,11 +30,11 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
  #include "paddle/phi/kernels/primitive/functor_primitives.h"
  
@@ -487,7 +487,7 @@ index dc7935423c..84896c2214 100644
  #endif
  #define MAX_NUM_THREADS 1024
  
-@@ -200,21 +200,56 @@ __device__ __forceinline__ void AddTo(Pair<T> topk[],
+@@ -196,21 +196,56 @@ __device__ __forceinline__ void AddTo(Pair<T> topk[],
    for (int k = beam_size - 2; k >= 0; k--) {
      if (largest) {
        if (topk[k] < p) {
@@ -549,7 +549,7 @@ index dc7935423c..84896c2214 100644
  }
  
  template <typename T, int BlockSize>
-@@ -243,24 +278,24 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[],
+@@ -239,24 +274,24 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[],
  template <typename T, int BlockSize>
  __device__ __forceinline__ void GetTopK(Pair<T> topk[],
                                          const T* src,
@@ -581,7 +581,7 @@ index dc7935423c..84896c2214 100644
          }
        }
      }
-@@ -287,7 +322,9 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[],
+@@ -283,7 +318,9 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[],
      } else {
        for (int k = 0; k < MaxLength; k++) {
          if (k < MaxLength - (*beam)) {
@@ -592,7 +592,7 @@ index dc7935423c..84896c2214 100644
          } else {
            if (largest) {
              topk[k].set(-static_cast<T>(INFINITY), -1);
-@@ -297,8 +334,10 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[],
+@@ -293,8 +330,10 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[],
          }
        }
        if (!(*is_empty)) {
@@ -604,7 +604,7 @@ index dc7935423c..84896c2214 100644
        }
      }
  
-@@ -359,6 +398,8 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
+@@ -355,6 +394,8 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
        shared_max[wid] = input_now;
      }
      __syncthreads();
@@ -613,7 +613,7 @@ index dc7935423c..84896c2214 100644
      if (largest) {
        input_now = (tid < BlockSize / WARP_SIZE)
                        ? shared_max[lane]
-@@ -373,27 +414,32 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
+@@ -369,27 +410,32 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
        if (lane == 0) shared_max[0] = input_now;
      }
      __syncthreads();
@@ -652,7 +652,7 @@ index dc7935423c..84896c2214 100644
          break;
      }
    }
-@@ -482,16 +528,17 @@ struct Bitfield<unsigned int> {
+@@ -478,16 +524,17 @@ struct Bitfield<unsigned int> {
                                                               int pos,
                                                               int len) {
      unsigned int ret;
@@ -674,7 +674,7 @@ index dc7935423c..84896c2214 100644
      return ret;
    }
  };
-@@ -502,7 +549,9 @@ struct Bitfield<uint64_t> {
+@@ -498,7 +545,9 @@ struct Bitfield<uint64_t> {
                                                           int pos,
                                                           int len) {
      uint64_t ret;
@@ -685,7 +685,7 @@ index dc7935423c..84896c2214 100644
      return ret;
    }
  
-@@ -511,9 +560,9 @@ struct Bitfield<uint64_t> {
+@@ -507,9 +556,9 @@ struct Bitfield<uint64_t> {
                                                           int pos,
                                                           int len) {
      uint64_t ret;
@@ -698,7 +698,7 @@ index dc7935423c..84896c2214 100644
      return ret;
    }
  };
-@@ -631,14 +680,20 @@ struct RadixTypeConfig<phi::dtype::bfloat16> {
+@@ -627,14 +676,20 @@ struct RadixTypeConfig<phi::bfloat16> {
  /*---------------------------Helper Functions------------------*/
  __device__ __forceinline__ int GetLaneId() {
    int lane_id;
@@ -723,7 +723,7 @@ index dc7935423c..84896c2214 100644
  }
  
  template <typename T, bool KillDependency, class Function>
-@@ -885,7 +940,8 @@ __global__ void GatherKthValue(const T* input,
+@@ -881,7 +936,8 @@ __global__ void GatherKthValue(const T* input,
  
    // 1. Find the k-th value
    T kth_value = static_cast<T>(0);
@@ -733,13 +733,13 @@ index dc7935423c..84896c2214 100644
        cur_input, k, num_cols, shared_mem, &kth_value);
  
    __shared__ int64_t block_min_idx;
-@@ -1318,3 +1374,4 @@ bool SortTopk(const phi::GPUContext& dev_ctx,
+@@ -1314,3 +1370,4 @@ bool SortTopk(const phi::GPUContext& dev_ctx,
  }
  }  // namespace funcs
  }  // namespace phi
 +//
 diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
-index 45a29b4cff..8449e3d309 100644
+index 32db61532f..0220316bc3 100644
 --- a/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
 +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
 @@ -15,7 +15,7 @@
@@ -752,7 +752,7 @@ index 45a29b4cff..8449e3d309 100644
  
  #include "glog/logging.h"
 diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
-index 7d05bcb654..c79cdadabc 100644
+index 9d4bb18d55..ea42cc10a9 100644
 --- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
 +++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
 @@ -638,9 +638,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
@@ -767,11 +767,11 @@ index 7d05bcb654..c79cdadabc 100644
          }
        }
 diff --git a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
-index ad04265bd6..59481d0e6a 100644
+index b8cfdbf3ce..fa14b94a77 100644
 --- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
-@@ -15,7 +15,7 @@
- #include "paddle/phi/common/bfloat16.h"
+@@ -14,7 +14,7 @@
+ 
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
@@ -780,11 +780,11 @@ index ad04265bd6..59481d0e6a 100644
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
-index 148d72ca9c..5da3461ebf 100644
+index e838778952..83e805e75a 100644
 --- a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
-@@ -15,7 +15,7 @@
- #include "paddle/phi/common/bfloat16.h"
+@@ -14,7 +14,7 @@
+ 
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
@@ -793,7 +793,7 @@ index 148d72ca9c..5da3461ebf 100644
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
-index b16553589a..90080c375d 100644
+index f0cca0f701..02ea957240 100644
 --- a/paddle/phi/kernels/gpu/depthwise_conv.h
 +++ b/paddle/phi/kernels/gpu/depthwise_conv.h
 @@ -29,8 +29,8 @@ namespace cub = hipcub;
@@ -833,7 +833,7 @@ index 29fa252e96..4ae72b0935 100644
  }
  
 diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
-index ee71a2b452..69130ab955 100644
+index 11efd87965..679db14c24 100644
 --- a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
 +++ b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
 @@ -17,7 +17,7 @@
@@ -846,7 +846,7 @@ index ee71a2b452..69130ab955 100644
  namespace phi {
  
 diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
-index 00a2f1e210..1267cf7ec2 100644
+index 63c35dd4ee..15da9aea45 100644
 --- a/paddle/phi/kernels/gpu/log_softmax_kernel.cu
 +++ b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
 @@ -17,7 +17,7 @@
@@ -872,7 +872,7 @@ index 1bdbe1564c..f753b54bc6 100644
  #include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
  #include "paddle/phi/kernels/lstsq_kernel.h"
 diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
-index 14b24dd3ed..e54a342c98 100644
+index 9bc5326c90..79b57a8203 100644
 --- a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
 @@ -21,7 +21,7 @@ limitations under the License. */
@@ -885,7 +885,7 @@ index 14b24dd3ed..e54a342c98 100644
  #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
  #include "paddle/phi/kernels/funcs/for_range.h"
 diff --git a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
-index 06fff0dd58..973049105f 100644
+index cf80666b4e..ca76e055fb 100644
 --- a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 @@ -19,7 +19,7 @@ limitations under the License. */
@@ -1041,461 +1041,12 @@ index 4099d8b506..baef2cd643 100644
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
  
-diff --git a/test/legacy_test/test_batch_norm_op.py b/test/legacy_test/test_batch_norm_op.py
-index 4a5660ea0e..ca4e456e02 100644
---- a/test/legacy_test/test_batch_norm_op.py
-+++ b/test/legacy_test/test_batch_norm_op.py
-@@ -22,7 +22,9 @@ from op_test import (
-     _set_use_system_allocator,
-     convert_float_to_uint16,
-     convert_uint16_to_float,
--    get_places,
-+    get_devices,
-+    is_custom_device,
-+    get_device_place,
- )
- 
- import paddle
-@@ -189,6 +191,7 @@ def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
- 
- 
- def create_or_get_tensor(scope, var_name, var, place):
-+    
-     tensor = scope.var(var_name).get_tensor()
-     if var is not None:
-         assert isinstance(var, np.ndarray)
-@@ -321,7 +324,6 @@ class TestBatchNormOpInference(unittest.TestCase):
-             fuse_with_relu=self.fuse_with_relu,
-             epsilon=epsilon,
-         )
--
-         batch_norm_op.run(scope, place)
- 
-         # When op is called without Executor then
-@@ -454,7 +456,7 @@ class TestBatchNormOpInference(unittest.TestCase):
-         )
- 
-     def test_check_output(self):
--        for place in get_places():
-+        for place in get_devices():
-             for data_format in ["NCHW", "NHWC"]:
-                 self.check_with_place(
-                     place,
-@@ -488,8 +490,8 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
- 
-     def test_check_output(self):
-         places = []
--        if core.is_compiled_with_cuda():
--            place = core.CUDAPlace(0)
-+        if core.is_compiled_with_cuda() or is_custom_device():
-+            place = get_device_place()
-             if core.is_float16_supported(place):
-                 places.append(place)
-         for place in places:
-@@ -510,8 +512,8 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda()
--    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
-+    not (core.is_compiled_with_cuda() or is_custom_device())
-+    or not core.is_bfloat16_supported(get_device_place()),
-     "core is not compiled with CUDA or not support the bfloat16",
- )
- class TestBF16BatchNormOpInference(TestBatchNormOpInference):
-@@ -522,7 +524,7 @@ class TestBF16BatchNormOpInference(TestBatchNormOpInference):
-         self.init_kernel_type()
- 
-     def test_check_output(self):
--        places = [core.CUDAPlace(0)]
-+        places = [get_device_place()]
-         for place in places:
-             # for data_format in ["NCHW", "NHWC"]:
-             for data_format in ["NCHW"]:
-@@ -562,7 +564,7 @@ class TestDygraphBatchNormAPIError(unittest.TestCase):
- 
- class TestDygraphBatchNormTrainableStats(unittest.TestCase):
-     def test_dygraph(self):
--        for p in get_places():
-+        for p in get_devices():
-             shape = [4, 10, 4, 4]
- 
-             def compute(x, is_test, trainable_statistics):
-@@ -581,7 +583,7 @@ class TestDygraphBatchNormTrainableStats(unittest.TestCase):
-             np.testing.assert_allclose(y1, y2, rtol=1e-05)
- 
-     def test_static(self):
--        for p in get_places():
-+        for p in get_devices():
-             exe = base.Executor(p)
-             shape = [4, 10, 16, 16]
- 
-@@ -625,7 +627,7 @@ class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase):
- 
- class TestBatchNormAPI_ZeroSize(unittest.TestCase):
-     def setUp(self):
--        self.places = get_places()
-+        self.places = get_devices()
- 
-     def test_dygraph(self):
-         for place in self.places:
-diff --git a/test/legacy_test/test_conv3d_transpose_op.py b/test/legacy_test/test_conv3d_transpose_op.py
-index c9853e9073..277eb26d00 100644
---- a/test/legacy_test/test_conv3d_transpose_op.py
-+++ b/test/legacy_test/test_conv3d_transpose_op.py
-@@ -19,7 +19,7 @@ import numpy as np
- import paddle
- 
- paddle.enable_static()
--from op_test import OpTest, copy_bits_from_float_to_uint16
-+from op_test import OpTest, copy_bits_from_float_to_uint16, is_custom_device, get_devices, get_device_place
- 
- from paddle.base import core
- 
-@@ -150,7 +150,7 @@ def conv3dtranspose_forward_naive(input_, filter_, attrs):
- 
- def create_test_cudnn_fp16_class(parent, grad_check=True):
-     @unittest.skipIf(
--        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+        not ((core.is_compiled_with_cuda() or is_custom_device()) or is_custom_device()), "core is not compiled with CUDA"
-     )
-     class TestConv3DTransposeCUDNNFP16(parent):
-         def init_kernel_type(self):
-@@ -158,20 +158,20 @@ def create_test_cudnn_fp16_class(parent, grad_check=True):
-             self.dtype = np.float16
- 
-         def test_check_output(self):
--            if core.is_compiled_with_cuda():
--                place = core.CUDAPlace(0)
-+            if ((core.is_compiled_with_cuda() or is_custom_device()) or is_custom_device()):
-+                place = get_device_place()
-                 if core.is_float16_supported(place):
-                     self.check_output_with_place(place, atol=2e-2)
- 
-         def test_check_grad_no_filter(self):
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             if core.is_float16_supported(place) and grad_check:
-                 self.check_grad_with_place(
-                     place, ['Input'], 'Output', no_grad_set={'Filter'}
-                 )
- 
-         def test_check_grad_no_input(self):
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             if core.is_float16_supported(place) and grad_check:
-                 self.check_grad_with_place(
-                     place, ['Filter'], 'Output', no_grad_set={'Input'}
-@@ -184,8 +184,8 @@ def create_test_cudnn_fp16_class(parent, grad_check=True):
- 
- def create_test_cudnn_bf16_class(parent):
-     @unittest.skipIf(
--        not core.is_compiled_with_cuda()
--        or not core.is_bfloat16_supported(core.CUDAPlace(0)),
-+        not (core.is_compiled_with_cuda() or is_custom_device())
-+        or not core.is_bfloat16_supported(get_device_place()),
-         "core is not compiled with CUDA and do not support bfloat16",
-     )
-     class TestConv3DTransposeCUDNNBF16(parent):
-@@ -194,11 +194,11 @@ def create_test_cudnn_bf16_class(parent):
-             self.dtype = np.uint16
- 
-         def test_check_output(self):
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             self.check_output_with_place(place)
- 
-         def test_check_grad(self):
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             self.check_grad_with_place(
-                 place,
-                 {'Input', 'Filter'},
-@@ -206,7 +206,7 @@ def create_test_cudnn_bf16_class(parent):
-             )
- 
-         def test_check_grad_no_filter(self):
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             self.check_grad_with_place(
-                 place,
-                 ['Input'],
-@@ -215,7 +215,7 @@ def create_test_cudnn_bf16_class(parent):
-             )
- 
-         def test_check_grad_no_input(self):
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             self.check_grad_with_place(
-                 place,
-                 ['Filter'],
-@@ -306,14 +306,14 @@ class TestConv3DTransposeOp(OpTest):
- 
-     def test_check_output(self):
-         if self.use_cudnn:
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             self.check_output_with_place(place, atol=1e-5)
-         else:
-             self.check_output()
- 
-     def test_check_grad(self):
-         if self.use_cudnn:
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             self.check_grad_with_place(
-                 place,
-                 {'Input', 'Filter'},
-@@ -327,7 +327,7 @@ class TestConv3DTransposeOp(OpTest):
- 
-     def test_check_grad_no_filter(self):
-         if self.use_cudnn:
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             self.check_grad_with_place(
-                 place,
-                 ['Input'],
-@@ -345,7 +345,7 @@ class TestConv3DTransposeOp(OpTest):
- 
-     def test_check_grad_no_input(self):
-         if self.use_cudnn:
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             self.check_grad_with_place(
-                 place,
-                 ['Filter'],
-@@ -471,7 +471,7 @@ class Test_NHWC(TestConv3DTransposeOp):
- 
- # ------------ test_cudnn ------------
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNN(TestConv3DTransposeOp):
-     def init_op_type(self):
-@@ -481,7 +481,7 @@ class TestCUDNN(TestConv3DTransposeOp):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithSymmetricPad(TestWithSymmetricPad):
-     def init_test_case(self):
-@@ -500,7 +500,7 @@ class TestCUDNNWithSymmetricPad(TestWithSymmetricPad):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithAsymmetricPad(TestWithAsymmetricPad):
-     def init_test_case(self):
-@@ -519,7 +519,7 @@ class TestCUDNNWithAsymmetricPad(TestWithAsymmetricPad):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithSAMEPad(TestWithSAMEPad):
-     def init_test_case(self):
-@@ -538,7 +538,7 @@ class TestCUDNNWithSAMEPad(TestWithSAMEPad):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithVALIDPad(TestWithVALIDPad):
-     def init_test_case(self):
-@@ -557,7 +557,7 @@ class TestCUDNNWithVALIDPad(TestWithVALIDPad):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithStride(TestWithStride):
-     def init_test_case(self):
-@@ -576,7 +576,7 @@ class TestCUDNNWithStride(TestWithStride):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithGroups(TestWithGroups):
-     def init_test_case(self):
-@@ -610,7 +610,7 @@ class TestCUDNNWithGroups(TestWithGroups):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNN_NHWC(TestConv3DTransposeOp):
-     def init_test_case(self):
-@@ -630,7 +630,7 @@ class TestCUDNN_NHWC(TestConv3DTransposeOp):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithSymmetricPad_NHWC(TestWithSymmetricPad):
-     def init_test_case(self):
-@@ -650,7 +650,7 @@ class TestCUDNNWithSymmetricPad_NHWC(TestWithSymmetricPad):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithAsymmetricPad_NHWC(TestWithAsymmetricPad):
-     def init_test_case(self):
-@@ -670,7 +670,7 @@ class TestCUDNNWithAsymmetricPad_NHWC(TestWithAsymmetricPad):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithStride_NHWC(TestWithStride):
-     def init_test_case(self):
-@@ -690,7 +690,7 @@ class TestCUDNNWithStride_NHWC(TestWithStride):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithGroups_NHWC(TestWithGroups):
-     def init_test_case(self):
-diff --git a/test/legacy_test/test_cross_entropy_op.py b/test/legacy_test/test_cross_entropy_op.py
-index 74eedb6a48..e4c6ecb98a 100644
---- a/test/legacy_test/test_cross_entropy_op.py
-+++ b/test/legacy_test/test_cross_entropy_op.py
-@@ -20,6 +20,8 @@ from op_test import (
-     get_places,
-     paddle_static_guard,
-     randomize_probability,
-+    is_custom_device,
-+    get_device_place,
- )
- 
- import paddle
-@@ -385,19 +387,19 @@ class TestCrossEntropyOp7RemoveLastDim(TestCrossEntropyOp7):
- # Add Fp16 test
- def create_test_class(parent, cls_name):
-     @unittest.skipIf(
--        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+        not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
-     )
-     class TestCrossEntropyFP16Op(parent):
-         def init_dtype_type(self):
-             return np.float16
- 
-         def test_check_output(self):
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             if core.is_float16_supported(place):
-                 self.check_output_with_place(place, atol=2e-1)
- 
-         def test_check_grad(self):
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             if core.is_float16_supported(place):
-                 self.check_grad_with_place(
-                     place, ['X'], 'Y', max_relative_error=0.9
-diff --git a/test/legacy_test/test_fmin_op.py b/test/legacy_test/test_fmin_op.py
-index 4c9944e877..e6ed5c0f8e 100644
---- a/test/legacy_test/test_fmin_op.py
-+++ b/test/legacy_test/test_fmin_op.py
-@@ -15,8 +15,7 @@
- import unittest
- 
- import numpy as np
--from op_test import OpTest, convert_float_to_uint16
--
-+from op_test import OpTest, convert_float_to_uint16, is_custom_device, get_devices, get_device_place
- import paddle
- from paddle.base import core
- 
-@@ -28,8 +27,8 @@ class ApiFMinTest(unittest.TestCase):
- 
-     def setUp(self):
-         """setUp"""
--        if core.is_compiled_with_cuda():
--            self.place = core.CUDAPlace(0)
-+        if core.is_compiled_with_cuda() or is_custom_device():
-+            self.place = get_device_place()
-         else:
-             self.place = core.CPUPlace()
- 
-@@ -259,8 +258,8 @@ class TestElementwiseFmin3Op(OpTest):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda()
--    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
-+    not (core.is_compiled_with_cuda() or is_custom_device())
-+    or not core.is_bfloat16_supported(get_device_place()),
-     "core is not compiled with CUDA and not support the bfloat16",
- )
- class TestFminBF16OP(OpTest):
-@@ -281,13 +280,13 @@ class TestFminBF16OP(OpTest):
-         self.outputs = {'Out': convert_float_to_uint16(out)}
- 
-     def test_check_output(self):
--        place = core.CUDAPlace(0)
-+        place = get_device_place()
-         self.check_output_with_place(
-             place, check_pir=True, check_symbol_infer=False
-         )
- 
-     def test_check_grad(self):
--        place = core.CUDAPlace(0)
-+        place = get_device_place()
-         self.check_grad_with_place(
-             place, ['X', 'Y'], 'Out', check_pir=True, check_prim_pir=True
-         )
-@@ -304,7 +303,7 @@ class TestElementwiseFminOpZeroSize1(TestElementwiseFminOp):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestElementwiseFminOp_Stride(OpTest):
-     no_need_check_grad = True
-@@ -335,7 +334,7 @@ class TestElementwiseFminOp_Stride(OpTest):
-         self.val_dtype = np.float64
- 
-     def test_check_output(self):
--        place = core.CUDAPlace(0)
-+        place = get_device_place()
-         self.check_strided_forward = True
-         self.check_output(
-             place,
-diff --git a/test/legacy_test/test_spectral_norm_op.py b/test/legacy_test/test_spectral_norm_op.py
-index 80e5c2ec63..f1602a8b40 100644
---- a/test/legacy_test/test_spectral_norm_op.py
-+++ b/test/legacy_test/test_spectral_norm_op.py
-@@ -112,6 +112,7 @@ class TestSpectralNormOpNoGrad2(TestSpectralNormOpNoGrad):
- 
- class TestSpectralNormOp(TestSpectralNormOpNoGrad):
-     def test_check_grad_ignore_uv(self):
-+        
-         self.check_grad(
-             ['Weight'],
-             'Out',
 diff --git a/third_party/flagcx b/third_party/flagcx
-index 77495cd6a8..7e6c4cc3ca 160000
+index 7c469f4af9..7e6c4cc3ca 160000
 --- a/third_party/flagcx
 +++ b/third_party/flagcx
 @@ -1 +1 @@
--Subproject commit 77495cd6a84b1c8f88dd8f6f99e63ef3c84c766f
+-Subproject commit 7c469f4af991bf0f64b8f76d66f8e307a5eaea3f
 +Subproject commit 7e6c4cc3cad3fce9b3dedfe46a9d195d616e8ffa
 diff --git a/third_party/flashattn b/third_party/flashattn
 index 581e48aa69..749aca3807 160000
diff --git a/backends/metax_gpu/runtime/process_cupti_data.cc b/backends/metax_gpu/runtime/process_cupti_data.cc
index 65011e3f58d..94caca5d8cb 100755
--- a/backends/metax_gpu/runtime/process_cupti_data.cc
+++ b/backends/metax_gpu/runtime/process_cupti_data.cc
@@ -226,52 +226,126 @@ class CuptiRuntimeCbidStr {
 CuptiRuntimeCbidStr::CuptiRuntimeCbidStr() {
 #define REGISTER_RUNTIME_CBID_STR(cbid) \
   cbid_str_[CUPTI_RUNTIME_TRACE_CBID_##cbid] = #cbid
-  REGISTER_RUNTIME_CBID_STR(cudaBindTexture_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaConfigureCall_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetAttribute_v5000);
-  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetStreamPriorityRange_v5050);
-  REGISTER_RUNTIME_CBID_STR(cudaDeviceSynchronize_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaDriverGetVersion_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaEventCreateWithFlags_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaEventQuery_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaEventRecord_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaFreeHost_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaFree_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaFuncGetAttributes_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaRuntimeGetVersion_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaGetDeviceCount_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaGetDeviceProperties_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaGetDevice_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaGetErrorString_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaChooseDevice_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaGetLastError_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaPeekAtLastError_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaFuncSetCacheConfig_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaFuncGetAttributes_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaSetDevice_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaGetDevice_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaSetValidDevices_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaSetDeviceFlags_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMalloc_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMallocPitch_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaFree_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMallocArray_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaFreeArray_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMallocHost_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaFreeHost_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaHostAlloc_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaHostGetDevicePointer_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaLaunchKernel_v7000);
-  REGISTER_RUNTIME_CBID_STR(cudaMallocHost_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaMalloc_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaMemcpyAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaHostGetFlags_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemGetInfo_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaMemcpy_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaMemsetAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpy2D_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyToArray_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpy2DToArray_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyToSymbol_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyFromSymbol_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpy2DAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyToSymbolAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyFromSymbolAsync_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaMemset_v3020);
-  REGISTER_RUNTIME_CBID_STR(
-      cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000);
-  REGISTER_RUNTIME_CBID_STR(cudaPeekAtLastError_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaRuntimeGetVersion_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaSetDevice_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemset2D_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemsetAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemset2DAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaGetSymbolAddress_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaGetSymbolSize_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaBindTexture_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaBindTexture2D_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaBindTextureToArray_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaStreamCreate_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithFlags_v5000);
-  REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithPriority_v5050);
-  REGISTER_RUNTIME_CBID_STR(cudaStreamDestroy_v5050);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamDestroy_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaStreamSynchronize_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamQuery_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventCreate_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventCreateWithFlags_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventRecord_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventSynchronize_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventQuery_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventElapsedTime_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMalloc3D_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMalloc3DArray_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemset3D_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemset3DAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpy3D_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpy3DAsync_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaStreamWaitEvent_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaPointerGetAttributes_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaHostRegister_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaHostUnregister_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceCanAccessPeer_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceEnablePeerAccess_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceDisablePeerAccess_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyPeer_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyPeerAsync_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpy3DPeer_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpy3DPeerAsync_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceReset_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceSynchronize_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetLimit_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceSetLimit_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetCacheConfig_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceSetCacheConfig_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaProfilerInitialize_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaProfilerStart_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaProfilerStop_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetByPCIBusId_v4010);
   REGISTER_RUNTIME_CBID_STR(cudaDeviceGetPCIBusId_v4010);
+  REGISTER_RUNTIME_CBID_STR(cudaIpcGetEventHandle_v4010);
+  REGISTER_RUNTIME_CBID_STR(cudaIpcOpenEventHandle_v4010);
+  REGISTER_RUNTIME_CBID_STR(cudaIpcGetMemHandle_v4010);
+  REGISTER_RUNTIME_CBID_STR(cudaIpcOpenMemHandle_v4010);
+  REGISTER_RUNTIME_CBID_STR(cudaIpcCloseMemHandle_v4010);
+  REGISTER_RUNTIME_CBID_STR(cudaFuncSetSharedMemConfig_v4020);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetSharedMemConfig_v4020);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceSetSharedMemConfig_v4020);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamAddCallback_v5000);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithFlags_v5000);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetAttribute_v5000);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamDestroy_v5050);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithPriority_v5050);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamGetPriority_v5050);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamGetFlags_v5050);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetStreamPriorityRange_v5050);
+  REGISTER_RUNTIME_CBID_STR(cudaMallocManaged_v6000);
+  REGISTER_RUNTIME_CBID_STR(
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamAttachMemAsync_v6000);
+  REGISTER_RUNTIME_CBID_STR(
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6050);
+  REGISTER_RUNTIME_CBID_STR(cudaLaunchKernel_v7000);
+  REGISTER_RUNTIME_CBID_STR(cudaGetDeviceFlags_v7000);
+  REGISTER_RUNTIME_CBID_STR(
+      cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000);
+  REGISTER_RUNTIME_CBID_STR(cudaMemRangeGetAttribute_v8000);
+  REGISTER_RUNTIME_CBID_STR(cudaMemRangeGetAttributes_v8000);
 #if CUDA_VERSION >= 9000
   REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000);
   REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000);
+  REGISTER_RUNTIME_CBID_STR(cudaFuncSetAttribute_v9000);
+  REGISTER_RUNTIME_CBID_STR(cudaGraphLaunch_v10000);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamSetAttribute_v11000);
+  REGISTER_RUNTIME_CBID_STR(cudaMallocAsync_v11020);
+  REGISTER_RUNTIME_CBID_STR(cudaFreeAsync_v11020);
 #endif
 #undef REGISTER_RUNTIME_CBID_STR
 }

From 352f02e869be9bccd1c9d154d2c70151626a43ea Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Tue, 9 Sep 2025 16:45:38 +0800
Subject: [PATCH 03/58] [Metax] fix dgc & mklml compile product path problem
 (#8)

---
 backends/metax_gpu/CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 5022e1bdde3..beb442eadad 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -26,6 +26,10 @@ set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
 message(STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}")
 set(WITH_MKLML ON)
 
+set(THIRD_PARTY_PATH
+    "${PADDLE_SOURCE_DIR}/build/third_party"
+    CACHE PATH "Third party libraries directory.")
+
 include(paddle)
 include(version)
 include(generic)
@@ -52,10 +56,6 @@ option(ON_INFER "compile with inference c++ lib" OFF)
 option(WITH_GPU "Compile PaddlePaddle with METAX_GPU" ON)
 option(WITH_CUSTOM_DEVICE "Compile PaddlePaddle with CUSTOM_DEVICE" ON)
 
-set(THIRD_PARTY_PATH
-    "${PADDLE_SOURCE_DIR}/build/third_party"
-    CACHE PATH "Third party libraries directory.")
-
 macro(UNSET_VAR VAR_NAME)
   unset(${VAR_NAME} CACHE)
   unset(${VAR_NAME})

From 8f13faed41890653f7f57328674c672c77dcfa4c Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Thu, 11 Sep 2025 17:18:33 +0800
Subject: [PATCH 04/58] [Metax] fix accuracy kernel & add
 test_accuracy_op_metax.py unit test (#9)

* [Metax] fix dgc & mklml compile product path problem

* [Metax] fix accuracy kernel & add test_accuracy_op_metax.py unit test

* [Metax] add mixed_vector fix & update change patch
---
 backends/metax_gpu/CMakeLists.txt             |   2 +-
 backends/metax_gpu/build.sh                   |  26 +-
 backends/metax_gpu/build_in_metax.sh          |  17 +-
 backends/metax_gpu/change_patch.sh            |   9 +-
 .../cuda_kernels/accuracy_kernel_register.cu  | 141 ++-
 backends/metax_gpu/patch/tmp/mixed_vector.cc  | 111 ++
 backends/metax_gpu/patch/tmp/mixed_vector.h   | 413 ++++++++
 .../tests/unittest/test_accuracy_op_metax.py  | 206 ++++
 .../tests/unittest/test_gather_op_metax.py    | 983 +++++++++++++++---
 9 files changed, 1740 insertions(+), 168 deletions(-)
 create mode 100644 backends/metax_gpu/patch/tmp/mixed_vector.cc
 create mode 100644 backends/metax_gpu/patch/tmp/mixed_vector.h
 create mode 100644 backends/metax_gpu/tests/unittest/test_accuracy_op_metax.py

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index beb442eadad..4567723123c 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -128,7 +128,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/arange_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/adadelta_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/accuracy_check_kernel.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/accuracy_kernel.cu
+  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/accuracy_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/allclose_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/all_gather_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/all_reduce_kernel.cu
diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index 0350a32521f..dd0ab3aab90 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -2,13 +2,13 @@
 #!/bin/bash
 
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -31,25 +31,7 @@ git submodule sync --recursive && git submodule update --init --recursive
 
 
 # apply patch
-
-rm -r ../../Paddle/third_party/eigen3
-
-
-cd patch 
-
-unzip mcEigen_3.4.0_paddle_final.zip
-
-mv mcEigen_3.4.0_paddle_final eigen3
-
-cd ..
-
-cp -r patch/eigen3/ ../../Paddle/third_party/eigen3
-
-cd ../../Paddle/
-
-git apply --verbose ../backends/metax_gpu/patch/paddle.patch
-
-cd -
+bash change_patch.sh
 
 
 export MACA_PATH=/opt/maca
diff --git a/backends/metax_gpu/build_in_metax.sh b/backends/metax_gpu/build_in_metax.sh
index b1f9d63d85c..67ec1a2c31c 100644
--- a/backends/metax_gpu/build_in_metax.sh
+++ b/backends/metax_gpu/build_in_metax.sh
@@ -2,13 +2,13 @@
 #!/bin/bash
 
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,16 +22,7 @@ git submodule sync --recursive && git submodule update --init --recursive
 
 
 # apply patch
-
-rm -r ../../Paddle/third_party/eigen3
-cd patch 
-unzip mcEigen_3.4.0_paddle_final.zip
-mv mcEigen_3.4.0_paddle_final eigen3
-cd ..
-cp -r patch/eigen3/ ../../Paddle/third_party/eigen3
-cd ../../Paddle/
-git apply --verbose ../backends/metax_gpu/patch/paddle.patch
-cd -
+bash change_patch.sh
 
 export MACA_PATH=/opt/maca
 export CUDA_PATH=/workspace/cuda-11.7/
diff --git a/backends/metax_gpu/change_patch.sh b/backends/metax_gpu/change_patch.sh
index 58bda1aacd4..833ae00f6bd 100644
--- a/backends/metax_gpu/change_patch.sh
+++ b/backends/metax_gpu/change_patch.sh
@@ -2,13 +2,13 @@
 #!/bin/bash
 
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,11 +16,12 @@
 # limitations under the License.
 
 rm -r ../../Paddle/third_party/eigen3
-cd patch 
+cd patch
 unzip mcEigen_3.4.0_paddle_final.zip
 mv mcEigen_3.4.0_paddle_final eigen3
 cd ..
 cp -r patch/eigen3/ ../../Paddle/third_party/eigen3
+cp patch/tmp/mixed_vector* ../../Paddle/paddle/phi/core
 cd ../../Paddle/
 git apply --verbose ../backends/metax_gpu/patch/paddle.patch
 cd -
diff --git a/backends/metax_gpu/kernels/cuda_kernels/accuracy_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/accuracy_kernel_register.cu
index 1b26e5711ac..0d61c79d0fa 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/accuracy_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/accuracy_kernel_register.cu
@@ -1,7 +1,7 @@
 // 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
 // Reserved.
 
-// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,19 +14,150 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
+#include <thrust/execution_policy.h>
+#include <thrust/reduce.h>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/accuracy_kernel.h"
 
+namespace phi {
+using phi::PADDLE_CUDA_NUM_THREADS;
+
+template <int BlockSize, typename T>
+__global__ void AccuracyCudaKernel(const int N,
+                                   const int D,
+                                   const int64_t* Xdata,
+                                   const int64_t* labeldata,
+                                   int* correct_data,
+                                   T* accuracy,
+                                   int* total_data) {
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  int count = 0;
+  __shared__ int total[BlockSize];
+
+  // support only 1 block
+  for (int i = threadIdx.x; i < (N); i += BlockSize) {
+    for (int j = 0; j < D; ++j) {
+      if (Xdata[i * D + j] == labeldata[i]) {
+        ++count;
+        break;
+      }
+    }
+  }
+  total[threadIdx.x] = count;
+  __syncthreads();
+
+  // reduce the count with init value 0, and output accuracy.
+  // #ifdef PADDLE_WITH_CUDA
+  //   int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
+  // #else
+  // HIP thrust::reduce not support __device__
+  for (int s = BlockSize / 2; s > 0; s >>= 1) {
+    if (threadIdx.x < s) {
+      total[threadIdx.x] += total[threadIdx.x + s];
+    }
+    __syncthreads();
+  }
+  int result = total[0];
+  // #endif
+  if (threadIdx.x == 0) {
+    *correct_data = result;
+    *accuracy = static_cast<T>(static_cast<MT>(result) / static_cast<MT>(N));
+    *total_data = N;
+  }
+}
+
+template <typename T, typename Context>
+void AccuracyKernel(const Context& dev_ctx,
+                    const DenseTensor& inference,
+                    const DenseTensor& indices,
+                    const DenseTensor& label,
+                    DenseTensor* accuracy,
+                    DenseTensor* correct,
+                    DenseTensor* total) {
+  // FIXME(typhoonzero): only support indices currently
+  // if add support for output values, how to detect the data type?
+  const int64_t* indices_data = indices.data<int64_t>();
+  const int64_t* label_data = label.data<int64_t>();
+
+  PADDLE_ENFORCE_EQ(
+      inference.dims().size(),
+      2,
+      common::errors::InvalidArgument(
+          "Rank(Input) of AccuracyOp must be 2, with shape "
+          "[sample_number, class_dim], But received rank(Input) is %d",
+          inference.dims().size()));
+
+  int* correct_data = dev_ctx.template Alloc<int>(correct);
+  int* total_data = dev_ctx.template Alloc<int>(total);
+  T* accuracy_data = dev_ctx.template Alloc<T>(accuracy);
+
+  int num_samples = static_cast<int>(inference.dims()[0]);
+  size_t infer_width = inference.dims()[1];
+  auto stream = dev_ctx.stream();
+  phi::backends::gpu::GpuMemsetAsync(accuracy_data, 0, sizeof(T), stream);
+
+  PADDLE_ENFORCE_GT(label.dims().size(),
+                    0,
+                    common::errors::InvalidArgument(
+                        "Rank(Label) of AccuracyOp must greater than 0, "
+                        "But received rank(Label) is %d",
+                        label.dims().size()));
+
+  PADDLE_ENFORCE_GE(label.dims()[0],
+                    inference.dims()[0],
+                    common::errors::InvalidArgument(
+                        "num_samples(%d) of Label should less than "
+                        "or equal to num_samples(%d) of Input",
+                        label.dims()[0],
+                        num_samples));
+
+  if (num_samples == 0) {
+    return;
+  }
+
+  AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS, T>
+      <<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(num_samples,
+                                                  infer_width,
+                                                  indices_data,
+                                                  label_data,
+                                                  correct_data,
+                                                  accuracy_data,
+                                                  total_data);
+}
+}  // namespace phi
+
+// FIXME(typhoonzero): types of T is for inference data.
+// label data is always int64
+PD_REGISTER_KERNEL(accuracy,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AccuracyKernel,
+                   phi::float16,
+                   phi::bfloat16,
+                   float,
+                   double) {
+  kernel->InputAt(1).SetDataType(phi::DataType::INT64);
+  kernel->InputAt(2).SetDataType(phi::DataType::INT64);
+  kernel->OutputAt(1).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(2).SetDataType(phi::DataType::INT32);
+}
+
 PD_CUSTOM_KERNEL_REGISTER(accuracy,
                           metax_gpu,
                           ALL_LAYOUT,
                           phi::AccuracyKernel,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
+                          phi::float16,
+                          phi::bfloat16,
                           float,
                           double) {
-  kernel->InputAt(1).SetDataType(phi::DataType::INT32);
-  kernel->InputAt(2).SetDataType(phi::DataType::INT32);
+  kernel->InputAt(1).SetDataType(phi::DataType::INT64);
+  kernel->InputAt(2).SetDataType(phi::DataType::INT64);
   kernel->OutputAt(1).SetDataType(phi::DataType::INT32);
   kernel->OutputAt(2).SetDataType(phi::DataType::INT32);
 }
diff --git a/backends/metax_gpu/patch/tmp/mixed_vector.cc b/backends/metax_gpu/patch/tmp/mixed_vector.cc
new file mode 100644
index 00000000000..a90113c7977
--- /dev/null
+++ b/backends/metax_gpu/patch/tmp/mixed_vector.cc
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/mixed_vector.h"
+
+#include <algorithm>
+#include <initializer_list>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <utility>
+#include <vector>
+
+#include "glog/logging.h"
+#include "paddle/phi/backends/context_pool.h"
+#include "paddle/phi/common/memory_utils.h"
+#include "paddle/utils/none.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T>
+void CopyToCPUHelper(std::vector<T> *cpu_,
+                     phi::Allocator::AllocationPtr *gpu_,
+                     size_t *gpu_memory_size_) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  // COPY GPU Data To CPU
+  auto *dev_ctx = static_cast<phi::GPUContext *>(
+      phi::DeviceContextPool::Instance().Get((*gpu_)->place()));
+  auto stream = dev_ctx->stream();
+  void *src = (*gpu_)->ptr();
+  void *dst = cpu_->data();
+  auto place = dev_ctx->GetPlace();
+  if (place.GetType() == phi::AllocationType::GPU) {
+    memory_utils::Copy(phi::CPUPlace(),
+                       dst,
+                       OptionalCUDAPlace(*gpu_).get(),
+                       src,
+                       *gpu_memory_size_,
+                       stream);
+  } else {
+    memory_utils::Copy(phi::CPUPlace(),
+                       dst,
+                       OptionalCustomPlace(*gpu_).get(),
+                       src,
+                       *gpu_memory_size_,
+                       stream);
+  }
+  dev_ctx->Wait();
+#endif
+}
+
+template <typename T>
+void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
+                             phi::Allocator::AllocationPtr *gpu_,
+                             size_t *gpu_memory_size_,
+                             const phi::Place &place) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  void *src = cpu_->data();
+  *gpu_memory_size_ = cpu_->size() * sizeof(T);  // sizeof(T)
+  (*gpu_) = memory_utils::Alloc(place, *gpu_memory_size_);
+  void *dst = (*gpu_)->ptr();
+  auto *dev_ctx = static_cast<phi::GPUContext *>(
+      phi::DeviceContextPool::Instance().Get(place));
+  auto stream = dev_ctx->stream();
+  if (place.GetType() == phi::AllocationType::GPU) {
+    memory_utils::Copy(OptionalCUDAPlace(*gpu_).get(),
+                       dst,
+                       phi::CPUPlace(),
+                       src,
+                       *gpu_memory_size_,
+                       stream);
+  } else {
+    memory_utils::Copy(OptionalCustomPlace(*gpu_).get(),
+                       dst,
+                       phi::CPUPlace(),
+                       src,
+                       *gpu_memory_size_,
+                       stream);
+  }
+  dev_ctx->Wait();
+#endif
+}
+
+#define INSTANTIATE_VECTOR_FOR_TYPE(__TYPE__)                                 \
+  template <>                                                                 \
+  void MixVector<__TYPE__>::VectorData::CopyToCPU() const {                   \
+    CopyToCPUHelper<__TYPE__>(cpu_, &gpu_, &gpu_memory_size_);                \
+  }                                                                           \
+                                                                              \
+  template <>                                                                 \
+  void MixVector<__TYPE__>::VectorData::CopyCPUDataToCUDA(                    \
+      const phi::Place &place) const {                                        \
+    CopyCPUDataToCUDAHelper<__TYPE__>(cpu_, &gpu_, &gpu_memory_size_, place); \
+  }
+
+INSTANTIATE_VECTOR_FOR_TYPE(size_t)
+INSTANTIATE_VECTOR_FOR_TYPE(int)
+INSTANTIATE_VECTOR_FOR_TYPE(int64_t)
+
+};  // namespace phi
diff --git a/backends/metax_gpu/patch/tmp/mixed_vector.h b/backends/metax_gpu/patch/tmp/mixed_vector.h
new file mode 100644
index 00000000000..e7cf1e626c9
--- /dev/null
+++ b/backends/metax_gpu/patch/tmp/mixed_vector.h
@@ -0,0 +1,413 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <initializer_list>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <utility>
+#include <vector>
+
+#include "glog/logging.h"
+#include "paddle/common/errors.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/allocator.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/utils/none.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <class T>
+using Vector = std::vector<T>;
+
+inline paddle::optional<phi::GPUPlace> OptionalCUDAPlace(
+    const phi::Allocator::AllocationPtr &gpu_) {
+  return gpu_ == nullptr ? paddle::none
+                         : paddle::optional<phi::GPUPlace>(gpu_->place());
+}
+
+inline paddle::optional<phi::CustomPlace> OptionalCustomPlace(
+    const phi::Allocator::AllocationPtr &gpu_) {
+  return gpu_ == nullptr ? paddle::none
+                         : paddle::optional<phi::CustomPlace>(gpu_->place());
+}
+
+// Vector<T> implements the std::vector interface, and can get Data or
+// MutableData from any place. The data will be synced implicitly inside.
+template <typename T>
+class MixVector {
+ public:
+  using value_type = T;
+  using iterator = typename std::vector<T>::iterator;
+  using const_iterator = typename std::vector<T>::const_iterator;
+
+ private:
+  // The actual class to implement vector logic
+  class VectorData {
+   public:
+    template <typename U>
+    explicit VectorData(std::vector<U> *dat) : cpu_(dat), flag_(kDataInCPU) {}
+    ~VectorData() {}
+
+    VectorData(const VectorData &o) = delete;
+
+    VectorData &operator=(const VectorData &o) = delete;
+
+    T &operator[](size_t i) {
+      MutableCPU();
+      return (*cpu_)[i];
+    }
+
+    const T &operator[](size_t i) const {
+      ImmutableCPU();
+      return (*cpu_)[i];
+    }
+
+    size_t size() const { return (*cpu_).size(); }
+
+    iterator begin() {
+      MutableCPU();
+      return (*cpu_).begin();
+    }
+
+    iterator end() {
+      MutableCPU();
+      return (*cpu_).end();
+    }
+
+    T &front() {
+      MutableCPU();
+      return (*cpu_).front();
+    }
+
+    T &back() {
+      MutableCPU();
+      return (*cpu_).back();
+    }
+
+    const_iterator begin() const {
+      ImmutableCPU();
+      return (*cpu_).begin();
+    }
+
+    const_iterator end() const {
+      ImmutableCPU();
+      return (*cpu_).end();
+    }
+
+    const T &back() const {
+      ImmutableCPU();
+      return (*cpu_).back();
+    }
+
+    T *data() { return cpu_->data(); }
+
+    const T *data() const { return cpu_->data(); }
+
+    const T &front() const {
+      ImmutableCPU();
+      return (*cpu_).front();
+    }
+
+    // assign this from iterator.
+    // NOTE: the iterator must support `end-begin`
+    template <typename Iter>
+    void assign(Iter begin, Iter end) {
+      MutableCPU();
+      (*cpu_).assign(begin, end);
+    }
+
+    // push_back. If the previous capacity is not enough, the memory will
+    // double.
+    void push_back(T elem) {
+      MutableCPU();
+      (*cpu_).push_back(elem);
+    }
+
+    // extend a vector by iterator.
+    // NOTE: the iterator must support end-begin
+    template <typename It>
+    void Extend(It begin, It end) {
+      MutableCPU();
+      auto out_it = std::back_inserter<std::vector<T>>(*(this->cpu_));
+      std::copy(begin, end, out_it);
+    }
+
+    // resize the vector
+    void resize(size_t size) {
+      MutableCPU();
+      (*cpu_).resize(size);
+    }
+
+    // get cuda ptr. immutable
+    const T *CUDAData(phi::Place place) const {
+      PADDLE_ENFORCE_EQ(
+          place.GetType() == phi::AllocationType::GPU ||
+              place.GetType() == phi::AllocationType::CUSTOM,
+          true,
+          common::errors::Unavailable(
+              "Place mismatch, CUDA Data must be on CUDA place."));
+      ImmutableCUDA(place);
+      return reinterpret_cast<T *>(gpu_->ptr());
+    }
+
+    // get cuda ptr. mutable
+    T *CUDAMutableData(phi::Place place) {
+      const T *ptr = CUDAData(place);
+      flag_ = kDirty | kDataInCUDA;
+      return const_cast<T *>(ptr);
+    }
+
+    // clear
+    void clear() {
+      (*cpu_).clear();
+      flag_ = kDirty | kDataInCPU;
+    }
+
+    std::vector<T> *get_vector() { return cpu_; }
+
+    size_t capacity() const { return (*cpu_).capacity(); }
+
+    // reserve data
+    void reserve(size_t size) const { (*cpu_).reserve(size); }
+
+    std::mutex &Mutex() const { return mtx_; }
+
+    paddle::optional<phi::GPUPlace> CUDAPlace() const {
+      return OptionalCUDAPlace(gpu_);
+    }
+
+    paddle::optional<phi::CustomPlace> CustomPlace() const {
+      return OptionalCustomPlace(gpu_);
+    }
+
+    void MutableCPU() {
+      if (IsInCUDA() && IsDirty()) {
+        CopyToCPU();
+      }
+      flag_ = kDirty | kDataInCPU;
+    }
+
+   private:
+    enum DataFlag {
+      kDataInCPU = 0x01,
+      kDataInCUDA = 0x02,
+      // kDirty means the data has been changed in one device.
+      kDirty = 0x10
+    };
+
+    void CopyToCPU() const;
+
+    void ImmutableCUDA(phi::Place place) const {
+      if (IsDirty()) {
+        if (IsInCPU()) {
+          CopyCPUDataToCUDA(place);
+          UnsetFlag(kDirty);
+          SetFlag(kDataInCUDA);
+        } else if (IsInCUDA() && !(place == gpu_->place())) {
+          PADDLE_THROW(
+              common::errors::Unavailable("Unexpected data place mismatch."));
+          // Still dirty
+        } else {
+          // Dirty && DataInCUDA && Device is same
+          // Do nothing
+        }
+      } else {
+        if (!IsInCUDA()) {
+          // Even data is not dirty. However, data is not in CUDA. Copy data.
+          CopyCPUDataToCUDA(place);
+          SetFlag(kDataInCUDA);
+        } else if (!(place == gpu_->place())) {
+          PADDLE_THROW(
+              common::errors::Unavailable("Unexpected data place mismatch."));
+        } else {
+          // Not Dirty && DataInCUDA && Device is same
+          // Do nothing.
+        }
+      }
+    }
+
+    void CopyCPUDataToCUDA(const phi::Place &place) const;
+
+    void ImmutableCPU() const {
+      if (IsDirty() && !IsInCPU()) {  // If data has been changed in CUDA, or
+                                      // CPU has no data.
+        CopyToCPU();
+        UnsetFlag(kDirty);
+      }
+      SetFlag(kDataInCPU);
+    }
+
+    void UnsetFlag(int flag) const { flag_ &= ~flag; }
+    void SetFlag(int flag) const { flag_ |= flag; }
+
+    bool IsDirty() const { return flag_ & kDirty; }
+
+    bool IsInCUDA() const { return flag_ & kDataInCUDA; }
+
+    bool IsInCPU() const { return flag_ & kDataInCPU; }
+
+    std::vector<T> *cpu_;
+    mutable phi::Allocator::AllocationPtr gpu_;
+    mutable size_t gpu_memory_size_{0};
+    mutable int flag_;
+
+    mutable std::mutex mtx_;
+  };
+
+ public:
+  // implicit cast from std::vector.
+  template <typename U>
+  MixVector(const std::vector<U> *dat) {  // NOLINT
+    m_.reset(new VectorData(const_cast<std::vector<U> *>(dat)));
+  }
+
+  // Copy ctor
+  MixVector(const MixVector<T> &other) = delete;
+
+  // Copy operator
+  MixVector<T> &operator=(const MixVector<T> &other) = delete;
+
+  // Move ctor
+  MixVector(MixVector<T> &&other) = delete;
+
+  // CPU data access method. Mutable.
+  T &operator[](size_t i) { return (*m_)[i]; }
+
+  // CPU data access method. Immutable.
+  const T &operator[](size_t i) const { return (*m_)[i]; }
+
+  // std::vector iterator methods. Based on CPU data access method
+  size_t size() const { return m_->size(); }
+
+  iterator begin() { return m_->begin(); }
+
+  iterator end() { return m_->end(); }
+
+  T &front() { return m_->front(); }
+
+  T &back() { return m_->back(); }
+
+  const_iterator begin() const { return m_->begin(); }
+
+  const_iterator end() const { return m_->end(); }
+
+  const_iterator cbegin() const { return begin(); }
+
+  const_iterator cend() const { return end(); }
+
+  const T &back() const { return m_->back(); }
+
+  T *data() { return m_->data(); }
+
+  const T *data() const { return m_->data(); }
+
+  const T &front() const { return m_->front(); }
+  // end of std::vector iterator methods
+
+  // assign this from iterator.
+  // NOTE: the iterator must support `end-begin`
+  template <typename Iter>
+  void assign(Iter begin, Iter end) {
+    m_->assign(begin, end);
+  }
+
+  // push_back. If the previous capacity is not enough, the memory will
+  // double.
+  void push_back(T elem) { m_->push_back(elem); }
+
+  // extend a vector by iterator.
+  // NOTE: the iterator must support end-begin
+  template <typename It>
+  void Extend(It begin, It end) {
+    m_->Extend(begin, end);
+  }
+
+  // resize the vector
+  void resize(size_t size) {
+    if (m_->size() != size) {
+      m_->resize(size);
+    }
+  }
+
+  // get cuda ptr. immutable
+  const T *CUDAData(phi::Place place) const {
+    {
+      phi::GPUPlace p(place.GetDeviceId());
+      auto &mtx = m_->Mutex();
+      std::lock_guard<std::mutex> guard(mtx);
+      auto cuda_place = m_->CUDAPlace();
+      if (cuda_place == paddle::none || cuda_place == p) {
+        return m_->CUDAData(place);
+      }
+    }
+    m_->MutableCPU();
+    m_.reset(new VectorData(m_->get_vector()));
+    return CUDAData(place);
+  }
+
+  // get cuda ptr. mutable
+  T *CUDAMutableData(phi::Place place) {
+    {
+      phi::GPUPlace p(place.GetDeviceId());
+      auto &mtx = m_->Mutex();
+      std::lock_guard<std::mutex> guard(mtx);
+      auto cuda_place = m_->CUDAPlace();
+      if (cuda_place == paddle::none || cuda_place == p) {
+        return m_->CUDAMutableData(place);
+      }
+    }
+    m_->MutableCPU();
+    m_.reset(new VectorData(m_->get_vector()));
+    return CUDAMutableData(place);
+  }
+
+  // clear
+  void clear() { m_->clear(); }
+
+  size_t capacity() const { return m_->capacity(); }
+
+  // reserve data
+  void reserve(size_t size) { m_->reserve(size); }
+
+  // the unify method to access CPU or CUDA data. immutable.
+  const T *Data(phi::Place place) const {
+    if (place.GetType() == phi::AllocationType::GPU) {
+      return CUDAData(place);
+    } else {
+      return data();
+    }
+  }
+
+  // the unify method to access CPU or CUDA data. mutable.
+  T *MutableData(phi::Place place) {
+    if (place.GetType() == phi::AllocationType::GPU) {
+      return CUDAMutableData(place);
+    } else {
+      return data();
+    }
+  }
+
+  void CopyToCPU() { m_->MutableCPU(); }
+
+  const void *Handle() const { return m_.get(); }
+
+ private:
+  mutable std::unique_ptr<VectorData> m_;
+};
+
+};  // namespace phi
diff --git a/backends/metax_gpu/tests/unittest/test_accuracy_op_metax.py b/backends/metax_gpu/tests/unittest/test_accuracy_op_metax.py
new file mode 100644
index 00000000000..910ef5cd1a6
--- /dev/null
+++ b/backends/metax_gpu/tests/unittest/test_accuracy_op_metax.py
@@ -0,0 +1,206 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    paddle_static_guard,
+    is_custom_device,
+    get_device_place,
+)
+
+import paddle
+from paddle import base
+from paddle.base import Program, core, program_guard
+
+
+def accuracy_wrapper(infer, indices, label):
+    return paddle._C_ops.accuracy(infer, indices, label)
+
+
+class TestAccuracyOp(OpTest):
+    def setUp(self):
+        self.op_type = "accuracy"
+        self.python_api = accuracy_wrapper
+        self.dtype = np.float32
+        self.init_dtype()
+        n = 8192
+        infer = np.random.random((n, 1)).astype(self.dtype)
+        indices = np.random.randint(0, 2, (n, 1)).astype("int64")
+        label = np.random.randint(0, 2, (n, 1)).astype("int64")
+        self.inputs = {"Out": infer, "Indices": indices, "Label": label}
+        num_correct = 0
+        for rowid in range(n):
+            for ele in indices[rowid]:
+                if ele == label[rowid]:
+                    num_correct += 1
+                    break
+        self.outputs = {
+            "Accuracy": np.array(num_correct / float(n)).astype(self.dtype),
+            "Correct": np.array(num_correct).astype("int32"),
+            "Total": np.array(n).astype("int32"),
+        }
+
+    def init_dtype(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output(check_pir=True)
+
+
+class TestAccuracyOpFp16(TestAccuracyOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output(atol=1e-3, check_pir=True)
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device())
+    or not core.is_bfloat16_supported(get_device_place()),
+    "core is not compiled with CUDA and not support the bfloat16",
+)
+class TestAccuracyOpBf16(OpTest):
+    def setUp(self):
+        self.op_type = "accuracy"
+        self.python_api = accuracy_wrapper
+        self.init_dtype()
+        n = 8192
+        infer = np.random.random((n, 1)).astype(np.float32)
+        indices = np.random.randint(0, 2, (n, 1)).astype("int64")
+        label = np.random.randint(0, 2, (n, 1)).astype("int64")
+        self.inputs = {
+            "Out": convert_float_to_uint16(infer),
+            "Indices": indices,
+            "Label": label,
+        }
+        num_correct = 0
+        for rowid in range(n):
+            for ele in indices[rowid]:
+                if ele == label[rowid]:
+                    num_correct += 1
+                    break
+        self.outputs = {
+            "Accuracy": convert_float_to_uint16(
+                np.array(num_correct / float(n)).astype(np.float32)
+            ),
+            "Correct": np.array(num_correct).astype("int32"),
+            "Total": np.array(n).astype("int32"),
+        }
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda() or is_custom_device():
+            place = get_device_place()
+            self.check_output_with_place(place, atol=1e-2, check_pir=True)
+
+
+class TestAccuracyOpError(unittest.TestCase):
+    def test_type_errors(self):
+        with (
+            paddle_static_guard(),
+            program_guard(Program(), Program()),
+        ):
+            # The input type of accuracy_op must be Variable.
+            x1 = base.create_lod_tensor(np.array([[-1]]), [[1]], base.CPUPlace())
+            label = paddle.static.data(name="label", shape=[-1, 1], dtype="int32")
+            self.assertRaises(TypeError, paddle.static.accuracy, x1, label)
+            self.assertRaises(TypeError, paddle.metric.accuracy, x1, label)
+            # The input dtype of accuracy_op must be float32 or float64.
+            x2 = paddle.static.data(name="x2", shape=[-1, 4], dtype="int32")
+            self.assertRaises(TypeError, paddle.static.accuracy, x2, label)
+            self.assertRaises(TypeError, paddle.metric.accuracy, x2, label)
+
+            x3 = paddle.static.data(name="input", shape=[-1, 2], dtype="float32")
+            paddle.static.accuracy(input=x3, label=label)
+            paddle.metric.accuracy(input=x3, label=label)
+
+    def test_value_errors(self):
+        with (
+            program_guard(Program(), Program()),
+            # The input rank of accuracy_op must be 2.
+            self.assertRaises(ValueError),
+        ):
+            x3 = paddle.to_tensor([0.1], dtype="float32")
+            label3 = paddle.to_tensor(np.reshape([0], [1, 1]), dtype="int32")
+            paddle.metric.accuracy(x3, label3)
+
+
+class TestAccuracyAPI1(unittest.TestCase):
+    def run_api(self, accuracy_api):
+        with (
+            paddle_static_guard(),
+            paddle.static.program_guard(paddle.static.Program()),
+        ):
+            self.predictions = paddle.static.data(
+                shape=[2, 5], name="predictions", dtype="float32"
+            )
+            self.label = paddle.static.data(shape=[2, 1], name="labels", dtype="int64")
+            self.result = accuracy_api(input=self.predictions, label=self.label, k=1)
+            self.input_predictions = np.array(
+                [[0.2, 0.1, 0.4, 0.1, 0.1], [0.2, 0.3, 0.1, 0.15, 0.25]],
+                dtype="float32",
+            )
+            self.input_labels = np.array([[2], [0]], dtype="int64")
+            self.expect_value = np.array([0.5], dtype="float32")
+            exe = paddle.static.Executor()
+            (result,) = exe.run(
+                feed={
+                    "predictions": self.input_predictions,
+                    "labels": self.input_labels,
+                },
+                fetch_list=[self.result],
+            )
+            self.assertEqual((result == self.expect_value).all(), True)
+
+    def test_api(self):
+        self.run_api(accuracy_api=paddle.static.accuracy)
+        self.run_api(accuracy_api=paddle.metric.accuracy)
+
+
+class TestAccuracyAPI2(unittest.TestCase):
+    def test_api(self):
+        with base.dygraph.guard():
+            predictions = paddle.to_tensor(
+                [[0.2, 0.1, 0.4, 0.1, 0.1], [0.2, 0.3, 0.1, 0.15, 0.25]],
+                dtype="float32",
+            )
+            label = paddle.to_tensor([[2], [0]], dtype="int64")
+            result = paddle.static.accuracy(input=predictions, label=label, k=1)
+            expect_value = np.array([0.5], dtype="float32")
+            self.assertEqual((result.numpy() == expect_value).all(), True)
+
+
+class TestAccuracyAPI(unittest.TestCase):
+    def test_api(self):
+        with base.dygraph.guard():
+            predictions = paddle.to_tensor(
+                [[0.2, 0.1, 0.4, 0.1, 0.1], [0.2, 0.3, 0.1, 0.15, 0.25]],
+                dtype="float32",
+            )
+            label = paddle.to_tensor([[2], [0]], dtype="int64")
+            result = paddle.metric.accuracy(input=predictions, label=label, k=1)
+            expect_value = np.array([0.5], dtype="float32")
+
+            self.assertEqual((result.numpy() == expect_value).all(), True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unittest/test_gather_op_metax.py b/backends/metax_gpu/tests/unittest/test_gather_op_metax.py
index bdf116571f7..3ce39588838 100644
--- a/backends/metax_gpu/tests/unittest/test_gather_op_metax.py
+++ b/backends/metax_gpu/tests/unittest/test_gather_op_metax.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,14 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
 import unittest
-from op_test import OpTest
 
 import numpy as np
-import paddle
+from op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    get_devices,
+    is_custom_device,
+    get_device_place,
+)
+from utils import dygraph_guard
 
-paddle.enable_static()
+import paddle
+from paddle import base
+from paddle.base.dygraph.base import switch_to_static_graph
+from paddle.framework import core
 
 
 def gather_numpy(x, index, axis):
@@ -32,29 +40,119 @@ def gather_numpy(x, index, axis):
 class TestGatherOp(OpTest):
     def setUp(self):
         self.op_type = "gather"
-        self.place = paddle.CustomPlace("metax_gpu", 0)
-        self.__class__.use_custom_device = True
         self.python_api = paddle.gather
+        self.public_python_api = paddle.gather
         self.config()
-        xnp = np.random.random(self.x_shape).astype(self.x_type)
-        self.inputs = {"X": xnp, "Index": np.array(self.index).astype(self.index_type)}
-        self.outputs = {"Out": self.inputs["X"][self.inputs["Index"]]}
+        self.prim_op_type = "prim"
+        self.init_inputs_and_outputs()
+        self.if_enable_cinn()
 
     def test_check_output(self):
-        self.check_output_with_place(self.place)
+        self.check_output(check_pir=True, check_symbol_infer=False)
 
     def test_check_grad(self):
-        self.check_grad_with_place(self.place, ["X"], "Out")
+        self.check_grad(["X"], "Out", check_pir=True, check_prim_pir=True)
 
     def config(self):
         """
         For multi-dimension input
         """
         self.x_shape = (10, 20)
-        self.x_type = "float32"
+        self.config_dtype()
         self.index = [1, 3, 5]
         self.index_type = "int32"
 
+    def config_dtype(self):
+        self.x_type = "float64"
+
+    def init_inputs_and_outputs(self):
+        xnp = np.random.random(self.x_shape).astype(self.x_type)
+        if self.x_type == "complex64" or self.x_type == "cpmolex128":
+            xnp = (
+                np.random.randint(-10, 10, size=(10, 10))
+                + 1j * np.random.randint(-10, 10, size=(10, 10))
+            ).astype(self.x_type)
+        self.inputs = {
+            "X": xnp,
+            "Index": np.array(self.index).astype(self.index_type),
+        }
+        self.outputs = {"Out": self.inputs["X"][self.inputs["Index"]]}
+
+    def if_enable_cinn(self):
+        pass
+
+
+class TestGatherOp_ZeroDim(TestGatherOp):
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = 100
+        self.config_dtype()
+        self.index = 2
+        self.index_type = "int32"
+
+    def if_enable_cinn(self):
+        self.enable_cinn = False
+
+
+class TestGatherOpFP16(TestGatherOp):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+# @unittest.skipIf(
+#     not (core.is_compiled_with_cuda() or is_custom_device())
+#     # or core.cudnn_version() < 8100
+#     # or paddle.device.cuda.get_device_capability()[0] < 8,
+#     # "only support compiled with CUDA and cudnn version need larger than 8.1.0 and device's compute capability is at least 8.0",
+# )
+class TestGatherOpBFP16(TestGatherOp):
+    def config_dtype(self):
+        self.x_type = "float32"
+        self.dtype = np.uint16
+
+    def init_inputs_and_outputs(self):
+        xnp = np.random.random(self.x_shape).astype(self.x_type)
+        self.inputs = {
+            "X": convert_float_to_uint16(xnp),
+            "Index": np.array(self.index).astype(self.index_type),
+        }
+        self.outputs = {"Out": convert_float_to_uint16(xnp[self.inputs["Index"]])}
+
+    def if_enable_cinn(self):
+        self.enable_cinn = False
+
+    def test_check_output(self):
+        self.check_output_with_place(
+            place=get_device_place(), check_pir=True, check_symbol_infer=False
+        )
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            get_device_place(),
+            ["X"],
+            "Out",
+            check_pir=True,
+            check_prim_pir=True,
+        )
+
+
+class TestGatherOpComplex64(TestGatherOp):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOpComplex128(TestGatherOp):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
 
 class TestCase1(TestGatherOp):
     def config(self):
@@ -62,10 +160,42 @@ def config(self):
         For one dimension input
         """
         self.x_shape = 100
-        self.x_type = "float32"
+        self.config_dtype()
         self.index = [1, 3, 5]
         self.index_type = "int32"
 
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestCase1FP16(TestCase1):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestCase1BFP16(TestGatherOpBFP16):
+    def config(self):
+        self.x_shape = 100
+        self.config_dtype()
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+
+
+class TestCase1Complex64(TestCase1):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase1Complex128(TestCase1):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
 
 class TestCase2(TestGatherOp):
     def config(self):
@@ -73,42 +203,574 @@ def config(self):
         For int64_t index type
         """
         self.x_shape = 100
-        self.x_type = "float32"
+        self.config_dtype()
+        self.index = [1, 3, 5]
+        self.index_type = "int64"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestCase2FP16(TestCase2):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestCase2BFP16(TestGatherOpBFP16):
+    def config(self):
+        self.x_shape = 100
+        self.config_dtype()
+        self.index = [1, 3, 5]
+        self.index_type = "int64"
+
+
+class TestCase2Complex64(TestCase2):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase2Complex128(TestCase2):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase3(TestGatherOp):
+    def config(self):
+        """
+        For other input type
+        """
+        self.x_shape = (10, 20)
+        self.config_dtype()
+        self.index = [1, 3, 5]
+        self.index_type = "int64"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestCase3Fp16(TestCase3):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestCase3BFP16(TestGatherOpBFP16):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.config_dtype()
         self.index = [1, 3, 5]
         self.index_type = "int64"
 
 
+class TestCase3Complex64(TestCase3):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase3Complex128(TestCase3):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase4(TestGatherOp):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.attrs = {"overwrite": False}
+        self.config_dtype()
+        self.index = [1, 1]
+        self.index_type = "int32"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestCase4FP16(TestCase4):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestCase4BFP16(TestGatherOpBFP16):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.attrs = {"overwrite": False}
+        self.config_dtype()
+        self.index = [1, 1]
+        self.index_type = "int32"
+
+
+class TestCase4Complex64(TestCase4):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase4Complex128(TestCase4):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase5(TestGatherOp):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.attrs = {"overwrite": False}
+        self.config_dtype()
+        self.index = [1, 1, 3]
+        self.index_type = "int32"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestCase5BFP16(TestGatherOpBFP16):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.attrs = {"overwrite": False}
+        self.config_dtype()
+        self.index = [1, 1]
+        self.index_type = "int32"
+
+
+class TestCase5FP16(TestCase5):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestCase5Complex64(TestCase5):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase5Complex128(TestCase5):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase6(TestGatherOp):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.attrs = {"overwrite": True}
+        self.config_dtype()
+        self.index = [1, 3]
+        self.index_type = "int32"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestCase6FP16(TestCase6):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestCase6BFP16(TestGatherOpBFP16):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.attrs = {"overwrite": True}
+        self.config_dtype()
+        self.index = [1, 3]
+        self.index_type = "int32"
+
+
+class TestGatherBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "gather"
+        self.python_api = paddle.gather
+        self.dtype = np.uint16
+        self.config()
+        xnp = np.random.random(self.x_shape).astype(np.float32)
+        axis_np = np.array(self.axis).astype(self.axis_type)
+        index_np = np.array(self.index).astype(self.index_type)
+        self.inputs = {
+            "X": convert_float_to_uint16(xnp),
+            "Index": index_np,
+            "Axis": axis_np,
+        }
+        out = gather_numpy(self.inputs["X"], index_np, axis_np[0])
+        self.outputs = {"Out": out}
+
+    def test_check_output(self):
+        self.check_output(check_pir=True, check_symbol_infer=False)
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", numeric_grad_delta=0.5, check_pir=True)
+
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (3, 88, 3)
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+        self.axis = [1]
+        self.axis_type = "int32"
+
+
+class TestGatherNegativeAxis(OpTest):
+    def setUp(self):
+        self.op_type = "gather"
+        self.python_api = paddle.gather
+        self.dtype = np.uint16
+        self.config()
+        xnp = np.random.random(self.x_shape).astype(np.float32)
+        axis_np = np.array(self.axis).astype(self.axis_type)
+        index_np = np.array(self.index).astype(self.index_type)
+        self.inputs = {
+            "X": convert_float_to_uint16(xnp),
+            "Index": index_np,
+            "Axis": axis_np,
+        }
+        out = gather_numpy(self.inputs["X"], index_np, axis_np[0])
+        self.outputs = {"Out": out}
+
+    def test_check_output(self):
+        places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda() or is_custom_device():
+            places.append(get_device_place())
+        for place in places:
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda() or is_custom_device():
+            places.append(get_device_place())
+        for place in places:
+            self.check_grad_with_place(place, ["X"], "Out", numeric_grad_delta=0.5)
+
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (100, 3)
+        self.index = [0, 1, -2]
+        self.index_type = "int32"
+        self.axis = [-1]
+        self.axis_type = "int32"
+
+
+class TestOutOfRangeError(unittest.TestCase):
+    def test_dygraph_forward_and_backward(self):
+        with dygraph_guard():
+            x = paddle.randn([100, 3]).cpu()
+            x.stop_gradient = False
+            y = paddle.gather(
+                x,
+                paddle.to_tensor([0, -2]).cpu(),
+                axis=-1,
+            )
+            grad_x = paddle.grad(y, x)
+
+    def test_dygraph_error(self):
+        with dygraph_guard():
+            # out of lower bound
+            with self.assertRaises(IndexError):
+                _ = paddle.gather(
+                    paddle.randn([100, 3]).cpu(),
+                    paddle.to_tensor([0, -4]).cpu(),
+                    axis=1,
+                )
+            # out of upper bound
+            with self.assertRaises(IndexError):
+                _ = paddle.gather(
+                    paddle.randn([100, 3]).cpu(),
+                    paddle.to_tensor([0, 3]).cpu(),
+                    axis=1,
+                )
+
+
+class TestCase6Complex64(TestCase6):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase6Complex128(TestCase6):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp1(OpTest):
+    def setUp(self):
+        self.op_type = "gather"
+        self.python_api = paddle.gather
+        self.config()
+        xnp = np.random.random(self.x_shape).astype(self.x_type)
+        axis_np = np.array(self.axis).astype(self.index_type)
+        index_np = np.array(self.index).astype(self.index_type)
+        out = gather_numpy(xnp, index_np, axis_np[0])
+        self.inputs = {"X": xnp, "Index": index_np, "Axis": axis_np}
+        self.outputs = {"Out": out}
+
+    def test_check_output(self):
+        self.check_output(check_pir=True, check_symbol_infer=False)
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", check_pir=True)
+
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (3, 88, 3)
+        self.config_dtype()
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+        self.axis = [1]
+        self.axis_type = "int32"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestGatherOp1FP16(TestGatherOp1):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestGatherOp1Complex64(TestGatherOp1):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp1Complex128(TestGatherOp1):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp2(TestGatherOp1):
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (10, 88, 10)
+        self.config_dtype()
+        self.index = [1, 3, 5]
+        self.index_type = "int64"
+        self.axis = [0]
+        self.axis_type = "int32"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestGatherOp2FP16(TestGatherOp2):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestGatherOp2Complex64(TestGatherOp2):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp2Complex128(TestGatherOp2):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp3(TestGatherOp1):
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (10, 88, 10)
+        self.config_dtype()
+        self.index = [1, 3, 5]
+        self.index_type = "int64"
+        self.axis = [2]
+        self.axis_type = "int32"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestGatherOp3FP16(TestGatherOp3):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestGatherOp3Complex64(TestGatherOp3):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp3Complex128(TestGatherOp3):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp4(TestGatherOp1):
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (3, 100, 10)
+        self.config_dtype()
+        self.index = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+        self.index_type = "int64"
+        self.axis = [0]
+        self.axis_type = "int32"
+        self.attrs = {"overwrite": False}
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestGatherOp4FP16(TestGatherOp4):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestGatherOp4Complex64(TestGatherOp4):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp4Complex128(TestGatherOp4):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp5(TestGatherOp):
+    def config(self):
+        """
+        Test for negative axis
+        """
+        self.x_shape = (3, 100, 10)
+        self.config_dtype()
+        self.index = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+        self.index_type = "int64"
+        self.axis = [-1]
+        self.axis_type = "int32"
+        self.attrs = {"overwrite": False}
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+    def test_check_grad(self):
+        self.check_grad(
+            ["X"],
+            "Out",
+            check_pir=True,
+            check_prim_pir=True,
+        )
+
+
+class API_TestGather(unittest.TestCase):
+    def test_out1(self):
+        with base.program_guard(base.Program(), base.Program()):
+            data1 = paddle.static.data("data1", shape=[-1, 2], dtype="float64")
+            index = paddle.static.data("index", shape=[-1, 1], dtype="int64")
+            out = paddle.gather(data1, index)
+            place = base.CPUPlace()
+            exe = base.Executor(place)
+            input = np.array([[1, 2], [3, 4], [5, 6]]).astype("float64")
+            index_1 = np.array([1, 2]).astype("int64")
+            (result,) = exe.run(
+                feed={"data1": input, "index": index_1}, fetch_list=[out]
+            )
+            expected_output = np.array([[3, 4], [5, 6]])
+        np.testing.assert_allclose(result, expected_output, rtol=1e-05)
+
+    def test_out2(self):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            x = paddle.static.data("x", shape=[-1, 2], dtype="float64")
+            index = paddle.static.data("index", shape=[-1, 1], dtype="int32")
+            axis = paddle.static.data("axis", shape=[1], dtype="int32")
+            out = paddle.gather(x, index, axis)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            x_np = np.array([[1, 2], [3, 4], [5, 6]]).astype("float64")
+            index_np = np.array([1, 1]).astype("int32")
+            axis_np = np.array([1]).astype("int32")
+            (result,) = exe.run(
+                feed={"x": x_np, "index": index_np, "axis": axis_np},
+                fetch_list=[out],
+            )
+            expected_output = gather_numpy(x_np, index_np, axis_np[0])
+        np.testing.assert_allclose(result, expected_output, rtol=1e-05)
+
+
 class API_TestDygraphGather(unittest.TestCase):
     def test_out1(self):
-        paddle.set_device("metax_gpu")
         paddle.disable_static()
-        input_1 = np.array([[1, 2], [3, 4], [5, 6]]).astype("int32")
+        input_1 = np.array([[1, 2], [3, 4], [5, 6]])
         index_1 = np.array([1, 2])
         input = paddle.to_tensor(input_1)
         index = paddle.to_tensor(index_1)
         output = paddle.gather(input, index)
         output_np = output.numpy()
-        expected_output = np.array([[3, 4], [5, 6]]).astype("int32")
-        np.testing.assert_allclose(output_np, expected_output)
+        expected_output = np.array([[3, 4], [5, 6]])
+        np.testing.assert_allclose(output_np, expected_output, rtol=1e-05)
         paddle.enable_static()
 
     def test_out12(self):
-        paddle.set_device("metax_gpu")
         paddle.disable_static()
-        input_1 = np.array([[1, 2], [3, 4], [5, 6]]).astype("int32")
+        input_1 = np.array([[1, 2], [3, 4], [5, 6]])
         index_1 = np.array([1, 2])
         x = paddle.to_tensor(input_1)
         index = paddle.to_tensor(index_1)
         output = paddle.gather(x, index, axis=0)
         output_np = output.numpy()
         expected_output = gather_numpy(input_1, index_1, axis=0)
-        np.testing.assert_allclose(output_np, expected_output)
+        np.testing.assert_allclose(output_np, expected_output, rtol=1e-05)
         paddle.enable_static()
 
     def test_zero_index(self):
-        paddle.set_device("metax_gpu")
         paddle.disable_static()
-        x = paddle.to_tensor([[1, 2], [3, 4]]).astype("int32")
+        x = paddle.to_tensor([[1, 2], [3, 4]])
         index = paddle.to_tensor(np.array([]).astype("int64"))
         for axis in range(len(x.shape)):
             out = paddle.gather(x, index, axis)
@@ -117,122 +779,197 @@ def test_zero_index(self):
             self.assertEqual(list(out.shape), expected_shape)
         paddle.enable_static()
 
+    def test_large_data(self):
+        if not paddle.is_compiled_with_cuda():
+            return
 
-class TestGathertError(unittest.TestCase):
-    def setUp(self) -> None:
-        self.place = paddle.CustomPlace("metax_gpu", 0)
-        paddle.set_device("metax_gpu:0")
+        x = np.random.rand(226862, 256).astype("float32")
+        index = np.random.randint(-226862, 22682, size=(8859027))
 
-    def test_error1(self):
-        paddle.enable_static()
-        if not paddle.framework.use_pir_api():
+        def test_dygraph():
+            with base.dygraph.guard():
+                gpu_out = paddle.gather(paddle.to_tensor(x), paddle.to_tensor(index))
+                return gpu_out.numpy()
+
+        @switch_to_static_graph
+        def test_static_graph():
             with paddle.static.program_guard(
                 paddle.static.Program(), paddle.static.Program()
             ):
-
-                input_shape = [8, 9, 6]
-                index_shape = [4]
-                x_int8 = paddle.static.data(
-                    shape=input_shape, dtype="int8", name="x_int8"
-                )
-                x_float32 = paddle.static.data(
-                    shape=input_shape, dtype="float32", name="x_float32"
-                )
-                axis = paddle.static.data(shape=[1], dtype="float32", name="axis")
-                index = paddle.static.data(
-                    shape=index_shape, dtype="int32", name="index"
-                )
-                index_float = paddle.static.data(
-                    shape=index_shape, dtype="float32", name="index_float"
+                x_t = paddle.static.data(name="x", dtype=x.dtype, shape=x.shape)
+                index_t = paddle.static.data(
+                    name="index", dtype=index.dtype, shape=index.shape
                 )
+                out_t = paddle.gather(x_t, index_t)
+                feed = {x_t.name: x, index_t.name: index}
+                fetch = [out_t]
 
-                def test_x_type():
-                    paddle.gather(x_int8, index)
+                gpu_exe = paddle.static.Executor(get_device_place())
+                gpu_value = gpu_exe.run(feed=feed, fetch_list=fetch)[0]
+                return gpu_value
 
-                self.assertRaises(TypeError, test_x_type)
+        np.testing.assert_array_equal(test_dygraph(), test_static_graph())
 
-                def test_index_type():
-                    paddle.gather(x_float32, index_float)
 
-                self.assertRaises(TypeError, test_index_type)
+class TestGathertError(unittest.TestCase):
+    def test_error1(self):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            shape = [8, 9, 6]
+            x = paddle.static.data(shape=shape, dtype="int8", name="x")
+            axis = paddle.static.data(shape=[1], dtype="float32", name="axis")
+            index = paddle.static.data(shape=shape, dtype="int32", name="index")
+            index_float = paddle.static.data(
+                shape=shape, dtype="float32", name="index_float"
+            )
+
+            def test_x_type():
+                paddle.gather(x, index)
+
+            self.assertRaises((TypeError, ValueError), test_x_type)
+
+            def test_index_type():
+                paddle.gather(x, index_float)
+
+            self.assertRaises((TypeError, ValueError), test_index_type)
+
+            def test_axis_dtype():
+                paddle.gather(x, index, axis=1.11)
 
-                def test_axis_dtype():
-                    paddle.gather(x_float32, index, axis=1.11)
+            self.assertRaises((TypeError, ValueError), test_axis_dtype)
 
-                self.assertRaises(TypeError, test_axis_dtype)
+            def test_axis_dtype1():
+                paddle.gather(x, index, axis=axis)
 
-                def test_axis_dtype1():
-                    paddle.gather(x_float32, index, axis=axis)
+            self.assertRaises((TypeError, ValueError), test_axis_dtype1)
 
-                self.assertRaises(TypeError, test_axis_dtype1)
-        else:
-            paddle.set_device("metax_gpu")
-            input_shape = [8, 9, 6]
-            index_shape = [4]
+    def test_error2(self):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            shape = [8, 9, 6]
+            x = paddle.static.data(shape=shape, dtype="int8", name="x")
+            index = paddle.static.data(shape=shape, dtype="int32", name="mask")
+            index_float = paddle.static.data(
+                shape=shape, dtype="float32", name="index_float"
+            )
+
+            def test_x_type():
+                paddle.gather(x, index)
+
+            self.assertRaises((TypeError, ValueError), test_x_type)
 
             def test_index_type():
-                with paddle.static.program_guard(
-                    paddle.static.Program(), paddle.static.Program()
-                ):
-                    x = paddle.static.data(shape=input_shape, dtype="float32", name="x")
-                    index = paddle.static.data(
-                        shape=index_shape, dtype="float32", name="index_float"
-                    )
-                    out = paddle.gather(x, index)
-                    exe = paddle.static.Executor(place=self.place)
-                    exe.run(paddle.static.default_startup_program())
-                    self.assertRaises(
-                        ValueError,
-                        exe.run,
-                        paddle.static.default_main_program(),
-                        feed={
-                            "x": np.random.random(input_shape).astype("float32"),
-                            "index_float": np.random.random(index_shape).astype(
-                                "float32"
-                            ),
-                        },
-                    )
-
-            def test_axis_scalar_dtype():
-                with paddle.static.program_guard(
-                    paddle.static.Program(), paddle.static.Program()
-                ):
-                    x = paddle.static.data(shape=input_shape, dtype="float32", name="x")
-                    index = paddle.static.data(
-                        shape=index_shape, dtype="int32", name="index"
-                    )
-                    axis = paddle.static.data(shape=[1], dtype="int32", name="axis")
-                    self.assertRaises(TypeError, paddle.gather, x, index, axis=1.11)
-
-            def test_axis_tensor_dtype():
-                with paddle.static.program_guard(
-                    paddle.static.Program(), paddle.static.Program()
-                ):
-                    x = paddle.static.data(shape=input_shape, dtype="float32", name="x")
-                    index = paddle.static.data(
-                        shape=index_shape, dtype="int32", name="index"
-                    )
-                    axis = paddle.static.data(shape=[1], dtype="float32", name="axis")
-                    y = paddle.gather(x, index, axis=axis)
-                    exe = paddle.static.Executor(place=self.place)
-                    exe.run(paddle.static.default_startup_program())
-                    self.assertRaises(
-                        ValueError,
-                        exe.run,
-                        paddle.static.default_main_program(),
-                        feed={
-                            "x": np.random.random(input_shape).astype("float32"),
-                            "index": np.random.randint(0, 8, index_shape).astype(
-                                "int32"
-                            ),
-                            "axis": np.array([1.11]).astype("float32"),
-                        },
-                    )
-
-            test_index_type()
-            test_axis_scalar_dtype()
-            # test_axis_tensor_dtype()
+                paddle.gather(x, index_float)
+
+            self.assertRaises((TypeError, ValueError), test_index_type)
+
+    def test_error3(self):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            shape = [8, 9, 6]
+            x = paddle.static.data(shape=shape, dtype="int32", name="x")
+            index = paddle.static.data(shape=shape, dtype="int32", name="index")
+
+            def test_axis_minsize():
+                paddle.gather(x, index, axis=-1)
+
+            self.assertRaises(ValueError, test_axis_minsize)
+
+            def test_axis_maxsize():
+                paddle.gather(x, index, axis=512)
+
+            self.assertRaises(ValueError, test_axis_maxsize)
+
+
+class TestCheckOutType(unittest.TestCase):
+    def test_out_type(self):
+        data = paddle.static.data(shape=[16, 10], dtype="int64", name="x")
+        index = paddle.static.data(shape=[4], dtype="int64", name="index")
+        out = paddle.gather(data, index)
+        self.assertTrue(out.dtype == paddle.int64 or out.dtype == core.DataType.INT64)
+
+    def test_pir_out_type(self):
+        with paddle.pir_utils.IrGuard():
+            data = paddle.static.data(shape=[16, 10], dtype="int64", name="x")
+            index = paddle.static.data(shape=[4], dtype="int64", name="index")
+            out = paddle.gather(data, index)
+            self.assertTrue(out.dtype == core.DataType.INT64)
+
+
+class TestGatherBackward(unittest.TestCase):
+    def setUp(self):
+        self.shape = [10, 20]
+        self.dtype = "float32"
+        self.index = (1, 3, 5)
+        self.index_dtype = "int64"
+        self.places = get_devices()
+
+    def test_gather_backward(self):
+        if len(self.places) != 2:
+            return
+        res_list = []
+        x_np = np.random.random(self.shape).astype(self.dtype)
+        index_np = np.array(self.index, dtype=self.index_dtype)
+        grad_out_np = np.random.random(self.shape).astype(self.dtype)
+        for place in self.places:
+            with base.dygraph.guard(place):
+                x = paddle.to_tensor(x_np, dtype=self.dtype)
+                x.stop_gradient = False
+                index = paddle.to_tensor(index_np, dtype=self.index_dtype)
+                out = paddle.gather(x, index, -1)
+                grad_out = paddle.to_tensor(grad_out_np, dtype=self.dtype)
+                (re,) = paddle.grad(
+                    outputs=out,
+                    inputs=x,
+                    grad_outputs=grad_out,
+                )
+                res_list.append(re.numpy())
+        np.testing.assert_allclose(res_list[0], res_list[1])
+
+
+class TestGatherOp_ZeroSize(OpTest):
+    def setUp(self):
+        self.op_type = "gather"
+        self.python_api = paddle.gather
+        self.public_python_api = paddle.gather
+        self.config()
+        self.init_inputs_and_outputs()
+
+    def test_check_output(self):
+        self.check_output(check_pir=True)
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", check_pir=True)
+
+    def config(self):
+        self.x_shape = (3, 0, 4)
+        self.config_dtype()
+        self.index = [2]
+        self.index_type = "int32"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+    def init_inputs_and_outputs(self):
+        xnp = np.random.random(self.x_shape).astype(self.x_type)
+        self.inputs = {
+            "X": xnp,
+            "Index": np.array(self.index).astype(self.index_type),
+        }
+        self.outputs = {"Out": self.inputs["X"][self.inputs["Index"]]}
+
+
+class TestGatherOp_ZeroSize2(TestGatherOp_ZeroSize):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.config_dtype()
+        self.index = [2, 0]
+        self.index_type = "int32"
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()

From 893829371efacbff859d0eb83c7ea827f5bb0124 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Thu, 11 Sep 2025 17:29:10 +0800
Subject: [PATCH 05/58] [Metax] update metax_gpu CMakeLists.txt (#10)

* [Metax] fix dgc & mklml compile product path problem

* [Metax] fix accuracy kernel & add test_accuracy_op_metax.py unit test

* [Metax] add mixed_vector fix & update change patch

* [Metax] update metax_gpu CMakeLists.txt
---
 backends/metax_gpu/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 4567723123c..b22d7077e3b 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -26,11 +26,11 @@ set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
 message(STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}")
 set(WITH_MKLML ON)
 
+include(paddle)
 set(THIRD_PARTY_PATH
     "${PADDLE_SOURCE_DIR}/build/third_party"
     CACHE PATH "Third party libraries directory.")
 
-include(paddle)
 include(version)
 include(generic)
 include(cblas)

From f54187fb3e47ed8062537b9d339c48c7fd711326 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 11 Sep 2025 18:51:43 +0800
Subject: [PATCH 06/58] [metax] updata_qr_kernel (#11)

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 .../metax_kernel/qr_kernel_register.cu        | 207 +++++++++---------
 1 file changed, 98 insertions(+), 109 deletions(-)

diff --git a/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu
index 7b133371f4d..745069e2eda 100644
--- a/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu
@@ -22,9 +22,8 @@
 #include <algorithm>
 #include <vector>
 
-#include "kernels/impl/values_vectors_functor.h"
+#include "kernels/metax_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -39,7 +38,6 @@
 #include "paddle/phi/kernels/slice_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 #include "paddle/phi/kernels/tril_triu_kernel.h"
-
 namespace phi {
 
 template <class T, class Context>
@@ -358,47 +356,47 @@ void QrKernel(const Context& dev_ctx,
 
 #ifdef PADDLE_WITH_HIP
 #define FUNC_WITH_TYPES(m) m(float, s) m(double, d)
-#define GEQRF_BATCH_INSTANCE(T, C)                              \
-  template <>                                                   \
-  void BatchedGeqrf<GPUContext, T>(const GPUContext& dev_ctx,   \
-                                   int batch_size,              \
-                                   int m,                       \
-                                   int n,                       \
-                                   T* a,                        \
-                                   int lda,                     \
-                                   T* tau,                      \
-                                   int a_stride,                \
-                                   int tau_stride) {            \
-    auto handle = dev_ctx.cusolver_dn_handle();                 \
-    for (int i = 0; i < batch_size; ++i) {                      \
-      T* a_working_ptr = &a[i * a_stride];                      \
-      T* tau_working_ptr = &tau[i * tau_stride];                \
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::rocsolver_##C##geqrf( \
-          handle, m, n, a_working_ptr, lda, tau_working_ptr));  \
-    }                                                           \
+#define GEQRF_BATCH_INSTANCE(T, C)                                           \
+  template <>                                                                \
+  void BatchedGeqrf<GPUContext, T>(const GPUContext& dev_ctx,                \
+                                   int batch_size,                           \
+                                   int m,                                    \
+                                   int n,                                    \
+                                   T* a,                                     \
+                                   int lda,                                  \
+                                   T* tau,                                   \
+                                   int a_stride,                             \
+                                   int tau_stride) {                         \
+    auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); \
+    for (int i = 0; i < batch_size; ++i) {                                   \
+      T* a_working_ptr = &a[i * a_stride];                                   \
+      T* tau_working_ptr = &tau[i * tau_stride];                             \
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::rocsolver_##C##geqrf(              \
+          handle, m, n, a_working_ptr, lda, tau_working_ptr));               \
+    }                                                                        \
   }
 
 FUNC_WITH_TYPES(GEQRF_BATCH_INSTANCE);
 
-#define ORGQR_BATCH_INSTANCE(T, C)                                \
-  template <>                                                     \
-  void BatchedOrgqr<GPUContext, T>(const GPUContext& dev_ctx,     \
-                                   int batch_size,                \
-                                   int m,                         \
-                                   int n,                         \
-                                   int k,                         \
-                                   T* a,                          \
-                                   int lda,                       \
-                                   T* tau,                        \
-                                   int a_stride,                  \
-                                   int tau_stride) {              \
-    auto handle = dev_ctx.cusolver_dn_handle();                   \
-    for (int i = 0; i < batch_size; ++i) {                        \
-      T* a_working_ptr = &a[i * a_stride];                        \
-      T* tau_working_ptr = &tau[i * tau_stride];                  \
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::rocsolver_##C##orgqr(   \
-          handle, m, n, k, a_working_ptr, lda, tau_working_ptr)); \
-    }                                                             \
+#define ORGQR_BATCH_INSTANCE(T, C)                                           \
+  template <>                                                                \
+  void BatchedOrgqr<GPUContext, T>(const GPUContext& dev_ctx,                \
+                                   int batch_size,                           \
+                                   int m,                                    \
+                                   int n,                                    \
+                                   int k,                                    \
+                                   T* a,                                     \
+                                   int lda,                                  \
+                                   T* tau,                                   \
+                                   int a_stride,                             \
+                                   int tau_stride) {                         \
+    auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); \
+    for (int i = 0; i < batch_size; ++i) {                                   \
+      T* a_working_ptr = &a[i * a_stride];                                   \
+      T* tau_working_ptr = &tau[i * tau_stride];                             \
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::rocsolver_##C##orgqr(              \
+          handle, m, n, k, a_working_ptr, lda, tau_working_ptr));            \
+    }                                                                        \
   }
 
 FUNC_WITH_TYPES(ORGQR_BATCH_INSTANCE);
@@ -421,7 +419,6 @@ void BatchedGeqrf<GPUContext, float>(const GPUContext& dev_ctx,
     const int64_t a_stride_64 = static_cast<int64_t>(a_stride);
     const int64_t tau_stride_64 = static_cast<int64_t>(tau_stride);
 
-    // auto handle = dev_ctx.cusolver_dn_handle();
     auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
 
     size_t workspace_in_bytes_on_device = 0;
@@ -499,7 +496,6 @@ void BatchedGeqrf<GPUContext, float>(const GPUContext& dev_ctx,
   } else {
     int lwork = 0;
 
-    // auto handle = dev_ctx.cusolver_dn_handle();
     auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnSgeqrf_bufferSize(
         handle, m, n, a, lda, &lwork));
@@ -555,7 +551,6 @@ void BatchedGeqrf<GPUContext, double>(const GPUContext& dev_ctx,
                                       int tau_stride) {
   int lwork = 0;
 
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::cusolverDnDgeqrf_bufferSize(handle, m, n, a, lda, &lwork));
@@ -599,35 +594,33 @@ void BatchedGeqrf<GPUContext, double>(const GPUContext& dev_ctx,
 }
 
 template <>
-void BatchedGeqrf<GPUContext, phi::dtype::complex<float>>(
-    const GPUContext& dev_ctx,
-    int batch_size,
-    int m,
-    int n,
-    phi::dtype::complex<float>* a,
-    int lda,
-    phi::dtype::complex<float>* tau,
-    int a_stride,
-    int tau_stride) {
+void BatchedGeqrf<GPUContext, phi::complex64>(const GPUContext& dev_ctx,
+                                              int batch_size,
+                                              int m,
+                                              int n,
+                                              phi::complex64* a,
+                                              int lda,
+                                              phi::complex64* tau,
+                                              int a_stride,
+                                              int tau_stride) {
   int lwork = 0;
 
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCgeqrf_bufferSize(
       handle, m, n, reinterpret_cast<cuComplex*>(a), lda, &lwork));
 
   DenseTensor workspace = DenseTensor();
   workspace.Resize(common::make_ddim({lwork}));
-  phi::dtype::complex<float>* workspace_ptr =
-      dev_ctx.template Alloc<phi::dtype::complex<float>>(&workspace);
+  phi::complex64* workspace_ptr =
+      dev_ctx.template Alloc<phi::complex64>(&workspace);
 
   DenseTensor info = DenseTensor();
   info.Resize(common::make_ddim({1}));
   int* info_d = dev_ctx.template Alloc<int>(&info);
 
   for (int i = 0; i < batch_size; ++i) {
-    phi::dtype::complex<float>* a_working_ptr = &a[i * a_stride];
-    phi::dtype::complex<float>* tau_working_ptr = &tau[i * tau_stride];
+    phi::complex64* a_working_ptr = &a[i * a_stride];
+    phi::complex64* tau_working_ptr = &tau[i * tau_stride];
     // compute geqrf
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCgeqrf(
         handle,
@@ -657,35 +650,33 @@ void BatchedGeqrf<GPUContext, phi::dtype::complex<float>>(
 }
 
 template <>
-void BatchedGeqrf<GPUContext, phi::dtype::complex<double>>(
-    const GPUContext& dev_ctx,
-    int batch_size,
-    int m,
-    int n,
-    phi::dtype::complex<double>* a,
-    int lda,
-    phi::dtype::complex<double>* tau,
-    int a_stride,
-    int tau_stride) {
+void BatchedGeqrf<GPUContext, phi::complex128>(const GPUContext& dev_ctx,
+                                               int batch_size,
+                                               int m,
+                                               int n,
+                                               phi::complex128* a,
+                                               int lda,
+                                               phi::complex128* tau,
+                                               int a_stride,
+                                               int tau_stride) {
   int lwork = 0;
 
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZgeqrf_bufferSize(
       handle, m, n, reinterpret_cast<cuDoubleComplex*>(a), lda, &lwork));
 
   DenseTensor workspace = DenseTensor();
   workspace.Resize(common::make_ddim({lwork}));
-  phi::dtype::complex<double>* workspace_ptr =
-      dev_ctx.template Alloc<phi::dtype::complex<double>>(&workspace);
+  phi::complex128* workspace_ptr =
+      dev_ctx.template Alloc<phi::complex128>(&workspace);
 
   DenseTensor info = DenseTensor();
   info.Resize(common::make_ddim({1}));
   int* info_d = dev_ctx.template Alloc<int>(&info);
 
   for (int i = 0; i < batch_size; ++i) {
-    phi::dtype::complex<double>* a_working_ptr = &a[i * a_stride];
-    phi::dtype::complex<double>* tau_working_ptr = &tau[i * tau_stride];
+    phi::complex128* a_working_ptr = &a[i * a_stride];
+    phi::complex128* tau_working_ptr = &tau[i * tau_stride];
     // compute geqrf
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZgeqrf(
         handle,
@@ -727,7 +718,6 @@ void BatchedOrgqr<GPUContext, float>(const GPUContext& dev_ctx,
                                      int tau_stride) {
   int lwork = 0;
 
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnSorgqr_bufferSize(
       handle, m, n, k, a, lda, tau, &lwork));
@@ -784,7 +774,6 @@ void BatchedOrgqr<GPUContext, double>(const GPUContext& dev_ctx,
                                       int tau_stride) {
   int lwork = 0;
 
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDorgqr_bufferSize(
       handle, m, n, k, a, lda, tau, &lwork));
@@ -829,20 +818,18 @@ void BatchedOrgqr<GPUContext, double>(const GPUContext& dev_ctx,
 }
 
 template <>
-void BatchedOrgqr<GPUContext, phi::dtype::complex<float>>(
-    const GPUContext& dev_ctx,
-    int batch_size,
-    int m,
-    int n,
-    int k,
-    phi::dtype::complex<float>* a,
-    int lda,
-    phi::dtype::complex<float>* tau,
-    int a_stride,
-    int tau_stride) {
+void BatchedOrgqr<GPUContext, phi::complex64>(const GPUContext& dev_ctx,
+                                              int batch_size,
+                                              int m,
+                                              int n,
+                                              int k,
+                                              phi::complex64* a,
+                                              int lda,
+                                              phi::complex64* tau,
+                                              int a_stride,
+                                              int tau_stride) {
   int lwork = 0;
 
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCungqr_bufferSize(
       handle,
@@ -856,16 +843,16 @@ void BatchedOrgqr<GPUContext, phi::dtype::complex<float>>(
 
   DenseTensor workspace = DenseTensor();
   workspace.Resize(common::make_ddim({lwork}));
-  phi::dtype::complex<float>* workspace_ptr =
-      dev_ctx.template Alloc<phi::dtype::complex<float>>(&workspace);
+  phi::complex64* workspace_ptr =
+      dev_ctx.template Alloc<phi::complex64>(&workspace);
 
   DenseTensor info = DenseTensor();
   info.Resize(common::make_ddim({1}));
   int* info_d = dev_ctx.template Alloc<int>(&info);
 
   for (int i = 0; i < batch_size; ++i) {
-    phi::dtype::complex<float>* a_working_ptr = &a[i * a_stride];
-    phi::dtype::complex<float>* tau_working_ptr = &tau[i * tau_stride];
+    phi::complex64* a_working_ptr = &a[i * a_stride];
+    phi::complex64* tau_working_ptr = &tau[i * tau_stride];
     // compute orggr
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCungqr(
         handle,
@@ -896,20 +883,18 @@ void BatchedOrgqr<GPUContext, phi::dtype::complex<float>>(
 }
 
 template <>
-void BatchedOrgqr<GPUContext, phi::dtype::complex<double>>(
-    const GPUContext& dev_ctx,
-    int batch_size,
-    int m,
-    int n,
-    int k,
-    phi::dtype::complex<double>* a,
-    int lda,
-    phi::dtype::complex<double>* tau,
-    int a_stride,
-    int tau_stride) {
+void BatchedOrgqr<GPUContext, phi::complex128>(const GPUContext& dev_ctx,
+                                               int batch_size,
+                                               int m,
+                                               int n,
+                                               int k,
+                                               phi::complex128* a,
+                                               int lda,
+                                               phi::complex128* tau,
+                                               int a_stride,
+                                               int tau_stride) {
   int lwork = 0;
 
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZungqr_bufferSize(
       handle,
@@ -923,16 +908,16 @@ void BatchedOrgqr<GPUContext, phi::dtype::complex<double>>(
 
   DenseTensor workspace = DenseTensor();
   workspace.Resize(common::make_ddim({lwork}));
-  phi::dtype::complex<double>* workspace_ptr =
-      dev_ctx.template Alloc<phi::dtype::complex<double>>(&workspace);
+  phi::complex128* workspace_ptr =
+      dev_ctx.template Alloc<phi::complex128>(&workspace);
 
   DenseTensor info = DenseTensor();
   info.Resize(common::make_ddim({1}));
   int* info_d = dev_ctx.template Alloc<int>(&info);
 
   for (int i = 0; i < batch_size; ++i) {
-    phi::dtype::complex<double>* a_working_ptr = &a[i * a_stride];
-    phi::dtype::complex<double>* tau_working_ptr = &tau[i * tau_stride];
+    phi::complex128* a_working_ptr = &a[i * a_stride];
+    phi::complex128* tau_working_ptr = &tau[i * tau_stride];
     // compute orggr
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZungqr(
         handle,
@@ -965,11 +950,15 @@ void BatchedOrgqr<GPUContext, phi::dtype::complex<double>>(
 
 }  // namespace phi
 
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(qr, GPU, ALL_LAYOUT, phi::QrKernel, float, double) {}
+#else
 PD_REGISTER_PLUGIN_KERNEL(qr,
                           metax_gpu,
                           ALL_LAYOUT,
                           phi::QrKernel,
                           float,
                           double,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::complex64,
+                          phi::complex128) {}
+#endif

From 1e042162a9f7cbb4c08b260bae373122fee1e827 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Mon, 15 Sep 2025 10:30:01 +0800
Subject: [PATCH 07/58] [Metax] fix illegal address access error in
 test_momentum_op (#12)

* [Metax] fix illegal address access error in test_momentum_op
---
 backends/metax_gpu/patch/tmp/mixed_vector.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/backends/metax_gpu/patch/tmp/mixed_vector.h b/backends/metax_gpu/patch/tmp/mixed_vector.h
index e7cf1e626c9..1dcca9c71b4 100644
--- a/backends/metax_gpu/patch/tmp/mixed_vector.h
+++ b/backends/metax_gpu/patch/tmp/mixed_vector.h
@@ -386,7 +386,8 @@ class MixVector {
 
   // the unify method to access CPU or CUDA data. immutable.
   const T *Data(phi::Place place) const {
-    if (place.GetType() == phi::AllocationType::GPU) {
+    if (place.GetType() == phi::AllocationType::GPU ||
+        place.GetType() == phi::AllocationType::CUSTOM) {
       return CUDAData(place);
     } else {
       return data();
@@ -395,7 +396,8 @@ class MixVector {
 
   // the unify method to access CPU or CUDA data. mutable.
   T *MutableData(phi::Place place) {
-    if (place.GetType() == phi::AllocationType::GPU) {
+    if (place.GetType() == phi::AllocationType::GPU ||
+        place.GetType() == phi::AllocationType::CUSTOM) {
       return CUDAMutableData(place);
     } else {
       return data();

From aca80a41f6f619d995f5944c584c3141fab3ce9e Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 15 Sep 2025 11:41:10 +0800
Subject: [PATCH 08/58] [Metax] fix cufft and fix some blas kernel apply (#13)

* [Metax] fix cufft and fix some blas kernel apply
---
 backends/metax_gpu/CMakeLists.txt     | 13 ++----
 backends/metax_gpu/patch/paddle.patch | 59 +++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 9 deletions(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index b22d7077e3b..6048b59e6c1 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -618,6 +618,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bernoulli_kernel.cu
   # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bmm_grad_kernel_impl.h
   # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bmm_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cufft.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/box_coder_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
@@ -683,15 +684,9 @@ file(
   ${CMAKE_SOURCE_DIR}/kernels/flash_attn_kernel.cu
   ${CMAKE_SOURCE_DIR}/kernels/flashattn.cc)
 
-list(
-  REMOVE_ITEM
-  CUDA_SRCS
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/gru_compute.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/softmax.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/weight_only_gemv.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/math/context_project.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/fft.cu)
+list(REMOVE_ITEM CUDA_SRCS
+     ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/softmax.cu
+     ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/weight_only_gemv.cu)
 
 file(
   GLOB
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 1935217baa0..8127caee61e 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -133,6 +133,26 @@ index c0080f0a5e..458ca3e2e8 100644
  }  // namespace dynload
  }  // namespace phi
  
+diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h
+index 1547909d92..66b2779392 100644
+--- a/paddle/phi/backends/dynload/cufft.h
++++ b/paddle/phi/backends/dynload/cufft.h
+@@ -1,3 +1,4 @@
++// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.   
+ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+@@ -40,7 +41,9 @@ extern void EnforceCUFFTLoaded(const char* fn_name);
+         cufft_dso_handle = phi::dynload::GetCUFFTDsoHandle();        \
+       });                                                            \
+       EnforceCUFFTLoaded(#__name);                                   \
+-      static void* p_##__name = dlsym(cufft_dso_handle, #__name);    \
++      std::string replaced_name = #__name;                                  \
++      replaced_name =  replaced_name.replace(0,2,"mc");          \
++      static void* p_##__name = dlsym(cufft_dso_handle, replaced_name.c_str());    \
+       return reinterpret_cast<cufft_func>(p_##__name)(args...);      \
+     }                                                                \
+   };                                                                 \
 diff --git a/paddle/phi/backends/dynload/cupti.h b/paddle/phi/backends/dynload/cupti.h
 index 59e92955c9..d2f8c2da15 100644
 --- a/paddle/phi/backends/dynload/cupti.h
@@ -437,6 +457,32 @@ index cb35feee32..64f5bd24ac 100644
  #include "paddle/phi/kernels/funcs/quant_dequant.h"
  #include "paddle/phi/kernels/matmul_kernel.h"
  
+diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu
+index 88663ec880..98b93072a3 100644
+--- a/paddle/phi/kernels/funcs/gru_compute.cu
++++ b/paddle/phi/kernels/funcs/gru_compute.cu
+@@ -12,7 +12,7 @@ limitations under the License. */
+ #include "paddle/phi/kernels/funcs/gru_compute.h"
+ 
+ #include "paddle/phi/backends/gpu/gpu_context.h"
+-#include "paddle/phi/kernels/funcs/blas/blas.h"
++#include "kernels/funcs/blas/blas.h"
+ #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h"
+ #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
+ 
+diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h
+index 15e1a4a3c3..e4780538d7 100644
+--- a/paddle/phi/kernels/funcs/math/context_project.h
++++ b/paddle/phi/kernels/funcs/math/context_project.h
+@@ -18,7 +18,7 @@
+ #include <vector>
+ 
+ #include "paddle/phi/core/tensor_utils.h"
+-#include "paddle/phi/kernels/funcs/blas/blas.h"
++#include "kernels/funcs/blas/blas.h"
+ #include "paddle/phi/kernels/funcs/im2col.h"
+ 
+ namespace phi {
 diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
 index e101224970..a52eb6096f 100644
 --- a/paddle/phi/kernels/funcs/matrix_inverse.cu
@@ -469,6 +515,19 @@ index 558d363b39..05da04b517 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
  #include "paddle/phi/kernels/funcs/scatter.cu.h"
  
+diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
+index 8b0baf5f5f..260482f124 100644
+--- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
++++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
+@@ -27,7 +27,7 @@ namespace cub = hipcub;
+ 
+ #include "paddle/phi/kernels/funcs/multihead_matmul_functor.h"
+ 
+-#include "paddle/phi/kernels/funcs/blas/blas.h"
++#include "kernels/funcs/blas/blas.h"
+ #include "paddle/phi/kernels/funcs/math_cuda_utils.h"
+ 
+ namespace phi {
 diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
 index e30d440ff3..3c74792690 100644
 --- a/paddle/phi/kernels/funcs/top_k_function_cuda.h

From fb547db298546f2c3249e22886c2232ba4882987 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 15 Sep 2025 16:04:35 +0800
Subject: [PATCH 09/58] [metax] add warpctc_warprnn (#14)

* [metax] fix bug
---
 backends/metax_gpu/CMakeLists.txt             |   2 +
 backends/metax_gpu/change_patch.sh            |   1 +
 backends/metax_gpu/cmake/warpctc.cmake        | 149 ++++++
 backends/metax_gpu/cmake/warprnnt.cmake       | 142 ++++++
 .../warpctc_grad_kernel_register.cu           |   2 +-
 .../cuda_kernels/warpctc_kernel_register.cu   |   2 +-
 .../kernels/impl/warpctc_kernel_impl.h        |   3 +-
 .../kernels/impl/warprnnt_kernel_impl.h       |   6 +-
 backends/metax_gpu/patch/intrinsics.cuh       | 459 ++++++++++++++++++
 backends/metax_gpu/patch/paddle.patch         |  26 +
 10 files changed, 787 insertions(+), 5 deletions(-)
 create mode 100644 backends/metax_gpu/cmake/warpctc.cmake
 create mode 100644 backends/metax_gpu/cmake/warprnnt.cmake
 create mode 100644 backends/metax_gpu/patch/intrinsics.cuh

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 6048b59e6c1..cca23ab42f5 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -37,6 +37,8 @@ include(cblas)
 include(flashattn)
 include(cutlass)
 include(dgc)
+include(warpctc)
+include(warprnnt)
 
 set(PLUGIN_VERSION ${PADDLE_VERSION})
 
diff --git a/backends/metax_gpu/change_patch.sh b/backends/metax_gpu/change_patch.sh
index 833ae00f6bd..60d74ec0f3d 100644
--- a/backends/metax_gpu/change_patch.sh
+++ b/backends/metax_gpu/change_patch.sh
@@ -25,3 +25,4 @@ cp patch/tmp/mixed_vector* ../../Paddle/paddle/phi/core
 cd ../../Paddle/
 git apply --verbose ../backends/metax_gpu/patch/paddle.patch
 cd -
+cp -r patch/intrinsics.cuh ../../Paddle/third_party/warpctc/include/contrib/moderngpu/include/device/
diff --git a/backends/metax_gpu/cmake/warpctc.cmake b/backends/metax_gpu/cmake/warpctc.cmake
new file mode 100644
index 00000000000..71c892a6cfa
--- /dev/null
+++ b/backends/metax_gpu/cmake/warpctc.cmake
@@ -0,0 +1,149 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+
+include(ExternalProject)
+
+if(WITH_ROCM)
+  add_definitions(-DWARPCTC_WITH_HIP)
+endif()
+
+set(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc)
+set(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
+# in case of low internet speed set(WARPCTC_REPOSITORY
+# https://gitee.com/tianjianhe/warp-ctc.git)
+set(WARPCTC_TAG bdc2b4550453e0ef2d3b5190f9c6103a84eff184)
+set(SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/warpctc)
+set(WARPCTC_PATCH_COMMAND "")
+set(WARPCTC_CCBIN_OPTION "")
+if(WIN32)
+  set(WARPCTC_PATCH_CUDA_COMMAND
+      git checkout -- . && git checkout ${WARPCTC_TAG} && git apply
+      ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.cuda.patch)
+else()
+  set(WARPCTC_PATCH_CUDA_COMMAND
+      git checkout -- . && git checkout ${WARPCTC_TAG} && patch -Nd
+      ${SOURCE_DIR} <
+      ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.cuda.patch)
+endif()
+
+if(NOT WIN32 AND WITH_GPU)
+  if(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0 AND ${CMAKE_CXX_COMPILER_VERSION}
+                                                  VERSION_GREATER 12.0)
+    file(TO_NATIVE_PATH
+         ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.patch native_src)
+    set(WARPCTC_PATCH_COMMAND git checkout -- . && git checkout ${WARPCTC_TAG}
+                              && patch -Nd ${SOURCE_DIR} < ${native_src} &&)
+    set(WARPCTC_CCBIN_OPTION -DCCBIN_COMPILER=${CCBIN_COMPILER})
+  endif()
+endif()
+
+if(WITH_ROCM)
+  set(WARPCTC_PATHCH_ROCM_COMMAND
+      patch -p1 <
+      ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.rocm.patch && patch
+      -p1 < ${PADDLE_SOURCE_DIR}/patches/warpctc/devicetypes.cuh.patch && patch
+      -p1 < ${PADDLE_SOURCE_DIR}/patches/warpctc/hip.cmake.patch)
+endif()
+
+set(WARPCTC_INCLUDE_DIR
+    "${WARPCTC_INSTALL_DIR}/include"
+    CACHE PATH "Warp-ctc Directory" FORCE)
+# Used in unit test test_WarpCTCLayer
+set(WARPCTC_LIB_DIR
+    "${WARPCTC_INSTALL_DIR}/lib"
+    CACHE PATH "Warp-ctc Library Directory" FORCE)
+
+if(WIN32)
+  set(WARPCTC_LIBRARIES
+      "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      CACHE FILEPATH "Warp-ctc Library" FORCE)
+else()
+  set(WARPCTC_LIBRARIES
+      "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      CACHE FILEPATH "Warp-ctc Library" FORCE)
+endif()
+
+if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang"
+   OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang"
+   OR WIN32)
+  set(USE_OMP OFF)
+else()
+  set(USE_OMP ON)
+endif()
+
+if(WIN32)
+  set(WARPCTC_C_FLAGS $<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>)
+  set(WARPCTC_C_FLAGS_DEBUG $<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
+  set(WARPCTC_C_FLAGS_RELEASE
+      $<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
+  set(WARPCTC_CXX_FLAGS $<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>)
+  set(WARPCTC_CXX_FLAGS_RELEASE
+      $<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
+  set(WARPCTC_CXX_FLAGS_DEBUG
+      $<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
+else()
+  set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS})
+  set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
+  set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
+  set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+  set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+  set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
+endif()
+
+ExternalProject_Add(
+  extern_warpctc
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  SOURCE_DIR ${SOURCE_DIR}
+  PREFIX ${WARPCTC_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  PATCH_COMMAND
+  COMMAND ${WARPCTC_PATCH_COMMAND}
+  COMMAND ${WARPCTC_PATCH_CUDA_COMMAND}
+  COMMAND ${WARPCTC_PATHCH_ROCM_COMMAND}
+  # BUILD_ALWAYS    1
+  CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+             -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+             -DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS}
+             -DCMAKE_C_FLAGS_DEBUG=${WARPCTC_C_FLAGS_DEBUG}
+             -DCMAKE_C_FLAGS_RELEASE=${WARPCTC_C_FLAGS_RELEASE}
+             -DCMAKE_CXX_FLAGS=${WARPCTC_CXX_FLAGS}
+             -DCMAKE_CXX_FLAGS_RELEASE=${WARPCTC_CXX_FLAGS_RELEASE}
+             -DCMAKE_CXX_FLAGS_DEBUG=${WARPCTC_CXX_FLAGS_DEBUG}
+             -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
+             -DWITH_GPU=${WITH_GPU}
+             -DWITH_ROCM=${WITH_ROCM}
+             -DWITH_OMP=${USE_OMP}
+             -DNVCC_FLAGS_EXTRA=${NVCC_FLAGS_EXTRA}
+             -DWITH_TORCH=OFF
+             -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
+             -DBUILD_SHARED=ON
+             -DBUILD_TESTS=OFF
+             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+             -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}
+             ${EXTERNAL_OPTIONAL_ARGS}
+             ${WARPCTC_CCBIN_OPTION}
+  CMAKE_CACHE_ARGS
+    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+    -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
+  BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES})
+
+message(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
+get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY)
+include_directories(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its
+                                            # headers.
+
+add_library(warpctc INTERFACE)
+add_dependencies(warpctc extern_warpctc)
diff --git a/backends/metax_gpu/cmake/warprnnt.cmake b/backends/metax_gpu/cmake/warprnnt.cmake
new file mode 100644
index 00000000000..54a7ad6be86
--- /dev/null
+++ b/backends/metax_gpu/cmake/warprnnt.cmake
@@ -0,0 +1,142 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+
+include(ExternalProject)
+
+if(WITH_ROCM)
+  add_definitions(-DWARPRNNT_WITH_HIP)
+endif()
+
+set(WARPRNNT_PREFIX_DIR ${THIRD_PARTY_PATH}/warprnnt)
+set(WARPRNNT_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warprnnt)
+set(WARPRNNT_TAG 7ea6bfe748779c245a0fcaa5dd9383826273eff2)
+set(SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/warprnnt)
+set(WARPRNNT_PATCH_COMMAND "")
+set(WARPRNNT_CCBIN_OPTION "")
+if(WIN32)
+  set(WARPCTC_PATCH_CUDA_COMMAND
+      ${CMAKE_COMMAND} -E copy_if_different
+      ${PADDLE_SOURCE_DIR}/patches/warprnnt/CMakeLists.txt.cuda.patch
+      "<SOURCE_DIR>/")
+else()
+  set(WARPCTC_PATCH_CUDA_COMMAND
+      git checkout -- . && git checkout ${WARPRNNT_TAG} && patch -Nd
+      ${SOURCE_DIR} <
+      ${PADDLE_SOURCE_DIR}/patches/warprnnt/CMakeLists.txt.cuda.patch)
+endif()
+if(WITH_ROCM)
+  set(WARPRNNT_PATCH_ROCM_COMMAND
+      patch -p1 <
+      ${PADDLE_SOURCE_DIR}/patches/warprnnt/CMakeLists.txt.rocm.patch)
+endif()
+if(NOT WIN32 AND WITH_GPU)
+  if(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0 AND ${CMAKE_CXX_COMPILER_VERSION}
+                                                  VERSION_GREATER 12.0)
+    file(TO_NATIVE_PATH
+         ${PADDLE_SOURCE_DIR}/patches/warprnnt/CMakeLists.txt.patch native_src)
+    set(WARPRNNT_PATCH_COMMAND
+        git checkout -- . && git checkout ${WARPRNNT_TAG} && patch -Nd
+        ${SOURCE_DIR} < ${native_src})
+    set(WARPRNNT_CCBIN_OPTION -DCCBIN_COMPILER=${CCBIN_COMPILER})
+  endif()
+endif()
+
+set(WARPRNNT_INCLUDE_DIR
+    "${WARPRNNT_INSTALL_DIR}/include"
+    CACHE PATH "Warp-rnnt Directory" FORCE)
+# Used in unit test test_WarpCTCLayer
+set(WARPRNNT_LIB_DIR
+    "${WARPRNNT_INSTALL_DIR}/lib"
+    CACHE PATH "Warp-rnnt Library Directory" FORCE)
+
+if(WIN32)
+  set(WARPRNNT_LIBRARIES
+      "${WARPRNNT_INSTALL_DIR}/bin/warprnnt${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      CACHE FILEPATH "Warp-rnnt Library" FORCE)
+else()
+  set(WARPRNNT_LIBRARIES
+      "${WARPRNNT_INSTALL_DIR}/lib/libwarprnnt${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      CACHE FILEPATH "Warp-rnnt Library" FORCE)
+endif()
+
+if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang"
+   OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang"
+   OR WIN32)
+  set(USE_OMP OFF)
+else()
+  set(USE_OMP ON)
+endif()
+
+if(WIN32)
+  set(WARPRNNT_C_FLAGS $<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>)
+  set(WARPRNNT_C_FLAGS_DEBUG
+      $<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
+  set(WARPRNNT_C_FLAGS_RELEASE
+      $<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
+  set(WARPRNNT_CXX_FLAGS $<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>)
+  set(WARPRNNT_CXX_FLAGS_RELEASE
+      $<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
+  set(WARPRNNT_CXX_FLAGS_DEBUG
+      $<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
+else()
+  set(WARPRNNT_C_FLAGS ${CMAKE_C_FLAGS})
+  set(WARPRNNT_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
+  set(WARPRNNT_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
+  set(WARPRNNT_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+  set(WARPRNNT_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+  set(WARPRNNT_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
+endif()
+ExternalProject_Add(
+  extern_warprnnt
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  SOURCE_DIR ${SOURCE_DIR}
+  PREFIX ${WARPRNNT_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  PATCH_COMMAND
+  COMMAND ${WARPCTC_PATCH_CUDA_COMMAND}
+  COMMAND ${WARPRNNT_PATCH_ROCM_COMMAND}
+  # BUILD_ALWAYS    1
+  CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+             -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+             -DCMAKE_C_FLAGS=${WARPRNNT_C_FLAGS}
+             -DCMAKE_C_FLAGS_DEBUG=${WARPRNNT_C_FLAGS_DEBUG}
+             -DCMAKE_C_FLAGS_RELEASE=${WARPRNNT_C_FLAGS_RELEASE}
+             -DCMAKE_CXX_FLAGS=${WARPRNNT_CXX_FLAGS}
+             -DCMAKE_CXX_FLAGS_RELEASE=${WARPRNNT_CXX_FLAGS_RELEASE}
+             -DCMAKE_CXX_FLAGS_DEBUG=${WARPRNNT_CXX_FLAGS_DEBUG}
+             -DCMAKE_INSTALL_PREFIX=${WARPRNNT_INSTALL_DIR}
+             -DWITH_GPU=${WITH_GPU}
+             -DWITH_ROCM=${WITH_ROCM}
+             -DWITH_OMP=${USE_OMP}
+             -DNVCC_FLAGS_EXTRA=${NVCC_FLAGS_EXTRA}
+             -DBUILD_SHARED=ON
+             -DBUILD_TESTS=OFF
+             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+             ${EXTERNAL_OPTIONAL_ARGS}
+             ${WARPCTC_CCBIN_OPTION}
+  CMAKE_CACHE_ARGS
+    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+    -DCMAKE_INSTALL_PREFIX:PATH=${WARPRNNT_INSTALL_DIR}
+  BUILD_BYPRODUCTS ${WARPRNNT_LIBRARIES})
+
+message(STATUS "warp-rnnt library: ${WARPRNNT_LIBRARIES}")
+get_filename_component(WARPRNNT_LIBRARY_PATH ${WARPRNNT_LIBRARIES} DIRECTORY)
+include_directories(${WARPRNNT_INCLUDE_DIR}) # For warprnnt code to include its
+                                             # headers.
+
+add_library(warprnnt INTERFACE)
+# set_property(TARGET warprnnt PROPERTY IMPORTED_LOCATION ${WARPRNNT_LIBRARIES})
+add_dependencies(warprnnt extern_warprnnt)
diff --git a/backends/metax_gpu/kernels/cuda_kernels/warpctc_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/warpctc_grad_kernel_register.cu
index e77a29d12fe..d02f805a671 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/warpctc_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/warpctc_grad_kernel_register.cu
@@ -17,7 +17,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/warpctc_grad_kernel.h"
 
-PD_REGISTER_PLUGIN_KERNEL(warpctc_grad,
+PD_CUSTOM_KERNEL_REGISTER(warpctc_grad,
                           metax_gpu,
                           ALL_LAYOUT,
                           phi::WarpctcGradKernel,
diff --git a/backends/metax_gpu/kernels/cuda_kernels/warpctc_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/warpctc_kernel_register.cu
index 5b343506cad..c488e23fba9 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/warpctc_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/warpctc_kernel_register.cu
@@ -17,5 +17,5 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/warpctc_kernel.h"
 
-PD_REGISTER_PLUGIN_KERNEL(
+PD_CUSTOM_KERNEL_REGISTER(
     warpctc, metax_gpu, ALL_LAYOUT, phi::WarpctcKernel, float, double) {}
diff --git a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
index eb64f21c90f..9794ba1b3c0 100644
--- a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
@@ -204,7 +204,8 @@ class WarpCTCFunctor {
   void init(const Context& dev_ctx, const size_t blank) {
     warpctc_version_ = phi::dynload::get_warpctc_version();
 
-    if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
+    if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU ||
+        dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       options_.loc = CTC_GPU;
       options_.stream =
diff --git a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
index 96e756b16b1..bb4311f5912 100644
--- a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
@@ -138,7 +138,8 @@ class WarpRNNTFunctor {
     // There is no memory allocated operations within warp-rnnt.
     rnntStatus_t status = RNNT_STATUS_UNKNOWN_ERROR;
     bool gpu = false;
-    if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
+    if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU ||
+        dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       gpu = true;
 #else
@@ -207,7 +208,8 @@ class WarpRNNTFunctor {
     options_.fastemit_lambda = fastemit_lambda;
     options_.batch_first = true;
 
-    if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
+    if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU ||
+        dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       options_.loc = RNNT_GPU;
       options_.stream =
diff --git a/backends/metax_gpu/patch/intrinsics.cuh b/backends/metax_gpu/patch/intrinsics.cuh
new file mode 100644
index 00000000000..71365b6577c
--- /dev/null
+++ b/backends/metax_gpu/patch/intrinsics.cuh
@@ -0,0 +1,459 @@
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+
+#include "devicetypes.cuh"
+
+#pragma once
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+
+namespace mgpu {
+
+MGPU_HOST_DEVICE uint2 ulonglong_as_uint2(uint64 x) {
+	return *reinterpret_cast<uint2*>(&x);
+}
+MGPU_HOST_DEVICE uint64 uint2_as_ulonglong(uint2 x) {
+	return *reinterpret_cast<uint64*>(&x);
+}
+
+MGPU_HOST_DEVICE int2 longlong_as_int2(int64 x) {
+	return *reinterpret_cast<int2*>(&x);
+}
+MGPU_HOST_DEVICE int64 int2_as_longlong(int2 x) {
+	return *reinterpret_cast<int64*>(&x);
+}
+
+MGPU_HOST_DEVICE int2 double_as_int2(double x) {
+	return *reinterpret_cast<int2*>(&x);
+}
+MGPU_HOST_DEVICE double int2_as_double(int2 x) {
+	return *reinterpret_cast<double*>(&x);
+}
+
+MGPU_HOST_DEVICE void SetDoubleX(double& d, int x) {
+	reinterpret_cast<int*>(&d)[0] = x;
+}
+MGPU_HOST_DEVICE int GetDoubleX(double d) {
+	return double_as_int2(d).x;
+}
+MGPU_HOST_DEVICE void SetDoubleY(double& d, int y) {
+	reinterpret_cast<int*>(&d)[1] = y;
+}
+MGPU_HOST_DEVICE int GetDoubleY(double d) {
+	return double_as_int2(d).y;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// PTX for bfe and bfi
+
+#if __CUDA_ARCH__ >= 200
+
+MGPU_DEVICE uint bfe_ptx(uint x, uint bit, uint numBits) {
+	uint result;
+	asm("bfe.u32 %0, %1, %2, %3;" :
+		"=r"(result) : "r"(x), "r"(bit), "r"(numBits));
+	return result;
+}
+
+
+MGPU_DEVICE uint bfi_ptx(uint x, uint y, uint bit, uint numBits) {
+	uint result;
+	asm("bfi.b32 %0, %1, %2, %3, %4;" :
+		"=r"(result) : "r"(x), "r"(y), "r"(bit), "r"(numBits));
+	return result;
+}
+
+MGPU_DEVICE uint prmt_ptx(uint a, uint b, uint index) {
+	uint ret;
+	asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
+	return ret;
+}
+
+#endif // __CUDA_ARCH__ >= 200
+
+
+////////////////////////////////////////////////////////////////////////////////
+// shfl_up
+
+__device__ __forceinline__ float shfl_up(float var,
+	unsigned int delta, int width = 32) {
+
+#if __CUDA_ARCH__ >= 300
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+	var = __shfl_up_sync(0xFFFFFFFF, var, delta, width);
+#else
+	var = __shfl_up(var, delta, width);
+#endif
+#endif
+	return var;
+}
+
+__device__ __forceinline__ double shfl_up(double var,
+	unsigned int delta, int width = 32) {
+
+#if __CUDA_ARCH__ >= 300
+	int2 p = mgpu::double_as_int2(var);
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+	p.x = __shfl_up_sync(0xFFFFFFFF, p.x, delta, width);
+	p.y = __shfl_up_sync(0xFFFFFFFF, p.y, delta, width);
+#else
+	p.x = __shfl_up(p.x, delta, width);
+	p.y = __shfl_up(p.y, delta, width);
+#endif
+	var = mgpu::int2_as_double(p);
+#endif
+
+	return var;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// shfl_add
+
+// MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) {
+// 	int result = 0;
+// #if __CUDA_ARCH__ >= 300
+// 	int mask = (WARP_SIZE - width)<< 8;
+// #if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+// 	asm(
+// 		"{.reg .s32 r0;"
+// 		".reg .pred p;"
+// 		"shfl.up.sync.b32 r0|p, %1, %2, %3, 0xFFFFFFFF;"
+// 		"@p add.s32 r0, r0, %4;"
+// 		"mov.s32 %0, r0; }"
+// 		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
+// #else
+// 	asm(
+// 		"{.reg .s32 r0;"
+// 		".reg .pred p;"
+// 		"shfl.up.b32 r0|p, %1, %2, %3;"
+// 		"@p add.s32 r0, r0, %4;"
+// 		"mov.s32 %0, r0; }"
+// 		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
+// #endif
+// #endif
+// 	return result;
+// }
+
+MGPU_DEVICE int shfl_add(int x, int offset, int width = 32)
+{
+#if __CUDA_ARCH__ >= 300
+    unsigned fullMask = 0xffffffffU;
+    unsigned mask = (width == 32) ? fullMask : ((1U << width) - 1U);
+    int src = 0;
+#if defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 9
+    src = __shfl_up_sync(mask, x, offset, width);   // CUDA 9+
+#else
+    src = __shfl_up(x, offset, width);              // CUDA 8-
+#endif
+    int lane = threadIdx.x & 31;
+    return (lane >= offset) ? (src + x) : x;
+#else
+    return x;
+#endif
+}
+
+MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) {
+	int result = 0;
+#if __CUDA_ARCH__ >= 300
+	int mask = (WARP_SIZE - width)<< 8;
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+	asm(
+		"{.reg .s32 r0;"
+		".reg .pred p;"
+		"shfl.up.sync.b32 r0|p, %1, %2, %3, 0xFFFFFFFF;"
+		"@p max.s32 r0, r0, %4;"
+		"mov.s32 %0, r0; }"
+		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
+#else
+	asm(
+		"{.reg .s32 r0;"
+		".reg .pred p;"
+		"shfl.up.b32 r0|p, %1, %2, %3;"
+		"@p max.s32 r0, r0, %4;"
+		"mov.s32 %0, r0; }"
+		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
+#endif
+#endif
+	return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// brev, popc, clz, bfe, bfi, prmt
+
+// Reverse the bits in an integer.
+MGPU_HOST_DEVICE uint brev(uint x) {
+#if __CUDA_ARCH__ >= 200
+	uint y = __brev(x);
+#else
+	uint y = 0;
+	for(int i = 0; i < 32; ++i)
+		y |= (1 & (x>> i))<< (31 - i);
+#endif
+	return y;
+}
+
+// Count number of bits in a register.
+MGPU_HOST_DEVICE int popc(uint x) {
+#if __CUDA_ARCH__ >= 200
+	return __popc(x);
+#else
+	int c;
+	for(c = 0; x; ++c)
+		x &= x - 1;
+	return c;
+#endif
+}
+
+// Count leading zeros - start from most significant bit.
+MGPU_HOST_DEVICE int clz(int x) {
+#if __CUDA_ARCH__ >= 200
+	return __clz(x);
+#else
+	for(int i = 31; i >= 0; --i)
+		if((1<< i) & x) return 31 - i;
+	return 32;
+#endif
+}
+
+// Find first set - start from least significant bit. LSB is 1. ffs(0) is 0.
+MGPU_HOST_DEVICE int ffs(int x) {
+#if __CUDA_ARCH__ >= 200
+	return __ffs(x);
+#else
+	for(int i = 0; i < 32; ++i)
+		if((1<< i) & x) return i + 1;
+	return 0;
+#endif
+}
+
+MGPU_HOST_DEVICE uint bfe(uint x, uint bit, uint numBits) {
+#if __CUDA_ARCH__ >= 200
+	return bfe_ptx(x, bit, numBits);
+#else
+	return ((1<< numBits) - 1) & (x>> bit);
+#endif
+}
+
+MGPU_HOST_DEVICE uint bfi(uint x, uint y, uint bit, uint numBits) {
+	uint result;
+#if __CUDA_ARCH__ >= 200
+	result = bfi_ptx(x, y, bit, numBits);
+#else
+	if(bit + numBits > 32) numBits = 32 - bit;
+	uint mask = ((1<< numBits) - 1)<< bit;
+	result = y & ~mask;
+	result |= mask & (x<< bit);
+#endif
+	return result;
+}
+
+MGPU_HOST_DEVICE uint prmt(uint a, uint b, uint index) {
+	uint result;
+#if __CUDA_ARCH__ >= 200
+	result = prmt_ptx(a, b, index);
+#else
+	result = 0;
+	for(int i = 0; i < 4; ++i) {
+		uint sel = 0xf & (index>> (4 * i));
+		uint x = ((7 & sel) > 3) ? b : a;
+		x = 0xff & (x>> (8 * (3 & sel)));
+		if(8 & sel) x = (128 & x) ? 0xff : 0;
+		result |= x<< (8 * i);
+	}
+#endif
+	return result;
+}
+
+// Find log2(x) and optionally round up to the next integer logarithm.
+MGPU_HOST_DEVICE int FindLog2(int x, bool roundUp = false) {
+	int a = 31 - clz(x);
+	if(roundUp) a += !MGPU_IS_POW_2(x);
+	return a;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// vset4
+
+#if __CUDA_ARCH__ >= 300
+
+// Performs four byte-wise comparisons and returns 1 for each byte that
+// satisfies the conditional, and zero otherwise.
+MGPU_DEVICE uint vset4_lt_add_ptx(uint a, uint b, uint c) {
+	uint result;
+	asm("vset4.u32.u32.lt.add %0, %1, %2, %3;" :
+		"=r"(result) : "r"(a), "r"(b), "r"(c));
+	return result;
+}
+MGPU_DEVICE uint vset4_eq_ptx(uint a, uint b) {
+	uint result;
+	asm("vset4.u32.u32.eq %0, %1, %2, %3;" :
+		"=r"(result) : "r"(a), "r"(b), "r"(0));
+	return result;
+}
+#endif // __CUDA_ARCH__ >= 300
+
+MGPU_HOST_DEVICE uint vset4_lt_add(uint a, uint b, uint c) {
+	uint result;
+#if __CUDA_ARCH__ >= 300
+	result = vset4_lt_add_ptx(a, b, c);
+#else
+	result = c;
+	if((0x000000ff & a) < (0x000000ff & b)) result += 0x00000001;
+	if((0x0000ff00 & a) < (0x0000ff00 & b)) result += 0x00000100;
+	if((0x00ff0000 & a) < (0x00ff0000 & b)) result += 0x00010000;
+	if((0xff000000 & a) < (0xff000000 & b)) result += 0x01000000;
+#endif
+	return result;
+}
+
+MGPU_HOST_DEVICE uint vset4_eq(uint a, uint b) {
+	uint result;
+#if __CUDA_ARCH__ >= 300
+	result = vset4_eq_ptx(a, b);
+#else
+	result = 0;
+	if((0x000000ff & a) == (0x000000ff & b)) result = 0x00000001;
+	if((0x0000ff00 & a) == (0x0000ff00 & b)) result += 0x00000100;
+	if((0x00ff0000 & a) == (0x00ff0000 & b)) result += 0x00010000;
+	if((0xff000000 & a) == (0xff000000 & b)) result += 0x01000000;
+#endif
+	return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+
+MGPU_HOST_DEVICE uint umulhi(uint x, uint y) {
+#if __CUDA_ARCH__ >= 100
+	return __umulhi(x, y);
+#else
+	uint64 product = (uint64)x * y;
+	return (uint)(product>> 32);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// ldg() function defined for all devices and all types. Only compiles to __ldg
+// intrinsic for __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 for types supported
+// by __ldg in sm_32_intrinsics.h
+
+template<typename T>
+struct IsLdgType {
+	enum { value = false };
+};
+#define DEFINE_LDG_TYPE(T) \
+	template<> struct IsLdgType<T> { enum { value = true }; };
+
+template<typename T, bool UseLDG = IsLdgType<T>::value>
+struct LdgShim {
+	MGPU_DEVICE static T Ldg(const T* p) {
+		return *p;
+	}
+};
+
+#if __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400
+
+	// List of __ldg-compatible types from sm_32_intrinsics.h.
+	DEFINE_LDG_TYPE(char)
+	DEFINE_LDG_TYPE(short)
+	DEFINE_LDG_TYPE(int)
+	DEFINE_LDG_TYPE(long long)
+	DEFINE_LDG_TYPE(char2)
+	DEFINE_LDG_TYPE(char4)
+	DEFINE_LDG_TYPE(short2)
+	DEFINE_LDG_TYPE(short4)
+	DEFINE_LDG_TYPE(int2)
+	DEFINE_LDG_TYPE(int4)
+	DEFINE_LDG_TYPE(longlong2)
+
+	DEFINE_LDG_TYPE(unsigned char)
+	DEFINE_LDG_TYPE(unsigned short)
+	DEFINE_LDG_TYPE(unsigned int)
+	DEFINE_LDG_TYPE(unsigned long long)
+	DEFINE_LDG_TYPE(uchar2)
+	DEFINE_LDG_TYPE(uchar4)
+	DEFINE_LDG_TYPE(ushort2)
+	DEFINE_LDG_TYPE(ushort4)
+	DEFINE_LDG_TYPE(uint2)
+	DEFINE_LDG_TYPE(uint4)
+	DEFINE_LDG_TYPE(ulonglong2)
+
+	DEFINE_LDG_TYPE(float)
+	DEFINE_LDG_TYPE(double)
+	DEFINE_LDG_TYPE(float2)
+	DEFINE_LDG_TYPE(float4)
+	DEFINE_LDG_TYPE(double2)
+
+	template<typename T> struct LdgShim<T, true> {
+		MGPU_DEVICE static T Ldg(const T* p) {
+			return __ldg(p);
+		}
+	};
+#endif
+
+template<typename T>
+MGPU_DEVICE T ldg(const T* p) {
+	return LdgShim<T>::Ldg(p);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Fast division for 31-bit integers.
+// Uses the method in Hacker's Delight (2nd edition) page 228.
+// Evaluates for denom > 1 and x < 2^31.
+struct FastDivide {
+	uint denom;
+	uint coef;
+	uint shift;
+
+	MGPU_HOST_DEVICE uint Divide(uint x) {
+		return umulhi(x, coef)>> shift;
+	}
+	MGPU_HOST_DEVICE uint Modulus(uint x) {
+		return x - Divide(x) * denom;
+	}
+
+	explicit FastDivide(uint denom_) {
+		denom = denom_;
+		uint p = 31 + FindLog2(denom, true);
+		coef = (uint)(((1ull<< p) + denom - 1) / denom);
+		shift = p - 32;
+	}
+};
+
+#pragma GCC diagnostic pop
+
+} // namespace mgpu
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 8127caee61e..0283a443adb 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -1087,6 +1087,32 @@ index 6f03f76eeb..5fe2c3e7dc 100644
  #include "paddle/phi/kernels/funcs/for_range.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
  
+diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h
+index 7b85903776..3f4b298807 100644
+--- a/paddle/phi/kernels/impl/merged_momentum_impl.h
++++ b/paddle/phi/kernels/impl/merged_momentum_impl.h
+@@ -297,7 +297,7 @@ void MergedMomentumInnerCompute(
+                 params_out[idx],
+                 velocities_out[idx]);
+         VLOG(10) << "Launch MergedMomentum cpu kernel.";
+-      } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
++      } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
+         phi::funcs::ForRange<Context> for_range(
+             static_cast<const Context &>(dev_ctx), params[idx]->numel());
+         const auto grad_type = grads[idx]->dtype();
+diff --git a/paddle/phi/kernels/impl/momentum_kernel_impl.h b/paddle/phi/kernels/impl/momentum_kernel_impl.h
+index de5bcfc30b..eb2a9714f5 100644
+--- a/paddle/phi/kernels/impl/momentum_kernel_impl.h
++++ b/paddle/phi/kernels/impl/momentum_kernel_impl.h
+@@ -457,7 +457,7 @@ void MomentumDenseImpl(const Context& dev_ctx,
+             regularization_coeff,
+             param_out,
+             velocity_out);
+-  } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
++  } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
+     funcs::ForRange<Context> for_range(dev_ctx, param.numel());
+     const auto grad_type = grad.dtype();
+ #define PADDLE_LAUNCH_DENSE_MOMENTUM_KERNEL(__nesterov, __reg_type)     \
 diff --git a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
 index 4099d8b506..baef2cd643 100644
 --- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h

From 8e981985c3b9f2e6bfc3789d92b48fed42abace1 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Mon, 15 Sep 2025 17:40:04 +0800
Subject: [PATCH 10/58] [Metax] update metax CI (#15)

* [Metax] update metax CI
---
 backends/metax_gpu/tests/CMakeLists.txt       | 100 ++++-
 .../check_diff_metax_legacy_unit_test.sh      | 108 +++++
 .../tests/unit_test/test_abs_metax.py         |  39 ++
 .../tests/unit_test/test_arange_metax.py      | 260 ++++++++++++
 .../test_bfloat16_embedding_metax.py          |  72 ++++
 .../unit_test/test_count_nonzero_api_metax.py |  81 ++++
 .../unit_test/test_gaussian_nll_loss_metax.py | 208 +++++++++
 .../tests/unit_test/test_greater_equal.py     |  44 ++
 ...bate_build_src_rank_and_local_expert_id.py |  62 +++
 ...test_incubate_expand_modality_expert_id.py | 172 ++++++++
 .../test_incubate_fused_rmsnorm_ext_metax.py  |  95 +++++
 .../unit_test/test_incubate_moe_combine.py    | 193 +++++++++
 ...moe_gate_dispatch_partial_nosoftmaxtopk.py | 218 ++++++++++
 ...st_incubate_moe_gate_dispatch_w_permute.py | 207 +++++++++
 ...ncubate_moe_gate_dispatch_w_permute_bwd.py | 175 ++++++++
 .../tests/unit_test/test_layer_norm.py        | 358 ++++++++++++++++
 .../tests/unit_test/test_matmul_op__metax.py  | 395 ++++++++++++++++++
 .../tests/unit_test/test_nonzero_api_metax.py | 220 ++++++++++
 .../tests/unit_test/test_p_norm_op_metax.py   | 215 ++++++++++
 .../tests/unit_test/test_squeeze_op_metax.py  | 125 ++++++
 .../tests/unit_test/test_swiglu_metax.py      | 295 +++++++++++++
 .../tests/unit_test/test_top_p_sampling.py    | 162 +++++++
 .../unit_test/test_unsqueeze_op_metax.py      |  98 +++++
 23 files changed, 3894 insertions(+), 8 deletions(-)
 create mode 100644 backends/metax_gpu/tests/scripts/check_diff_metax_legacy_unit_test.sh
 create mode 100644 backends/metax_gpu/tests/unit_test/test_abs_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_arange_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_bfloat16_embedding_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_count_nonzero_api_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_gaussian_nll_loss_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_greater_equal.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_fused_rmsnorm_ext_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_layer_norm.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_nonzero_api_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_p_norm_op_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_squeeze_op_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_swiglu_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_top_p_sampling.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_unsqueeze_op_metax.py

diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index d2e92f209ab..7e549ef4eaa 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -5,22 +5,106 @@ enable_testing()
 
 find_package(Python REQUIRED COMPONENTS Interpreter)
 
-file(GLOB_RECURSE PYTHON_TEST_SCRIPTS "unittest/*.py")
+set(PADDLE_LEGACY_TEST_PATH
+    ${CMAKE_CURRENT_LIST_DIR}/../../../Paddle/test/legacy_test)
+set(METAX_UNIT_TEST_PATH ${CMAKE_CURRENT_LIST_DIR}/unit_test)
+
+file(GLOB_RECURSE PYTHON_TEST_SCRIPTS "${METAX_UNIT_TEST_PATH}/*.py")
 
 list(
   APPEND
   PYTHON_TEST_SCRIPTS
-  ${CMAKE_CURRENT_LIST_DIR}/../../../Paddle/test/legacy_test/test_tril_triu_op.py
-)
+  ${PADDLE_LEGACY_TEST_PATH}/test_accuracy_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_where_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_split_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_fill_constant_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_empty_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_sign_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_cast_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_unbind_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_put_along_axis_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_maximum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_accuracy_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_strided_slice_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_set_value_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_flatten_contiguous_range_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_top_k_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_subtract_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_greater_equal_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_top_k_v2_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_one_hot_v2_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_fill_any_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_reshape_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_index_put_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_bitwise_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_pad_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_cast_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_zeros_like_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_shape_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_index_put_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_bincount_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_assign_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_squared_l2_norm_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_fused_bias_act_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_expand_v2_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_adamw_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_gather_nd_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_concat_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_nd_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_floordiv_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_mul_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_einsum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_numel_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_scale_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_full_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_clip_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_reduce_op.py)
 
 list(
   REMOVE_ITEM
   PYTHON_TEST_SCRIPTS
-  ${CMAKE_CURRENT_LIST_DIR}/unittest/test_cumsum_op_metax.py
-  ${CMAKE_CURRENT_LIST_DIR}/unittest/test_softmax_with_cross_entropy_op_metax.py
-  ${CMAKE_CURRENT_LIST_DIR}/unittest/test_expand_v2_op_metax.py
-  ${CMAKE_CURRENT_LIST_DIR}/unittest/test_tril_triu_op_metax.py
-  ${CMAKE_CURRENT_LIST_DIR}/unittest/test_squared_l2_norm_op_metax.py)
+  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_expand_v2_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_squared_l2_norm_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_einsum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py)
 
 list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS)
 foreach(test_script ${PYTHON_TEST_SCRIPTS})
diff --git a/backends/metax_gpu/tests/scripts/check_diff_metax_legacy_unit_test.sh b/backends/metax_gpu/tests/scripts/check_diff_metax_legacy_unit_test.sh
new file mode 100644
index 00000000000..86bfcb08f86
--- /dev/null
+++ b/backends/metax_gpu/tests/scripts/check_diff_metax_legacy_unit_test.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+SOURCE_DIR="backends/metax_gpu/tests/unittest"
+SEARCH_DIR="Paddle/test/legacy_test"
+PREFIX_FILE="metax_prefixes.txt"
+UNMATCHED_FILE="unmatched_files.txt"
+EXIST_FILE="existing_files.txt"
+MISS_FILE="missing_files.txt"
+
+# 检查源路径是否存在
+if [ ! -d "$SOURCE_DIR" ]; then
+    echo "错误: 源路径 '$SOURCE_DIR' 不存在或不是一个目录"
+    exit 1
+fi
+
+# 检查搜索路径是否存在
+if [ ! -d "$SEARCH_DIR" ]; then
+    echo "错误: 搜索路径 '$SEARCH_DIR' 不存在或不是一个目录"
+    exit 1
+fi
+
+# 第一步：提取前缀（根据新规则处理）
+echo "第一步：从 '$SOURCE_DIR' 提取文件前缀（按_op/_metax规则）..."
+> "$PREFIX_FILE"      # 清空前缀文件
+> "$UNMATCHED_FILE"   # 清空未匹配文件列表
+
+find "$SOURCE_DIR" -type f -name "*.py" | while read -r file; do
+    filename=$(basename "$file")
+    prefix=""
+
+    # 规则1：如果包含_op关键字，提取_op前的所有字符
+    if [[ "$filename" == *"_op"* ]]; then
+        prefix="${filename%%_op*}"
+        echo "提取前缀（_op规则）: $prefix (来自 $filename)"
+        echo "$prefix" >> "$PREFIX_FILE"
+
+    # 规则2：如果没有_op但有_metax，提取_metax前的所有字符
+    elif [[ "$filename" == *"_metax"* ]]; then
+        prefix="${filename%%_metax*}"
+        echo "提取前缀（_metax规则）: $prefix (来自 $filename)"
+        echo "$prefix" >> "$PREFIX_FILE"
+
+    # 规则3：都不包含，归类到未匹配
+    else
+        echo "未匹配的文件: $filename（不包含_op和_metax）"
+        echo "$filename" >> "$UNMATCHED_FILE"
+    fi
+done
+
+# 检查是否有提取到前缀或未匹配文件
+prefix_count=$(wc -l < "$PREFIX_FILE")
+unmatched_count=$(wc -l < "$UNMATCHED_FILE")
+
+echo "提取完成 - 有效前缀: $prefix_count 个，未匹配文件: $unmatched_count 个"
+
+if [ $prefix_count -eq 0 ] && [ $unmatched_count -eq 0 ]; then
+    echo "警告: 在 '$SOURCE_DIR' 中未找到任何以 '_metax.py' 结尾的文件"
+    exit 0
+fi
+
+# 第二步：在搜索路径中查找同名文件（仅搜索当前目录，不包括子文件夹）
+echo -e "\n第二步：在 '$SEARCH_DIR' 中搜索同名文件（深度为1）..."
+> "$EXIST_FILE"   # 清空存在文件列表
+> "$MISS_FILE"    # 清空缺失文件列表
+
+# 逐个处理每个前缀
+while read -r prefix; do
+    # 跳过空行
+    if [ -z "$prefix" ]; then
+        continue
+    fi
+
+    # 只在搜索路径的直接目录下查找（深度为1）
+    found=$(find "$SEARCH_DIR" -maxdepth 1 -type f -name "${prefix}_op.py" -print -quit)
+
+    if [ -n "$found" ]; then
+        echo "$prefix -> 找到文件: $found"
+        echo "${prefix}_op.py" >> "$EXIST_FILE"
+    else
+        echo "$prefix -> 未找到同名文件"
+        echo "$prefix" >> "$MISS_FILE"
+    fi
+done < "$PREFIX_FILE"
+
+# 输出结果统计
+exist_count=$(wc -l < "$EXIST_FILE")
+miss_count=$(wc -l < "$MISS_FILE")
+
+echo -e "\n处理完成！"
+echo "找到同名文件的前缀数量: $exist_count（已保存到 $EXIST_FILE）"
+echo "未找到同名文件的前缀数量: $miss_count（已保存到 $MISS_FILE）"
+echo "未匹配规则的文件数量: $unmatched_count（已保存到 $UNMATCHED_FILE）"
diff --git a/backends/metax_gpu/tests/unit_test/test_abs_metax.py b/backends/metax_gpu/tests/unit_test/test_abs_metax.py
new file mode 100644
index 00000000000..0dae6822bba
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_abs_metax.py
@@ -0,0 +1,39 @@
+# 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
+# # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.base.dygraph as dg
+
+
+class TestAbs(unittest.TestCase):
+    def setUp(self):
+        self._dtypes = ["float32"]
+        self._places = [paddle.CustomPlace("metax_gpu", 0)]
+
+    def test_all_positive(self):
+        for dtype in self._dtypes:
+            x = 1 + 10 * np.random.random([13, 3, 3]).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    y = paddle.abs(paddle.to_tensor(x))
+                    np.testing.assert_allclose(np.abs(x), y.numpy(), rtol=1e-05)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_arange_metax.py b/backends/metax_gpu/tests/unit_test/test_arange_metax.py
new file mode 100644
index 00000000000..89308c33401
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_arange_metax.py
@@ -0,0 +1,260 @@
+# 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
+# #   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import OpTest, convert_float_to_uint16
+
+import paddle
+from paddle.base import core
+from paddle.static import Program, program_guard
+
+
+def arange_wrapper(start, end, step, dtype="float32"):
+    return paddle.arange(start, end, step, dtype)
+
+
+class TestArangeOp(OpTest):
+    def setUp(self):
+        self.op_type = "range"
+        self.init_config()
+        self.inputs = {
+            "Start": np.array([self.case[0]]).astype(self.dtype),
+            "End": np.array([self.case[1]]).astype(self.dtype),
+            "Step": np.array([self.case[2]]).astype(self.dtype),
+        }
+
+        self.outputs = {
+            "Out": np.arange(self.case[0], self.case[1], self.case[2]).astype(
+                self.dtype
+            )
+        }
+
+    def init_config(self):
+        self.dtype = np.float32
+        self.python_api = arange_wrapper
+        self.case = (0, 1, 0.2)
+
+    def test_check_output(self):
+        self.check_output(check_pir=True, check_symbol_infer=False)
+
+
+class TestFloatArangeOp(TestArangeOp):
+    def init_config(self):
+        self.dtype = np.float32
+        self.python_api = paddle.arange
+        self.case = (0, 5, 1)
+
+
+class TestFloat16ArangeOp(TestArangeOp):
+    def init_config(self):
+        self.dtype = np.float16
+        self.python_api = paddle.arange
+        self.case = (0, 5, 1)
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and not support the bfloat16",
+)
+class TestBFloat16ArangeOp(OpTest):
+    def setUp(self):
+        self.op_type = "range"
+        self.init_config()
+        self.inputs = {
+            "Start": convert_float_to_uint16(self.start),
+            "End": convert_float_to_uint16(self.end),
+            "Step": convert_float_to_uint16(self.step),
+        }
+
+        self.outputs = {
+            "Out": convert_float_to_uint16(np.arange(self.start, self.end, self.step))
+        }
+
+    def init_config(self):
+        self.dtype = np.uint16
+        self.python_api = arange_wrapper
+        self.case = (0, 5, 1)
+        self.start = np.array([self.case[0]]).astype(np.float32)
+        self.end = np.array([self.case[1]]).astype(np.float32)
+        self.step = np.array([self.case[2]]).astype(np.float32)
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, check_pir=True, check_symbol_infer=False)
+
+
+class TestInt32ArangeOp(TestArangeOp):
+    def init_config(self):
+        self.dtype = np.int32
+        self.python_api = paddle.arange
+        self.case = (0, 5, 2)
+
+
+class TestFloat64ArangeOp(TestArangeOp):
+    def init_config(self):
+        self.dtype = np.float64
+        self.python_api = paddle.arange
+        self.case = (10, 1, -2)
+
+
+class TestInt64ArangeOp(TestArangeOp):
+    def init_config(self):
+        self.dtype = np.int64
+        self.python_api = paddle.arange
+        self.case = (-1, -10, -2)
+
+
+class TestZeroSizeArangeOp(TestArangeOp):
+    def init_config(self):
+        self.dtype = np.int32
+        self.python_api = paddle.arange
+        self.case = (0, 0, 1)
+
+
+class TestArangeOpError(unittest.TestCase):
+    def test_static_errors(self):
+        with program_guard(Program(), Program()):
+            paddle.enable_static()
+            self.assertRaises(TypeError, paddle.arange, 10, dtype="int8")
+
+
+class TestArangeAPI(unittest.TestCase):
+    def test_out(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            x1 = paddle.arange(0, 5, 1, "float32")
+
+            place = (
+                paddle.CUDAPlace(0)
+                if core.is_compiled_with_cuda()
+                else paddle.CPUPlace()
+            )
+            exe = paddle.static.Executor(place)
+            out = exe.run(fetch_list=[x1])
+
+            expected_data = np.arange(0, 5, 1).astype(np.float32)
+            self.assertEqual((out == expected_data).all(), True)
+            self.assertListEqual(list(x1.shape), [5])
+        paddle.disable_static(place)
+
+
+class TestArangeImperative(unittest.TestCase):
+    def test_out(self):
+        place = (
+            paddle.CUDAPlace(0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
+        )
+        paddle.disable_static(place)
+        x1 = paddle.arange(0, 5, 1)
+        x2 = paddle.tensor.arange(5)
+        x3 = paddle.tensor.creation.arange(5)
+
+        start = paddle.to_tensor(np.array([0], "float32"))
+        end = paddle.to_tensor(np.array([5], "float32"))
+        step = paddle.to_tensor(np.array([1], "float32"))
+        x4 = paddle.arange(start, end, step, "int64")
+
+        expected_data = np.arange(0, 5, 1).astype(np.int64)
+        for x in [x1, x2, x3, x4]:
+            np.testing.assert_array_equal(x.numpy(), expected_data)
+
+        start_float = paddle.to_tensor(np.array([0.5], "float32"))
+        end_float = paddle.to_tensor(np.array([1.5], "float32"))
+        step_float = paddle.to_tensor(np.array([0.5], "float32"))
+        # all [start, end, step] is float
+        x5 = paddle.arange(start_float, end_float, step_float)
+        x5_expected_data = np.arange(0.5, 1.5, 0.5).astype(np.float32)
+        np.testing.assert_array_equal(x5.numpy(), x5_expected_data)
+        self.assertEqual(x5.numpy().dtype, np.float32)
+
+        # [start, end] is float , [step] is int
+        x6 = paddle.arange(start_float, end_float, 1)
+        x6_expected_data = np.arange(0.5, 1.5, 1).astype(np.float32)
+        np.testing.assert_array_equal(x6.numpy(), x6_expected_data)
+        self.assertEqual(x6.numpy().dtype, np.float32)
+
+        # [start] is float , [end] is int
+        x7 = paddle.arange(start_float, 1)
+        x7_expected_data = np.arange(0.5, 1).astype(np.float32)
+        np.testing.assert_array_equal(x7.numpy(), x7_expected_data)
+        self.assertEqual(x7.numpy().dtype, np.float32)
+
+        # [start] is float
+        x8 = paddle.arange(start_float)
+        x8_expected_data = np.arange(0.5).astype(np.float32)
+        np.testing.assert_array_equal(x8.numpy(), x8_expected_data)
+        self.assertEqual(x8.numpy().dtype, np.float32)
+
+        # [start] is int
+        x9 = paddle.arange(1)
+        x9_expected_data = np.arange(1).astype(np.int64)
+        np.testing.assert_array_equal(x9.numpy(), x9_expected_data)
+        self.assertEqual(x9.numpy().dtype, np.int64)
+
+        # [start] is float
+        x10 = paddle.arange(1.0)
+        x10_expected_data = np.arange(1).astype(np.float32)
+        np.testing.assert_array_equal(x10.numpy(), x10_expected_data)
+        self.assertEqual(x10.numpy().dtype, np.float32)
+
+        # [start] is np.int
+        x11 = paddle.arange(np.int64(10))
+        x11_expected_data = np.arange(10).astype(np.int64)
+        np.testing.assert_array_equal(x11.numpy(), x11_expected_data)
+        self.assertEqual(x11.numpy().dtype, np.int64)
+
+        # [start] is a big integer
+        x12 = paddle.arange(
+            start=0,
+            end=-9007199254740994,
+            step=-9007199254740993,
+        )
+
+        # numpy give wrong result here, so we generate 'x12_expected_data' manually
+        # x12_expected_data = np.arange(start=0, stop=-9007199254740994, step=-9007199254740993, dtype=np.int64)
+        x12_expected_data = np.array([0, -9007199254740993])
+
+        np.testing.assert_array_equal(x12.numpy(), x12_expected_data)
+        self.assertEqual(x12.numpy().dtype, np.int64)
+
+        # [start<end step<0]
+        x13 = paddle.arange(start=0, end=10, step=-1)
+
+        x13_expected_data = np.array([])
+        np.testing.assert_array_equal(x13.numpy(), x13_expected_data)
+
+        # [start>end step>0]
+        x14 = paddle.arange(start=10, end=0, step=1)
+
+        x14_expected_data = np.array([])
+        np.testing.assert_array_equal(x14.numpy(), x14_expected_data)
+
+        paddle.enable_static()
+
+
+class TestArangeStatic(unittest.TestCase):
+    def test_infermeta(self):
+        paddle.enable_static()
+        x = paddle.arange(0, 1 + 0.005, 0.005)
+        self.assertEqual(x.shape, [201])
+        paddle.disable_static()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_bfloat16_embedding_metax.py b/backends/metax_gpu/tests/unit_test/test_bfloat16_embedding_metax.py
new file mode 100644
index 00000000000..f575d4eece0
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_bfloat16_embedding_metax.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+
+
+class BF16EmbeddingTest(unittest.TestCase):
+    def setUp(self):
+        self.batch_size = 30
+        self.vocab_size = 1024
+        self.hidden_size = 512
+        self.seed = 10
+
+    def run_main(self, dtype):
+        ids, weight, dout = self.gen_random()
+        origin_dtype = weight.dtype
+        weight_cast = weight.astype(dtype)
+        out = F.embedding(ids, weight_cast)
+        dout = dout.astype(out.dtype)
+        dweight = paddle.autograd.grad(out, weight, dout)
+        return (
+            out.astype(origin_dtype).numpy(),
+            dweight[0].astype(origin_dtype).numpy(),
+        )
+
+    def gen_random(self):
+        np.random.seed(self.seed)
+        weight = np.random.random([self.vocab_size, self.hidden_size]).astype("float32")
+        ids = np.random.randint(low=0, high=self.vocab_size, size=[self.batch_size])
+        dout = np.random.random([self.batch_size, self.hidden_size]).astype("float32")
+
+        weight = paddle.to_tensor(weight)
+        weight.stop_gradient = False
+        ids = paddle.to_tensor(ids)
+        dout = paddle.to_tensor(dout)
+        return ids, weight, dout
+
+    def test_main(self):
+
+        ret1 = self.run_main("float32")
+        ret2 = self.run_main("bfloat16")
+        self.assertEqual(len(ret1), len(ret2))
+        for i, (r1, r2) in enumerate(zip(ret1, ret2)):
+            np.testing.assert_allclose(r1, r2, atol=1e-3, rtol=1e-2)
+
+
+class BF16EmbeddingTestOddHiddenSize(BF16EmbeddingTest):
+    def setUp(self):
+        self.batch_size = 30
+        self.vocab_size = 511
+        self.hidden_size = 512
+        self.seed = 20
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_count_nonzero_api_metax.py b/backends/metax_gpu/tests/unit_test/test_count_nonzero_api_metax.py
new file mode 100644
index 00000000000..57a5d0b1c97
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_count_nonzero_api_metax.py
@@ -0,0 +1,81 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+np.random.seed(10)
+
+
+class TestCountNonzeroAPI(unittest.TestCase):
+    # test paddle.tensor.math.count_nonzero
+
+    def setUp(self):
+        self.x_shape = [2, 3, 4, 5]
+        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
+        self.place = paddle.CustomPlace("metax_gpu", 0)
+
+    def test_api_static(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data("X", self.x_shape)
+            out1 = paddle.count_nonzero(x)
+            out2 = paddle.tensor.count_nonzero(x)
+            out3 = paddle.tensor.math.count_nonzero(x)
+            axis = np.arange(len(self.x_shape)).tolist()
+            out4 = paddle.count_nonzero(x, axis)
+            out5 = paddle.count_nonzero(x, tuple(axis))
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={"X": self.x}, fetch_list=[out1, out2, out3, out4, out5])
+        out_ref = np.count_nonzero(self.x)
+        for out in res:
+            np.testing.assert_allclose(out, out_ref, rtol=1e-05)
+
+    def test_api_dygraph(self):
+        paddle.disable_static(self.place)
+
+        def test_case(x, axis=None, keepdim=False):
+            x_tensor = paddle.to_tensor(x)
+            out = paddle.count_nonzero(x_tensor, axis=axis, keepdim=keepdim)
+            if isinstance(axis, list):
+                axis = tuple(axis)
+                if len(axis) == 0:
+                    axis = None
+
+            out_ref = np.count_nonzero(x, axis, keepdims=keepdim)
+            np.testing.assert_allclose(out.numpy(), out_ref, rtol=1e-05)
+
+        test_case(self.x)
+        test_case(self.x, None)
+        test_case(self.x, -1)
+        test_case(self.x, keepdim=True)
+        test_case(self.x, 2, keepdim=True)
+        test_case(self.x, [0, 2])
+        test_case(self.x, (0, 2))
+        test_case(self.x, (0, 1, 3))
+        test_case(self.x, [0, 1, 2, 3])
+        paddle.enable_static()
+
+    def test_errors(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data("X", [10, 12], "int32")
+            self.assertRaises(ValueError, paddle.count_nonzero, x, axis=10)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_gaussian_nll_loss_metax.py b/backends/metax_gpu/tests/unit_test/test_gaussian_nll_loss_metax.py
new file mode 100644
index 00000000000..73e389324f9
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_gaussian_nll_loss_metax.py
@@ -0,0 +1,208 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle.base import core
+
+np.random.seed(10)
+
+
+def ref_gaussian_nll_loss(
+    input, label, variance, full=False, eps=1e-6, reduction="none"
+):
+    if variance.shape != input.shape:
+        if input.shape[:-1] == variance.shape:
+            variance = np.expand_dims(variance, -1)
+        elif input.shape[:-1] == variance.shape[:-1] and variance.shape[-1] == 1:
+            pass
+        else:
+            raise ValueError("variance is of incorrect size")
+    if reduction != "none" and reduction != "mean" and reduction != "sum":
+        raise ValueError(reduction + " is not valid")
+
+    if np.any(variance < 0):
+        raise ValueError("var has negative entry/entries")
+
+    variance = variance.copy()
+    variance = np.clip(variance, a_min=eps, a_max=None)
+
+    loss = 0.5 * (np.log(variance) + (input - label) ** 2 / variance)
+    if full:
+        loss += 0.5 * np.log(2 * np.pi)
+
+    if reduction == "none":
+        return loss
+    elif reduction == "sum":
+        return [np.sum(loss)]
+    elif reduction == "mean":
+        return [np.mean(loss)]
+
+
+class TestGaussianNLLLossAPI(unittest.TestCase):
+    # test paddle.nn.functional.gaussian_nll_loss, paddle.nn.gaussian_nll_loss
+
+    def setUp(self, type=None):
+        self.shape = [10, 2]
+        if type in ["float16", "float64", "int32", "int64"]:
+            dtype = np.dtype(type)
+            self.input_np = np.random.random(self.shape).astype(dtype)
+            self.label_np = np.random.random(self.shape).astype(dtype)
+            self.variance_np = np.ones(self.shape).astype(dtype)
+        elif type == "broadcast1":
+            self.shape = [10, 2, 3]
+            self.broadcast_shape = [10, 2]
+            self.input_np = np.random.random(self.shape).astype(np.float32)
+            self.label_np = np.random.random(self.shape).astype(np.float32)
+            self.variance_np = np.ones(self.broadcast_shape).astype(np.float32)
+        elif type == "broadcast2":
+            self.shape = [10, 2, 3]
+            self.broadcast_shape = [10, 2, 1]
+            self.input_np = np.random.random(self.shape).astype(np.float32)
+            self.label_np = np.random.random(self.shape).astype(np.float32)
+            self.variance_np = np.ones(self.broadcast_shape).astype(np.float32)
+        else:
+            dtype = np.dtype("float32")
+            self.input_np = np.random.random(self.shape).astype(dtype)
+            self.label_np = np.random.random(self.shape).astype(dtype)
+            self.variance_np = np.ones(self.shape).astype(dtype)
+        if type == "test_err":
+            self.variance_np = -np.ones(self.shape).astype(np.float32)
+
+        self.place = (
+            paddle.CUDAPlace(0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
+        )
+
+    def test_dynamic_case(self, type=None, full=False, reduction="none"):
+        self.setUp(type)
+        paddle.disable_static(self.place)
+
+        input_x = paddle.to_tensor(self.input_np)
+        label = paddle.to_tensor(self.label_np)
+        variance = paddle.to_tensor(self.variance_np)
+        if type in ["test_err", "int32", "int64"]:
+            self.assertRaises(
+                ValueError,
+                paddle.nn.functional.gaussian_nll_loss,
+                input=input_x,
+                label=label,
+                variance=variance,
+            )
+        else:
+            out_ref = ref_gaussian_nll_loss(
+                self.input_np,
+                self.label_np,
+                self.variance_np,
+                full=full,
+                reduction=reduction,
+            )
+            out1 = F.gaussian_nll_loss(
+                input_x, label, variance, full=full, reduction=reduction
+            )
+            gaussian_nll_loss = paddle.nn.GaussianNLLLoss(full, reduction=reduction)
+            out2 = gaussian_nll_loss(input_x, label, variance)
+
+            for r in [out1, out2]:
+                np.allclose(out_ref, r.numpy(), rtol=1e-5, atol=1e-5)
+        paddle.enable_static()
+
+    def test_static_case(self, type=None, full=False, reduction="none"):
+        self.setUp(type)
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            if type in ["int32", "int64", "float64"]:
+                input_x = paddle.static.data("Input_x", self.shape, type)
+                label = paddle.static.data("Label", self.shape, type)
+                variance = paddle.static.data("Variance", self.shape, type)
+            elif type in ["broadcast1", "broadcast2"]:
+                input_x = paddle.static.data("Input_x", self.shape)
+                label = paddle.static.data("Label", self.shape)
+                variance = paddle.static.data("Variance", self.broadcast_shape)
+            else:
+                input_x = paddle.static.data("Input_x", self.shape, "float32")
+                label = paddle.static.data("Label", self.shape, "float32")
+                variance = paddle.static.data("Variance", self.shape, "float32")
+            out1 = F.gaussian_nll_loss(
+                input_x, label, variance, full=full, reduction=reduction
+            )
+            gaussian_nll_loss = paddle.nn.GaussianNLLLoss(full, reduction=reduction)
+            out2 = gaussian_nll_loss(input_x, label, variance)
+            exe = paddle.static.Executor(self.place)
+            if type not in ["test_err", "int32", "int64"]:
+                out_ref = ref_gaussian_nll_loss(
+                    self.input_np,
+                    self.label_np,
+                    self.variance_np,
+                    full=full,
+                    reduction=reduction,
+                )
+                res = exe.run(
+                    feed={
+                        "Input_x": self.input_np,
+                        "Label": self.label_np,
+                        "Variance": self.variance_np,
+                    },
+                    fetch_list=[out1, out2],
+                )
+                for r in res:
+                    np.allclose(out_ref, r, rtol=1e-5, atol=1e-5)
+            else:
+                try:
+                    res = exe.run(
+                        feed={
+                            "Input_x": self.input_np,
+                            "Label": self.label_np,
+                            "Variance": self.variance_np,
+                        },
+                        fetch_list=[out1, out2],
+                    )
+                except ValueError:
+                    pass
+
+    def test_api(self):
+        self.test_dynamic_case()
+        self.test_static_case()
+
+    def test_float64(self):
+        self.test_dynamic_case("float64")
+        self.test_static_case("float64")
+
+    def test_broadcast(self):
+        self.test_dynamic_case("broadcast1")
+        self.test_static_case("broadcast1")
+
+    def test_broadcast_with_same_dim(self):
+        self.test_dynamic_case("broadcast2")
+        self.test_static_case("broadcast2")
+
+    def test_reduction(self):
+        self.test_dynamic_case(full=True, reduction="mean")
+        self.test_dynamic_case(full=True, reduction="sum")
+        self.test_static_case(full=True, reduction="mean")
+
+    def test_error(self):
+        self.test_dynamic_case("test_err")
+        self.test_static_case("test_err")
+
+    def test_int(self):
+        self.test_dynamic_case("int64")
+        self.test_dynamic_case("int32")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_greater_equal.py b/backends/metax_gpu/tests/unit_test/test_greater_equal.py
new file mode 100644
index 00000000000..816d6075099
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_greater_equal.py
@@ -0,0 +1,44 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import static
+
+
+class Test_Greater_Equal_Op_Fp16(unittest.TestCase):
+    def test_api_fp16(self):
+        paddle.enable_static()
+        with static.program_guard(static.Program(), static.Program()):
+            label = paddle.to_tensor([3, 3], dtype="float16")
+            limit = paddle.to_tensor([3, 2], dtype="float16")
+            out = paddle.greater_equal(x=label, y=limit)
+            # if core.is_compiled_with_cuda():
+            #     place = paddle.CUDAPlace(0)
+            #     exe = static.Executor(place)
+            #     (res,) = exe.run(fetch_list=[out])
+            #     self.assertEqual((res == np.array([True, True])).all(), True)
+            place = paddle.CustomPlace(paddle.device.get_device().split(":")[0], 0)
+            exe = static.Executor(place)
+            (res,) = exe.run(fetch_list=[out])
+            self.assertEqual((res == np.array([True, True])).all(), True)
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py b/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py
new file mode 100644
index 00000000000..b4e4282c5ce
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.incubate.nn.functional import build_src_rank_and_local_expert_id
+
+logger = logging.getLogger(__name__)
+
+
+class TestFusedCalculateAuxLoss(unittest.TestCase):
+    def test_build_src_rank_and_local_expert_id(self):
+        def orig_func(expert_num_global_list, num_local_experts):
+            send_rank_cpu = np.concatenate(  # TOO SLOW!!! break every thing
+                [
+                    np.full([j], i // num_local_experts, dtype="int32")
+                    for i, j in enumerate(expert_num_global_list)
+                ],
+                0,
+            )
+            local_expert_id_cpu = np.concatenate(
+                [
+                    np.full([j], i % num_local_experts, dtype="int32")
+                    for i, j in enumerate(expert_num_global_list)
+                ],
+                0,
+            )
+            send_rank = paddle.to_tensor(send_rank_cpu)
+            local_expert_id = paddle.to_tensor(local_expert_id_cpu)
+            return send_rank, local_expert_id
+
+        def fused_func(expert_num_global_tensor, expert_num_global, num_local_experts):
+            return build_src_rank_and_local_expert_id(
+                expert_num_global_tensor, expert_num_global, num_local_experts
+            )
+
+        expert_num_global = np.random.randint(0, 512, size=[12 * 8], dtype="int32")
+        expert_num_global_tensor = paddle.to_tensor(expert_num_global, dtype="int64")
+
+        s1, l1 = orig_func(expert_num_global, 12)
+        s2, l2 = fused_func(expert_num_global_tensor, expert_num_global, 12)
+        assert ((s1 - s2) == 0).all(), (s1, s2)
+        assert ((l1 - l2) == 0).all(), (l1, l2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py b/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py
new file mode 100644
index 00000000000..2d5670ee739
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from collections import namedtuple
+from functools import partial
+
+from ernie_utils.moe_all_gather_layer import MOEAllGatherLayerV2
+
+import paddle
+import paddle.nn.functional as F
+from paddle.incubate.nn.functional import expand_modality_expert_id
+
+
+def fused_gate_logits_process_ref(self, gate_logits_lm, gate_logits_mm, token_type_ids):
+    """process gatelogits"""
+    top_k = self.k
+    num_expert_per_rank_per_modality = (
+        gate_logits_lm.shape[-1] // self.config.moe_world_size
+    )
+
+    @paddle.no_grad()
+    def shift_ids(ids, modality_offset):
+        # 现在认为所以模态的 expert 数都一样
+        rank = ids // num_expert_per_rank_per_modality
+        expert_id_in_rank = ids % num_expert_per_rank_per_modality
+        return (
+            rank * (num_expert_per_rank_per_modality * 2)
+            + expert_id_in_rank
+            + modality_offset * num_expert_per_rank_per_modality
+        )
+
+    if self.group_experts:
+        gate_logits_lm = gate_logits_lm.reshape([gate_logits_lm.shape[0], top_k, -1])
+        prob_lm = self.gate.act(gate_logits_lm)
+        weight_lm, expert_id_lm = prob_lm.topk(k=1, axis=-1)
+        weight_lm = weight_lm.reshape([gate_logits_lm.shape[0], -1])
+        expert_id_lm = expert_id_lm.reshape([gate_logits_lm.shape[0], -1])
+        group_size = gate_logits_lm.shape[-1]
+        scale = paddle.arange(0, top_k * group_size, group_size).unsqueeze(0)
+        expert_id_lm = expert_id_lm + scale
+    else:
+        prob_lm = self.gate.act(gate_logits_lm)
+        weight_lm, expert_id_lm = prob_lm.topk(k=top_k, axis=-1)
+    if token_type_ids is not None:
+        expert_id_lm = shift_ids(expert_id_lm, 0)
+    expert_id_lm.stop_gradient = True
+    lm_weight_and_expert_id = paddle.concat(
+        [weight_lm, expert_id_lm.astype("float32")], -1
+    )
+    if token_type_ids is None:
+        return (
+            lm_weight_and_expert_id,
+            prob_lm.reshape([prob_lm.shape[0], -1]),
+            None,
+        )
+
+    prob_mm = self.gate.act(gate_logits_mm)
+    weight_mm, expert_id_mm = prob_mm.topk(k=top_k, axis=-1)
+
+    expert_id_mm = shift_ids(expert_id_mm, 1)
+    expert_id_mm.stop_gradient = True
+
+    mm_weight_and_expert_id = paddle.concat(
+        [weight_mm, expert_id_mm.astype("float32")], -1
+    )
+
+    token_type_ids_float = token_type_ids[:, None].astype("float32")
+    weight_and_expert = (
+        1 - token_type_ids_float
+    ) * lm_weight_and_expert_id + token_type_ids_float * mm_weight_and_expert_id
+    return weight_and_expert, prob_lm.reshape([prob_lm.shape[0], -1]), prob_mm
+
+
+def test_expand_modality_expert_id():
+    def expand_id_one(
+        expert_id,
+        num_expert_per_modality,
+        k,
+        group_size,
+        modality_offset,
+        is_group_expert,
+    ):
+        orig_shape = expert_id.shape
+        expert_id = expert_id.reshape([-1])
+        xid = paddle.arange(len(expert_id))
+        if is_group_expert:
+            eid = xid % k
+            expert_id += eid * group_size
+
+        rank = expert_id // num_expert_per_modality
+        expert_id_in_rank = expert_id % num_expert_per_modality
+        ret = (
+            rank * (num_expert_per_modality * 2)
+            + expert_id_in_rank
+            + modality_offset * num_expert_per_modality
+        )
+        return ret.reshape(orig_shape)
+
+    S, E, k = 100, 24, 3
+    expert_id_mm = paddle.randint(0, 12, shape=[S, k])
+    num_expert_per_rank_per_modality = E // 2 // 4
+    group_size = E // 2 // k
+    print(f"num_expert_per_rank_per_modality: {num_expert_per_rank_per_modality}")
+    fused = expand_modality_expert_id(
+        expert_id_mm, num_expert_per_rank_per_modality, group_size, 1, True
+    )
+
+    nonfused = expand_id_one(
+        expert_id_mm, num_expert_per_rank_per_modality, k, group_size, 1, True
+    )
+    # num_expert_per_rank_per_modality, group_size
+    assert (fused == nonfused).all().item()
+
+    Config = namedtuple("Config", ["moe_world_size"])
+    Self = namedtuple(
+        "Self",
+        [
+            "config",
+            "k",
+            "gate",
+            "group_experts",
+            "moe_statics",
+            "use_correction_bias",
+        ],
+    )
+    Gate = namedtuple("Gate", ["act"])
+    fake_gate = Gate(act=partial(F.softmax, axis=-1))
+    fake_self = Self(
+        config=Config(
+            moe_world_size=8,
+        ),
+        k=k,
+        gate=fake_gate,
+        moe_statics=None,
+        use_correction_bias=False,
+        group_experts=True,
+    )
+
+    fake_logits = paddle.randn([S, E])
+    fake_logits_mm = paddle.randn([S, E])
+    token_type_ids = paddle.randint(0, 2, shape=[S])
+    w_and_e, prob_lm, prob_mm = MOEAllGatherLayerV2.fused_gate_logits_process_fused(
+        fake_self, fake_logits, fake_logits_mm, None
+    )
+    w_and_e_ref, prob_lm_ref, prob_mm_ref = fused_gate_logits_process_ref(
+        fake_self, fake_logits, fake_logits_mm, None
+    )
+    assert (prob_lm == prob_lm_ref).all().item()
+    assert (w_and_e == w_and_e_ref).all().item()
+    w, e = w_and_e_ref.chunk(2, axis=-1)
+
+
+class Test_expand_modality_expert_id_API(unittest.TestCase):
+    def test_dygraph(self):
+        test_expand_modality_expert_id()
+
+
+if __name__ == "__main__":
+
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_fused_rmsnorm_ext_metax.py b/backends/metax_gpu/tests/unit_test/test_incubate_fused_rmsnorm_ext_metax.py
new file mode 100644
index 00000000000..ca0a780e908
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_incubate_fused_rmsnorm_ext_metax.py
@@ -0,0 +1,95 @@
+#  Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.incubate.nn.functional import fused_rms_norm_ext
+
+
+class TestFusedRMSNorm(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2023)
+        np.random.seed(2023)
+
+    def rms_norm_reference(self, x, scale, bias=None, epsilon=1e-5):
+        variance = paddle.mean(paddle.square(x), axis=-1, keepdim=True)
+
+        rms = paddle.sqrt(variance + epsilon)
+        y = x / rms
+        y = y * scale.reshape([1, -1])
+        if bias is not None:
+            y = y + bias.reshape([1, -1])
+        return y, (1.0 / rms).squeeze(-1)
+
+    def test_2d_input(self):
+        rows, cols = 32, 64
+        x = paddle.randn([rows, cols])
+        scale = paddle.randn([cols])
+        y_fused, invvar_fused = fused_rms_norm_ext(x, scale)
+
+        y_ref, invvar_ref = self.rms_norm_reference(x, scale)
+
+        np.testing.assert_allclose(y_fused, y_ref, rtol=1e-5, atol=1e-5)
+        np.testing.assert_allclose(invvar_fused, invvar_ref, rtol=1e-5, atol=1e-5)
+
+    def test_without_bias(self):
+
+        rows, cols = 32, 64
+        x = paddle.randn([rows, cols])
+        scale = paddle.randn([cols])
+
+        y_fused, invvar_fused = fused_rms_norm_ext(x, scale)
+
+        y_ref, invvar_ref = self.rms_norm_reference(x, scale)
+
+        np.testing.assert_allclose(y_fused, y_ref, rtol=1e-5, atol=1e-5)
+        np.testing.assert_allclose(invvar_fused, invvar_ref, rtol=1e-5, atol=1e-5)
+
+    def test_backward(self):
+
+        rows, cols = 16, 32
+        x = paddle.randn([rows, cols], dtype="float32")
+        x.stop_gradient = False
+        scale = paddle.randn([cols], dtype="float32")
+        scale.stop_gradient = False
+
+        y_fused, invvar = fused_rms_norm_ext(x, scale)
+
+        loss = paddle.mean(y_fused)
+        loss.backward()
+
+        x_grad_fused = x.grad.clone()
+        scale_grad_fused = scale.grad.clone()
+
+        x.clear_gradient()
+        scale.clear_gradient()
+
+        y_ref, invvar_ref = self.rms_norm_reference(x, scale)
+        loss_ref = paddle.mean(y_ref)
+        loss_ref.backward()
+
+        x_grad_ref = x.grad
+        scale_grad_ref = scale.grad
+
+        np.testing.assert_allclose(x_grad_fused, x_grad_ref, rtol=1e-4, atol=1e-4)
+        np.testing.assert_allclose(
+            scale_grad_fused, scale_grad_ref, rtol=1e-4, atol=1e-4
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py
new file mode 100644
index 00000000000..23df4e3457b
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py
@@ -0,0 +1,193 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+import unittest
+
+import numpy as np
+from ernie_utils.moe_layer_uneven import GateCombine
+
+import paddle
+import paddle.nn.functional as F
+from paddle.incubate.nn.functional import moe_combine
+
+os.environ["FLAGS_flash_attn_version"] = "v1"
+os.environ["FLAGS_cudnn_deterministic"] = "1"
+os.environ["FLAGS_embedding_deterministic"] = "1"
+
+
+def combining(x, combine_weights, scatter_index, hard_gate=False):
+    """
+    Args:
+        x: Tensor[seq, dim]
+        combine_weights: [seq, k]
+        scatter_index:  ** [seq, k] **
+
+    Returns:
+        y: Tensor[s, dim]
+    """
+    x_gatherd = F.embedding(scatter_index, x)  # [s,k,dim]
+    if hard_gate:
+        return x_gatherd.squeeze(-2)
+    # logger.info(f'combinning: {combine_weights}')
+    y = (combine_weights.unsqueeze(-1) * x_gatherd).sum(1)
+    # y = paddle.matmul(combine_weights.unsqueeze(1), x_gatherd).squeeze()  # [s,1,k] @ [s,k,dim] -> [s,1,dim]
+    return y
+
+
+def baseline_result(x_numpy, combine_weights_numpy, scatter_index_numpy, grad_numpy):
+    """baseline_result"""
+    scatter_index = paddle.to_tensor(scatter_index_numpy)
+    x = paddle.to_tensor(x_numpy).cast("float32")
+    x.stop_gradient = False
+
+    combine_weights = paddle.to_tensor(combine_weights_numpy).cast("float32")
+    combine_weights.stop_gradient = False
+
+    scatter_index = paddle.to_tensor(scatter_index_numpy)
+    grad = paddle.to_tensor(grad_numpy).cast("float32")
+
+    y = combining(x, combine_weights, scatter_index)
+    paddle.autograd.backward([y], [grad], True)
+    return [x.grad, combine_weights.grad, y]
+
+
+def test_moe_combine(x_numpy, combine_weights_numpy, scatter_index_numpy, grad_numpy):
+    """baseline_result"""
+    x = paddle.to_tensor(x_numpy).cast("float32")
+    x.stop_gradient = False
+
+    combine_weights = paddle.to_tensor(combine_weights_numpy).cast("float32")
+    combine_weights.stop_gradient = False
+
+    scatter_index = paddle.to_tensor(scatter_index_numpy).cast("int32")
+    grad = paddle.to_tensor(grad_numpy).cast("float32")
+
+    y = GateCombine.apply(x, combine_weights, scatter_index)
+    paddle.autograd.backward([y], [grad], True)
+    # grad.backward()
+    return [x.grad, combine_weights.grad, y]
+
+
+def gen_test_case(S, K, Dim, capacity_factor, seed=1234):
+    """gen_test_case"""
+    random.seed(seed)
+    np.random.seed(seed)
+    paddle.seed(seed)
+    x_numpy = np.random.rand(int(S * capacity_factor), Dim).astype(np.float32)
+    combine_weights_numpy = np.random.rand(S, K).astype(np.float32)
+    scatter_index_numpy = np.random.permutation(max(x_numpy.shape[0], S * K))[
+        : S * K
+    ].astype("int64")
+    scatter_index_numpy = scatter_index_numpy.reshape([S, K])
+
+    combine_weights_numpy[scatter_index_numpy >= x_numpy.shape[0]] = 0
+    scatter_index_numpy[scatter_index_numpy >= x_numpy.shape[0]] = 0
+    grad_numpy = np.random.randn(S, Dim).astype(np.float32)
+    return x_numpy, combine_weights_numpy, scatter_index_numpy, grad_numpy
+
+
+def testing(test_case):
+    """testing"""
+    [bl_x_grad, bl_combine_weights_grad, bl_y] = baseline_result(*test_case)
+    [fused_x_grad, fused_combine_weights_grad, fused_y] = test_moe_combine(*test_case)
+    np.testing.assert_allclose(
+        fused_y.astype("float32").numpy(),
+        bl_y.astype("float32").numpy(),
+        err_msg="fwd precision not pass",
+        rtol=1e-6,
+    )
+    np.testing.assert_allclose(
+        fused_x_grad.astype("float32").numpy(),
+        bl_x_grad.astype("float32").numpy(),
+        rtol=1e-6,
+        err_msg="bwd grad precision not pass",
+    )
+    np.testing.assert_allclose(
+        fused_combine_weights_grad.astype("float32").numpy(),
+        bl_combine_weights_grad.astype("float32").numpy(),
+        rtol=1e-6,
+    )
+
+
+class TestFused(unittest.TestCase):
+    @unittest.skipIf(moe_combine is None, "test_moe_combine not installed")
+    def test_cap_lt_2(
+        self,
+    ):
+        """
+        测试精度对齐的功能
+
+        Args:
+            无参，没有任何参数。
+
+        Returns:
+            NoneType：测试通过时返回None；测试失败时抛出异常。
+
+        """
+        testing(gen_test_case(S=1024, K=2, Dim=4096, capacity_factor=1.8))
+
+    @unittest.skipIf(moe_combine is None, "test_moe_combine not installed")
+    def test_cap_eq_2(
+        self,
+    ):
+        """
+        测试精度对齐的功能
+
+        Args:
+            无参，没有任何参数。
+
+        Returns:
+            NoneType：测试通过时返回None；测试失败时抛出异常。
+
+        """
+        testing(gen_test_case(S=1024, K=2, Dim=4096, capacity_factor=2))
+
+    @unittest.skipIf(moe_combine is None, "test_moe_combine not installed")
+    def test_cap_gt_2(
+        self,
+    ):
+        """
+        测试精度对齐的功能
+
+        Args:
+            无参，没有任何参数。
+
+        Returns:
+            NoneType：测试通过时返回None；测试失败时抛出异常。
+
+        """
+        testing(gen_test_case(S=1024, K=2, Dim=4096, capacity_factor=2.2))
+
+    @unittest.skipIf(moe_combine is None, "test_moe_combine not installed")
+    def test_k_gt_2(
+        self,
+    ):
+        """
+        测试精度对齐的功能
+
+        Args:
+            无参，没有任何参数。
+
+        Returns:
+            NoneType：测试通过时返回None；测试失败时抛出异常。
+
+        """
+        testing(gen_test_case(S=1024, K=8, Dim=4096, capacity_factor=2))
+
+
+if __name__ == "__main__":
+
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py
new file mode 100644
index 00000000000..4c209970629
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py
@@ -0,0 +1,218 @@
+# ruff: noqa: C419
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+from paddle.incubate.nn.functional import (
+    moe_gate_dispatch,
+    moe_gate_dispatch_partial_nosoftmaxtopk,
+)
+
+
+def test_moe_dispatch_partial_nosoftmaxtopk_nonepad_op():
+
+    s, d, e = 4, 100, 8
+    k, cap = 4, 3
+    local_expert_num = 2
+
+    # x = paddle.randn([s, d])
+    # gate_logits = paddle.randn([s, e])
+    x = paddle.arange(1, s + 1).unsqueeze(-1).expand([s, d]).astype("bfloat16")
+    x_ = x.clone().detach()
+
+    t = (
+        (paddle.arange(0, e)).unsqueeze(0) + paddle.arange(0, -s, -1).unsqueeze(-1)
+    ) % e
+    gate_logits = (1 / (t + 1)).astype("float32")
+    # gate_logits = F.softmax(paddle.randn([s,e]),-1).astype('float32')
+    gate_logits_ = gate_logits.clone().detach()
+    s = x.shape[0]
+    d = x.shape[1]
+    e = gate_logits.shape[1]
+    x.stop_gradient = False
+    x_.stop_gradient = False
+    gate_logits.stop_gradient = False
+    gate_logits_.stop_gradient = False
+    print(f"gate_logits:{gate_logits}")
+
+    def check_ascend(index_rev, chunks):
+        for idx in index_rev.split(chunks.tolist()):
+            if len(idx) > 2:
+                assert (paddle.diff(idx) >= 0).all(), (index_rev,)
+
+    ys, comm, scatter_idx = [], [], []
+    for ilocal_expert in range(0, e, local_expert_num):
+        combine_weihgts, expert_id = gate_logits.topk(k=k, axis=1)
+        (
+            y,
+            combine_weihgts,
+            scatter_index,
+            scatter_index_rev,
+            expert_offset,
+            expert_num_local,
+        ) = moe_gate_dispatch_partial_nosoftmaxtopk(
+            x,
+            combine_weihgts,
+            expert_id.astype("int32"),
+            k=k,
+            capacity=cap,
+            num_experts=gate_logits.shape[-1],
+            use_pad=False,
+            expert_start_index=ilocal_expert,
+            expert_end_index=ilocal_expert + local_expert_num,  # k  # cap
+            reverse_token_drop=False,
+        )
+        check_ascend(scatter_index_rev, expert_num_local)
+        print(f"y:{y.mean(-1)}")
+        print(f"combine_weihgts:{combine_weihgts}")
+        print(f"expert_num_local:{expert_num_local}")
+        print(f"scatter_index:{scatter_index.transpose([1,0])}")
+        print(f"scatter_index_rev:{scatter_index_rev}")
+
+        ys.append(y)
+        comm.append(combine_weihgts)
+        scatter_idx.append(scatter_index)
+
+    comm_sum = paddle.stack(comm).sum(0)
+    ys_sum = paddle.concat(ys)
+
+    (
+        y_,
+        combine_weihgts_,
+        scatter_index_,
+        expert_offset_,
+        expert_id_,
+    ) = moe_gate_dispatch(
+        x_,
+        gate_logits_,
+        None,
+        k=k,
+        capacity=cap,
+        use_pad=True,  # k  # cap
+    )
+    valid_y = y_.sum(-1) > 0.0
+    y_2 = y_[valid_y].squeeze()
+
+    print(
+        f"""
+    y: {ys_sum.astype("float32").mean(axis=-1)}
+    y_: {y_2.astype("float32").mean(axis=-1)}
+
+    comm-weight: {comm_sum}
+    comm-weight_: {combine_weihgts_}
+
+    expert_id:{expert_id}
+    scatter_index:{scatter_index}
+    scatter_index_rev: {scatter_index_rev}
+    expert_num_global:{expert_offset}
+    expert_num_local:{expert_num_local}
+    """
+    )
+
+    print("<<< begin backward>>>")
+
+    assert combine_weihgts_.shape == combine_weihgts.shape, (
+        combine_weihgts_.shape,
+        combine_weihgts.shape,
+    )
+
+    dysum, dcombine_weights_sum = paddle.ones_like(ys_sum), paddle.randn(
+        comm_sum.shape
+    ).astype(comm_sum.dtype)
+    dy_, dcombine_weights_ = paddle.ones_like(y_), paddle.ones_like(combine_weihgts_)
+    dy_[~valid_y] = 0
+
+    y_shapes = [len(y) for y in ys]
+    for dyy, yy, commm in zip(
+        paddle.split(dysum, y_shapes),
+        ys,
+        comm,
+    ):
+        print(f"dyy:{dyy.shape}, {yy.shape} {commm.shape}")
+        paddle.autograd.backward([yy, commm], [dyy, dcombine_weights_sum])
+        print(x.grad.astype("float32").mean(axis=-1))
+    print(f"bwd original:{y_.shape} {dy_.shape}")
+    paddle.autograd.backward([y_, combine_weihgts_], [dy_, dcombine_weights_])
+
+    print(x_.grad.astype("float32").mean(axis=-1))
+
+    print(
+        f"""
+    x: {x.grad.astype('float32').mean(axis=-1)}
+    x_: {x_.grad.astype('float32').mean(axis=-1)}
+    """
+    )
+
+
+def test_moe_ops_partial_nosoftmaxtopk_w_reverse_token_drop():
+
+    S, E, D = 3, 4, 3
+    k = 2
+    capacity = 2
+    x = (paddle.arange(S) + 1).unsqueeze(-1).expand([S, D]).astype("bfloat16")
+    cw = paddle.randn([S, k])
+    eid = paddle.to_tensor([[0, 1], [0, 1], [0, 2]], dtype="int32")  # 1  # 2  # 3
+    (
+        y,
+        cw_,
+        idx,
+        idx_rev,
+        num_ex_global,
+        num_ex_local,
+    ) = moe_gate_dispatch_partial_nosoftmaxtopk(
+        x, cw, eid, k, capacity, E, False, 0, 2, reverse_token_drop=True
+    )
+
+    y0, y1 = y.split([i for i in num_ex_local.tolist() if i > 0])
+    assert y0[:, 0].astype("int32").tolist() == [2, 3], y0[:, 0]
+    assert y1[:, 0].astype("int32").tolist() == [1, 2]
+
+
+def test_moe_ops_partial_nosoftmax_topk_empty_output():
+
+    S, E, D = 3, 4, 3
+    k = 2
+    capacity = 2
+    x = (paddle.arange(S) + 1).unsqueeze(-1).expand([S, D]).astype("bfloat16")
+    cw = paddle.randn([S, k])
+    paddle.device.synchronize()
+    eid = paddle.to_tensor([[0, 1], [0, 1], [0, 2]], dtype="int32")  # 1  # 2  # 3
+    (
+        y,
+        cw_,
+        idx,
+        idx_rev,
+        num_ex_global,
+        num_ex_local,
+    ) = moe_gate_dispatch_partial_nosoftmaxtopk(
+        x, cw, eid, k, capacity, E, False, 3, 4, reverse_token_drop=True
+    )
+    assert all([i == 0 for i in num_ex_local.tolist()]), num_ex_local
+
+
+class TestAddition(unittest.TestCase):
+    def test_moe_dispatch_partial_nosoftmaxtopk_nonepad_op(self):
+        test_moe_dispatch_partial_nosoftmaxtopk_nonepad_op()
+
+    def test_moe_ops_partial_nosoftmaxtopk_w_reverse_token_drop(self):
+        test_moe_ops_partial_nosoftmaxtopk_w_reverse_token_drop()
+
+    def test_moe_ops_partial_nosoftmax_topk_empty_output(self):
+        test_moe_ops_partial_nosoftmax_topk_empty_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py
new file mode 100644
index 00000000000..19752abd904
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py
@@ -0,0 +1,207 @@
+# !/usr/bin/env python3
+
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle.incubate.nn.functional import (
+    moe_gate_dispatch,
+    moe_gate_dispatch_permute,
+)
+
+os.environ["FLAGS_flash_attn_version"] = "v1"
+os.environ["FLAGS_cudnn_deterministic"] = "1"
+os.environ["FLAGS_embedding_deterministic"] = "1"
+
+
+class TestFused(unittest.TestCase):
+    def test_moe_ops(self):
+        """
+        test `moe-ops` w/ bias
+        """
+        S, E, D = 8192, 64, 128
+        k = 4
+        x = paddle.randn([S, D], dtype="bfloat16")
+        gate_logits = paddle.randn([S, E], dtype="float32")
+        x_ = x.clone()
+        gate_logits_ = gate_logits.clone()
+        x.stop_gradient = True
+        x_.stop_gradient = True
+        gate_logits.stop_gradient = True
+        gate_logits_.stop_gradient = True
+        bias = paddle.zeros([E], dtype="float32")
+        cap = 512
+
+        (
+            y,
+            combine_weihgts,
+            scatter_index,
+            expert_offset_,
+            expert_id_,
+        ) = moe_gate_dispatch(
+            x,
+            gate_logits,
+            None,
+            k=k,
+            capacity=cap,
+            use_pad=True,  # k  # cap
+        )
+
+        (
+            y_,
+            combine_weihgts_,
+            scatter_index_,
+            expert_offset_,
+            expert_id_,
+        ) = moe_gate_dispatch(
+            x_,
+            gate_logits_,
+            bias + 1,  # +1也不会破坏路由结果
+            k=k,
+            capacity=cap,
+            use_pad=True,  # k  # cap
+        )
+        bias_unbalanced = bias.clone()
+        bias_unbalanced[0] += 1
+        (
+            y__,
+            combine_weihgts__,
+            scatter_index__,
+            expert_offset__,
+            expert_id__,
+        ) = moe_gate_dispatch(
+            x_,
+            gate_logits_,
+            bias_unbalanced,
+            k=k,
+            capacity=cap,
+            use_pad=True,  # k  # cap
+        )
+        np.testing.assert_equal(
+            y.astype("float32").numpy(),
+            y_.astype("float32").numpy(),
+            err_msg="incubate w bias not match",
+        )
+        # bias 不影响 prob 概率
+        np.testing.assert_equal(
+            combine_weihgts.astype("float32").numpy(),
+            combine_weihgts_.astype("float32").numpy(),
+            err_msg="incubate w bias not match",
+        )
+        np.testing.assert_(
+            (y.astype("float32").numpy(0) != y__.astype("float32").numpy()).any(),
+        )
+
+
+class TestDispatchPermute(unittest.TestCase):
+    def get_detached_input(self, input, prob):
+        ret_input = input.detach()
+        ret_prob = prob.detach()
+        ret_input.stop_gradient = input.stop_gradient
+        ret_prob.stop_gradient = prob.stop_gradient
+        return ret_input, ret_prob
+
+    def get_stage_input_list(self, x, world_size, stage):
+        print(world_size, stage, x.shape)
+        x = x.reshape([world_size * stage, -1, x.shape[-1]])
+        stage_input_list = []
+        x_list = paddle.split(x, num_or_sections=(world_size * stage), axis=0)
+        for stage_id in range(stage):
+            stage_input_list.append(
+                paddle.unsqueeze(paddle.concat(x_list[stage_id::stage], axis=0), axis=0)
+            )
+        stage_input_list = paddle.concat(stage_input_list, axis=0)
+        return stage_input_list
+
+    def test_moe_permute_ops(self):
+        paddle.seed(2025)
+
+        test_cases = [
+            (8, 4, 2),
+            (64, 16, 32),
+            (1024, 1024, 1024),
+            (8, 2, 4),
+            (4096, 4096, 4096),
+        ]
+        cases = list(zip(*test_cases))
+        for _, case in enumerate(cases):
+            world_size, num_experts, num_tokens, k, hidden_size = case
+            capacity = num_tokens // k
+            stages = num_experts // world_size
+
+            input = paddle.randn([num_tokens, hidden_size], dtype="float32")
+            prob_logits = paddle.randn([num_tokens, num_experts], dtype="float32")
+            prob = F.softmax(prob_logits, axis=-1)
+            input.stop_gradient = False
+            prob.stop_gradient = False
+
+            compat_args = (None,)
+
+            ref_input, ref_prob = self.get_detached_input(input, prob)
+            (
+                ref_dispatched_input,
+                ref_combine_weights_unnorm,
+                ref_scatter_index,
+                ref_dispatch_mask,
+                _,
+            ) = moe_gate_dispatch(
+                ref_input,
+                ref_prob,
+                *compat_args,
+                k=k,
+                capacity=capacity,
+                use_pad=True,
+            )
+
+            ref_stage_input_list = self.get_stage_input_list(
+                ref_dispatched_input, world_size, stages
+            )
+
+            test_input, test_prob = self.get_detached_input(input, prob)
+            (
+                test_dispatched_input,
+                test_combine_weights_unnorm,
+                test_scatter_index,
+                test_dispatch_mask,
+                _,
+            ) = moe_gate_dispatch_permute(
+                test_input,
+                test_prob,
+                *compat_args,
+                k=k,
+                capacity=capacity,
+                world_size=world_size,
+            )
+
+            np.testing.assert_equal(
+                test_dispatched_input.shape,
+                ref_stage_input_list.shape,
+                err_msg="moe_permute_ops not match",
+            )
+            np.testing.assert_equal(
+                test_dispatched_input._md5sum(),
+                ref_stage_input_list._md5sum(),
+                err_msg="moe_permute_ops not match",
+            )
+
+
+if __name__ == "__main__":
+
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py
new file mode 100644
index 00000000000..14991becc47
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py
@@ -0,0 +1,175 @@
+# !/usr/bin/env python3
+
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle.incubate.nn.functional import (
+    moe_gate_dispatch,
+    moe_gate_dispatch_permute,
+)
+
+batch_size = 4
+hidden_size = 2
+k = 16
+capacity = 2
+num_experts = 16
+
+world_size = 2
+
+
+class TestLayer(paddle.nn.Layer):
+    def forward(self, x, gate_prob, k, capacity):
+        y, combine_weights, scatter_index, expert_offset, expert_id = moe_gate_dispatch(
+            x, gate_prob, None, k, capacity, True
+        )
+        return y, combine_weights, scatter_index, expert_offset, expert_id
+
+
+class TestLayerPermute(paddle.nn.Layer):
+    def forward(self, x, gate_prob, k, capacity):
+        (
+            y,
+            combine_weights,
+            scatter_index,
+            expert_offset,
+            expert_id,
+        ) = moe_gate_dispatch_permute(
+            x, gate_prob, None, k, capacity, world_size=world_size
+        )
+        return y, combine_weights, scatter_index, expert_offset, expert_id
+
+
+def check_backward_correctness(layer_cls):
+    paddle.seed(1024)
+
+    dtype = "bfloat16"
+    layer = layer_cls()
+    input = paddle.randn([batch_size, hidden_size])
+
+    gate_weight = paddle.randn([hidden_size, num_experts])
+    logits = paddle.matmul(input, gate_weight)
+    gate_prob = F.softmax(logits, axis=-1)
+    print(f"gate_prob: {gate_prob}")
+
+    input = paddle.cast(input, "bfloat16")
+    input.stop_gradient = False
+    gate_prob.stop_gradient = False
+
+    output, combine_weights, scatter_index, expert_offset, expert_id = layer(
+        input, gate_prob, k, capacity
+    )
+
+    print(f"output: {output}")
+    print(f"combine_weights: {combine_weights}")
+    print(f"scatter_index: {scatter_index}")
+    print(f"expert_offset: {expert_offset}")
+    print(f"expert_id: {expert_id}")
+
+    # output_g = paddle.randn(output.shape).astype(output.dtype)
+    # combine_weights_g = paddle.randn(combine_weights.shape).astype(combine_weights.dtype)
+    output_g = paddle.ones_like(output)
+    combine_weights_g = paddle.ones_like(combine_weights)
+    print(f"output_g: {output_g}")
+    print(f"combine_weights_g: {combine_weights_g}")
+
+    paddle.autograd.backward(
+        tensors=[output, combine_weights],
+        grad_tensors=[output_g, combine_weights_g],
+    )
+    # 数值估算
+    epsilon = 0.005
+    input_numpy = input.detach().astype("float32").numpy()
+    num_grad = paddle.zeros_like(input)
+    flattened = num_grad.reshape([-1])
+
+    for i in range(input.numel()):
+        input_pos = input_numpy.copy()
+        input_neg = input_numpy.copy()
+        input_pos.flat[i] += epsilon
+        input_neg.flat[i] -= epsilon
+
+        output_pos, _, _, _, _ = layer(
+            paddle.to_tensor(input_pos), gate_prob, k, capacity
+        )
+        output_neg, _, _, _, _ = layer(
+            paddle.to_tensor(input_neg), gate_prob, k, capacity
+        )
+
+        """
+        flattened[i] = (output_pos.astype("float32").numpy() - output_neg.astype("float32").numpy()).sum() / (
+            2 * epsilon
+        )
+        """
+        grad_value = (output_pos - output_neg).sum() / (2 * epsilon)
+        flattened[i] = grad_value
+
+    flattened = flattened.reshape(input.shape)
+
+    print(f"input gradient: {input.grad}")
+    print(f"numerical gradient: {flattened}")
+    np.testing.assert_allclose(
+        input.grad.astype("float32").numpy(),
+        flattened.astype("float32").numpy(),
+        rtol=1e-5,
+        atol=0,
+    )
+
+    # 数值估算 gate_prob
+    epsilon = 0.0005
+    gate_prob_numpy = gate_prob.detach().astype("float32").numpy()
+    num_grad = paddle.zeros_like(gate_prob)
+    flattened = num_grad.reshape([-1])
+
+    for i in range(gate_prob.numel()):
+        input_pos = gate_prob_numpy.copy()
+        input_neg = gate_prob_numpy.copy()
+        input_pos.flat[i] += epsilon
+        input_neg.flat[i] -= epsilon
+
+        _, output_pos, _, _, _ = layer(input, paddle.to_tensor(input_pos), k, capacity)
+        _, output_neg, _, _, _ = layer(input, paddle.to_tensor(input_neg), k, capacity)
+
+        grad_value = paddle.to_tensor(
+            (output_pos.numpy() - output_neg.numpy()).sum() / (2 * epsilon)
+        )
+        flattened[i] = grad_value
+
+    flattened = flattened.reshape(gate_prob.shape)
+
+    print(f"gate_prob gradient: {gate_prob.grad}")
+    print(f"numerical gradient: {flattened}")
+    np.testing.assert_allclose(
+        gate_prob.grad.astype("float32").numpy(),
+        flattened.astype("float32").numpy(),
+        rtol=1e-4,
+        atol=0,
+    )
+
+
+class TestFused(unittest.TestCase):
+    def test_moe_backward(self):
+        check_backward_correctness(TestLayer)
+
+    def test_moe_permute_backward(self):
+        check_backward_correctness(TestLayerPermute)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_layer_norm.py b/backends/metax_gpu/tests/unit_test/test_layer_norm.py
new file mode 100644
index 00000000000..dbeaee31f6c
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_layer_norm.py
@@ -0,0 +1,358 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from operator import mul
+import paddle.base.core as core
+import paddle.nn.functional as F
+import paddle.base as base
+from functools import reduce
+from op_test import _set_use_system_allocator
+from paddle.static.amp.fp16_utils import (
+    _keep_layer_norm_scale_bias_to_fp32,
+)
+from paddle.pir_utils import OldIrGuard
+
+paddle.enable_static()
+
+np.random.random(123)
+
+_set_use_system_allocator(True)
+
+
+def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
+    x_shape = x.shape
+    N = reduce(mul, x_shape[0:begin_norm_axis], 1)
+    D = reduce(mul, x_shape[begin_norm_axis : len(x_shape)], 1)
+    x.shape = [N, D]
+
+    mean = np.mean(x, axis=1)
+    var = np.var(x, axis=1) + epsilon
+    output = np.divide((x - mean.reshape([N, 1])), (np.sqrt(var)).reshape([N, 1]))
+    if scale is not None:
+        output = scale.reshape([1, D]) * output
+    if beta is not None:
+        output = output + beta.reshape([1, D])
+
+    x.shape, output.shape = x_shape, x_shape
+    return output, mean, var
+
+
+def _reference_layer_norm_grad(x, grad_y, scale, bias, mean, var, begin_norm_axis=1):
+    x_shape = x.shape
+    N = reduce(mul, x_shape[0:begin_norm_axis], 1)
+    D = reduce(mul, x_shape[begin_norm_axis : len(x_shape)], 1)
+
+    if scale is not None:
+        scale_shape = scale.shape
+        scale.shape = [1, D]
+    x.shape, grad_y.shape = [N, D], [N, D]
+    var.shape, mean.shape = [N, 1], [N, 1]
+
+    # d_bias
+    if bias is not None:
+        d_bias = np.sum(grad_y, axis=0).reshape([1, D])
+    else:
+        d_bias = None
+    # d_scale
+    if scale is not None:
+        d_scale = np.sum(((x - mean) * np.sqrt(1 / var)) * grad_y, axis=0).reshape(
+            [1, D]
+        )
+    else:
+        d_scale = None
+    # dx
+    if scale is not None:
+        dx_end = scale * np.sqrt(1.0 / var) * grad_y
+        d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * scale, axis=1).reshape(
+            [N, 1]
+        )  # the second part equals to zero.
+        d_mean = 1.0 / D * d_mean_0
+        d_std = np.sum(-(1.0 / var) * (x - mean) * grad_y * scale, axis=1).reshape(
+            [N, 1]
+        ) * (1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean))
+    else:
+        dx_end = 1.0 * np.sqrt(1.0 / var) * grad_y
+        d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * 1.0, axis=1).reshape(
+            [N, 1]
+        )  # the second part equals to zero.
+        d_mean = 1.0 / D * d_mean_0
+        d_std = np.sum(-(1.0 / var) * (x - mean) * grad_y * 1.0, axis=1).reshape(
+            [N, 1]
+        ) * (1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean))
+
+    grad_x = dx_end + d_mean + d_std
+
+    grad_x.shape, x.shape, grad_y.shape = x_shape, x_shape, x_shape
+    var.shape, mean.shape = [N], [N]
+
+    if scale is not None:
+        scale.shape = scale_shape
+    return grad_x, d_scale, d_bias
+
+
+class TestLayerNormOp(unittest.TestCase):
+    def setUp(self):
+        self.init_dtype()
+        self.place = paddle.CustomPlace("metax_gpu", 0)
+        self.__class__.use_custom_device = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        np.testing.assert_allclose(
+            np.array(tensor), np_array, rtol=1e-4, atol=atol, err_msg=msg
+        )
+
+    def check_forward_backward(
+        self,
+        shape,
+        begin_norm_axis,
+        has_scale=True,
+        has_bias=True,
+        y_grad_scale=1.0,
+        use_mkldnn=False,
+    ):
+        def test_with_place(place, shape, begin_norm_axis, use_mkldnn=use_mkldnn):
+            # attr
+            epsilon = 0.00001
+            x_shape = shape
+            D = reduce(mul, x_shape[begin_norm_axis : len(x_shape)], 1)
+            scale_shape = [D]
+
+            np.random.seed(123)
+            x = np.random.random_sample(x_shape).astype(self.dtype)
+            scale = (
+                np.random.random_sample(scale_shape).astype(np.float32)
+                if has_scale
+                else None
+            )
+            bias = (
+                np.random.random_sample(scale_shape).astype(np.float32)
+                if has_bias
+                else None
+            )
+            y_grad = (np.random.random_sample(x_shape) * y_grad_scale).astype(
+                self.dtype
+            )
+
+            # reference forward & backward
+            y, mean, variance = _reference_layer_norm_naive(
+                x, scale, bias, epsilon, begin_norm_axis
+            )
+            x_grad, scale_grad, bias_grad = _reference_layer_norm_grad(
+                x, y_grad, scale, bias, mean, variance, begin_norm_axis
+            )
+            mean.shape = x_shape[0:begin_norm_axis]
+            variance.shape = x_shape[0:begin_norm_axis]
+
+            var_dict = locals()
+            var_dict["y@GRAD"] = y_grad
+            var_names = ["x", "mean", "variance", "y", "y@GRAD"]
+            if has_scale:
+                var_names += ["scale"]
+            if has_bias:
+                var_names += ["bias"]
+            ground_truth = {name: var_dict[name] for name in var_names}
+
+            with OldIrGuard():
+                program = base.Program()
+                old_program_guard = base.program_guard
+            with old_program_guard(program):
+                block = program.global_block()
+                for name in ground_truth:
+                    block.create_var(
+                        name=name, dtype=self.dtype, shape=ground_truth[name].shape
+                    )
+                inputs = {"X": block.var("x")}
+                fetch_list = [
+                    "y",
+                    "mean",
+                    "variance",
+                    "x@GRAD",
+                ]
+                if has_scale:
+                    inputs["Scale"] = block.var("scale")
+                    fetch_list += ["scale@GRAD"]
+                if has_bias:
+                    inputs["Bias"] = block.var("bias")
+                    fetch_list += ["bias@GRAD"]
+                layer_norm_op = block.append_op(
+                    type="layer_norm",
+                    inputs=inputs,
+                    outputs={
+                        "Y": block.var("y"),
+                        "Mean": block.var("mean"),  # share the same memory
+                        "Variance": block.var("variance"),  # share the same memory
+                    },
+                    attrs={
+                        "epsilon": epsilon,
+                        "begin_norm_axis": begin_norm_axis,
+                        "use_mkldnn": use_mkldnn,
+                    },
+                )
+                # generate backward op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    layer_norm_op.desc, set(), []
+                )
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+                program._sync_with_cpp()
+                exe = base.Executor(place)
+                with OldIrGuard():
+                    out = exe.run(
+                        program,
+                        feed={
+                            name: var_dict[name]
+                            for name in ["x", "scale", "bias", "y@GRAD"]
+                        },
+                        fetch_list=fetch_list,
+                    )
+
+                self.__assert_close(y, out[0], "y")
+                self.__assert_close(mean, out[1], "mean")
+                self.__assert_close(variance, out[2], "variance", 1e-3)
+                self.__assert_close(x_grad, out[3], "x_grad")
+                if has_scale:
+                    self.__assert_close(
+                        scale_grad.reshape(-1),
+                        out[fetch_list.index("scale@GRAD")],
+                        "scale_grad",
+                        1e-3,
+                    )
+                if has_bias:
+                    self.__assert_close(
+                        bias_grad.reshape(-1),
+                        out[fetch_list.index("bias@GRAD")],
+                        "bias_grad",
+                    )
+
+        test_with_place(self.place, shape, begin_norm_axis)
+
+    def test_check_forward_backward_with_scale_and_bias(self):
+        self.check_forward_backward(shape=[1, 3, 4, 5], begin_norm_axis=1)
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5], begin_norm_axis=1, has_scale=False, has_bias=True
+        )
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5], begin_norm_axis=1, has_scale=True, has_bias=False
+        )
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5], begin_norm_axis=1, has_scale=False, has_bias=False
+        )
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
+        self.check_forward_backward(
+            shape=[92, 513, 129], begin_norm_axis=2, y_grad_scale=0.1
+        )
+        self.check_forward_backward(shape=[3, 34, 1134], begin_norm_axis=2)
+        self.check_forward_backward(
+            shape=[92, 513, 1134], begin_norm_axis=2, y_grad_scale=0.1
+        )
+        self.check_forward_backward(
+            shape=[92, 513, 1134],
+            begin_norm_axis=2,
+            has_scale=False,
+            has_bias=True,
+            y_grad_scale=0.1,
+        )
+        self.check_forward_backward(
+            shape=[92, 513, 1134],
+            begin_norm_axis=2,
+            has_scale=True,
+            has_bias=False,
+            y_grad_scale=0.1,
+        )
+        self.check_forward_backward(
+            shape=[92, 513, 1134],
+            begin_norm_axis=2,
+            has_scale=False,
+            has_bias=False,
+            y_grad_scale=0.1,
+        )
+        self.check_forward_backward(
+            shape=[512, 1024], begin_norm_axis=1, has_scale=True, has_bias=True
+        )
+
+
+class TestFP16ScaleBiasLayerNorm(unittest.TestCase):
+    def check_main(self, x_np, weight_np, bias_np, dtype):
+        paddle.disable_static()
+
+        weight_np = weight_np.astype(dtype)
+        bias_np = bias_np.astype(dtype)
+
+        x = paddle.to_tensor(x_np)
+        weight = paddle.to_tensor(weight_np)
+        bias = paddle.to_tensor(bias_np)
+        x.stop_gradient = False
+        weight.stop_gradient = False
+        bias.stop_gradient = False
+        y = F.layer_norm(x, x.shape[1:], weight, bias)
+        x_g, w_g, b_g = paddle.grad(y, [x, weight, bias])
+        y_np = y.numpy().astype("float32")
+        x_g_np = x_g.numpy().astype("float32")
+        w_g_np = w_g.numpy().astype("float16")
+        b_g_np = b_g.numpy().astype("float32")
+
+        paddle.enable_static()
+        return y_np, x_g_np, w_g_np, b_g_np
+
+    def test_main(self):
+        paddle.set_device("metax_gpu")
+        x_np = np.random.random([10, 20]).astype("float16")
+        weight_np = np.random.random([20]).astype("float16")
+        bias_np = np.random.random([20]).astype("float16")
+
+        y_np_1, x_g_np_1, w_g_np_1, b_g_np_1 = self.check_main(
+            x_np, weight_np, bias_np, "float16"
+        )
+        y_np_2, x_g_np_2, w_g_np_2, b_g_np_2 = self.check_main(
+            x_np, weight_np, bias_np, "float32"
+        )
+
+        def assert_equal(x, y):
+            np.testing.assert_allclose(x, y)
+
+        assert_equal(y_np_1, y_np_2)
+        assert_equal(x_g_np_1, x_g_np_2)
+        assert_equal(w_g_np_1, w_g_np_2)
+        assert_equal(b_g_np_1, b_g_np_2)
+
+
+class TestGetSetKeepLayerNormScaleBiasFP32Flag(unittest.TestCase):
+    def test_main(self):
+        self.assertTrue(_keep_layer_norm_scale_bias_to_fp32())
+        _keep_layer_norm_scale_bias_to_fp32(False)
+        self.assertFalse(_keep_layer_norm_scale_bias_to_fp32())
+        _keep_layer_norm_scale_bias_to_fp32(True)
+        self.assertTrue(_keep_layer_norm_scale_bias_to_fp32())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py b/backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py
new file mode 100644
index 00000000000..7545e16d14d
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py
@@ -0,0 +1,395 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+from tests.op_test import OpTest
+import paddle
+
+paddle.enable_static()
+SEED = 2022
+
+
+def reference_matmul(X, Y, transpose_X=False, transpose_Y=False, scale=1.0):
+    """Reference forward implementation using np.matmul."""
+    # np.matmul does not support the transpose flags, so we manually
+    # transpose X and Y appropriately.
+    if transpose_X:
+        if X.ndim == 1:
+            X = X.reshape((X.size,))
+        elif X.ndim == 2:
+            X = X.T
+        else:
+            dim = [i for i in range(len(X.shape))]
+            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
+            X = np.transpose(X, tuple(dim))
+    if transpose_Y:
+        if Y.ndim == 1:
+            Y = Y.reshape((Y.size,))
+        else:
+            dim = [i for i in range(len(Y.shape))]
+            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
+            Y = np.transpose(Y, tuple(dim))
+
+    Out = np.matmul(X, Y)
+    if abs(scale - 1.0) > 1e-09:
+        Out = Out * scale
+    return Out
+
+
+class TestBmmOp(OpTest):
+    """
+    case 0
+    """
+
+    def set_metax_gpu(self):
+        self.__class__.use_custom_device = True
+        self.place = paddle.CustomPlace("metax_gpu", 0)
+
+    def config(self):
+        self.x_shape = (10, 2, 5)
+        self.y_shape = (10, 5, 8)
+
+    def init_kernel_type(self):
+        self.dtype = "float32"
+
+    def setUp(self):
+        self.set_metax_gpu()
+        self.init_kernel_type()
+        self.config()
+        self.op_type = "bmm"
+        x = np.random.random(self.x_shape).astype(self.dtype)
+        y = np.random.random(self.y_shape).astype(self.dtype)
+        # -0.1 ~ 0.1
+        x = -0.1 + 0.2 * x
+        y = -0.1 + 0.2 * y
+        result = reference_matmul(x, y)
+        result = result.astype(self.dtype)
+        self.inputs = {
+            "X": x,
+            "Y": y,
+        }
+        self.outputs = {"Out": result}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-3)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ["X", "Y"], "Out")
+
+
+class TestBmmOp1(TestBmmOp):
+    """
+    case 1
+    """
+
+    def config(self):
+        self.x_shape = (40, 10, 10)
+        self.y_shape = (40, 10, 10)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-3)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ["X", "Y"], "Out")
+
+
+class TestBmmOp2(TestBmmOp):
+    """
+    case 2
+    """
+
+    def config(self):
+        self.x_shape = (4, 10, 80)
+        self.y_shape = (4, 80, 1)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place,
+            ["X", "Y"],
+            "Out",
+            max_relative_error=1e-2,
+        )
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-3)
+
+
+class TestMatMulOp(OpTest):
+    """
+    basic case
+    """
+
+    def setUp(self):
+        self.set_metax_gpu()
+        self.op_type = "matmul_v2"
+        self.init_dtype()
+        self.init_alpha()
+        self.config()
+
+        X = np.random.random(self.x_shape).astype(self.dtype)
+        Y = np.random.random(self.y_shape).astype(self.dtype)
+        # -0.1 ~ 0.1
+        X = -0.1 + 0.2 * X
+        Y = -0.1 + 0.2 * Y
+        Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y, self.alpha)
+        Out = Out.astype(self.dtype)
+        self.inputs = {"X": X, "Y": Y}
+        self.attrs = {
+            "trans_x": self.transpose_X,
+            "trans_y": self.transpose_Y,
+            "alpha": self.alpha,
+        }
+        self.outputs = {"Out": Out}
+
+    def set_metax_gpu(self):
+        self.__class__.use_custom_device = True
+        self.place = paddle.CustomPlace("metax_gpu", 0)
+
+    def config(self):
+        self.x_shape = (100,)
+        self.y_shape = (100,)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+    def init_alpha(self):
+        self.alpha = 1.0
+
+    def init_dtype(self):
+        self.dtype = "float32"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-7)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ["X", "Y"], "Out")
+
+
+class TestMatMulOp1(TestMatMulOp):
+    """
+    case x_ndim == 1, y_ndim != 1
+    """
+
+    def config(self):
+        self.x_shape = (100,)
+        self.y_shape = (1, 3, 2, 100)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp2(TestMatMulOp):
+    """
+    case x_ndim != 1, y_ndim == 1
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 100, 1)
+        self.y_shape = (100,)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp3(TestMatMulOp):
+    """
+    case [M, K] x [K, N] = [M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 100)
+        self.y_shape = (100, 2)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp4(TestMatMulOp):
+    """
+    case [M, K] x [K, N] = [M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 100)
+        self.y_shape = (2, 100)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp5(TestMatMulOp):
+    """
+    case [M, K] x [K, N] = [M, N]
+    """
+
+    def config(self):
+        self.x_shape = (100, 2)
+        self.y_shape = (100, 2)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp6(TestMatMulOp):
+    """
+    case [B, M, K] x [K, N] =  [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 2, 25)
+        self.y_shape = (25, 4)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp7(TestMatMulOp):
+    """
+    case [B, M, K] x [K, N] =  [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (1, 4, 25)
+        self.y_shape = (4, 25)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp8(TestMatMulOp):
+    """
+    case [B, M, K] x [K, N] =  [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (1, 25, 4)
+        self.y_shape = (25, 4)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp9(TestMatMulOp):
+    """
+    case [B, M, K] x  [B, K, N] = [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 5, 10)
+        self.y_shape = (2, 10, 5)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp10(TestMatMulOp):
+    """
+    case [B, M, K] x  [B, K, N] = [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 10, 5)
+        self.y_shape = (2, 10, 5)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp11(TestMatMulOp):
+    """
+    case [B, M, K] x  [B, K, N] = [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 5, 10)
+        self.y_shape = (2, 5, 10)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp12(TestMatMulOp):
+    """
+    case to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = 100
+        self.y_shape = (1, 2, 2, 100, 2)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp13(TestMatMulOp):
+    """
+    case to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (2, 1, 100)
+        self.y_shape = 100
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+# TODO(metax_gpu): alpha will be supported in next version
+# --------------------test matmul alpha--------------------
+# def create_test_alpha_class(parent):
+#     class TestMatMulOpAlphaCase(parent):
+#         def init_alpha(self):
+#             self.alpha = 0.125
+
+#     cls_name = "{0}_{1}".format(parent.__name__, "Alpha")
+#     TestMatMulOpAlphaCase.__name__ = cls_name
+#     globals()[cls_name] = TestMatMulOpAlphaCase
+
+# create_test_alpha_class(TestMatMulOp)
+# create_test_alpha_class(TestMatMulOp1)
+# create_test_alpha_class(TestMatMulOp2)
+# create_test_alpha_class(TestMatMulOp3)
+# create_test_alpha_class(TestMatMulOp4)
+# create_test_alpha_class(TestMatMulOp5)
+# create_test_alpha_class(TestMatMulOp6)
+# create_test_alpha_class(TestMatMulOp9)
+# create_test_alpha_class(TestMatMulOp10)
+# create_test_alpha_class(TestMatMulOp11)
+# create_test_alpha_class(TestMatMulOp12)
+# create_test_alpha_class(TestMatMulOp13)
+
+
+# --------------------test matmul fp16--------------------
+def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5):
+    class TestMatMulOpFp16Case(parent):
+        def init_kernel_type(self):
+            self.dtype = np.float16
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place, atol=atol)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(
+                self.place, ["X", "Y"], "Out", max_relative_error=max_relative_error
+            )
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
+    TestMatMulOpFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestMatMulOpFp16Case
+
+
+create_test_fp16_class(TestMatMulOp)
+create_test_fp16_class(TestMatMulOp1)
+create_test_fp16_class(TestMatMulOp2)
+create_test_fp16_class(TestMatMulOp3)
+create_test_fp16_class(TestMatMulOp4)
+create_test_fp16_class(TestMatMulOp5)
+create_test_fp16_class(TestMatMulOp6)
+create_test_fp16_class(TestMatMulOp9)
+create_test_fp16_class(TestMatMulOp10)
+create_test_fp16_class(TestMatMulOp11)
+create_test_fp16_class(TestMatMulOp12)
+create_test_fp16_class(TestMatMulOp13)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_nonzero_api_metax.py b/backends/metax_gpu/tests/unit_test/test_nonzero_api_metax.py
new file mode 100644
index 00000000000..c9bccd2abb3
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_nonzero_api_metax.py
@@ -0,0 +1,220 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import OpTest, convert_float_to_uint16
+
+import paddle
+from paddle import base
+from paddle.base import Program, program_guard
+
+
+def call_nonzero(x):
+    input = paddle.to_tensor(x)
+    return paddle.nonzero(x=input)
+
+
+class TestNonZeroAPI(unittest.TestCase):
+    def test_nonzero_api_as_tuple(self):
+        paddle.enable_static()
+        data = np.array([[1, 0], [0, 1]], dtype="float32")
+        with program_guard(Program(), Program()):
+            x = paddle.static.data(name="x", shape=[-1, 2], dtype="float32")
+            if not paddle.framework.use_pir_api():
+                x.desc.set_need_check_feed(False)
+            y = paddle.nonzero(x, as_tuple=True)
+            self.assertEqual(type(y), tuple)
+            self.assertEqual(len(y), 2)
+            z = paddle.concat(list(y), axis=0)
+            exe = base.Executor(base.CPUPlace())
+
+            (res,) = exe.run(feed={"x": data}, fetch_list=[z], return_numpy=False)
+        expect_out = np.array([0, 1, 0, 1])
+        np.testing.assert_allclose(expect_out, np.array(res), rtol=1e-05)
+
+        data = np.array([1, 1, 0], dtype="float32")
+        with program_guard(Program(), Program()):
+            x = paddle.static.data(name="x", shape=[-1], dtype="float32")
+            if not paddle.framework.use_pir_api():
+                x.desc.set_need_check_feed(False)
+            y = paddle.nonzero(x, as_tuple=True)
+            self.assertEqual(type(y), tuple)
+            self.assertEqual(len(y), 1)
+            z = paddle.concat(list(y), axis=0)
+            exe = base.Executor(base.CPUPlace())
+            (res,) = exe.run(feed={"x": data}, fetch_list=[z], return_numpy=False)
+        expect_out = np.array([0, 1])
+        np.testing.assert_allclose(expect_out, np.array(res), rtol=1e-05)
+
+        data = np.zeros([10, 3, 0], dtype="float32")
+        with program_guard(Program(), Program()):
+            x = paddle.static.data(name="x", shape=[10, 3, 0], dtype="float32")
+            if not paddle.framework.use_pir_api():
+                x.desc.set_need_check_feed(False)
+            y = paddle.nonzero(x, as_tuple=True)
+            self.assertEqual(type(y), tuple)
+            self.assertEqual(len(y), 3)
+            expect_out = np.zeros([0])
+            for item in y:
+                np.testing.assert_array_equal(expect_out, item)
+
+    def test_nonzero_api(self):
+        paddle.enable_static()
+        data = np.array([[1, 0], [0, 1]], dtype="float32")
+        with program_guard(Program(), Program()):
+            x = paddle.static.data(name="x", shape=[-1, 2], dtype="float32")
+            if not paddle.framework.use_pir_api():
+                x.desc.set_need_check_feed(False)
+            y = paddle.nonzero(x)
+            exe = base.Executor(base.CPUPlace())
+            (res,) = exe.run(feed={"x": data}, fetch_list=[y], return_numpy=False)
+        expect_out = np.array([[0, 0], [1, 1]])
+        np.testing.assert_allclose(expect_out, np.array(res), rtol=1e-05)
+
+        data = np.array([1, 1, 0], dtype="float32")
+        with program_guard(Program(), Program()):
+            x = paddle.static.data(name="x", shape=[-1], dtype="float32")
+            if not paddle.framework.use_pir_api():
+                x.desc.set_need_check_feed(False)
+            y = paddle.nonzero(x)
+            exe = base.Executor(base.CPUPlace())
+            (res,) = exe.run(feed={"x": data}, fetch_list=[y], return_numpy=False)
+        expect_out = np.array([[0], [1]])
+        np.testing.assert_allclose(expect_out, np.array(res), rtol=1e-05)
+
+    def test_dygraph_api(self):
+        data_x = np.array([[True, False], [False, True]])
+        with base.dygraph.guard():
+            x = paddle.to_tensor(data_x)
+            z = paddle.nonzero(x)
+            np_z = z.numpy()
+        expect_out = np.array([[0, 0], [1, 1]])
+
+
+# Base case
+class TestNonzeroOp(OpTest):
+    def setUp(self):
+        """Test where_index op with random value"""
+        np.random.seed(2023)
+        self.op_type = "where_index"
+        self.python_api = call_nonzero
+        self.init_shape()
+        self.init_dtype()
+
+        self.inputs = self.create_inputs()
+        self.outputs = self.return_outputs()
+
+    def test_check_output(self):
+        self.check_output(check_pir=True, check_symbol_infer=False)
+
+    def init_shape(self):
+        self.shape = [8, 8]
+
+    def init_dtype(self):
+        self.dtype = np.float64
+
+    def create_inputs(self):
+        return {"Condition": np.random.randint(5, size=self.shape).astype(self.dtype)}
+
+    def return_outputs(self):
+        return {"Out": np.transpose(np.nonzero(self.inputs["Condition"]))}
+
+
+class TestNonzeroComplex64Op(TestNonzeroOp):
+    def init_shape(self):
+        self.shape = [1, 2, 3]
+
+    def init_dtype(self):
+        self.dtype = np.complex64
+
+
+class TestNonzeroComplex128Op(TestNonzeroOp):
+    def init_shape(self):
+        self.shape = [1, 2, 3]
+
+    def init_dtype(self):
+        self.dtype = np.complex128
+
+
+class TestNonzeroFP32Op(TestNonzeroOp):
+    def init_shape(self):
+        self.shape = [2, 10, 2]
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+
+class TestNonzeroFP16Op(TestNonzeroOp):
+    def init_shape(self):
+        self.shape = [3, 4, 7]
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestNonzeroBF16(OpTest):
+    def setUp(self):
+        """Test where_index op with bfloat16 dtype"""
+        np.random.seed(2023)
+        self.op_type = "where_index"
+        self.python_api = call_nonzero
+        self.init_shape()
+        self.init_dtype()
+
+        self.inputs = self.create_inputs()
+        self.outputs = self.return_outputs()
+
+    def test_check_output(self):
+        self.check_output(check_pir=True, check_symbol_infer=False)
+
+    def init_shape(self):
+        self.shape = [12, 9]
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def create_inputs(self):
+        return {
+            "Condition": convert_float_to_uint16(
+                np.random.randint(5, size=self.shape).astype(np.float32)
+            )
+        }
+
+    def return_outputs(self):
+        return {"Out": np.transpose(np.nonzero(self.inputs["Condition"]))}
+
+
+class TestZeroSizeOp(TestNonzeroOp):
+    def init_shape(self):
+        self.shape = [0, 10]
+
+    def init_dtype(self):
+        self.dtype = np.float64
+
+
+class TestZeroSizeOpCase2(TestNonzeroOp):
+    def init_shape(self):
+        self.shape = [0, 10]
+
+    def init_dtype(self):
+        self.dtype = np.float64
+
+    def test_check_output(self):
+        self.check_output(check_pir=True, check_symbol_infer=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_p_norm_op_metax.py b/backends/metax_gpu/tests/unit_test/test_p_norm_op_metax.py
new file mode 100644
index 00000000000..c1bc46517b6
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_p_norm_op_metax.py
@@ -0,0 +1,215 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle
+from tests.op_test import OpTest
+
+paddle.enable_static()
+
+
+def p_norm(x, axis, porder, keepdims=False, reduce_all=False):
+    r = []
+    if axis is None or reduce_all:
+        x = x.flatten()
+        if porder == np.inf:
+            r = np.amax(np.abs(x), keepdims=keepdims)
+        elif porder == -np.inf:
+            r = np.amin(np.abs(x), keepdims=keepdims)
+        else:
+            r = np.linalg.norm(x, ord=porder, keepdims=keepdims)
+    elif isinstance(axis, list or tuple) and len(axis) == 2:
+        if porder == np.inf:
+            axis = tuple(axis)
+            r = np.amax(np.abs(x), axis=axis, keepdims=keepdims)
+        elif porder == -np.inf:
+            axis = tuple(axis)
+            r = np.amin(np.abs(x), axis=axis, keepdims=keepdims)
+        elif porder == 0:
+            axis = tuple(axis)
+            r = x.astype(bool)
+            r = np.sum(r, axis, keepdims=keepdims)
+        elif porder == 1:
+            axis = tuple(axis)
+            r = np.sum(np.abs(x), axis, keepdims=keepdims)
+        else:
+            axis = tuple(axis)
+            xp = np.power(np.abs(x), porder)
+            s = np.sum(xp, axis=axis, keepdims=keepdims)
+            r = np.power(s, 1.0 / porder)
+    else:
+        if isinstance(axis, list):
+            axis = tuple(axis)
+        r = np.linalg.norm(x, ord=porder, axis=axis, keepdims=keepdims)
+    r = r.astype(x.dtype)
+
+    return r
+
+
+class TestPnormOp(OpTest):
+    def set_metax_gpu(self):
+        self.__class__.use_custom_device = True
+
+    def setUp(self):
+        self.set_metax_gpu()
+        self.op_type = "p_norm"
+        self.init_test_case()
+        x = (np.random.random(self.shape) + 0.5).astype(self.dtype)
+        norm = p_norm(x, self.axis, self.porder, self.keepdim)
+        self.inputs = {"X": x}
+        self.attrs = {
+            "epsilon": self.epsilon,
+            "axis": self.axis,
+            "keepdim": self.keepdim,
+            "porder": float(self.porder),
+        }
+        self.outputs = {"Out": norm}
+        self.gradient = self.calc_gradient()
+
+    def test_check_output(self):
+        if self.dtype == "float16":
+            self.check_output_with_place(paddle.CustomPlace("metax_gpu", 0), atol=5e-3)
+        else:
+            self.check_output_with_place(paddle.CustomPlace("metax_gpu", 0))
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            paddle.CustomPlace("metax_gpu", 0),
+            ["X"],
+            "Out",
+            user_defined_grads=self.gradient,
+        )
+
+    def init_test_case(self):
+        self.shape = [2, 3, 4, 5]
+        self.axis = 1
+        self.epsilon = 1e-12
+        self.porder = 2.0
+        self.keepdim = False
+        self.init_dtype()
+
+    def init_dtype(self):
+        self.dtype = "float32"
+
+    def calc_gradient(self):
+        self.attrs = {
+            "epsilon": self.epsilon,
+            "axis": self.axis,
+            "keepdim": self.keepdim,
+            "porder": float(self.porder),
+        }
+        x = self.inputs["X"]
+        porder = self.attrs["porder"]
+        axis = self.attrs["axis"]
+        if porder == 0:
+            grad = np.zeros(x.shape).astype(x.dtype)
+        elif porder in [float("inf"), float("-inf")]:
+            norm = p_norm(x, axis=axis, porder=porder, keepdims=True)
+            x_abs = np.abs(x)
+            grad = np.sign(x)
+            grad[x_abs != norm] = 0.0
+        else:
+            norm = p_norm(x, axis=axis, porder=porder, keepdims=True)
+            grad = (
+                np.power(norm, 1 - porder)
+                * np.power(np.abs(x), porder - 1)
+                * np.sign(x)
+            )
+
+        numel = 1
+        for s in x.shape:
+            numel *= s
+        numel /= x.shape[axis]
+        return [grad.astype(x.dtype) * 1 / numel]
+
+
+class TestPnormOp2(TestPnormOp):
+    def init_test_case(self):
+        self.shape = [3, 20, 3]
+        self.axis = 2
+        self.epsilon = 1e-12
+        self.porder = 2.0
+        self.keepdim = True
+        self.init_dtype()
+
+
+# class TestPnormOp3(TestPnormOp):
+#     def init_test_case(self):
+#         self.shape = [3, 20, 3]
+#         self.axis = 2
+#         self.epsilon = 1e-12
+#         self.porder = np.inf
+#         self.keepdim = True
+#         self.init_dtype()
+
+
+# class TestPnormOp4(TestPnormOp3):
+#     def init_test_case(self):
+#         self.shape = [3, 20, 3]
+#         self.axis = 2
+#         self.epsilon = 1e-12
+#         self.porder = -np.inf
+#         self.keepdim = True
+#         self.init_dtype()
+
+
+class TestPnormOp5(TestPnormOp):
+    def init_test_case(self):
+        self.shape = [3, 20, 3]
+        self.axis = 2
+        self.epsilon = 1e-12
+        self.porder = 0
+        self.keepdim = True
+        self.init_dtype()
+
+
+# class TestPnormOp6(TestPnormOp):
+#     def init_test_case(self):
+#         self.shape = [2, 3, 4, 5]
+#         self.axis = 1
+#         self.epsilon = 1e-12
+#         self.porder = 0.5
+#         self.keepdim = False
+#         self.init_dtype()
+
+
+class TestPnormOpfp16(TestPnormOp):
+    def init_dtype(self):
+        self.dtype = "float16"
+
+
+class TestPnormOp2fp16(TestPnormOp2):
+    def init_dtype(self):
+        self.dtype = "float16"
+
+
+# class TestPnormOp3fp16(TestPnormOp3):
+#     def init_dtype(self):
+#         self.dtype = "float16"
+
+
+# class TestPnormOp4fp16(TestPnormOp4):
+#     def init_dtype(self):
+#         self.dtype = "float16"
+
+
+class TestPnormOp5fp16(TestPnormOp5):
+    def init_dtype(self):
+        self.dtype = "float16"
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_squeeze_op_metax.py b/backends/metax_gpu/tests/unit_test/test_squeeze_op_metax.py
new file mode 100644
index 00000000000..c67e807397c
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_squeeze_op_metax.py
@@ -0,0 +1,125 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+
+# import sys
+
+# sys.path.append("..")
+
+import numpy as np
+
+import paddle
+from tests.op_test import OpTest
+
+paddle.enable_static()
+
+
+# Correct: General.
+class TestSqueezeOp(OpTest):
+    def setUp(self):
+        self.op_type = "squeeze2"
+        self.init_test_case()
+        self.set_metax_gpu()
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float64")}
+        self.init_attrs()
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+        }
+
+    def set_metax_gpu(self):
+        self.__class__.use_custom_device = True
+        self.place = paddle.CustomPlace("metax_gpu", 0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ["X"], "Out")
+
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 40)
+        self.axes = (0, 2)
+        self.new_shape = (3, 40)
+
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes}
+
+
+# class TestSqueezeBF16Op(OpTest):
+#     def setUp(self):
+#         self.op_type = "squeeze2"
+#         self.dtype = np.uint16
+#         self.init_test_case()
+#         self.set_metax_gpu()
+#         x = np.random.random(self.ori_shape).astype("float32")
+#         out = x.reshape(self.new_shape)
+#         self.inputs = {"X": convert_float_to_uint16(x)}
+#         self.init_attrs()
+#         self.outputs = {"Out": convert_float_to_uint16(out)}
+
+#     def set_metax_gpu(self):
+#         self.__class__.use_custom_device = True
+#         self.place = paddle.CustomPlace("metax_gpu", 0)
+
+#     def test_check_output(self):
+#         self.check_output()
+
+#     def test_check_grad(self):
+#         self.check_grad(["X"], "Out")
+
+#     def init_test_case(self):
+#         self.ori_shape = (1, 3, 1, 40)
+#         self.axes = (0, 2)
+#         self.new_shape = (3, 40)
+
+#     def init_attrs(self):
+#         self.attrs = {"axes": self.axes}
+
+
+# Correct: There is mins axis.
+class TestSqueezeOp1(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 40)
+        self.axes = (0, -2)
+        self.new_shape = (3, 40)
+
+
+# Correct: No axes input.
+class TestSqueezeOp2(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (1, 20, 1, 5)
+        self.axes = ()
+        self.new_shape = (20, 5)
+
+
+# Correct: Just part of axes be squeezed.
+class TestSqueezeOp3(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (6, 1, 5, 1, 4, 1)
+        self.axes = (1, -1)
+        self.new_shape = (6, 5, 1, 4)
+
+
+# Correct: The demension of axis is not of size 1 remains unchanged.
+class TestSqueezeOp4(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (6, 1, 5, 1, 4, 1)
+        self.axes = (1, 2)
+        self.new_shape = (6, 5, 1, 4, 1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_swiglu_metax.py b/backends/metax_gpu/tests/unit_test/test_swiglu_metax.py
new file mode 100644
index 00000000000..40e46e70a21
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_swiglu_metax.py
@@ -0,0 +1,295 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import OpTest
+
+import paddle
+import paddle.distributed as dist
+import paddle.nn.functional as F
+from paddle import _C_ops
+from paddle.base import core
+from paddle.distributed.auto_parallel.static.dist_attribute import (
+    DistTensorSpec,
+    TensorDistAttr,
+)
+from paddle.incubate.nn.functional import swiglu as fused_swiglu_impl
+
+
+def swiglu(x, y, out_grad):
+    if isinstance(x, np.ndarray):
+        x = paddle.to_tensor(x)
+        y = paddle.to_tensor(y)
+        out_grad = paddle.to_tensor(out_grad)
+
+    origin_x = x.detach().clone()
+    origin_x.stop_gradient = False
+    x = origin_x
+
+    origin_y = y.detach().clone()
+    origin_y.stop_gradient = False
+    y = origin_y
+
+    dtype = x.dtype
+    need_convert = False
+    assert dtype == y.dtype
+    output_dtype = dtype
+    if paddle.is_compiled_with_cuda():
+        if dtype in [paddle.float16, paddle.bfloat16]:
+            output_dtype = paddle.float32
+            x = x.astype(output_dtype)
+            y = y.astype(output_dtype)
+            need_convert = True
+
+    out = F.silu(x) * y
+    if need_convert:
+        out = out.astype(dtype)
+    out.backward(out_grad)
+    ret = [
+        out.astype(output_dtype),
+        origin_x.grad.astype(output_dtype),
+        origin_y.grad.astype(output_dtype),
+    ]
+    return ret
+
+
+def fused_swiglu(x, y, out_grad):
+    x = x.detach().clone()
+    x.stop_gradient = False
+    if y is not None:
+        y = y.detach().clone()
+        y.stop_gradient = False
+    out = fused_swiglu_impl(x, y)
+    out.backward(out_grad)
+
+    output_dtype = x.dtype
+    if paddle.is_compiled_with_cuda():
+        if x.dtype in [paddle.float16, paddle.bfloat16]:
+            output_dtype = paddle.float32
+    ret = [
+        out.astype(output_dtype),
+    ]
+    if y is not None:
+        x_grad, y_grad = x.grad, y.grad
+    else:
+        x_grad, y_grad = paddle.split(x.grad, 2, axis=-1)
+
+    ret.append(x_grad.astype(output_dtype))
+    ret.append(y_grad.astype(output_dtype))
+    return ret
+
+
+tol_map = {
+    paddle.float64: [1e-8, 1e-8],
+    paddle.float32: [1e-6, 1e-6],
+    paddle.float16: [1e-3, 1e-3],
+    paddle.bfloat16: [1e-3, 1e-3],
+}
+
+
+class TestSwiGLUDygraph(unittest.TestCase):
+    def check_dygraph_impl(self, device, shape, dtype):
+        x = paddle.randn(shape, dtype=dtype)
+        y = paddle.randn(shape, dtype=dtype)
+        out_grad = paddle.randn(shape, dtype=dtype)
+
+        ret1 = swiglu(x, y, out_grad)
+        ret2 = fused_swiglu(x, y, out_grad)
+        ret3 = fused_swiglu(paddle.concat([x, y], axis=-1), None, out_grad)
+
+        atol, rtol = tol_map[dtype]
+        err_msg = f"Failed when device = {device}, dtype = {dtype}, shape = {shape}"
+        for t1, t2, t3 in zip(ret1, ret2, ret3):
+            t1, t2, t3 = t1.numpy(), t2.numpy(), t3.numpy()
+            np.testing.assert_allclose(t1, t2, atol=atol, rtol=rtol, err_msg=err_msg)
+            np.testing.assert_equal(t2, t3, err_msg=err_msg)
+
+    def check_dygraph(self, shape):
+        metas = [("cpu", paddle.float32), ("cpu", paddle.float64)]
+        if paddle.is_compiled_with_cuda():
+            metas.append(("gpu", paddle.float32))
+            metas.append(("gpu", paddle.float64))
+            metas.append(("gpu", paddle.float16))
+            prop = paddle.device.cuda.get_device_properties()
+            if prop.major >= 8:
+                metas.append(("gpu", paddle.bfloat16))
+
+        for device, dtype in metas:
+            origin_device = paddle.get_device()
+            paddle.set_device(device)
+            for with_split in [True]:
+                self.check_dygraph_impl(device, shape, dtype)
+            paddle.set_device(origin_device)
+
+    def check_static_graph(self, shape, dtype="float32"):
+        x = paddle.static.data(name="x", shape=shape, dtype=dtype)
+        y = paddle.static.data(name="y", shape=shape, dtype=dtype)
+        concated_x = paddle.static.data(
+            name="concated_x",
+            shape=[*shape[:-1], shape[-1] * 2],
+            dtype=dtype,
+        )
+        out1 = fused_swiglu_impl(x, y)
+        out2 = fused_swiglu_impl(concated_x)
+
+        concated_x_np = np.random.random(concated_x.shape).astype(dtype)
+        x_np, y_np = np.split(concated_x_np, 2, axis=-1)
+
+        exe = paddle.static.Executor()
+        t1, t2 = exe.run(
+            feed={"x": x_np, "y": y_np, "concated_x": concated_x_np},
+            fetch_list=[out1, out2],
+        )
+        np.testing.assert_equal(t1, t2)
+
+    def check_main(self, shape):
+        self.check_dygraph(shape)
+        paddle.enable_static()
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            self.check_static_graph(shape)
+        paddle.disable_static()
+
+    def test_main(self):
+        self.check_main([8, 100])
+        self.check_main([4, 101])
+
+
+class TestSwigluOp(OpTest):
+    def config(self):
+        self.x_shape = (8, 128)
+        self.check_auto_parallel = True
+
+    def setUp(self):
+        self.config()
+        self.op_type = "swiglu"
+        self.prim_op_type = "comp"
+        self.python_api = fused_swiglu_impl
+        self.public_python_api = fused_swiglu_impl
+        x = np.random.uniform(-1, 1, self.x_shape).astype("float64")
+        y = np.random.uniform(-1, 1, self.x_shape).astype("float64")
+        out_grad = np.random.uniform(-1, 1, self.x_shape).astype("float64")
+        res = swiglu(x, y, out_grad)
+        self.inputs = {"x": x, "y": y}
+        self.outputs = {"out": res[0].numpy()}
+        self.placements = {
+            "x": [dist.Shard(1)],
+            "y": [dist.Shard(1)],
+            "out": [dist.Shard(1)],
+        }
+
+    def test_check_output(self):
+        self.check_output(check_prim_pir=True)
+
+    def test_check_grad(self):
+        self.check_grad(
+            ["x", "y"],
+            "out",
+            check_auto_parallel=self.check_auto_parallel,
+            check_dygraph=1,
+            check_prim_pir=True,
+        )
+
+
+class TestSwigluOp2(TestSwigluOp):
+    def setUp(self):
+        self.config()
+        self.op_type = "swiglu"
+        self.prim_op_type = "comp"
+        self.python_api = fused_swiglu_impl
+        self.public_python_api = fused_swiglu_impl
+        x = np.random.uniform(-1, 1, self.x_shape).astype("float64")
+        tmp_inputs = np.split(x, 2, axis=-1)
+        x = tmp_inputs[0]
+        y = tmp_inputs[1]
+        out_grad = np.random.uniform(-1, 1, x.shape).astype("float64")
+        res = swiglu(x, y, out_grad)
+        self.inputs = {"x": x, "y": y}
+        self.outputs = {"out": res[0].numpy()}
+        self.placements = {
+            "x": [dist.Shard(1)],
+            "y": [dist.Shard(1)],
+            "out": [dist.Shard(1)],
+        }
+
+
+@unittest.skipIf(
+    not paddle.base.core.is_compiled_with_dist(),
+    "The spmd rule is should be tested with distributed=ON",
+)
+class TestSwigluSpmd(unittest.TestCase):
+    def setUp(self):
+        self.kernel = "swiglu"
+        self.rule = paddle.base.core.get_phi_spmd_rule(self.kernel)
+        x_shape = [64, 32]
+        process_mesh = dist.ProcessMesh(mesh=[0, 1, 2, 3])
+        x_tensor_dist_attr = TensorDistAttr()
+        x_tensor_dist_attr.dims_mapping = [-1, 0]
+        x_tensor_dist_attr.process_mesh = process_mesh
+        self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr)
+        self.y_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr)
+        self.out_dist_tensor_spec = DistTensorSpec(self.x_dist_tensor_spec)
+
+    def test_input_x_y(self):
+        result_dist_attrs = self.rule.infer_forward(
+            self.x_dist_tensor_spec, self.y_dist_tensor_spec
+        )
+        inferred_input_dist_attrs = result_dist_attrs[0]
+        inferred_output_dist_attrs = result_dist_attrs[1]
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(inferred_input_dist_attrs), 2)
+        self.assertEqual(len(inferred_output_dist_attrs), 1)
+        self.assertEqual(inferred_output_dist_attrs[0].dims_mapping, [-1, 0])
+
+    def test_input_x_unshard_last_dim(self):
+        x_shape = [64, 32]
+        process_mesh = dist.ProcessMesh(mesh=[0, 1, 2, 3])
+        x_tensor_dist_attr = TensorDistAttr()
+        x_tensor_dist_attr.dims_mapping = [0, -1]
+        x_tensor_dist_attr.process_mesh = process_mesh
+        self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr)
+
+        result_dist_attrs = self.rule.infer_forward(
+            self.x_dist_tensor_spec, DistTensorSpec()
+        )
+        inferred_input_dist_attrs = result_dist_attrs[0]
+        inferred_output_dist_attrs = result_dist_attrs[1]
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(inferred_input_dist_attrs), 2)
+        self.assertEqual(len(inferred_output_dist_attrs), 1)
+        self.assertEqual(inferred_output_dist_attrs[0].dims_mapping, [0, -1])
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(), "mamtul 0 size only with in cuda")
+class TestSwiglu0SizeDygraph(unittest.TestCase):
+    def test_swiglu(self):
+        x = paddle.ones([0, 128], dtype="float32")
+        y = paddle.ones([0, 128], dtype="float32")
+        x.stop_gradient = False
+        y.stop_gradient = False
+        out = fused_swiglu_impl(x, y)
+
+        dz = paddle.ones([0, 128], dtype="float32")
+
+        out = _C_ops.swiglu_grad(x, y, dz)
+
+        self.assertEqual(out[0].shape, x.shape)
+        self.assertEqual(out[1].shape, y.shape)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_top_p_sampling.py b/backends/metax_gpu/tests/unit_test/test_top_p_sampling.py
new file mode 100644
index 00000000000..4369972255d
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_top_p_sampling.py
@@ -0,0 +1,162 @@
+# 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
+# #   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+def TopPProcess(probs, top_p):
+    sorted_probs = paddle.sort(probs, descending=True)
+    sorted_indices = paddle.argsort(probs, descending=True)
+    cumulative_probs = paddle.cumsum(sorted_probs, axis=-1)
+
+    # Remove tokens with cumulative probs above the top_p, But keep at
+    # least min_tokens_to_keep tokens
+    sorted_indices_to_remove = cumulative_probs > top_p
+
+    # Keep the first token
+    sorted_indices_to_remove = paddle.cast(sorted_indices_to_remove, dtype="int64")
+
+    sorted_indices_to_remove = paddle.static.setitem(
+        sorted_indices_to_remove,
+        (slice(None), slice(1, None)),
+        sorted_indices_to_remove[:, :-1].clone(),
+    )
+    sorted_indices_to_remove = paddle.static.setitem(
+        sorted_indices_to_remove, (slice(None), 0), 0
+    )
+
+    # Scatter sorted tensors to original indexing
+    sorted_indices = (
+        sorted_indices + paddle.arange(probs.shape[0]).unsqueeze(-1) * probs.shape[-1]
+    )
+    condition = paddle.scatter(
+        sorted_indices_to_remove.flatten(),
+        sorted_indices.flatten(),
+        sorted_indices_to_remove.flatten(),
+    )
+    condition = paddle.cast(condition, "bool").reshape(probs.shape)
+    probs = paddle.where(condition, paddle.full_like(probs, 0.0), probs)
+    next_tokens = paddle.multinomial(probs)
+    next_scores = paddle.index_sample(probs, next_tokens)
+    return next_scores, next_tokens
+
+
+class TestTopPAPI(unittest.TestCase):
+    def setUp(self):
+        self.topp = 0.0
+        self.seed = 6688
+        self.batch_size = 3
+        self.vocab_size = 10000
+        self.dtype = "float32"
+        self.input_data = np.random.rand(self.batch_size, self.vocab_size)
+
+    def run_dygraph(self, place):
+        with paddle.base.dygraph.guard(place):
+            input_tensor = paddle.to_tensor(self.input_data, self.dtype)
+            topp_tensor = paddle.to_tensor(
+                [
+                    self.topp,
+                ]
+                * self.batch_size,
+                self.dtype,
+            ).reshape((-1, 1))
+
+            # test case for basic test case 1
+            paddle_result = paddle.tensor.top_p_sampling(
+                input_tensor, topp_tensor, seed=self.seed
+            )
+            ref_res = TopPProcess(input_tensor, self.topp)
+
+            np.testing.assert_allclose(
+                paddle_result[0].numpy(), ref_res[0].numpy(), rtol=1e-05
+            )
+            np.testing.assert_allclose(
+                paddle_result[1].numpy().flatten(),
+                ref_res[1].numpy().flatten(),
+                rtol=0,
+            )
+
+            # test case for basic test case 1
+            paddle_result = paddle.tensor.top_p_sampling(
+                input_tensor,
+                topp_tensor,
+                seed=-1,
+                k=5,
+                mode="non-truncated",
+                return_top=True,
+            )
+            ref_res = TopPProcess(input_tensor, self.topp)
+
+            np.testing.assert_allclose(
+                paddle_result[0].numpy(), ref_res[0].numpy(), rtol=1e-05
+            )
+            np.testing.assert_allclose(
+                paddle_result[1].numpy().flatten(),
+                ref_res[1].numpy().flatten(),
+                rtol=0,
+            )
+
+    def run_static(self, place):
+        paddle.enable_static()
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            input_tensor = paddle.static.data(
+                name="x", shape=[6, 1030], dtype=self.dtype
+            )
+            topp_tensor = paddle.static.data(
+                name="topp", shape=[6, 1], dtype=self.dtype
+            )
+            result = paddle.tensor.top_p_sampling(
+                input_tensor, topp_tensor, seed=self.seed
+            )
+            ref_res = TopPProcess(input_tensor, self.topp)
+            exe = paddle.static.Executor(place)
+            input_data = np.random.rand(6, 1030).astype(self.dtype)
+            paddle_result = exe.run(
+                feed={
+                    "x": input_data,
+                    "topp": np.array(
+                        [
+                            self.topp,
+                        ]
+                        * 6
+                    ).astype(self.dtype),
+                },
+                fetch_list=[
+                    result[0],
+                    result[1],
+                    ref_res[0],
+                    ref_res[1],
+                ],
+            )
+            np.testing.assert_allclose(paddle_result[0], paddle_result[2], rtol=1e-05)
+            np.testing.assert_allclose(paddle_result[1], paddle_result[3], rtol=1e-05)
+
+    def test_dygraph(self):
+        place = paddle.CustomPlace("metax_gpu", 0)
+        self.run_dygraph(place)
+
+    def test_static(self):
+        place = paddle.CustomPlace("metax_gpu", 0)
+        self.run_static(place)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_unsqueeze_op_metax.py b/backends/metax_gpu/tests/unit_test/test_unsqueeze_op_metax.py
new file mode 100644
index 00000000000..ff22c2c9ac9
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_unsqueeze_op_metax.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+
+from tests.op_test import OpTest
+import paddle
+
+paddle.enable_static()
+
+
+# Correct: General.
+class TestUnsqueezeOp(OpTest):
+    def setUp(self):
+        self.set_metax_gpu()
+        self.op_type = "unsqueeze2"
+        self.dtype = "float32"
+        self.init_test_case()
+        self.inputs = {"X": np.random.random(self.ori_shape).astype(self.dtype)}
+        self.init_attrs()
+        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+
+    def set_metax_gpu(self):
+        self.__class__.use_custom_device = True
+        self.place = paddle.CustomPlace("metax_gpu", 0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ["X"], "Out")
+
+    def init_test_case(self):
+        self.ori_shape = (3, 40)
+        self.axes = (1, 2)
+        self.new_shape = (3, 1, 1, 40)
+
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes}
+
+
+# Correct: Single input index.
+class TestUnsqueezeOp1(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (20, 5)
+        self.axes = (-1,)
+        self.new_shape = (20, 5, 1)
+
+
+# Correct: Mixed input axis.
+class TestUnsqueezeOp2(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (20, 5)
+        self.axes = (0, -1)
+        self.new_shape = (1, 20, 5, 1)
+
+
+# Correct: There is duplicated axis.
+class TestUnsqueezeOp3(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (10, 2, 5)
+        self.axes = (0, 3, 3)
+        self.new_shape = (1, 10, 2, 1, 1, 5)
+
+
+# Correct: Reversed axes.
+class TestUnsqueezeOp4(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (10, 2, 5)
+        self.axes = (3, 1, 1)
+        self.new_shape = (10, 1, 1, 2, 5, 1)
+
+
+# test float16
+class TestUnsqueezeOp5(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.dtype = "float16"
+        self.ori_shape = (10, 2, 5)
+        self.axes = (3, 1, 1)
+        self.new_shape = (10, 1, 1, 2, 5, 1)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 528ec55971cd8e115b3d0a7e2103bd4ebf7493a5 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Tue, 16 Sep 2025 11:39:34 +0800
Subject: [PATCH 11/58] [Metax] update metax CI CMakeLists (#16)

* [Metax] update metax CI

* [Metax] update metax CI CMakeLists
---
 backends/metax_gpu/tests/CMakeLists.txt | 44 +++++++++++++++----------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 7e549ef4eaa..37475773026 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -87,24 +87,32 @@ list(
 list(
   REMOVE_ITEM
   PYTHON_TEST_SCRIPTS
-  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_expand_v2_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_squared_l2_norm_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_einsum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py)
+  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py # 精度问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py # 受 test_sum_op.py 影响
+  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py # 精度问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py # core.cudnnversion
+                                                                   # 适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py # core.cudnnversion 适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py # core.cudnnversion 适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py # op_test.py 里
+                                                        # self._get_places()
+                                                        # 接口适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py # device == "gpu" 适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py # paddle-gpu 报错一致
+  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py # paddle-gpu 报错一致
+  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py # core.cudnnversion 适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py # paddle-gpu 报错一致
+  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py # paddle-gpu 报错一致
+  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py # paddle.device.cuda.get_device_properties
+  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py # needs check_grad with fp64
+                                                    # precision
+  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py # op_test.py 里
+                                                   # self._get_places() 接口适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py # CUDAPinnedPlace 问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py # paddle.device.cuda.get_device_properties
+  ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py # CUDAPinnedPlace 问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py # paddle.device.cuda.get_device_properties
+)
 
 list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS)
 foreach(test_script ${PYTHON_TEST_SCRIPTS})

From 5b31405c13c32af5dbc826f7e8fec58e64a74322 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 16 Sep 2025 15:02:29 +0800
Subject: [PATCH 12/58] [Metax] add github action (#18)

* [Metax] add github action

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 .github/workflows/metax_work.yaml | 52 +++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 .github/workflows/metax_work.yaml

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
new file mode 100644
index 00000000000..0d3d2637cdd
--- /dev/null
+++ b/.github/workflows/metax_work.yaml
@@ -0,0 +1,52 @@
+name: padlle metax gpu test
+
+on:
+  workflow_dispatch:
+  pull_request:
+    types: [opened, synchronize]
+    branches: [develop, release/**]
+    paths:
+      - "**"
+      - "!backends/**"
+      - "backends/metax_gpu/**"
+
+permissions: read-all
+
+defaults:
+  run:
+    shell: bash
+
+jobs:
+  metax-gpu-test:
+    runs-on: paddle-metax-runner-set
+    steps:
+      - name: Checkout repository
+        run: |
+          git config --global user.name "GitHub Actions"
+          git config --global user.email "actions@github.com"
+
+          if [ "${{ github.event_name }}" == "pull_request" ]; then
+            BRANCH_NAME=${{ github.head_ref }}
+          else
+            BRANCH_NAME=${{ github.ref_name }}
+          fi
+
+          git clone \
+            --reference-if-able /home/runner/PaddleCustomDevice \
+            --depth=1 \
+            --shallow-submodules \
+            --jobs=8 \
+            --branch $BRANCH_NAME \
+            --recurse-submodules \
+            https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git .
+
+
+      - name: compile
+        run: |
+          cd backends/metax_gpu
+          bash build.sh
+
+      - name: run test
+        run: |
+          cd backends/metax_gpu/tests
+          bash run_test.sh

From b93c971b17729f09733faf5400d7ba44f1e5f3f2 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 16 Sep 2025 15:15:34 +0800
Subject: [PATCH 13/58] [metax] chang build (#19)

* [metax]chaneg build

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 backends/metax_gpu/build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index dd0ab3aab90..c288ea22312 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -20,7 +20,7 @@ set -e
 pip  uninstall paddlepaddle -y
 
 
-export http_proxy=http://10.2.192.21:1080 https_proxy=http://10.2.192.21:1080
+# export http_proxy=http://10.2.192.21:1080 https_proxy=http://10.2.192.21:1080
 pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
 # install paddle
 python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
@@ -50,7 +50,7 @@ fi
 echo "make_maca"
 cd build
 cmake_maca .. -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON
-make_maca -j8
+make_maca -j60
 
 echo "install whl"
 pip install dist/paddle_metax_gpu*.whl --force-reinstall

From 6dbbe848d672a27bbbdded8e399ff5b1229c6647 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 16 Sep 2025 16:04:55 +0800
Subject: [PATCH 14/58] change_build (#20)

* [metax]chaneg build

---------
---
 backends/metax_gpu/build.sh | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index c288ea22312..e52cddc6476 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -20,15 +20,18 @@ set -e
 pip  uninstall paddlepaddle -y
 
 
-# export http_proxy=http://10.2.192.21:1080 https_proxy=http://10.2.192.21:1080
+# init paddle
+git submodule sync --recursive && git submodule update --init --recursive
+
+
+export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080
 pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
 # install paddle
 python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
 
 # exit 1
-# init paddle
-git submodule sync --recursive && git submodule update --init --recursive
 
+unset http_proxy https_proxy
 
 # apply patch
 bash change_patch.sh

From ef1b28e5d17ceac419de30f8ba129f16444bd39d Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 16 Sep 2025 16:18:54 +0800
Subject: [PATCH 15/58] change_build (#21)

---
 backends/metax_gpu/build.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index e52cddc6476..a40cac19e19 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -25,6 +25,7 @@ git submodule sync --recursive && git submodule update --init --recursive
 
 
 export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080
+export
 pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
 # install paddle
 python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/

From 3737e488da962ae43cde4d51e495454a2818eb01 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 16 Sep 2025 16:24:15 +0800
Subject: [PATCH 16/58] change_build (#22)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 backends/metax_gpu/build.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index a40cac19e19..e3c4304e5f8 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -30,7 +30,6 @@ pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/
 # install paddle
 python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
 
-# exit 1
 
 unset http_proxy https_proxy
 

From 16f35844e7218d0eb67aaffe6379c2a8820241e7 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Tue, 16 Sep 2025 16:52:30 +0800
Subject: [PATCH 17/58] =?UTF-8?q?=E3=80=90metax=E3=80=91modify=20cmake=20f?=
 =?UTF-8?q?or=20warpctc=20and=20warprnnt=20(#17)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel
---
 backends/metax_gpu/CMakeLists.txt             |  4 +-
 backends/metax_gpu/cmake/warpctc.cmake        |  7 +-
 backends/metax_gpu/cmake/warprnnt.cmake       |  8 ++-
 .../fused_conv2d_add_act_kernel_register.cu   |  2 +-
 .../conv_grad_kernel_register.cu              | 42 ++++++++++--
 .../kernels/gpudnn/conv_kernel_register.cu    |  2 +-
 .../kernels/gpudnn/conv_transpose_kernel.cu   |  2 +-
 backends/metax_gpu/kernels/impl/warpctc.h     | 64 -------------------
 .../kernels/impl/warpctc_grad_kernel_impl.h   |  2 +-
 .../kernels/impl/warpctc_kernel_impl.h        | 16 ++---
 backends/metax_gpu/kernels/impl/warprnnt.h    | 63 ------------------
 .../kernels/impl/warprnnt_kernel_impl.h       | 14 ++--
 backends/metax_gpu/kernels/metax_context.cc   | 20 +++++-
 backends/metax_gpu/kernels/metax_context.h    |  1 +
 14 files changed, 88 insertions(+), 159 deletions(-)
 rename backends/metax_gpu/kernels/{cuda_kernels => gpudnn}/conv_grad_kernel_register.cu (98%)
 delete mode 100644 backends/metax_gpu/kernels/impl/warpctc.h
 delete mode 100644 backends/metax_gpu/kernels/impl/warprnnt.h

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index cca23ab42f5..787aae13e40 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -736,7 +736,7 @@ add_library(
 target_include_directories(
   ${TARGET_NAME}
   PRIVATE ${PADDLE_SOURCE_DIR} ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/kernels
-          ${CUDA_INCLUDE_DIRS} ${PADDLE_SOURCE_DIR}/third_party/pybind/include
+          ${CUDA_INCLUDE_DIRS} ${WARPCTC_INCLUDE_DIR} ${WARPRNNT_INCLUDE_DIR} ${PADDLE_SOURCE_DIR}/third_party/pybind/include
           ${PADDLE_SOURCE_DIR}/paddle/phi/api/include/compat)
 
 target_link_libraries(
@@ -749,6 +749,8 @@ target_link_libraries(
   protobuf
   external_error_proto
   dgc
+  ${WARPCTC_LIBRARIES}
+  ${WARPRNNT_LIBRARIES}
   ${PADDLE_CORE_LIB})
 target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmccl.so)
 target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcFlashAttn.so)
diff --git a/backends/metax_gpu/cmake/warpctc.cmake b/backends/metax_gpu/cmake/warpctc.cmake
index 71c892a6cfa..9edc92f0a94 100644
--- a/backends/metax_gpu/cmake/warpctc.cmake
+++ b/backends/metax_gpu/cmake/warpctc.cmake
@@ -145,5 +145,8 @@ get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY)
 include_directories(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its
                                             # headers.
 
-add_library(warpctc INTERFACE)
-add_dependencies(warpctc extern_warpctc)
+add_library(warpctc SHARED IMPORTED GLOBAL)
+set_target_properties(warpctc PROPERTIES
+    IMPORTED_LOCATION ${WARPCTC_LIBRARIES}
+    INTERFACE_INCLUDE_DIRECTORIES ${WARPCTC_INCLUDE_DIR}
+)
\ No newline at end of file
diff --git a/backends/metax_gpu/cmake/warprnnt.cmake b/backends/metax_gpu/cmake/warprnnt.cmake
index 54a7ad6be86..527f2e55a1b 100644
--- a/backends/metax_gpu/cmake/warprnnt.cmake
+++ b/backends/metax_gpu/cmake/warprnnt.cmake
@@ -137,6 +137,8 @@ get_filename_component(WARPRNNT_LIBRARY_PATH ${WARPRNNT_LIBRARIES} DIRECTORY)
 include_directories(${WARPRNNT_INCLUDE_DIR}) # For warprnnt code to include its
                                              # headers.
 
-add_library(warprnnt INTERFACE)
-# set_property(TARGET warprnnt PROPERTY IMPORTED_LOCATION ${WARPRNNT_LIBRARIES})
-add_dependencies(warprnnt extern_warprnnt)
+add_library(warprnnt SHARED IMPORTED GLOBAL)
+set_target_properties(warprnnt PROPERTIES
+    IMPORTED_LOCATION ${WARPRNNT_LIBRARIES}
+    INTERFACE_INCLUDE_DIRECTORIES ${WARPRNNT_INCLUDE_DIR}
+)
\ No newline at end of file
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu
index ee4f105cbc5..48809ceefa4 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu
@@ -308,7 +308,7 @@ class CudnnConvDescManager {
       int groups,
       cudnnDataType_t dtype) {
     auto* desc = new phi::backends::gpu::ConvolutionDescriptor();
-    desc->set(dtype, paddings, strides, dilations, true, groups);
+    desc->set(dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups);
     return desc;
   }
 
diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_grad_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu
similarity index 98%
rename from backends/metax_gpu/kernels/cuda_kernels/conv_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu
index 885137675b4..e4acb2f95b6 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/conv_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu
@@ -161,7 +161,12 @@ void ConvCudnnGradKernelImplV7(
     args1.idesc.set(*transformed_input_grad, layout_tensor);
     args1.wdesc.set(*transformed_filter_channel, layout_tensor, iwo_groups);
     args1.odesc.set(*transformed_output_grad_channel, layout_tensor);
-    args1.cdesc.set(dtype, padding_common, strides, dilations, true, c_groups);
+    args1.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    phi::AllowTF32Cudnn(),
+                    c_groups);
 
 #ifdef PADDLE_WITH_HIP
     using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
@@ -184,7 +189,12 @@ void ConvCudnnGradKernelImplV7(
     args2.wdesc.set(
         *transformed_filter_grad_channel, layout_tensor, iwo_groups);
     args2.odesc.set(*transformed_output_grad_channel, layout_tensor);
-    args2.cdesc.set(dtype, padding_common, strides, dilations, true, c_groups);
+    args2.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    phi::AllowTF32Cudnn(),
+                    c_groups);
 #ifdef PADDLE_WITH_HIP
     using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
     workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
@@ -1073,7 +1083,12 @@ void ConvCudnnGradGradKernel(
       args1.idesc.set(transformed_ddX, iwo_group);
       args1.wdesc.set(*W, layout, iwo_group);
       args1.odesc.set(transformed_ddO_channel, iwo_group);
-      args1.cdesc.set(dtype, padding_common, strides, dilations, true, c_group);
+      args1.cdesc.set(dtype,
+                      padding_common,
+                      strides,
+                      dilations,
+                      phi::AllowTF32Cudnn(),
+                      c_group);
 
 #ifdef PADDLE_WITH_HIP
       using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
@@ -1092,7 +1107,12 @@ void ConvCudnnGradGradKernel(
       args2.idesc.set(transformed_X, iwo_group);
       args2.wdesc.set(*ddW, layout, iwo_group);
       args2.odesc.set(transformed_ddO_channel, iwo_group);
-      args2.cdesc.set(dtype, padding_common, strides, dilations, true, c_group);
+      args2.cdesc.set(dtype,
+                      padding_common,
+                      strides,
+                      dilations,
+                      phi::AllowTF32Cudnn(),
+                      c_group);
 
 #ifdef PADDLE_WITH_HIP
       using search2 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
@@ -1114,7 +1134,12 @@ void ConvCudnnGradGradKernel(
     args3.idesc.set(transformed_ddX, iwo_group);
     args3.wdesc.set(*dW, layout, iwo_group);
     args3.odesc.set(transformed_dO_channel, iwo_group);
-    args3.cdesc.set(dtype, padding_common, strides, dilations, true, c_group);
+    args3.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    phi::AllowTF32Cudnn(),
+                    c_group);
 
 #ifdef PADDLE_WITH_HIP
     using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
@@ -1136,7 +1161,12 @@ void ConvCudnnGradGradKernel(
     args4.idesc.set(transformed_dX, iwo_group);
     args4.wdesc.set(*ddW, layout, iwo_group);
     args4.odesc.set(transformed_dO_channel, iwo_group);
-    args4.cdesc.set(dtype, padding_common, strides, dilations, true, c_group);
+    args4.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    phi::AllowTF32Cudnn(),
+                    c_group);
 
 #ifdef PADDLE_WITH_HIP
     using search4 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
index bdff5fa9f93..bf129fed05c 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
@@ -81,7 +81,7 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
   args.cdesc.set(
       dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn(), groups);
 #else
-  args.cdesc.set(dtype, padding_common, strides, dilations, true);
+  args.cdesc.set(dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn());
 #endif
 
 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
index aa1cc80d06d..928201c705f 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
@@ -93,7 +93,7 @@ void ConvTransposeCudnnKernelImplV7(const DenseTensor* transformed_x,
   args.idesc.set(*transformed_out, iwo_groups);
   args.wdesc.set(*filter, layout_tensor, iwo_groups);
   args.odesc.set(*transformed_x, iwo_groups);
-  args.cdesc.set(dtype, padding_common, strides, dilations_, false, c_groups);
+  args.cdesc.set(dtype, padding_common, strides, dilations_, phi::AllowTF32Cudnn(), c_groups);
 
 #ifdef PADDLE_WITH_HIP
   SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result;
diff --git a/backends/metax_gpu/kernels/impl/warpctc.h b/backends/metax_gpu/kernels/impl/warpctc.h
deleted file mode 100644
index ba5da472ade..00000000000
--- a/backends/metax_gpu/kernels/impl/warpctc.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <mutex>  // NOLINT
-
-#include "paddle/phi/backends/dynload/dynamic_loader.h"
-#include "paddle/phi/common/port.h"
-#include "third_party/warpctc/include/ctc.h"
-
-namespace phi {
-namespace dynload {
-
-extern std::once_flag warpctc_dso_flag;
-extern void* warpctc_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load warpctc routine
- * via operator overloading.
- */
-#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                            \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
-      using warpctcFunc = decltype(&::__name);                       \
-      std::call_once(warpctc_dso_flag, []() {                        \
-        warpctc_dso_handle = phi::dynload::GetWarpCTCDsoHandle();    \
-      });                                                            \
-      static void* p_##__name = dlsym(warpctc_dso_handle, #__name);  \
-      return reinterpret_cast<warpctcFunc>(p_##__name)(args...);     \
-    }                                                                \
-  };                                                                 \
-  extern DynLoad__##__name __name
-
-#define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
-  DYNAMIC_LOAD_WARPCTC_WRAP(__name)
-
-#define WARPCTC_ROUTINE_EACH(__macro) \
-  __macro(get_warpctc_version);       \
-  __macro(ctcGetStatusString);        \
-  __macro(compute_ctc_loss);          \
-  __macro(compute_ctc_loss_double);   \
-  __macro(get_workspace_size);        \
-  __macro(get_workspace_size_double)
-
-WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP);
-
-#undef DYNAMIC_LOAD_WARPCTC_WRAP
-
-}  // namespace dynload
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h
index 51f4ce86890..dc9bc376e63 100644
--- a/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h
@@ -16,7 +16,7 @@
 
 #include <vector>
 
-#include "kernels/impl/warpctc.h"
+#include "third_party/warpctc/include/ctc.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
index 9794ba1b3c0..e0b15feca03 100644
--- a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
@@ -16,7 +16,7 @@
 
 #include <vector>
 
-#include "kernels/impl/warpctc.h"
+#include "third_party/warpctc/include/ctc.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/lod_utils.h"
 #include "paddle/phi/core/tensor_utils.h"
@@ -58,7 +58,7 @@ class ComputeCtcLossFunctor<Context, float> {
                          float* costs,
                          void* workspace,
                          ctcOptions options) {
-    return phi::dynload::compute_ctc_loss(activations,
+    return compute_ctc_loss(activations,
                                           gradients,
                                           flat_labels,
                                           label_lengths,
@@ -84,7 +84,7 @@ class ComputeCtcLossFunctor<Context, double> {
                          double* costs,
                          void* workspace,
                          ctcOptions options) {
-    return phi::dynload::compute_ctc_loss_double(
+    return compute_ctc_loss_double(
         activations,
         gradients,
         flat_labels,
@@ -141,14 +141,14 @@ class WarpCTCFunctor {
     ctcStatus_t status = CTC_STATUS_UNKNOWN_ERROR;
     if (sizeof(T) == 4) {
       status =
-          phi::dynload::get_workspace_size(cpu_label_lengths,
+          get_workspace_size(cpu_label_lengths,
                                            cpu_input_lengths,
                                            static_cast<int>(sequence_width),
                                            static_cast<int>(num_sequences),
                                            options_,
                                            &workspace_bytes);
     } else {
-      status = phi::dynload::get_workspace_size_double(
+      status = get_workspace_size_double(
           cpu_label_lengths,
           cpu_input_lengths,
           static_cast<int>(sequence_width),
@@ -162,7 +162,7 @@ class WarpCTCFunctor {
         errors::PreconditionNotMet(
             "warp-ctc [version %d] Error in get_workspace_size: %s",
             warpctc_version_,
-            phi::dynload::ctcGetStatusString(status)));
+            ctcGetStatusString(status)));
     PADDLE_ENFORCE_GT(
         workspace_bytes,
         0UL,
@@ -197,12 +197,12 @@ class WarpCTCFunctor {
         errors::PreconditionNotMet(
             "warp-ctc [version %d] Error in get_workspace_size: %s",
             warpctc_version_,
-            phi::dynload::ctcGetStatusString(status)));
+            ctcGetStatusString(status)));
   }
 
  protected:
   void init(const Context& dev_ctx, const size_t blank) {
-    warpctc_version_ = phi::dynload::get_warpctc_version();
+    warpctc_version_ = get_warpctc_version();
 
     if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU ||
         dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
diff --git a/backends/metax_gpu/kernels/impl/warprnnt.h b/backends/metax_gpu/kernels/impl/warprnnt.h
deleted file mode 100644
index 50b0dfc0efc..00000000000
--- a/backends/metax_gpu/kernels/impl/warprnnt.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <mutex>  // NOLINT
-
-#include "paddle/phi/backends/dynload/dynamic_loader.h"
-#include "paddle/phi/common/port.h"
-#include "third_party/warprnnt/include/rnnt.h"
-
-namespace phi {
-namespace dynload {
-
-extern std::once_flag warprnnt_dso_flag;
-extern void* warprnnt_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load warprnnt routine
- * via operator overloading.
- */
-#define DYNAMIC_LOAD_WARPRNNT_WRAP(__name)                           \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
-      using warprnntFunc = decltype(&::__name);                      \
-      std::call_once(warprnnt_dso_flag, []() {                       \
-        warprnnt_dso_handle = phi::dynload::GetWarpRNNTDsoHandle();  \
-      });                                                            \
-      static void* p_##__name = dlsym(warprnnt_dso_handle, #__name); \
-      return reinterpret_cast<warprnntFunc>(p_##__name)(args...);    \
-    }                                                                \
-  };                                                                 \
-  extern DynLoad__##__name __name
-
-#define DECLARE_DYNAMIC_LOAD_WARPRNNT_WRAP(__name) \
-  DYNAMIC_LOAD_WARPRNNT_WRAP(__name)
-
-#define WARPRNNT_ROUTINE_EACH(__macro) \
-  __macro(get_warprnnt_version);       \
-  __macro(rnntGetStatusString);        \
-  __macro(compute_rnnt_loss);          \
-  __macro(compute_rnnt_loss_fp64);     \
-  __macro(get_rnnt_workspace_size);
-
-WARPRNNT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPRNNT_WRAP);
-
-#undef DYNAMIC_LOAD_WARPRNNT_WRAP
-
-}  // namespace dynload
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
index bb4311f5912..457fdcb9bff 100644
--- a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
@@ -16,7 +16,7 @@
 
 #include <vector>
 
-#include "kernels/impl/warprnnt.h"
+#include "third_party/warprnnt/include/rnnt.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/empty_kernel.h"
@@ -55,7 +55,7 @@ class ComputeRnntLossFunctor<Context, float> {
                           float* costs,
                           void* workspace,
                           rnntOptions options) {
-    return phi::dynload::compute_rnnt_loss(activations,
+    return compute_rnnt_loss(activations,
                                            gradients,
                                            label,
                                            label_lengths,
@@ -81,7 +81,7 @@ class ComputeRnntLossFunctor<Context, double> {
                           double* costs,
                           void* workspace,
                           rnntOptions options) {
-    return phi::dynload::compute_rnnt_loss_fp64(activations,
+    return compute_rnnt_loss_fp64(activations,
                                                 gradients,
                                                 label,
                                                 label_lengths,
@@ -149,7 +149,7 @@ class WarpRNNTFunctor {
     }
 
     size_t workspace_bytes = 0;
-    status = phi::dynload::get_rnnt_workspace_size(
+    status = get_rnnt_workspace_size(
         maxT, maxU, B, gpu, &workspace_bytes, sizeof(T));
 
     PADDLE_ENFORCE_EQ(
@@ -158,7 +158,7 @@ class WarpRNNTFunctor {
         errors::PreconditionNotMet(
             "warp-rnnt [version %d] Error in get_rnnt_workspace_size: %s",
             warprnnt_version_,
-            phi::dynload::rnntGetStatusString(status)));
+            rnntGetStatusString(status)));
     PADDLE_ENFORCE_GT(
         workspace_bytes,
         0UL,
@@ -190,7 +190,7 @@ class WarpRNNTFunctor {
         errors::PreconditionNotMet(
             "warp-rnnt [version %d] Error in get_workspace_size: %s",
             warprnnt_version_,
-            phi::dynload::rnntGetStatusString(status)));
+            rnntGetStatusString(status)));
   }
 
  protected:
@@ -200,7 +200,7 @@ class WarpRNNTFunctor {
             const size_t blank,
             const float fastemit_lambda,
             const int num_threads) {
-    warprnnt_version_ = phi::dynload::get_warprnnt_version();
+    warprnnt_version_ = get_warprnnt_version();
 
     options_.maxT = maxT;
     options_.maxU = maxU;
diff --git a/backends/metax_gpu/kernels/metax_context.cc b/backends/metax_gpu/kernels/metax_context.cc
index 4df4d88b0b4..f0c92f00565 100644
--- a/backends/metax_gpu/kernels/metax_context.cc
+++ b/backends/metax_gpu/kernels/metax_context.cc
@@ -15,7 +15,25 @@
 #include "kernels/metax_context.h"
 
 namespace phi {
-bool AllowTF32Cudnn() { return false; }
+const bool allow_tf32_cublas = []() -> bool {
+    const char* v = std::getenv("ALLOW_TF32_CUBLAS");
+    if (v) {
+      return std::atoi(v);
+    }
+    return false;
+}();
+
+const bool allow_tf32_cudnn = []() -> bool {
+    const char* v = std::getenv("ALLOW_TF32_CUDNN");
+    if (v) {
+      return std::atoi(v);
+    }
+    return false;
+}();
+
+bool AllowTF32Cublas() { return allow_tf32_cublas; }
+bool AllowTF32Cudnn()  { return allow_tf32_cudnn;  }
+
 void DnnWorkspaceHandle::RunFuncSync(
     const std::function<void(void*)>& cudnn_func,
     size_t required_workspace_bytes,
diff --git a/backends/metax_gpu/kernels/metax_context.h b/backends/metax_gpu/kernels/metax_context.h
index 5974aadcc41..683a6df7017 100644
--- a/backends/metax_gpu/kernels/metax_context.h
+++ b/backends/metax_gpu/kernels/metax_context.h
@@ -128,6 +128,7 @@ inline void InitCusolverDnHandle(cusolverDnHandle_t* handle,
   }
 }
 
+bool AllowTF32Cublas();
 bool AllowTF32Cudnn();
 inline cusolverDnHandle_t GetCusolverDnHandle(gpuStream_t stream, Place place) {
   std::call_once(flag_cusolver_dn_, [&]() {

From ce54693240221505b150900fb601e640181a5620 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Tue, 16 Sep 2025 18:12:37 +0800
Subject: [PATCH 18/58] [metax]modify library to static library (#24)

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel

* modify library to static library
---
 backends/metax_gpu/cmake/warpctc.cmake  | 19 +++++++++----------
 backends/metax_gpu/cmake/warprnnt.cmake | 19 +++++++++----------
 2 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/backends/metax_gpu/cmake/warpctc.cmake b/backends/metax_gpu/cmake/warpctc.cmake
index 9edc92f0a94..0733c0f9ce5 100644
--- a/backends/metax_gpu/cmake/warpctc.cmake
+++ b/backends/metax_gpu/cmake/warpctc.cmake
@@ -66,11 +66,11 @@ set(WARPCTC_LIB_DIR
 
 if(WIN32)
   set(WARPCTC_LIBRARIES
-      "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_STATIC_LIBRARY_SUFFIX}"
       CACHE FILEPATH "Warp-ctc Library" FORCE)
 else()
   set(WARPCTC_LIBRARIES
-      "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_STATIC_LIBRARY_SUFFIX}"
       CACHE FILEPATH "Warp-ctc Library" FORCE)
 endif()
 
@@ -93,10 +93,10 @@ if(WIN32)
   set(WARPCTC_CXX_FLAGS_DEBUG
       $<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
 else()
-  set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS})
+  set(WARPCTC_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
   set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
   set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
-  set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+  set(WARPCTC_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
   set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
   set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
 endif()
@@ -127,7 +127,7 @@ ExternalProject_Add(
              -DNVCC_FLAGS_EXTRA=${NVCC_FLAGS_EXTRA}
              -DWITH_TORCH=OFF
              -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-             -DBUILD_SHARED=ON
+             -DBUILD_SHARED=OFF
              -DBUILD_TESTS=OFF
              -DCMAKE_POSITION_INDEPENDENT_CODE=ON
              -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
@@ -145,8 +145,7 @@ get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY)
 include_directories(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its
                                             # headers.
 
-add_library(warpctc SHARED IMPORTED GLOBAL)
-set_target_properties(warpctc PROPERTIES
-    IMPORTED_LOCATION ${WARPCTC_LIBRARIES}
-    INTERFACE_INCLUDE_DIRECTORIES ${WARPCTC_INCLUDE_DIR}
-)
\ No newline at end of file
+add_library(warpctc STATIC IMPORTED GLOBAL)
+set_target_properties(
+  warpctc PROPERTIES IMPORTED_LOCATION ${WARPCTC_LIBRARIES}
+                     INTERFACE_INCLUDE_DIRECTORIES ${WARPCTC_INCLUDE_DIR})
diff --git a/backends/metax_gpu/cmake/warprnnt.cmake b/backends/metax_gpu/cmake/warprnnt.cmake
index 527f2e55a1b..a8d6683af2b 100644
--- a/backends/metax_gpu/cmake/warprnnt.cmake
+++ b/backends/metax_gpu/cmake/warprnnt.cmake
@@ -62,11 +62,11 @@ set(WARPRNNT_LIB_DIR
 
 if(WIN32)
   set(WARPRNNT_LIBRARIES
-      "${WARPRNNT_INSTALL_DIR}/bin/warprnnt${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      "${WARPRNNT_INSTALL_DIR}/bin/warprnnt${CMAKE_STATIC_LIBRARY_SUFFIX}"
       CACHE FILEPATH "Warp-rnnt Library" FORCE)
 else()
   set(WARPRNNT_LIBRARIES
-      "${WARPRNNT_INSTALL_DIR}/lib/libwarprnnt${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      "${WARPRNNT_INSTALL_DIR}/lib/libwarprnnt${CMAKE_STATIC_LIBRARY_SUFFIX}"
       CACHE FILEPATH "Warp-rnnt Library" FORCE)
 endif()
 
@@ -90,10 +90,10 @@ if(WIN32)
   set(WARPRNNT_CXX_FLAGS_DEBUG
       $<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
 else()
-  set(WARPRNNT_C_FLAGS ${CMAKE_C_FLAGS})
+  set(WARPRNNT_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
   set(WARPRNNT_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
   set(WARPRNNT_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
-  set(WARPRNNT_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+  set(WARPRNNT_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
   set(WARPRNNT_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
   set(WARPRNNT_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
 endif()
@@ -120,7 +120,7 @@ ExternalProject_Add(
              -DWITH_ROCM=${WITH_ROCM}
              -DWITH_OMP=${USE_OMP}
              -DNVCC_FLAGS_EXTRA=${NVCC_FLAGS_EXTRA}
-             -DBUILD_SHARED=ON
+             -DBUILD_SHARED=OFF
              -DBUILD_TESTS=OFF
              -DCMAKE_POSITION_INDEPENDENT_CODE=ON
              -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
@@ -137,8 +137,7 @@ get_filename_component(WARPRNNT_LIBRARY_PATH ${WARPRNNT_LIBRARIES} DIRECTORY)
 include_directories(${WARPRNNT_INCLUDE_DIR}) # For warprnnt code to include its
                                              # headers.
 
-add_library(warprnnt SHARED IMPORTED GLOBAL)
-set_target_properties(warprnnt PROPERTIES
-    IMPORTED_LOCATION ${WARPRNNT_LIBRARIES}
-    INTERFACE_INCLUDE_DIRECTORIES ${WARPRNNT_INCLUDE_DIR}
-)
\ No newline at end of file
+add_library(warprnnt STATIC IMPORTED GLOBAL)
+set_target_properties(
+  warprnnt PROPERTIES IMPORTED_LOCATION ${WARPRNNT_LIBRARIES}
+                      INTERFACE_INCLUDE_DIRECTORIES ${WARPRNNT_INCLUDE_DIR})

From 4cda637ff68d88adfd88c322d4d55c9d7dd15397 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Tue, 16 Sep 2025 18:14:09 +0800
Subject: [PATCH 19/58] [Metax] organize documents (#25)

* [Metax] fix dgc & mklml compile product path problem

* [Metax] update metax_gpu CMakeLists.txt

* [Metax] organize documents
---
 .../calc_reduced_attn_kernel_register.cu      |   2 +-
 backends/metax_gpu/kernels/funcs/softmax.cu   |   2 +-
 .../kernels/funcs/values_vectors_functor.h    |   2 +-
 .../metax_gpu/kernels/gpudnn/conv_cudnn_v7.h  |   2 +-
 .../conv_transpose_grad_kernel_register.cu    |   2 +-
 .../kernels/gpudnn/pool_kernel_register.cu    |   2 +-
 .../metax_gpu/kernels/gpudnn/softmax_gpudnn.h |   2 +-
 .../kernels/impl/dirichlet_kernel_impl.h      |   2 +-
 .../addmm_grad_kernel_register.cu             |   0
 .../addmm_kernel_register.cu                  |   0
 .../batch_fc_grad_kernel_register.cu          |   0
 .../batch_norm_grad_kernel_register.cu        |   2 +-
 .../batch_norm_kernel_register.cu             |   0
 .../bilinear_grad_kernel_register.cu          |   0
 .../bilinear_kernel_register.cu               |   0
 .../metax_kernel/blha_get_max_len_register.cu |   2 +-
 .../bmm_grad_kernel_register.cu               |   0
 .../bmm_kernel_register.cu                    |   0
 ...abel_cross_entropy_grad_kernel_register.cu |   0
 .../cholesky_grad_kernel_register.cu          |   0
 .../metax_kernel/cholesky_kernel_register.cu  |   2 +-
 .../conv_kernel_register.cu                   |   0
 .../conv_transpose_kernel_register.cu         |   0
 .../crop_kernel_register.cu                   |   0
 .../cross_entropy_kernel_register.cu          |   2 +-
 .../depthwise_conv_grad_kernel.cu             |   0
 .../depthwise_conv_kernel.cu                  |   0
 .../kernels/{ => metax_kernel}/elementwise.h  |   0
 .../{ => metax_kernel}/flags_declare.cu       |   0
 .../flash_attn_grad_kernel.cu                 |   0
 .../{ => metax_kernel}/flash_attn_kernel.cu   |   0
 .../{ => metax_kernel}/flash_attn_kernel.h    |   0
 .../{ => metax_kernel}/flash_attn_utils.h     |   0
 .../kernels/{ => metax_kernel}/flashattn.cc   |   0
 .../kernels/{ => metax_kernel}/flashattn.h    |   0
 .../flatten2_grad_kernel_register.cu          |   0
 .../flatten2_kernel_register.cu               |   0
 .../fused_conv2d_add_act_kernel_register.cu   |   3 +-
 .../fused_rope_grad_kernel_register.cu        |   0
 .../fused_rope_kernel_register.cu             |   0
 .../instance_norm_grad_kerne_registerl.cu     |   2 +-
 .../instance_norm_kernel_register.cu          |   2 +-
 .../layer_norm_grad_kernel_register.cu        |   0
 .../layer_norm_kernel_register.cu             |   0
 .../lstm_kernel_register.cu                   |   0
 .../metax_kernel/lu_kernel_register.cu        |   2 +-
 .../lu_solve_grad_kernel_register.cu          |   0
 .../metax_kernel/matrix_rank_tol_kernel.cu    |   2 +-
 .../{ => metax_kernel}/metax_context.cc       |  24 +--
 .../{ => metax_kernel}/metax_context.h        |   6 +-
 .../multi_dot_grad_kernel_register.cu         |   0
 .../multi_dot_kernel_register.cu              |   0
 .../mv_grad_kernel_register.cu                |   0
 .../mv_kernel_register.cu                     |   0
 .../metax_kernel/qr_kernel_register.cu        |   2 +-
 .../rank_attention_grad_kernel_register.cu    |   0
 .../rank_attention_kernel_register.cu         |   0
 .../metax_kernel/rnn_grad_kernel.cu.cc        |   2 +-
 .../kernels/metax_kernel/rnn_kernel.cu.cc     |   2 +-
 .../slogdeterminant_kernel_register.cu        |   0
 .../softmax_kernel_grad_register.cu           |   0
 .../softmax_kernel_register.cu                |   0
 .../solve_grad_kernel_register.cu             |   0
 .../standard_gamma_kernel_register.cu         |   0
 .../stft_kernel_register.cu                   |   0
 .../svd_kernel_register.cu                    |   0
 .../top_k_grad_kernel_register.cu             |   0
 .../triangular_solve_grad_kernel_register.cu  |   0
 .../triangular_solve_kernel_register.cu       |   0
 .../warprnnt_kernel_register.cu               |   0
 .../weight_only_linear_kernel.cu              |   0
 .../weight_quantize_kernel_register.cu        |   0
 backends/metax_gpu/patch/paddle.patch         | 204 +++++++++---------
 backends/metax_gpu/tests/CMakeLists.txt       |  54 ++---
 74 files changed, 166 insertions(+), 163 deletions(-)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/addmm_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/addmm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/batch_fc_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/batch_norm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bilinear_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bilinear_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bmm_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bmm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/cholesky_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/conv_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/conv_transpose_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/crop_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/depthwise_conv_grad_kernel.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/depthwise_conv_kernel.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/elementwise.h (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flags_declare.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_grad_kernel.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_kernel.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_kernel.h (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_utils.h (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flashattn.cc (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flashattn.h (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/flatten2_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/flatten2_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/fused_conv2d_add_act_kernel_register.cu (99%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/fused_rope_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/fused_rope_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/layer_norm_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/layer_norm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/lstm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/lu_solve_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/metax_context.cc (90%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/metax_context.h (96%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/multi_dot_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/multi_dot_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/mv_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/mv_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/rank_attention_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/rank_attention_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/slogdeterminant_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/softmax_kernel_grad_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/softmax_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/solve_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/standard_gamma_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/stft_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/svd_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/top_k_grad_kernel_register.cu (100%)
 mode change 100755 => 100644
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/triangular_solve_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/triangular_solve_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/warprnnt_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/weight_only_linear_kernel.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/weight_quantize_kernel_register.cu (100%)

diff --git a/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu
index 11def2c9ee4..2aa8424f0b1 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/flash_attn_utils.h"
+#include "kernels/metax_kernel/flash_attn_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/calc_reduced_attn_kernel.h"
 
diff --git a/backends/metax_gpu/kernels/funcs/softmax.cu b/backends/metax_gpu/kernels/funcs/softmax.cu
index d738a53f43a..44bfd02a308 100644
--- a/backends/metax_gpu/kernels/funcs/softmax.cu
+++ b/backends/metax_gpu/kernels/funcs/softmax.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <vector>
 
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/backends/metax_gpu/kernels/funcs/values_vectors_functor.h b/backends/metax_gpu/kernels/funcs/values_vectors_functor.h
index ec429950872..8c5996e680b 100644
--- a/backends/metax_gpu/kernels/funcs/values_vectors_functor.h
+++ b/backends/metax_gpu/kernels/funcs/values_vectors_functor.h
@@ -24,7 +24,7 @@
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/common/errors.h"
 #endif
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h b/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h
index da61a1e5b41..a0f89047045 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h
+++ b/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "glog/logging.h"
-#include "metax_context.h"  //NOLINT
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
 #include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
 #include "paddle/phi/kernels/autotune/switch_autotune.h"
 #include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h"
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu
index 0067818d165..b7eebfcee2e 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <algorithm>
 
 #include "kernels/gpudnn/conv_cudnn_v7.h"
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/common/ddim.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/dynload/cudnn.h"
diff --git a/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu
index c115f5ad930..1c2bfeedf34 100644
--- a/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu
+++ b/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "gpudnn/pool_gpudnn.h"
-#include "metax_context.h"  //NOLINT
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/full_kernel.h"
diff --git a/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h b/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h
index 168752700e9..5844886ad1b 100644
--- a/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h
+++ b/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h
@@ -25,7 +25,7 @@
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "metax_context.h"  //NOLINT
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 
diff --git a/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h b/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h
index 70af87513e5..c2e2e341bf5 100644
--- a/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h
@@ -17,7 +17,7 @@
 #include <cmath>
 #include <random>
 
-#include "kernels/elementwise.h"
+#include "kernels/metax_kernel/elementwise.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/addmm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/addmm_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/addmm_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/addmm_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/addmm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/addmm_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/batch_fc_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_fc_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/batch_fc_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/batch_fc_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu
index 062646bbf9d..52fe5a1d566 100644
--- a/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "glog/logging.h"
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/common/flags.h"
 #include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_norm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/batch_norm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/bilinear_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bilinear_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/bilinear_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/bilinear_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/bilinear_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bilinear_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/bilinear_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/bilinear_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu b/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu
index bc9eb23c0e8..42810569fde 100644
--- a/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/flash_attn_utils.h"
 #include "kernels/metax_kernel/block_attn.h"
+#include "kernels/metax_kernel/flash_attn_utils.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/bmm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bmm_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/bmm_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/bmm_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/bmm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bmm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/bmm_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/bmm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/cholesky_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cholesky_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/cholesky_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/cholesky_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu
index e8fae2d9da5..8a39ae3f0a8 100644
--- a/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/conv_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/conv_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/conv_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_transpose_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/conv_transpose_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/conv_transpose_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/conv_transpose_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/crop_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/crop_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/crop_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/crop_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu
index e94862ec7b0..043a64dc149 100644
--- a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "glog/logging.h"
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/kernels/cross_entropy_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 
diff --git a/backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_grad_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_grad_kernel.cu
rename to backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_kernel.cu
rename to backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu
diff --git a/backends/metax_gpu/kernels/elementwise.h b/backends/metax_gpu/kernels/metax_kernel/elementwise.h
similarity index 100%
rename from backends/metax_gpu/kernels/elementwise.h
rename to backends/metax_gpu/kernels/metax_kernel/elementwise.h
diff --git a/backends/metax_gpu/kernels/flags_declare.cu b/backends/metax_gpu/kernels/metax_kernel/flags_declare.cu
similarity index 100%
rename from backends/metax_gpu/kernels/flags_declare.cu
rename to backends/metax_gpu/kernels/metax_kernel/flags_declare.cu
diff --git a/backends/metax_gpu/kernels/flash_attn_grad_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/flash_attn_grad_kernel.cu
similarity index 100%
rename from backends/metax_gpu/kernels/flash_attn_grad_kernel.cu
rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_grad_kernel.cu
diff --git a/backends/metax_gpu/kernels/flash_attn_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.cu
similarity index 100%
rename from backends/metax_gpu/kernels/flash_attn_kernel.cu
rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.cu
diff --git a/backends/metax_gpu/kernels/flash_attn_kernel.h b/backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.h
similarity index 100%
rename from backends/metax_gpu/kernels/flash_attn_kernel.h
rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.h
diff --git a/backends/metax_gpu/kernels/flash_attn_utils.h b/backends/metax_gpu/kernels/metax_kernel/flash_attn_utils.h
similarity index 100%
rename from backends/metax_gpu/kernels/flash_attn_utils.h
rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_utils.h
diff --git a/backends/metax_gpu/kernels/flashattn.cc b/backends/metax_gpu/kernels/metax_kernel/flashattn.cc
similarity index 100%
rename from backends/metax_gpu/kernels/flashattn.cc
rename to backends/metax_gpu/kernels/metax_kernel/flashattn.cc
diff --git a/backends/metax_gpu/kernels/flashattn.h b/backends/metax_gpu/kernels/metax_kernel/flashattn.h
similarity index 100%
rename from backends/metax_gpu/kernels/flashattn.h
rename to backends/metax_gpu/kernels/metax_kernel/flashattn.h
diff --git a/backends/metax_gpu/kernels/cuda_kernels/flatten2_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/flatten2_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/flatten2_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/flatten2_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/flatten2_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/flatten2_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/flatten2_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/flatten2_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu
similarity index 99%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu
index 48809ceefa4..c0d15b7f1b4 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu
@@ -308,7 +308,8 @@ class CudnnConvDescManager {
       int groups,
       cudnnDataType_t dtype) {
     auto* desc = new phi::backends::gpu::ConvolutionDescriptor();
-    desc->set(dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups);
+    desc->set(
+        dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups);
     return desc;
   }
 
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/fused_rope_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/fused_rope_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_rope_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/fused_rope_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_rope_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/fused_rope_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu b/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu
index d7540d949a9..bdf341f5a35 100644
--- a/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "glog/logging.h"
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu
index db975d74665..e0c0ae9c1d6 100644
--- a/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "glog/logging.h"
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/backends/metax_gpu/kernels/layer_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/layer_norm_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/layer_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/layer_norm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/layer_norm_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/layer_norm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/lstm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/lstm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/lstm_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/lstm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu
index 5a2d85418a1..72e4c5b2b79 100644
--- a/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu
@@ -18,7 +18,7 @@
 #include "paddle/phi/backends/dynload/cusolver.h"
 #endif
 
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/enforce.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/lu_solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/lu_solve_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/lu_solve_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/lu_solve_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu
index bda5dc62f1a..d8c3355e6e4 100644
--- a/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu
@@ -18,7 +18,7 @@
 #include <algorithm>
 #include <vector>
 
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/type_traits.h"
diff --git a/backends/metax_gpu/kernels/metax_context.cc b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
similarity index 90%
rename from backends/metax_gpu/kernels/metax_context.cc
rename to backends/metax_gpu/kernels/metax_kernel/metax_context.cc
index f0c92f00565..62aaa5fb2de 100644
--- a/backends/metax_gpu/kernels/metax_context.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
@@ -12,27 +12,27 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 
 namespace phi {
 const bool allow_tf32_cublas = []() -> bool {
-    const char* v = std::getenv("ALLOW_TF32_CUBLAS");
-    if (v) {
-      return std::atoi(v);
-    }
-    return false;
+  const char* v = std::getenv("ALLOW_TF32_CUBLAS");
+  if (v) {
+    return std::atoi(v);
+  }
+  return false;
 }();
 
 const bool allow_tf32_cudnn = []() -> bool {
-    const char* v = std::getenv("ALLOW_TF32_CUDNN");
-    if (v) {
-      return std::atoi(v);
-    }
-    return false;
+  const char* v = std::getenv("ALLOW_TF32_CUDNN");
+  if (v) {
+    return std::atoi(v);
+  }
+  return false;
 }();
 
 bool AllowTF32Cublas() { return allow_tf32_cublas; }
-bool AllowTF32Cudnn()  { return allow_tf32_cudnn;  }
+bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
 
 void DnnWorkspaceHandle::RunFuncSync(
     const std::function<void(void*)>& cudnn_func,
diff --git a/backends/metax_gpu/kernels/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
similarity index 96%
rename from backends/metax_gpu/kernels/metax_context.h
rename to backends/metax_gpu/kernels/metax_kernel/metax_context.h
index 683a6df7017..a6610c1dab2 100644
--- a/backends/metax_gpu/kernels/metax_context.h
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef BACKENDS_METAX_GPU_KERNELS_METAX_CONTEXT_H_
-#define BACKENDS_METAX_GPU_KERNELS_METAX_CONTEXT_H_
+#ifndef BACKENDS_METAX_GPU_KERNELS_METAX_KERNEL_METAX_CONTEXT_H_
+#define BACKENDS_METAX_GPU_KERNELS_METAX_KERNEL_METAX_CONTEXT_H_
 #include <array>
 #include <functional>
 #include <mutex>
@@ -161,4 +161,4 @@ inline DnnWorkspaceHandle GetDnnWorkspace(Allocator* alloactor,
   return DnnWorkspaceHandle(alloactor, stream);
 }
 }  // namespace phi
-#endif  // BACKENDS_METAX_GPU_KERNELS_METAX_CONTEXT_H_
+#endif  // BACKENDS_METAX_GPU_KERNELS_METAX_KERNEL_METAX_CONTEXT_H_
diff --git a/backends/metax_gpu/kernels/cuda_kernels/multi_dot_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/multi_dot_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/multi_dot_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/multi_dot_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/multi_dot_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/multi_dot_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/multi_dot_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/multi_dot_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/mv_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/mv_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/mv_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/mv_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/mv_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/mv_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/mv_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/mv_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu
index 745069e2eda..c3041254444 100644
--- a/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu
@@ -22,7 +22,7 @@
 #include <algorithm>
 #include <vector>
 
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/enforce.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/rank_attention_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/rank_attention_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/rank_attention_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/rank_attention_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/rank_attention_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/rank_attention_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/rank_attention_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/rank_attention_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc b/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc
index 499832049e4..101b51aa350 100644
--- a/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/phi/kernels/rnn_grad_kernel.h"
 
-#include "kernels/metax_context.h"  //NOLINT
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
diff --git a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc
index f1cf9e09dc7..2598ce093e6 100644
--- a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc
@@ -15,7 +15,7 @@
 #include "paddle/phi/kernels/rnn_kernel.h"
 
 #include "glog/logging.h"
-#include "kernels/metax_context.h"  //NOLINT
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/generator.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/slogdeterminant_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/slogdeterminant_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/slogdeterminant_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/slogdeterminant_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_grad_register.cu b/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_grad_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_grad_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/softmax_kernel_grad_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/softmax_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/solve_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/solve_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/solve_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/standard_gamma_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/standard_gamma_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/standard_gamma_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/standard_gamma_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/stft_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/stft_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/stft_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/stft_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/svd_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/svd_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/svd_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/svd_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/top_k_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/top_k_grad_kernel_register.cu
old mode 100755
new mode 100644
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/top_k_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/top_k_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/triangular_solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/triangular_solve_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/triangular_solve_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/triangular_solve_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/triangular_solve_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/triangular_solve_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/triangular_solve_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/triangular_solve_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/warprnnt_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/warprnnt_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/warprnnt_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/warprnnt_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/weight_only_linear_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/weight_only_linear_kernel.cu
rename to backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/weight_quantize_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/weight_quantize_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 0283a443adb..e56826c4f3e 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -16,16 +16,16 @@ index cfada544d4..a690e97d74 100644
 -  set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header})
 +  # set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header})
  endif()
- 
+
  set(EIGEN_INCLUDE_DIR ${SOURCE_DIR})
 diff --git a/paddle/fluid/platform/profiler/cupti_data_process.cc b/paddle/fluid/platform/profiler/cupti_data_process.cc
 index bff0f2bf70..9376b5781f 100644
 --- a/paddle/fluid/platform/profiler/cupti_data_process.cc
 +++ b/paddle/fluid/platform/profiler/cupti_data_process.cc
 @@ -16,7 +16,7 @@
- 
+
  #include <cstdio>
- 
+
 -#include "paddle/fluid/platform/enforce.h"
 +// #include "paddle/fluid/platform/enforce.h"
  #include "paddle/phi/core/os_info.h"
@@ -76,7 +76,7 @@ index c0080f0a5e..458ca3e2e8 100644
 +  __macro(cudnnDestroyActivationDescriptor);               \
 +  __macro(cudnnSetRNNDescriptor_v6);
  CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
- 
+
  #if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
 @@ -152,7 +161,12 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \
@@ -91,11 +91,11 @@ index c0080f0a5e..458ca3e2e8 100644
 +  __macro(cudnnRNNForwardInferenceEx);
  CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #endif
- 
+
 @@ -195,40 +209,6 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  CUDNN_DNN_ROUTINE_EACH_FRONTEND(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #endif
- 
+
 -#if CUDNN_VERSION < 90000
 -#define CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9(__macro) \
 -  __macro(cudnnGetRNNParamsSize);                     \
@@ -132,15 +132,15 @@ index c0080f0a5e..458ca3e2e8 100644
 -#endif
  }  // namespace dynload
  }  // namespace phi
- 
+
 diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h
 index 1547909d92..66b2779392 100644
 --- a/paddle/phi/backends/dynload/cufft.h
 +++ b/paddle/phi/backends/dynload/cufft.h
 @@ -1,3 +1,4 @@
-+// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.   
++// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
  /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
- 
+
  Licensed under the Apache License, Version 2.0 (the "License");
 @@ -40,7 +41,9 @@ extern void EnforceCUFFTLoaded(const char* fn_name);
          cufft_dso_handle = phi::dynload::GetCUFFTDsoHandle();        \
@@ -160,23 +160,23 @@ index 59e92955c9..d2f8c2da15 100644
 @@ -24,8 +24,8 @@ limitations under the License. */
  #include "paddle/phi/backends/dynload/dynamic_loader.h"
  #include "paddle/phi/common/port.h"
- 
+
 -namespace phi {
 -namespace dynload {
 +// namespace phi {
 +// namespace dynload {
- 
+
  extern std::once_flag cupti_dso_flag;
  extern void *cupti_dso_handle;
 @@ -71,7 +71,7 @@ extern void *cupti_dso_handle;
  CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
- 
+
  #undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
 -}  // namespace dynload
 -}  // namespace phi
 +// }  // namespace dynload
 +// }  // namespace phi
- 
+
 -#endif  // PADDLE_WITH_CUPTI
 +#endif  // PADDLE_WITH_CUPTI
 \ No newline at end of file
@@ -230,28 +230,28 @@ index 4ff2e528a9..81421c8ca1 100644
 --- a/paddle/phi/backends/gpu/cuda/cuda_device_function.h
 +++ b/paddle/phi/backends/gpu/cuda/cuda_device_function.h
 @@ -1,3 +1,4 @@
-+// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.   
++// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
  /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
- 
+
  Licensed under the Apache License, Version 2.0 (the "License");
 @@ -25,7 +26,7 @@ namespace phi {
  namespace backends {
  namespace gpu {
- 
+
 -#define FULL_WARP_MASK 0xFFFFFFFF
 +#define FULL_WARP_MASK 0xFFFFFFFFFFFFFFFFULL
  #define CREATE_SHFL_MASK(mask, predicate) \
    mask = __ballot_sync(FULL_WARP_MASK, (predicate))
- 
+
 @@ -45,12 +46,12 @@ namespace gpu {
- 
+
  template <typename T>
  __forceinline__ __device__ T
 -CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) {
 +CudaShuffleDownSync(unsigned long long mask, T val, int delta, int width = warpSize) {
    return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
  }
- 
+
  template <typename T>
 -__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
 +__forceinline__ __device__ T CudaShuffleXorSync(unsigned long long mask,
@@ -259,7 +259,7 @@ index 4ff2e528a9..81421c8ca1 100644
                                                  int width = warpSize) {
    return __shfl_xor_sync(mask, val, width);
 @@ -58,14 +59,14 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::float16 val, int delta, int width) {
@@ -267,7 +267,7 @@ index 4ff2e528a9..81421c8ca1 100644
    return phi::dtype::float16(__shfl_down_sync(
        mask, val.to_half(), static_cast<unsigned>(delta), width));
  }
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::bfloat16 val, int delta, int width) {
@@ -276,7 +276,7 @@ index 4ff2e528a9..81421c8ca1 100644
    return phi::dtype::bfloat16(__shfl_down_sync(
        mask, val.to_nv_bfloat16(), static_cast<unsigned>(delta), width));
 @@ -77,7 +78,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::complex<float> val, int delta, int width) {
@@ -285,7 +285,7 @@ index 4ff2e528a9..81421c8ca1 100644
        mask, static_cast<float>(val.real), static_cast<unsigned>(delta), width));
    float imag = static_cast<float>(__shfl_down_sync(
 @@ -87,7 +88,7 @@ __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::complex<double> val, int delta, int width) {
@@ -294,14 +294,14 @@ index 4ff2e528a9..81421c8ca1 100644
        static_cast<double>(__shfl_down_sync(mask,
                                             static_cast<double>(val.real),
 @@ -103,13 +104,13 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::float16 val, int width) {
 +    unsigned long long mask, phi::dtype::float16 val, int width) {
    return phi::dtype::float16(__shfl_xor_sync(mask, val.to_half(), width));
  }
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::bfloat16 val, int width) {
@@ -310,7 +310,7 @@ index 4ff2e528a9..81421c8ca1 100644
    return phi::dtype::bfloat16(
        __shfl_xor_sync(mask, val.to_nv_bfloat16(), width));
 @@ -121,7 +122,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::complex<float> val, int width) {
@@ -319,7 +319,7 @@ index 4ff2e528a9..81421c8ca1 100644
        __shfl_xor_sync(mask, static_cast<float>(val.real), width));
    float imag = static_cast<float>(
 @@ -131,7 +132,7 @@ __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::complex<double> val, int width) {
@@ -328,14 +328,14 @@ index 4ff2e528a9..81421c8ca1 100644
        __shfl_xor_sync(mask, static_cast<double>(val.real), width));
    double imag = static_cast<double>(
 @@ -141,7 +142,7 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
- 
+
  template <typename T>
  __forceinline__ __device__ T
 -CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) {
 +CudaShuffleSync(unsigned long long mask, T val, int src_line, int width = 32) {
    return __shfl_sync(mask, val, src_line, width);
  }
- 
+
 @@ -160,7 +161,7 @@ __device__ T reduceSum(T val, int tid, int len) {
    // but most card's warp size is 32.
    const int warpSize = 32;
@@ -343,7 +343,7 @@ index 4ff2e528a9..81421c8ca1 100644
 -  unsigned mask = 0u;
 +  unsigned long long mask = 0ull;
    CREATE_SHFL_MASK(mask, tid < len);
- 
+
    for (int offset = warpSize / 2; offset > 0; offset /= 2)
 diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
 index 024a7de73e..1e4cdf16be 100644
@@ -351,7 +351,7 @@ index 024a7de73e..1e4cdf16be 100644
 +++ b/paddle/phi/core/enforce.h
 @@ -45,7 +45,9 @@ limitations under the License. */
  #endif
- 
+
  #ifdef PADDLE_WITH_CUDA
 -#include "paddle/phi/backends/dynload/cublas.h"
 +// #include "paddle/phi/backends/dynload/../../../../../cublas.h"
@@ -361,9 +361,9 @@ index 024a7de73e..1e4cdf16be 100644
  #include "paddle/phi/backends/dynload/curand.h"
  #include "paddle/phi/backends/dynload/cusolver.h"
 @@ -97,7 +99,7 @@ inline bool is_error(bool stat) { return !stat; }
- 
+
  void ThrowWarnInternal(const std::string& message);
- 
+
 -#if defined(__CUDA_ARCH__)
 +#if defined(__CUDACC__)
  // For cuda, the assertions can affect performance and it is therefore
@@ -379,7 +379,7 @@ index 024a7de73e..1e4cdf16be 100644
    } while (0)
  #elif defined(__HIPCC__)
 @@ -757,4 +759,4 @@ inline void retry_sleep(unsigned millisecond) {
- 
+
  }  // namespace enforce
  using namespace enforce;  // NOLINT
 -}  // namespace phi
@@ -392,7 +392,7 @@ index c646e487d0..325122175c 100644
 @@ -25,8 +25,9 @@
  #else
  #include <cuda_runtime.h>
- 
+
 -#include "paddle/phi/backends/dynload/cublas.h"
 -#include "paddle/phi/backends/dynload/cublasLt.h"
 +// #include "paddle/phi/backends/dynload/cublas.h"
@@ -400,16 +400,16 @@ index c646e487d0..325122175c 100644
 +// #include "paddle/phi/backends/dynload/cublasLt.h"
  #include "paddle/phi/backends/dynload/cudnn.h"
  #endif
- 
+
 @@ -90,7 +91,7 @@ DECLARE_TYPE_FOR_GPU(gpuStreamCaptureMode,
- 
+
  // TODO(Ming Huang): Since there is no blasLt handler,
  // use rocblas_handle for workaround.
 -DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
 +// DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
- 
+
  #undef DECLARE_TYPE_FOR_GPU
- 
+
 diff --git a/paddle/phi/core/platform/device_context.h b/paddle/phi/core/platform/device_context.h
 index 2d02eb370b..8a7233e34e 100644
 --- a/paddle/phi/core/platform/device_context.h
@@ -430,58 +430,58 @@ index d69eb67d6f..1d8b6e9375 100644
 --- a/paddle/phi/kernels/cpu/index_select_impl.h
 +++ b/paddle/phi/kernels/cpu/index_select_impl.h
 @@ -18,7 +18,7 @@
- 
+
  #include "paddle/phi/core/dense_tensor.h"
  #include "paddle/phi/core/tensor_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
- 
+
 diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
 index cb35feee32..64f5bd24ac 100644
 --- a/paddle/phi/kernels/funcs/fc_functor.cu
 +++ b/paddle/phi/kernels/funcs/fc_functor.cu
 @@ -16,12 +16,12 @@ limitations under the License. */
- 
+
  #include "paddle/phi/backends/all_context.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/fc_functor.h"
- 
+
  #include "paddle/phi/backends/gpu/gpu_launch_config.h"
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
 +// #include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
  #include "paddle/phi/kernels/funcs/quant_dequant.h"
  #include "paddle/phi/kernels/matmul_kernel.h"
- 
+
 diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu
 index 88663ec880..98b93072a3 100644
 --- a/paddle/phi/kernels/funcs/gru_compute.cu
 +++ b/paddle/phi/kernels/funcs/gru_compute.cu
 @@ -12,7 +12,7 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/gru_compute.h"
- 
+
  #include "paddle/phi/backends/gpu/gpu_context.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h"
  #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
- 
+
 diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h
 index 15e1a4a3c3..e4780538d7 100644
 --- a/paddle/phi/kernels/funcs/math/context_project.h
 +++ b/paddle/phi/kernels/funcs/math/context_project.h
 @@ -18,7 +18,7 @@
  #include <vector>
- 
+
  #include "paddle/phi/core/tensor_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/im2col.h"
- 
+
  namespace phi {
 diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
 index e101224970..a52eb6096f 100644
@@ -489,14 +489,14 @@ index e101224970..a52eb6096f 100644
 +++ b/paddle/phi/kernels/funcs/matrix_inverse.cu
 @@ -15,11 +15,13 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
- 
+
  #include "paddle/phi/common/memory_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
- 
+
  namespace phi {
  namespace funcs {
- 
+
 +
 +
  template <typename Context, typename T>
@@ -514,19 +514,19 @@ index 558d363b39..05da04b517 100644
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
  #include "paddle/phi/kernels/funcs/scatter.cu.h"
- 
+
 diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 index 8b0baf5f5f..260482f124 100644
 --- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 +++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 @@ -27,7 +27,7 @@ namespace cub = hipcub;
- 
+
  #include "paddle/phi/kernels/funcs/multihead_matmul_functor.h"
- 
+
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/math_cuda_utils.h"
- 
+
  namespace phi {
 diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
 index e30d440ff3..3c74792690 100644
@@ -535,7 +535,7 @@ index e30d440ff3..3c74792690 100644
 @@ -30,11 +30,11 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
  #include "paddle/phi/kernels/primitive/functor_primitives.h"
- 
+
 -#define FINAL_MASK 0xffffffff
 +#define FINAL_MASK 0xffffffffffffffffull
  #ifdef PADDLE_WITH_HIP
@@ -545,7 +545,7 @@ index e30d440ff3..3c74792690 100644
 +#define WARP_SIZE 64
  #endif
  #define MAX_NUM_THREADS 1024
- 
+
 @@ -196,21 +196,56 @@ __device__ __forceinline__ void AddTo(Pair<T> topk[],
    for (int k = beam_size - 2; k >= 0; k--) {
      if (largest) {
@@ -606,7 +606,7 @@ index e30d440ff3..3c74792690 100644
 +  topk[0 + offset].v = p.v;
 +  topk[0 + offset].id = p.id;
  }
- 
+
  template <typename T, int BlockSize>
 @@ -239,24 +274,24 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[],
  template <typename T, int BlockSize>
@@ -662,7 +662,7 @@ index e30d440ff3..3c74792690 100644
 +            // topk + MaxLength - *beam, src, tid, dim, *max, length, largest);
        }
      }
- 
+
 @@ -355,6 +394,8 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
        shared_max[wid] = input_now;
      }
@@ -697,7 +697,7 @@ index e30d440ff3..3c74792690 100644
 -    if (--(*k) == 0) break;
 +    // if (--(*k) == 0) break;
 +    unsigned long long mask = 0ull;
- 
+
 -    unsigned mask = 0u;
 +    // unsigned mask = 0u;
      CREATE_SHFL_MASK(mask, true);
@@ -721,14 +721,14 @@ index e30d440ff3..3c74792690 100644
 +
      return ret;
    }
- 
+
    static __device__ __forceinline__ unsigned int SetBitfield(
        unsigned int val, unsigned int to_insert, int pos, int len) {
      unsigned int ret;
 -    asm("bfi.b32 %0, %1, %2, %3, %4;"
 -        : "=r"(ret)
 -        : "r"(to_insert), "r"(val), "r"(pos), "r"(len));
-+    
++
 +    ret = (static_cast<unsigned int>(val) << (32 - pos - len)) >> (32 - len);
      return ret;
    }
@@ -738,12 +738,12 @@ index e30d440ff3..3c74792690 100644
                                                           int len) {
      uint64_t ret;
 -    asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
-+    
++
 +
 +    ret = (static_cast<uint64_t>(val) << (64 - pos - len)) >> (64 - len);
      return ret;
    }
- 
+
 @@ -507,9 +556,9 @@ struct Bitfield<uint64_t> {
                                                           int pos,
                                                           int len) {
@@ -751,7 +751,7 @@ index e30d440ff3..3c74792690 100644
 -    asm("bfi.b64 %0, %1, %2, %3, %4;"
 -        : "=l"(ret)
 -        : "l"(to_insert), "l"(val), "r"(pos), "r"(len));
-+    
++
 +  ret = (static_cast<uint64_t>(val) << (64 - pos - len)) >> (64 - len);
 +
      return ret;
@@ -763,7 +763,7 @@ index e30d440ff3..3c74792690 100644
    int lane_id;
 -  asm("mov.s32 %0, %%laneid;" : "=r"(lane_id));
 -  return lane_id;
-+  
++
 +// // >>>> PTX2CPP Success <<<<
 +// {
 +// (lane_id)=(threadIdx.x&(warpSize-1));
@@ -771,7 +771,7 @@ index e30d440ff3..3c74792690 100644
 +  return ::__lane_id();
 +  // return lane_id;
  }
- 
+
  __device__ __forceinline__ unsigned GetLaneMaskLe() {
    unsigned mask;
 -  asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
@@ -780,17 +780,17 @@ index e30d440ff3..3c74792690 100644
 +  return ((uint64_t(1) << ::__lane_id()) << 1) - 1;
 +  // return mask;
  }
- 
+
  template <typename T, bool KillDependency, class Function>
 @@ -881,7 +936,8 @@ __global__ void GatherKthValue(const T* input,
- 
+
    // 1. Find the k-th value
    T kth_value = static_cast<T>(0);
 -  RadixSearch<T, RadixTypeConfig<T>::RadixType, IndexType, false>(
 +  // RadixSearch<T, RadixTypeConfig<T>::RadixType, IndexType, false>(
 +  RadixSearch<T, typename RadixTypeConfig<T>::RadixType, IndexType, false>(
        cur_input, k, num_cols, shared_mem, &kth_value);
- 
+
    __shared__ int64_t block_min_idx;
 @@ -1314,3 +1370,4 @@ bool SortTopk(const phi::GPUContext& dev_ctx,
  }
@@ -803,12 +803,12 @@ index 32db61532f..0220316bc3 100644
 +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
 @@ -15,7 +15,7 @@
  #pragma once
- 
+
  #if defined(PADDLE_WITH_CUDA)
 -#include "paddle/phi/backends/dynload/cublasLt.h"
 +// #include "paddle/phi/backends/dynload/cublasLt.h"
  #endif
- 
+
  #include "glog/logging.h"
 diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
 index 9d4bb18d55..ea42cc10a9 100644
@@ -830,12 +830,12 @@ index b8cfdbf3ce..fa14b94a77 100644
 --- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 @@ -14,7 +14,7 @@
- 
+
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
 +#include "kernels/metax_kernel/mmha_util.cu.h"
- 
+
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
@@ -843,12 +843,12 @@ index e838778952..83e805e75a 100644
 --- a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 @@ -14,7 +14,7 @@
- 
+
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
 +#include "kernels/metax_kernel/mmha_util.cu.h"
- 
+
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -863,7 +863,7 @@ index f0cca0f701..02ea957240 100644
 -#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
 +#include "kernels/gpudnn/conv_gpudnn.h"
 +#include "kernels/impl/conv_cudnn_impl.h"
- 
+
  namespace phi {
  // To determine use cudnn or not.
 diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h
@@ -890,7 +890,7 @@ index 29fa252e96..4ae72b0935 100644
 +// #endif
    return tanhf(x);
  }
- 
+
 diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
 index 11efd87965..679db14c24 100644
 --- a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
@@ -901,9 +901,9 @@ index 11efd87965..679db14c24 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
 -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 +#include "kernels/gpudnn/softmax_gpudnn.h"
- 
+
  namespace phi {
- 
+
 diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
 index 63c35dd4ee..15da9aea45 100644
 --- a/paddle/phi/kernels/gpu/log_softmax_kernel.cu
@@ -914,9 +914,9 @@ index 63c35dd4ee..15da9aea45 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
 -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 +#include "kernels/gpudnn/softmax_gpudnn.h"
- 
+
  namespace phi {
- 
+
 diff --git a/paddle/phi/kernels/gpu/lstsq_kernel.cu b/paddle/phi/kernels/gpu/lstsq_kernel.cu
 index 1bdbe1564c..f753b54bc6 100644
 --- a/paddle/phi/kernels/gpu/lstsq_kernel.cu
@@ -948,7 +948,7 @@ index cf80666b4e..ca76e055fb 100644
 --- a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 @@ -19,7 +19,7 @@ limitations under the License. */
- 
+
  #include "paddle/phi/common/amp_type_traits.h"
  #include "paddle/phi/kernels/baddbmm_grad_kernel.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -961,14 +961,14 @@ index 2789cb59a2..b91b076f7f 100644
 --- a/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
 @@ -20,7 +20,7 @@ limitations under the License. */
- 
+
  #include "paddle/phi/common/amp_type_traits.h"
  #include "paddle/phi/kernels/baddbmm_kernel.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
- 
+
 diff --git a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
 index 9a21c23666..86413d1577 100644
 --- a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
@@ -993,7 +993,7 @@ index 4459a931da..837c8682b8 100644
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
- 
+
  namespace phi {
 diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
 index ad9e9197dd..5478d9817d 100644
@@ -1013,27 +1013,27 @@ index e6b3960f6d..564125f1f6 100644
 --- a/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
 @@ -56,8 +56,8 @@ HOSTDEVICE T igam(const T a, const T x) {
- 
+
  template <typename T>
  HOSTDEVICE T igamc(const T a, const T x) {
 -  static T big = 4.503599627370496e15;
 -  static T biginv = 2.22044604925031308085e-16;
 +  const static T big = 4.503599627370496e15;
 +  const static T biginv = 2.22044604925031308085e-16;
- 
+
    if ((x <= T{0}) || (a <= T{0})) return (T{1.0});
- 
+
 diff --git a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 index 410fb3c560..009ce03440 100644
 --- a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 @@ -54,7 +54,7 @@ HOSTDEVICE T digamma_positive_domain(T x) {
- 
+
  template <typename T>
  HOSTDEVICE T digamma(T x) {
 -  static T pi = T{3.14159265358979323846};
 +  const static T pi = T{3.14159265358979323846};
- 
+
    if (x == T{0.0}) {
      T inf = std::numeric_limits<T>::infinity();
 diff --git a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
@@ -1048,12 +1048,12 @@ index 5ebbc8d2db..48acf8d0cd 100644
 -#include "paddle/phi/kernels/funcs/quant_dequant.h"
 +#include "kernels/funcs/blas/cublaslt.h"
 +#include "kernels/funcs/quant_dequant.h"
-+#include "kernels/metax_context.h"
- 
++#include "kernels/metax_kernel/metax_context.h"
+
  #pragma once
- 
+
 @@ -668,7 +669,7 @@ void LLMGemm(const phi::GPUContext& dev_ctx,
- 
+
    {
      auto helper =
 -        std::make_unique<CublasLtHelper>(m, k, n, dev_ctx.cublaslt_handle());
@@ -1067,12 +1067,12 @@ index 1f319c4ae3..9186eb6906 100644
 +++ b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
 @@ -15,7 +15,7 @@ limitations under the License. */
  #pragma once
- 
+
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
- 
+
  namespace phi {
 diff --git a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
 index 6f03f76eeb..5fe2c3e7dc 100644
@@ -1080,13 +1080,13 @@ index 6f03f76eeb..5fe2c3e7dc 100644
 +++ b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
 @@ -15,7 +15,7 @@ limitations under the License. */
  #pragma once
- 
+
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/for_range.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
- 
+
 diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h
 index 7b85903776..3f4b298807 100644
 --- a/paddle/phi/kernels/impl/merged_momentum_impl.h
@@ -1118,14 +1118,14 @@ index 4099d8b506..baef2cd643 100644
 --- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
 @@ -14,7 +14,7 @@
- 
+
  #pragma once
- 
+
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
- 
+
 diff --git a/third_party/flagcx b/third_party/flagcx
 index 7c469f4af9..7e6c4cc3ca 160000
 --- a/third_party/flagcx
diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 37475773026..410ef006514 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -87,32 +87,34 @@ list(
 list(
   REMOVE_ITEM
   PYTHON_TEST_SCRIPTS
-  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py # 精度问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py # 受 test_sum_op.py 影响
-  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py # 精度问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py # core.cudnnversion
-                                                                   # 适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py # core.cudnnversion 适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py # core.cudnnversion 适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py # op_test.py 里
-                                                        # self._get_places()
-                                                        # 接口适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py # device == "gpu" 适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py # paddle-gpu 报错一致
-  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py # paddle-gpu 报错一致
-  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py # core.cudnnversion 适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py # paddle-gpu 报错一致
-  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py # paddle-gpu 报错一致
-  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py # paddle.device.cuda.get_device_properties
-  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py # needs check_grad with fp64
-                                                    # precision
-  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py # op_test.py 里
-                                                   # self._get_places() 接口适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py # CUDAPinnedPlace 问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py # paddle.device.cuda.get_device_properties
-  ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py # CUDAPinnedPlace 问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py # paddle.device.cuda.get_device_properties
-)
+  # 精度问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
+  # core.cudnnversion
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
+  # op_test.py 里 self._get_places()接口适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
+  # device == "gpu" 适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
+  # paddle-gpu 报错一致
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
+  # paddle.device.cuda.get_device_properties
+  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py
+  # needs check_grad with fp64 precision
+  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
+  # CUDAPinnedPlace 问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py)
 
 list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS)
 foreach(test_script ${PYTHON_TEST_SCRIPTS})

From 23fca59cd47c30680a01e9ec79f5d4d16d156320 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 17 Sep 2025 10:44:44 +0800
Subject: [PATCH 20/58] [metax]fix_code style and index_elementwise_put_kernel
 (#27)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 backends/metax_gpu/CMakeLists.txt             | 15 +++--
 ...ex_elementwise_put_grad_kernel_register.cu | 18 ++++-
 .../index_elementwise_put_kernel_register.cu  | 18 ++++-
 .../kernels/gpudnn/conv_kernel_register.cu    |  3 +-
 .../kernels/gpudnn/conv_transpose_kernel.cu   |  7 +-
 .../kernels/impl/warpctc_grad_kernel_impl.h   |  2 +-
 .../kernels/impl/warpctc_kernel_impl.h        | 67 +++++++++----------
 .../kernels/impl/warprnnt_kernel_impl.h       | 39 +++++------
 8 files changed, 103 insertions(+), 66 deletions(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 787aae13e40..f282a9fbf7c 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -666,7 +666,6 @@ file(
   # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/shape_kernel.cc
   # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/gpu/conv_kernel_igemm.cu
   # ############################################################################
-  # kernels/fusion kernels/selected_rows
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
   # kernels/kps
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -713,10 +712,7 @@ file(
   kernels/cuda_kernels/*.cc
   kernels/cuda_kernels/*.cu
   kernels/funcs/blas/*.cc
-  kernels/ernie_core/*.cu
-  kernels/ernie_core/rms_norm_kernel_register.cu
-  kernels/ernie_core/top_p_sampling_kernel_register.cu
-  kernels/ernie_core/fused_bias_act_kernel_register.cu)
+  kernels/ernie_core/*.cu)
 
 set(CUSTOM_DEVICE_SRCS ${CUDA_SRCS} ${CC_SRCS} ${ERNIE_CORE_SRCS})
 
@@ -735,8 +731,13 @@ add_library(
 
 target_include_directories(
   ${TARGET_NAME}
-  PRIVATE ${PADDLE_SOURCE_DIR} ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/kernels
-          ${CUDA_INCLUDE_DIRS} ${WARPCTC_INCLUDE_DIR} ${WARPRNNT_INCLUDE_DIR} ${PADDLE_SOURCE_DIR}/third_party/pybind/include
+  PRIVATE ${PADDLE_SOURCE_DIR}
+          ${CMAKE_SOURCE_DIR}
+          ${CMAKE_SOURCE_DIR}/kernels
+          ${CUDA_INCLUDE_DIRS}
+          ${WARPCTC_INCLUDE_DIR}
+          ${WARPRNNT_INCLUDE_DIR}
+          ${PADDLE_SOURCE_DIR}/third_party/pybind/include
           ${PADDLE_SOURCE_DIR}/paddle/phi/api/include/compat)
 
 target_link_libraries(
diff --git a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_grad_kernel_register.cu
index c8d69cecae1..f935014d17b 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_grad_kernel_register.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/index_elementwise_put_grad_kernel.cu"  //NOLINT
 #include "paddle/phi/kernels/index_elementwise_put_grad_kernel.h"
-
 PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put_grad,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -31,3 +31,19 @@ PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put_grad,
                           phi::dtype::bfloat16,
                           phi::dtype::complex<float>,
                           phi::dtype::complex<double>) {}
+PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put_with_tensor_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::IndexElementwisePutWithTensorGradKernel,
+                          bool,
+                          float,
+                          double,
+                          int,
+                          int8_t,
+                          int64_t,
+                          int16_t,
+                          uint8_t,
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_kernel_register.cu
index 391dd908a8d..533204b8102 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_kernel_register.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/index_elementwise_put_kernel.cu"  //NOLINT
 #include "paddle/phi/kernels/index_elementwise_put_kernel.h"
-
 PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -31,3 +31,19 @@ PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put,
                           phi::dtype::bfloat16,
                           phi::dtype::complex<float>,
                           phi::dtype::complex<double>) {}
+PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put_with_tensor,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::IndexElementwisePutWithTensorKernel,
+                          bool,
+                          float,
+                          double,
+                          int,
+                          int8_t,
+                          int64_t,
+                          int16_t,
+                          uint8_t,
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
index bf129fed05c..0a83b504c76 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
@@ -81,7 +81,8 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
   args.cdesc.set(
       dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn(), groups);
 #else
-  args.cdesc.set(dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn());
+  args.cdesc.set(
+      dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn());
 #endif
 
 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
index 928201c705f..532b7af0db4 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
@@ -93,7 +93,12 @@ void ConvTransposeCudnnKernelImplV7(const DenseTensor* transformed_x,
   args.idesc.set(*transformed_out, iwo_groups);
   args.wdesc.set(*filter, layout_tensor, iwo_groups);
   args.odesc.set(*transformed_x, iwo_groups);
-  args.cdesc.set(dtype, padding_common, strides, dilations_, phi::AllowTF32Cudnn(), c_groups);
+  args.cdesc.set(dtype,
+                 padding_common,
+                 strides,
+                 dilations_,
+                 phi::AllowTF32Cudnn(),
+                 c_groups);
 
 #ifdef PADDLE_WITH_HIP
   SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result;
diff --git a/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h
index dc9bc376e63..16b740d5523 100644
--- a/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h
@@ -16,7 +16,6 @@
 
 #include <vector>
 
-#include "third_party/warpctc/include/ctc.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -24,6 +23,7 @@
 #include "paddle/phi/kernels/funcs/sequence_padding.h"
 #include "paddle/phi/kernels/funcs/sequence_scale.h"
 #include "paddle/utils/optional.h"
+#include "third_party/warpctc/include/ctc.h"
 
 namespace phi {
 
diff --git a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
index e0b15feca03..cb39a0171ba 100644
--- a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
@@ -16,7 +16,6 @@
 
 #include <vector>
 
-#include "third_party/warpctc/include/ctc.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/lod_utils.h"
 #include "paddle/phi/core/tensor_utils.h"
@@ -25,6 +24,7 @@
 #include "paddle/phi/kernels/funcs/sequence_padding.h"
 #include "paddle/phi/kernels/funcs/sequence_scale.h"
 #include "paddle/utils/optional.h"
+#include "third_party/warpctc/include/ctc.h"
 
 namespace phi {
 
@@ -59,15 +59,15 @@ class ComputeCtcLossFunctor<Context, float> {
                          void* workspace,
                          ctcOptions options) {
     return compute_ctc_loss(activations,
-                                          gradients,
-                                          flat_labels,
-                                          label_lengths,
-                                          input_lengths,
-                                          static_cast<int>(alphabet_size),
-                                          static_cast<int>(minibatch),
-                                          costs,
-                                          workspace,
-                                          options);
+                            gradients,
+                            flat_labels,
+                            label_lengths,
+                            input_lengths,
+                            static_cast<int>(alphabet_size),
+                            static_cast<int>(minibatch),
+                            costs,
+                            workspace,
+                            options);
   }
 };
 
@@ -84,17 +84,16 @@ class ComputeCtcLossFunctor<Context, double> {
                          double* costs,
                          void* workspace,
                          ctcOptions options) {
-    return compute_ctc_loss_double(
-        activations,
-        gradients,
-        flat_labels,
-        label_lengths,
-        input_lengths,
-        static_cast<int>(alphabet_size),
-        static_cast<int>(minibatch),
-        costs,
-        workspace,
-        options);
+    return compute_ctc_loss_double(activations,
+                                   gradients,
+                                   flat_labels,
+                                   label_lengths,
+                                   input_lengths,
+                                   static_cast<int>(alphabet_size),
+                                   static_cast<int>(minibatch),
+                                   costs,
+                                   workspace,
+                                   options);
   }
 };
 
@@ -140,21 +139,19 @@ class WarpCTCFunctor {
     size_t workspace_bytes = 0;
     ctcStatus_t status = CTC_STATUS_UNKNOWN_ERROR;
     if (sizeof(T) == 4) {
-      status =
-          get_workspace_size(cpu_label_lengths,
-                                           cpu_input_lengths,
-                                           static_cast<int>(sequence_width),
-                                           static_cast<int>(num_sequences),
-                                           options_,
-                                           &workspace_bytes);
+      status = get_workspace_size(cpu_label_lengths,
+                                  cpu_input_lengths,
+                                  static_cast<int>(sequence_width),
+                                  static_cast<int>(num_sequences),
+                                  options_,
+                                  &workspace_bytes);
     } else {
-      status = get_workspace_size_double(
-          cpu_label_lengths,
-          cpu_input_lengths,
-          static_cast<int>(sequence_width),
-          static_cast<int>(num_sequences),
-          options_,
-          &workspace_bytes);
+      status = get_workspace_size_double(cpu_label_lengths,
+                                         cpu_input_lengths,
+                                         static_cast<int>(sequence_width),
+                                         static_cast<int>(num_sequences),
+                                         options_,
+                                         &workspace_bytes);
     }
     PADDLE_ENFORCE_EQ(
         CTC_STATUS_SUCCESS,
diff --git a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
index 457fdcb9bff..8e3ab6fcdac 100644
--- a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
@@ -16,12 +16,12 @@
 
 #include <vector>
 
-#include "third_party/warprnnt/include/rnnt.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "third_party/warprnnt/include/rnnt.h"
 
 namespace phi {
 
@@ -56,15 +56,15 @@ class ComputeRnntLossFunctor<Context, float> {
                           void* workspace,
                           rnntOptions options) {
     return compute_rnnt_loss(activations,
-                                           gradients,
-                                           label,
-                                           label_lengths,
-                                           input_lengths,
-                                           static_cast<int>(alphabet_size),
-                                           static_cast<int>(minibatch),
-                                           costs,
-                                           workspace,
-                                           options);
+                             gradients,
+                             label,
+                             label_lengths,
+                             input_lengths,
+                             static_cast<int>(alphabet_size),
+                             static_cast<int>(minibatch),
+                             costs,
+                             workspace,
+                             options);
   }
 };
 
@@ -82,15 +82,15 @@ class ComputeRnntLossFunctor<Context, double> {
                           void* workspace,
                           rnntOptions options) {
     return compute_rnnt_loss_fp64(activations,
-                                                gradients,
-                                                label,
-                                                label_lengths,
-                                                input_lengths,
-                                                static_cast<int>(alphabet_size),
-                                                static_cast<int>(minibatch),
-                                                costs,
-                                                workspace,
-                                                options);
+                                  gradients,
+                                  label,
+                                  label_lengths,
+                                  input_lengths,
+                                  static_cast<int>(alphabet_size),
+                                  static_cast<int>(minibatch),
+                                  costs,
+                                  workspace,
+                                  options);
   }
 };
 
@@ -117,6 +117,7 @@ class WarpRNNTFunctor {
    * \param blank             blank label used in rnnt loss function.
    * \param cpu_loss         loss of each example in CPU memory.
    */
+
   void operator()(const Context& dev_ctx,
                   const T* input,
                   T* gradient,

From a513aaeb4c895177cd1c6b91d8d3b3c6b8ffe5a6 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 17 Sep 2025 11:07:44 +0800
Subject: [PATCH 21/58] change_build_917 (#29)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 backends/metax_gpu/build.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index e3c4304e5f8..2bee14930a3 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -24,14 +24,14 @@ pip  uninstall paddlepaddle -y
 git submodule sync --recursive && git submodule update --init --recursive
 
 
-export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080
-export
+# export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080
+# export
 pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
 # install paddle
 python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
 
 
-unset http_proxy https_proxy
+# unset http_proxy https_proxy
 
 # apply patch
 bash change_patch.sh

From 4eb455e0f14f4a74bfd91e3fd44d67500af2a2c0 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 17 Sep 2025 11:19:49 +0800
Subject: [PATCH 22/58] chang_build (#30)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

* [metax]change_build

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 backends/metax_gpu/build.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index 2bee14930a3..de409153472 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -22,12 +22,16 @@ pip  uninstall paddlepaddle -y
 
 # init paddle
 git submodule sync --recursive && git submodule update --init --recursive
+sleep 1000000
+unset http_proxy https_proxy
 
 
 # export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080
 # export
 pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
 # install paddle
+
+
 python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
 
 

From 1773978409b36845416e6491a6b5a2e06ff49992 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Wed, 17 Sep 2025 13:59:58 +0800
Subject: [PATCH 23/58] [metax]modify kernel (#31)

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel

* modify library to static library

* modify kernel
---
 backends/metax_gpu/patch/paddle.patch | 257 ++++++++++++++------------
 1 file changed, 138 insertions(+), 119 deletions(-)

diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index e56826c4f3e..667d9f75d1c 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -16,16 +16,16 @@ index cfada544d4..a690e97d74 100644
 -  set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header})
 +  # set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header})
  endif()
-
+ 
  set(EIGEN_INCLUDE_DIR ${SOURCE_DIR})
 diff --git a/paddle/fluid/platform/profiler/cupti_data_process.cc b/paddle/fluid/platform/profiler/cupti_data_process.cc
 index bff0f2bf70..9376b5781f 100644
 --- a/paddle/fluid/platform/profiler/cupti_data_process.cc
 +++ b/paddle/fluid/platform/profiler/cupti_data_process.cc
 @@ -16,7 +16,7 @@
-
+ 
  #include <cstdio>
-
+ 
 -#include "paddle/fluid/platform/enforce.h"
 +// #include "paddle/fluid/platform/enforce.h"
  #include "paddle/phi/core/os_info.h"
@@ -76,7 +76,7 @@ index c0080f0a5e..458ca3e2e8 100644
 +  __macro(cudnnDestroyActivationDescriptor);               \
 +  __macro(cudnnSetRNNDescriptor_v6);
  CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-
+ 
  #if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
 @@ -152,7 +161,12 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \
@@ -91,11 +91,11 @@ index c0080f0a5e..458ca3e2e8 100644
 +  __macro(cudnnRNNForwardInferenceEx);
  CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #endif
-
+ 
 @@ -195,40 +209,6 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  CUDNN_DNN_ROUTINE_EACH_FRONTEND(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #endif
-
+ 
 -#if CUDNN_VERSION < 90000
 -#define CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9(__macro) \
 -  __macro(cudnnGetRNNParamsSize);                     \
@@ -132,15 +132,15 @@ index c0080f0a5e..458ca3e2e8 100644
 -#endif
  }  // namespace dynload
  }  // namespace phi
-
+ 
 diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h
-index 1547909d92..66b2779392 100644
+index 1547909d92..ef20838434 100644
 --- a/paddle/phi/backends/dynload/cufft.h
 +++ b/paddle/phi/backends/dynload/cufft.h
 @@ -1,3 +1,4 @@
 +// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
  /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
+ 
  Licensed under the Apache License, Version 2.0 (the "License");
 @@ -40,7 +41,9 @@ extern void EnforceCUFFTLoaded(const char* fn_name);
          cufft_dso_handle = phi::dynload::GetCUFFTDsoHandle();        \
@@ -160,23 +160,23 @@ index 59e92955c9..d2f8c2da15 100644
 @@ -24,8 +24,8 @@ limitations under the License. */
  #include "paddle/phi/backends/dynload/dynamic_loader.h"
  #include "paddle/phi/common/port.h"
-
+ 
 -namespace phi {
 -namespace dynload {
 +// namespace phi {
 +// namespace dynload {
-
+ 
  extern std::once_flag cupti_dso_flag;
  extern void *cupti_dso_handle;
 @@ -71,7 +71,7 @@ extern void *cupti_dso_handle;
  CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
-
+ 
  #undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
 -}  // namespace dynload
 -}  // namespace phi
 +// }  // namespace dynload
 +// }  // namespace phi
-
+ 
 -#endif  // PADDLE_WITH_CUPTI
 +#endif  // PADDLE_WITH_CUPTI
 \ No newline at end of file
@@ -226,32 +226,32 @@ index c5309e7e11..3328571380 100644
      }                                                              \
    };                                                               \
 diff --git a/paddle/phi/backends/gpu/cuda/cuda_device_function.h b/paddle/phi/backends/gpu/cuda/cuda_device_function.h
-index 4ff2e528a9..81421c8ca1 100644
+index 4ff2e528a9..23f7f4b583 100644
 --- a/paddle/phi/backends/gpu/cuda/cuda_device_function.h
 +++ b/paddle/phi/backends/gpu/cuda/cuda_device_function.h
 @@ -1,3 +1,4 @@
 +// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
  /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
+ 
  Licensed under the Apache License, Version 2.0 (the "License");
 @@ -25,7 +26,7 @@ namespace phi {
  namespace backends {
  namespace gpu {
-
+ 
 -#define FULL_WARP_MASK 0xFFFFFFFF
 +#define FULL_WARP_MASK 0xFFFFFFFFFFFFFFFFULL
  #define CREATE_SHFL_MASK(mask, predicate) \
    mask = __ballot_sync(FULL_WARP_MASK, (predicate))
-
+ 
 @@ -45,12 +46,12 @@ namespace gpu {
-
+ 
  template <typename T>
  __forceinline__ __device__ T
 -CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) {
 +CudaShuffleDownSync(unsigned long long mask, T val, int delta, int width = warpSize) {
    return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
  }
-
+ 
  template <typename T>
 -__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
 +__forceinline__ __device__ T CudaShuffleXorSync(unsigned long long mask,
@@ -259,7 +259,7 @@ index 4ff2e528a9..81421c8ca1 100644
                                                  int width = warpSize) {
    return __shfl_xor_sync(mask, val, width);
 @@ -58,14 +59,14 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::float16 val, int delta, int width) {
@@ -267,7 +267,7 @@ index 4ff2e528a9..81421c8ca1 100644
    return phi::dtype::float16(__shfl_down_sync(
        mask, val.to_half(), static_cast<unsigned>(delta), width));
  }
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::bfloat16 val, int delta, int width) {
@@ -276,7 +276,7 @@ index 4ff2e528a9..81421c8ca1 100644
    return phi::dtype::bfloat16(__shfl_down_sync(
        mask, val.to_nv_bfloat16(), static_cast<unsigned>(delta), width));
 @@ -77,7 +78,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::complex<float> val, int delta, int width) {
@@ -285,7 +285,7 @@ index 4ff2e528a9..81421c8ca1 100644
        mask, static_cast<float>(val.real), static_cast<unsigned>(delta), width));
    float imag = static_cast<float>(__shfl_down_sync(
 @@ -87,7 +88,7 @@ __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::complex<double> val, int delta, int width) {
@@ -294,14 +294,14 @@ index 4ff2e528a9..81421c8ca1 100644
        static_cast<double>(__shfl_down_sync(mask,
                                             static_cast<double>(val.real),
 @@ -103,13 +104,13 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::float16 val, int width) {
 +    unsigned long long mask, phi::dtype::float16 val, int width) {
    return phi::dtype::float16(__shfl_xor_sync(mask, val.to_half(), width));
  }
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::bfloat16 val, int width) {
@@ -310,7 +310,7 @@ index 4ff2e528a9..81421c8ca1 100644
    return phi::dtype::bfloat16(
        __shfl_xor_sync(mask, val.to_nv_bfloat16(), width));
 @@ -121,7 +122,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::complex<float> val, int width) {
@@ -319,7 +319,7 @@ index 4ff2e528a9..81421c8ca1 100644
        __shfl_xor_sync(mask, static_cast<float>(val.real), width));
    float imag = static_cast<float>(
 @@ -131,7 +132,7 @@ __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::complex<double> val, int width) {
@@ -328,14 +328,14 @@ index 4ff2e528a9..81421c8ca1 100644
        __shfl_xor_sync(mask, static_cast<double>(val.real), width));
    double imag = static_cast<double>(
 @@ -141,7 +142,7 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
-
+ 
  template <typename T>
  __forceinline__ __device__ T
 -CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) {
 +CudaShuffleSync(unsigned long long mask, T val, int src_line, int width = 32) {
    return __shfl_sync(mask, val, src_line, width);
  }
-
+ 
 @@ -160,7 +161,7 @@ __device__ T reduceSum(T val, int tid, int len) {
    // but most card's warp size is 32.
    const int warpSize = 32;
@@ -343,7 +343,7 @@ index 4ff2e528a9..81421c8ca1 100644
 -  unsigned mask = 0u;
 +  unsigned long long mask = 0ull;
    CREATE_SHFL_MASK(mask, tid < len);
-
+ 
    for (int offset = warpSize / 2; offset > 0; offset /= 2)
 diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
 index 024a7de73e..1e4cdf16be 100644
@@ -351,7 +351,7 @@ index 024a7de73e..1e4cdf16be 100644
 +++ b/paddle/phi/core/enforce.h
 @@ -45,7 +45,9 @@ limitations under the License. */
  #endif
-
+ 
  #ifdef PADDLE_WITH_CUDA
 -#include "paddle/phi/backends/dynload/cublas.h"
 +// #include "paddle/phi/backends/dynload/../../../../../cublas.h"
@@ -361,9 +361,9 @@ index 024a7de73e..1e4cdf16be 100644
  #include "paddle/phi/backends/dynload/curand.h"
  #include "paddle/phi/backends/dynload/cusolver.h"
 @@ -97,7 +99,7 @@ inline bool is_error(bool stat) { return !stat; }
-
+ 
  void ThrowWarnInternal(const std::string& message);
-
+ 
 -#if defined(__CUDA_ARCH__)
 +#if defined(__CUDACC__)
  // For cuda, the assertions can affect performance and it is therefore
@@ -379,7 +379,7 @@ index 024a7de73e..1e4cdf16be 100644
    } while (0)
  #elif defined(__HIPCC__)
 @@ -757,4 +759,4 @@ inline void retry_sleep(unsigned millisecond) {
-
+ 
  }  // namespace enforce
  using namespace enforce;  // NOLINT
 -}  // namespace phi
@@ -392,7 +392,7 @@ index c646e487d0..325122175c 100644
 @@ -25,8 +25,9 @@
  #else
  #include <cuda_runtime.h>
-
+ 
 -#include "paddle/phi/backends/dynload/cublas.h"
 -#include "paddle/phi/backends/dynload/cublasLt.h"
 +// #include "paddle/phi/backends/dynload/cublas.h"
@@ -400,16 +400,16 @@ index c646e487d0..325122175c 100644
 +// #include "paddle/phi/backends/dynload/cublasLt.h"
  #include "paddle/phi/backends/dynload/cudnn.h"
  #endif
-
+ 
 @@ -90,7 +91,7 @@ DECLARE_TYPE_FOR_GPU(gpuStreamCaptureMode,
-
+ 
  // TODO(Ming Huang): Since there is no blasLt handler,
  // use rocblas_handle for workaround.
 -DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
 +// DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
-
+ 
  #undef DECLARE_TYPE_FOR_GPU
-
+ 
 diff --git a/paddle/phi/core/platform/device_context.h b/paddle/phi/core/platform/device_context.h
 index 2d02eb370b..8a7233e34e 100644
 --- a/paddle/phi/core/platform/device_context.h
@@ -430,58 +430,58 @@ index d69eb67d6f..1d8b6e9375 100644
 --- a/paddle/phi/kernels/cpu/index_select_impl.h
 +++ b/paddle/phi/kernels/cpu/index_select_impl.h
 @@ -18,7 +18,7 @@
-
+ 
  #include "paddle/phi/core/dense_tensor.h"
  #include "paddle/phi/core/tensor_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
-
+ 
 diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
 index cb35feee32..64f5bd24ac 100644
 --- a/paddle/phi/kernels/funcs/fc_functor.cu
 +++ b/paddle/phi/kernels/funcs/fc_functor.cu
 @@ -16,12 +16,12 @@ limitations under the License. */
-
+ 
  #include "paddle/phi/backends/all_context.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/fc_functor.h"
-
+ 
  #include "paddle/phi/backends/gpu/gpu_launch_config.h"
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
 +// #include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
  #include "paddle/phi/kernels/funcs/quant_dequant.h"
  #include "paddle/phi/kernels/matmul_kernel.h"
-
+ 
 diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu
 index 88663ec880..98b93072a3 100644
 --- a/paddle/phi/kernels/funcs/gru_compute.cu
 +++ b/paddle/phi/kernels/funcs/gru_compute.cu
 @@ -12,7 +12,7 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/gru_compute.h"
-
+ 
  #include "paddle/phi/backends/gpu/gpu_context.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h"
  #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
-
+ 
 diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h
 index 15e1a4a3c3..e4780538d7 100644
 --- a/paddle/phi/kernels/funcs/math/context_project.h
 +++ b/paddle/phi/kernels/funcs/math/context_project.h
 @@ -18,7 +18,7 @@
  #include <vector>
-
+ 
  #include "paddle/phi/core/tensor_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/im2col.h"
-
+ 
  namespace phi {
 diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
 index e101224970..a52eb6096f 100644
@@ -489,14 +489,14 @@ index e101224970..a52eb6096f 100644
 +++ b/paddle/phi/kernels/funcs/matrix_inverse.cu
 @@ -15,11 +15,13 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
-
+ 
  #include "paddle/phi/common/memory_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
-
+ 
  namespace phi {
  namespace funcs {
-
+ 
 +
 +
  template <typename Context, typename T>
@@ -514,28 +514,28 @@ index 558d363b39..05da04b517 100644
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
  #include "paddle/phi/kernels/funcs/scatter.cu.h"
-
+ 
 diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 index 8b0baf5f5f..260482f124 100644
 --- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 +++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 @@ -27,7 +27,7 @@ namespace cub = hipcub;
-
+ 
  #include "paddle/phi/kernels/funcs/multihead_matmul_functor.h"
-
+ 
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/math_cuda_utils.h"
-
+ 
  namespace phi {
 diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
-index e30d440ff3..3c74792690 100644
+index e30d440ff3..108edda7ca 100644
 --- a/paddle/phi/kernels/funcs/top_k_function_cuda.h
 +++ b/paddle/phi/kernels/funcs/top_k_function_cuda.h
 @@ -30,11 +30,11 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
  #include "paddle/phi/kernels/primitive/functor_primitives.h"
-
+ 
 -#define FINAL_MASK 0xffffffff
 +#define FINAL_MASK 0xffffffffffffffffull
  #ifdef PADDLE_WITH_HIP
@@ -545,7 +545,7 @@ index e30d440ff3..3c74792690 100644
 +#define WARP_SIZE 64
  #endif
  #define MAX_NUM_THREADS 1024
-
+ 
 @@ -196,21 +196,56 @@ __device__ __forceinline__ void AddTo(Pair<T> topk[],
    for (int k = beam_size - 2; k >= 0; k--) {
      if (largest) {
@@ -606,7 +606,7 @@ index e30d440ff3..3c74792690 100644
 +  topk[0 + offset].v = p.v;
 +  topk[0 + offset].id = p.id;
  }
-
+ 
  template <typename T, int BlockSize>
 @@ -239,24 +274,24 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[],
  template <typename T, int BlockSize>
@@ -662,7 +662,7 @@ index e30d440ff3..3c74792690 100644
 +            // topk + MaxLength - *beam, src, tid, dim, *max, length, largest);
        }
      }
-
+ 
 @@ -355,6 +394,8 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
        shared_max[wid] = input_now;
      }
@@ -697,7 +697,7 @@ index e30d440ff3..3c74792690 100644
 -    if (--(*k) == 0) break;
 +    // if (--(*k) == 0) break;
 +    unsigned long long mask = 0ull;
-
+ 
 -    unsigned mask = 0u;
 +    // unsigned mask = 0u;
      CREATE_SHFL_MASK(mask, true);
@@ -721,7 +721,7 @@ index e30d440ff3..3c74792690 100644
 +
      return ret;
    }
-
+ 
    static __device__ __forceinline__ unsigned int SetBitfield(
        unsigned int val, unsigned int to_insert, int pos, int len) {
      unsigned int ret;
@@ -743,7 +743,7 @@ index e30d440ff3..3c74792690 100644
 +    ret = (static_cast<uint64_t>(val) << (64 - pos - len)) >> (64 - len);
      return ret;
    }
-
+ 
 @@ -507,9 +556,9 @@ struct Bitfield<uint64_t> {
                                                           int pos,
                                                           int len) {
@@ -771,7 +771,7 @@ index e30d440ff3..3c74792690 100644
 +  return ::__lane_id();
 +  // return lane_id;
  }
-
+ 
  __device__ __forceinline__ unsigned GetLaneMaskLe() {
    unsigned mask;
 -  asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
@@ -780,17 +780,17 @@ index e30d440ff3..3c74792690 100644
 +  return ((uint64_t(1) << ::__lane_id()) << 1) - 1;
 +  // return mask;
  }
-
+ 
  template <typename T, bool KillDependency, class Function>
 @@ -881,7 +936,8 @@ __global__ void GatherKthValue(const T* input,
-
+ 
    // 1. Find the k-th value
    T kth_value = static_cast<T>(0);
 -  RadixSearch<T, RadixTypeConfig<T>::RadixType, IndexType, false>(
 +  // RadixSearch<T, RadixTypeConfig<T>::RadixType, IndexType, false>(
 +  RadixSearch<T, typename RadixTypeConfig<T>::RadixType, IndexType, false>(
        cur_input, k, num_cols, shared_mem, &kth_value);
-
+ 
    __shared__ int64_t block_min_idx;
 @@ -1314,3 +1370,4 @@ bool SortTopk(const phi::GPUContext& dev_ctx,
  }
@@ -803,12 +803,12 @@ index 32db61532f..0220316bc3 100644
 +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
 @@ -15,7 +15,7 @@
  #pragma once
-
+ 
  #if defined(PADDLE_WITH_CUDA)
 -#include "paddle/phi/backends/dynload/cublasLt.h"
 +// #include "paddle/phi/backends/dynload/cublasLt.h"
  #endif
-
+ 
  #include "glog/logging.h"
 diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
 index 9d4bb18d55..ea42cc10a9 100644
@@ -830,12 +830,12 @@ index b8cfdbf3ce..fa14b94a77 100644
 --- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 @@ -14,7 +14,7 @@
-
+ 
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
 +#include "kernels/metax_kernel/mmha_util.cu.h"
-
+ 
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
@@ -843,14 +843,27 @@ index e838778952..83e805e75a 100644
 --- a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 @@ -14,7 +14,7 @@
-
+ 
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
 +#include "kernels/metax_kernel/mmha_util.cu.h"
-
+ 
  namespace phi {
  namespace fusion {
+diff --git a/paddle/phi/kernels/gpu/correlation_kernel.cu b/paddle/phi/kernels/gpu/correlation_kernel.cu
+index 4c93778bde..c7bdf8a2cc 100644
+--- a/paddle/phi/kernels/gpu/correlation_kernel.cu
++++ b/paddle/phi/kernels/gpu/correlation_kernel.cu
+@@ -103,7 +103,7 @@ void CorrelationCUDAKernel(const Context &dev_ctx,
+                            int stride2,
+                            int corr_type_multiply,
+                            DenseTensor *out) {
+-  bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU;
++  bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM;
+   PADDLE_ENFORCE_EQ(
+       is_gpu_place,
+       true,
 diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
 index f0cca0f701..02ea957240 100644
 --- a/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -863,9 +876,22 @@ index f0cca0f701..02ea957240 100644
 -#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
 +#include "kernels/gpudnn/conv_gpudnn.h"
 +#include "kernels/impl/conv_cudnn_impl.h"
-
+ 
  namespace phi {
  // To determine use cudnn or not.
+diff --git a/paddle/phi/kernels/gpu/dgc_kernel.cu b/paddle/phi/kernels/gpu/dgc_kernel.cu
+index c2ddfa1347..c6adf5a6de 100644
+--- a/paddle/phi/kernels/gpu/dgc_kernel.cu
++++ b/paddle/phi/kernels/gpu/dgc_kernel.cu
+@@ -188,7 +188,7 @@ void DGCKernel(const Context& dev_ctx,
+   int buf_size = paddle::communication::dgc::get_buffer_size(k);
+   phi::Allocator::AllocationPtr tmp_ious_data;
+ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+-  if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
++  if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
+     tmp_ious_data = phi::memory_utils::Alloc(
+         dev_ctx.GetPlace(),
+         buf_size,
 diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h
 index 29fa252e96..4ae72b0935 100644
 --- a/paddle/phi/kernels/gpu/gelu_funcs.h
@@ -890,7 +916,7 @@ index 29fa252e96..4ae72b0935 100644
 +// #endif
    return tanhf(x);
  }
-
+ 
 diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
 index 11efd87965..679db14c24 100644
 --- a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
@@ -901,9 +927,9 @@ index 11efd87965..679db14c24 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
 -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 +#include "kernels/gpudnn/softmax_gpudnn.h"
-
+ 
  namespace phi {
-
+ 
 diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
 index 63c35dd4ee..15da9aea45 100644
 --- a/paddle/phi/kernels/gpu/log_softmax_kernel.cu
@@ -914,9 +940,9 @@ index 63c35dd4ee..15da9aea45 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
 -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 +#include "kernels/gpudnn/softmax_gpudnn.h"
-
+ 
  namespace phi {
-
+ 
 diff --git a/paddle/phi/kernels/gpu/lstsq_kernel.cu b/paddle/phi/kernels/gpu/lstsq_kernel.cu
 index 1bdbe1564c..f753b54bc6 100644
 --- a/paddle/phi/kernels/gpu/lstsq_kernel.cu
@@ -930,6 +956,19 @@ index 1bdbe1564c..f753b54bc6 100644
  #include "paddle/phi/kernels/impl/qr_kernel_impl.h"
  #include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
  #include "paddle/phi/kernels/lstsq_kernel.h"
+diff --git a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
+index 05a977828f..5136608c41 100644
+--- a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
++++ b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
+@@ -58,7 +58,7 @@ void ShuffleBatchKernel(const Context& dev_ctx,
+   int64_t seed_int = 0;
+   if (seed.initialized()) {
+     const auto& seed_place = seed.place().GetType();
+-    bool is_gpu_place = seed_place == phi::AllocationType::GPU;
++    bool is_gpu_place = seed_place == phi::AllocationType::GPU || seed_place == phi::AllocationType::CUSTOM;
+     if (is_gpu_place) {
+       // NOTE: We have overwritten GetKernelTypeForVar, so seed_place would
+       // not be CUDAPlace in practice. This case would only happen in Python
 diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
 index 9bc5326c90..79b57a8203 100644
 --- a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
@@ -948,7 +987,7 @@ index cf80666b4e..ca76e055fb 100644
 --- a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 @@ -19,7 +19,7 @@ limitations under the License. */
-
+ 
  #include "paddle/phi/common/amp_type_traits.h"
  #include "paddle/phi/kernels/baddbmm_grad_kernel.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -961,14 +1000,14 @@ index 2789cb59a2..b91b076f7f 100644
 --- a/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
 @@ -20,7 +20,7 @@ limitations under the License. */
-
+ 
  #include "paddle/phi/common/amp_type_traits.h"
  #include "paddle/phi/kernels/baddbmm_kernel.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
-
+ 
 diff --git a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
 index 9a21c23666..86413d1577 100644
 --- a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
@@ -993,7 +1032,7 @@ index 4459a931da..837c8682b8 100644
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
-
+ 
  namespace phi {
 diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
 index ad9e9197dd..5478d9817d 100644
@@ -1013,31 +1052,31 @@ index e6b3960f6d..564125f1f6 100644
 --- a/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
 @@ -56,8 +56,8 @@ HOSTDEVICE T igam(const T a, const T x) {
-
+ 
  template <typename T>
  HOSTDEVICE T igamc(const T a, const T x) {
 -  static T big = 4.503599627370496e15;
 -  static T biginv = 2.22044604925031308085e-16;
 +  const static T big = 4.503599627370496e15;
 +  const static T biginv = 2.22044604925031308085e-16;
-
+ 
    if ((x <= T{0}) || (a <= T{0})) return (T{1.0});
-
+ 
 diff --git a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 index 410fb3c560..009ce03440 100644
 --- a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 @@ -54,7 +54,7 @@ HOSTDEVICE T digamma_positive_domain(T x) {
-
+ 
  template <typename T>
  HOSTDEVICE T digamma(T x) {
 -  static T pi = T{3.14159265358979323846};
 +  const static T pi = T{3.14159265358979323846};
-
+ 
    if (x == T{0.0}) {
      T inf = std::numeric_limits<T>::infinity();
 diff --git a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
-index 5ebbc8d2db..48acf8d0cd 100644
+index 5ebbc8d2db..c7b6c338e2 100644
 --- a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
 @@ -15,8 +15,9 @@ limitations under the License. */
@@ -1049,11 +1088,11 @@ index 5ebbc8d2db..48acf8d0cd 100644
 +#include "kernels/funcs/blas/cublaslt.h"
 +#include "kernels/funcs/quant_dequant.h"
 +#include "kernels/metax_kernel/metax_context.h"
-
+ 
  #pragma once
-
+ 
 @@ -668,7 +669,7 @@ void LLMGemm(const phi::GPUContext& dev_ctx,
-
+ 
    {
      auto helper =
 -        std::make_unique<CublasLtHelper>(m, k, n, dev_ctx.cublaslt_handle());
@@ -1067,12 +1106,12 @@ index 1f319c4ae3..9186eb6906 100644
 +++ b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
 @@ -15,7 +15,7 @@ limitations under the License. */
  #pragma once
-
+ 
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
-
+ 
  namespace phi {
 diff --git a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
 index 6f03f76eeb..5fe2c3e7dc 100644
@@ -1080,13 +1119,13 @@ index 6f03f76eeb..5fe2c3e7dc 100644
 +++ b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
 @@ -15,7 +15,7 @@ limitations under the License. */
  #pragma once
-
+ 
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/for_range.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
-
+ 
 diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h
 index 7b85903776..3f4b298807 100644
 --- a/paddle/phi/kernels/impl/merged_momentum_impl.h
@@ -1118,31 +1157,11 @@ index 4099d8b506..baef2cd643 100644
 --- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
 @@ -14,7 +14,7 @@
-
+ 
  #pragma once
-
+ 
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
-
-diff --git a/third_party/flagcx b/third_party/flagcx
-index 7c469f4af9..7e6c4cc3ca 160000
---- a/third_party/flagcx
-+++ b/third_party/flagcx
-@@ -1 +1 @@
--Subproject commit 7c469f4af991bf0f64b8f76d66f8e307a5eaea3f
-+Subproject commit 7e6c4cc3cad3fce9b3dedfe46a9d195d616e8ffa
-diff --git a/third_party/flashattn b/third_party/flashattn
-index 581e48aa69..749aca3807 160000
---- a/third_party/flashattn
-+++ b/third_party/flashattn
-@@ -1 +1 @@
--Subproject commit 581e48aa693a17ec3676ec2715d46130310d318d
-+Subproject commit 749aca380794b472096d4e7ea01dd252ab0887c9
-diff --git a/third_party/yaml-cpp b/third_party/yaml-cpp
---- a/third_party/yaml-cpp
-+++ b/third_party/yaml-cpp
-@@ -1 +1 @@
--Subproject commit 1d8ca1f35eb3a9c9142462b28282a848e5d29a91
-+Subproject commit 1d8ca1f35eb3a9c9142462b28282a848e5d29a91-dirty
+ 

From 69af38186ebfd6029d6e5b1a057d6e8fa389ee08 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 17 Sep 2025 17:07:26 +0800
Subject: [PATCH 24/58] change_metax_work (#32)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

* [metax]change_build

* change_metax_work

* change_metax_work

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 .github/workflows/metax_work.yaml | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index 0d3d2637cdd..c23112f0545 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -19,27 +19,28 @@ defaults:
 jobs:
   metax-gpu-test:
     runs-on: paddle-metax-runner-set
+    # runs-on: debug-paddle-runner-set
     steps:
       - name: Checkout repository
         run: |
           git config --global user.name "GitHub Actions"
           git config --global user.email "actions@github.com"
 
-          if [ "${{ github.event_name }}" == "pull_request" ]; then
-            BRANCH_NAME=${{ github.head_ref }}
-          else
-            BRANCH_NAME=${{ github.ref_name }}
-          fi
-
           git clone \
             --reference-if-able /home/runner/PaddleCustomDevice \
             --depth=1 \
             --shallow-submodules \
             --jobs=8 \
-            --branch $BRANCH_NAME \
+            --branch ${{ github.base_ref }} \
             --recurse-submodules \
             https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git .
 
+          if [ "${{ github.event_name }}" == "pull_request" ]; then
+            git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/head
+            git checkout pull/${{ github.event.pull_request.number }}/head
+            git submodule update --init --recursive
+          fi
+
 
       - name: compile
         run: |

From 7fe6f2dca92c3c0e3fb4c4ceb7f18a26560422e9 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 17 Sep 2025 17:18:26 +0800
Subject: [PATCH 25/58] change_build (#33)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

* [metax]change_build

* change_metax_work

* change_metax_work

* change_metax_work

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 backends/metax_gpu/build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index de409153472..dbd583c52ea 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -22,8 +22,8 @@ pip  uninstall paddlepaddle -y
 
 # init paddle
 git submodule sync --recursive && git submodule update --init --recursive
-sleep 1000000
-unset http_proxy https_proxy
+# sleep 1000000
+# unset http_proxy https_proxy
 
 
 # export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080

From b22fc1317d786931c1aa8784ad30dd72b6dfc2fd Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Wed, 17 Sep 2025 17:58:21 +0800
Subject: [PATCH 26/58] [metax] modify fused_bias_dropout_residual_layer_norm
 (#34)

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel

* modify library to static library

* modify kernel

* modify fused_bias_dropout_residual_layer_norm
---
 backends/metax_gpu/patch/paddle.patch | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 667d9f75d1c..b7bdb953077 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -470,6 +470,25 @@ index 88663ec880..98b93072a3 100644
  #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h"
  #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
  
+diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+index 4eae698648..5c047723ea 100644
+--- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
++++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+@@ -43,11 +43,11 @@ template <typename T>
+ using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+ 
+ inline static int GetDesiredBlockDim(int64_t block_dim) {
+-  const int kMaxBlockDim = 512;
++  const int kMaxBlockDim = 256;
+ #ifdef __HIPCC__
+   const int lwarpSize = 64;
+ #else
+-  const int lwarpSize = 32;
++  const int lwarpSize = 64;
+ #endif
+   return block_dim >= kMaxBlockDim ? kMaxBlockDim : lwarpSize;
+ }
+
 diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h
 index 15e1a4a3c3..e4780538d7 100644
 --- a/paddle/phi/kernels/funcs/math/context_project.h

From c3d1444ef67441b9bb43f9fa5ee7c5a906a7f9df Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 17 Sep 2025 18:18:30 +0800
Subject: [PATCH 27/58] change_build (#35)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

* [metax]change_build

* change_metax_work

* change_metax_work

* change_metax_work

* change_metax_work

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 .github/workflows/metax_work.yaml | 6 ++++--
 backends/metax_gpu/build.sh       | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index c23112f0545..74de39c2e13 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -31,14 +31,16 @@ jobs:
             --depth=1 \
             --shallow-submodules \
             --jobs=8 \
-            --branch ${{ github.base_ref }} \
+            --branch ${{ github.base_ref || github.ref_name}} \
+
             --recurse-submodules \
             https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git .
 
           if [ "${{ github.event_name }}" == "pull_request" ]; then
             git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/head
             git checkout pull/${{ github.event.pull_request.number }}/head
-            git submodule update --init --recursive
+
+            # git submodule update --init --recursive
           fi
 
 
diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index dbd583c52ea..042b779a05c 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -21,7 +21,8 @@ pip  uninstall paddlepaddle -y
 
 
 # init paddle
-git submodule sync --recursive && git submodule update --init --recursive
+# git submodule sync --recursive && git submodule update --init --recursive
+
 # sleep 1000000
 # unset http_proxy https_proxy
 

From 569a867b358d9d3707c8d41dbbb0641d03e75de8 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 17 Sep 2025 18:21:54 +0800
Subject: [PATCH 28/58] change_build (#36)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

* [metax]change_build

* change_metax_work

* change_metax_work

* change_metax_work

* change_metax_work

* change_metax_work

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 .github/workflows/metax_work.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index 74de39c2e13..51c0c62cef6 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -32,7 +32,6 @@ jobs:
             --shallow-submodules \
             --jobs=8 \
             --branch ${{ github.base_ref || github.ref_name}} \
-
             --recurse-submodules \
             https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git .
 

From 0edc6f6549fff51d459bf9a77bfbedf4e6a33beb Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 18 Sep 2025 10:46:15 +0800
Subject: [PATCH 29/58] change_warpctc.cmake (#38)

* change_warpctc.cmake
---
 backends/metax_gpu/cmake/warpctc.cmake | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/backends/metax_gpu/cmake/warpctc.cmake b/backends/metax_gpu/cmake/warpctc.cmake
index 0733c0f9ce5..ea8e2ade754 100644
--- a/backends/metax_gpu/cmake/warpctc.cmake
+++ b/backends/metax_gpu/cmake/warpctc.cmake
@@ -35,6 +35,13 @@ else()
       git checkout -- . && git checkout ${WARPCTC_TAG} && patch -Nd
       ${SOURCE_DIR} <
       ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.cuda.patch)
+  file(COPY ${CMAKE_SOURCE_DIR}/patch/intrinsics.cuh
+       DESTINATION ${SOURCE_DIR}/include/contrib/moderngpu/include/device/)
+  message(STATUS "atch file path: ${CMAKE_SOURCE_DIR}/patch/intrinsics.cuh")
+  message(
+    STATUS
+      "ModernGPU device path: ${SOURCE_DIR}/include/contrib/moderngpu/include/device/"
+  )
 endif()
 
 if(NOT WIN32 AND WITH_GPU)

From 2688c8664cc50961267be572ed467ce4b89bc351 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 18 Sep 2025 11:44:44 +0800
Subject: [PATCH 30/58] change_warpctc.cmake (#39)

* change warpctc.cmake
---
 backends/metax_gpu/change_patch.sh     | 3 ++-
 backends/metax_gpu/cmake/warpctc.cmake | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/backends/metax_gpu/change_patch.sh b/backends/metax_gpu/change_patch.sh
index 60d74ec0f3d..f29986a3780 100644
--- a/backends/metax_gpu/change_patch.sh
+++ b/backends/metax_gpu/change_patch.sh
@@ -21,8 +21,9 @@ unzip mcEigen_3.4.0_paddle_final.zip
 mv mcEigen_3.4.0_paddle_final eigen3
 cd ..
 cp -r patch/eigen3/ ../../Paddle/third_party/eigen3
+rm -r patch/eigen3
 cp patch/tmp/mixed_vector* ../../Paddle/paddle/phi/core
 cd ../../Paddle/
 git apply --verbose ../backends/metax_gpu/patch/paddle.patch
 cd -
-cp -r patch/intrinsics.cuh ../../Paddle/third_party/warpctc/include/contrib/moderngpu/include/device/
+# cp -r patch/intrinsics.cuh ../../Paddle/third_party/warpctc/include/contrib/moderngpu/include/device/
diff --git a/backends/metax_gpu/cmake/warpctc.cmake b/backends/metax_gpu/cmake/warpctc.cmake
index ea8e2ade754..5d668032fb1 100644
--- a/backends/metax_gpu/cmake/warpctc.cmake
+++ b/backends/metax_gpu/cmake/warpctc.cmake
@@ -108,6 +108,10 @@ else()
   set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
 endif()
 
+set(COPY_COMMAND
+    ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/patch/intrinsics.cuh"
+    "${SOURCE_DIR}/include/contrib/moderngpu/include/device/")
+
 ExternalProject_Add(
   extern_warpctc
   ${EXTERNAL_PROJECT_LOG_ARGS}
@@ -117,6 +121,7 @@ ExternalProject_Add(
   PATCH_COMMAND
   COMMAND ${WARPCTC_PATCH_COMMAND}
   COMMAND ${WARPCTC_PATCH_CUDA_COMMAND}
+  COMMAND ${COPY_COMMAND}
   COMMAND ${WARPCTC_PATHCH_ROCM_COMMAND}
   # BUILD_ALWAYS    1
   CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}

From 6f031fe12a2020044b898b2b2921c899df3d4e3a Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 18 Sep 2025 12:10:23 +0800
Subject: [PATCH 31/58] test (#40)

* test

---------
---
 backends/metax_gpu/tests/run_test.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/backends/metax_gpu/tests/run_test.sh b/backends/metax_gpu/tests/run_test.sh
index 95cce650e6b..92dea2b492b 100755
--- a/backends/metax_gpu/tests/run_test.sh
+++ b/backends/metax_gpu/tests/run_test.sh
@@ -22,6 +22,8 @@ TEST_PATH1="${SCRIPT_DIR}/../../../python"
 TEST_PATH2="${SCRIPT_DIR}/../../../python/tests"
 export PYTHONPATH="${LEGACY_TEST_PATH}:${PYTHONPATH}:${TEST_PATH1}:${TEST_PATH2}"
 
+export
+sleep 1000000
 
 rm -r build
 mkdir -p build && cd build

From e84d399d6056f6dd017031514045a608e717b223 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 18 Sep 2025 14:42:12 +0800
Subject: [PATCH 32/58] test_ut (#41)

* change_run_ut

---------
---
 backends/metax_gpu/tests/run_test.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/backends/metax_gpu/tests/run_test.sh b/backends/metax_gpu/tests/run_test.sh
index 92dea2b492b..7d1e8e072a9 100755
--- a/backends/metax_gpu/tests/run_test.sh
+++ b/backends/metax_gpu/tests/run_test.sh
@@ -23,7 +23,8 @@ TEST_PATH2="${SCRIPT_DIR}/../../../python/tests"
 export PYTHONPATH="${LEGACY_TEST_PATH}:${PYTHONPATH}:${TEST_PATH1}:${TEST_PATH2}"
 
 export
-sleep 1000000
+# sleep 1000000
+
 
 rm -r build
 mkdir -p build && cd build
@@ -34,4 +35,4 @@ cmake ..
 cmake --build .
 
 
-ctest -j1 --output-on-failure
+ctest -j10 --output-on-failure

From b5f2feb398cae8217d1dff39a5e7ef31afa0e02d Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 18 Sep 2025 16:59:28 +0800
Subject: [PATCH 33/58] tets (#43)

* remove_tets

---------
---
 backends/metax_gpu/build.sh             | 2 +-
 backends/metax_gpu/tests/CMakeLists.txt | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index 042b779a05c..9ca589a7807 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -57,7 +57,7 @@ fi
 
 echo "make_maca"
 cd build
-cmake_maca .. -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON
+cmake_maca .. -DCMAKE_BUILD_TYPE=Release -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON
 make_maca -j60
 
 echo "install whl"
diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 410ef006514..08273782be6 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -81,8 +81,7 @@ list(
   ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
   ${PADDLE_LEGACY_TEST_PATH}/test_full_op.py
   ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_clip_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_reduce_op.py)
+  ${PADDLE_LEGACY_TEST_PATH}/test_clip_op.py)
 
 list(
   REMOVE_ITEM

From e20eca7e6f9846583293e988b7484380a25f314f Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 18 Sep 2025 18:53:51 +0800
Subject: [PATCH 34/58] test (#44)

* test

---------
---
 backends/metax_gpu/tests/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 08273782be6..795a3c5b8ac 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -95,7 +95,7 @@ list(
   ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
   ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
   ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
-  # op_test.py 里 self._get_places()接口适配问题
+  # op_test.py 里 self._get_places()接口的适配问题
   ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
   ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
   # device == "gpu" 适配问题

From e37f633a4d440a25126273ccddd7c3ff23288a02 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Fri, 19 Sep 2025 18:30:47 +0800
Subject: [PATCH 35/58] [metax] modify compile (#42)

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel

* modify library to static library

* modify kernel

* modify fused_bias_dropout_residual_layer_norm

* modify compile

* modify blas
---
 backends/metax_gpu/CMakeLists.txt             |   40 +-
 backends/metax_gpu/compile.sh                 |    2 +-
 .../kernels/funcs/blas/blas_impl.cu.h         | 1270 ++++++++---------
 .../fused_adam_kernel_register.cu             |    0
 ...esidual_layer_norm_grad_kernel_register.cu |    0
 ...out_residual_layer_norm_kernel_register.cu |    0
 ...dding_eltwise_layernorm_kernel_register.cu |    0
 .../fused_layernorm_kernel_register.cu        |    0
 .../fused_seqpool_cvm_grad_kernel_register.cu |    0
 .../fused_seqpool_cvm_kernel_register.cu      |    0
 ...fused_softmax_mask_grad_kernel_register.cu |    0
 .../fused_softmax_mask_kernel_register.cu     |    0
 ...max_mask_upper_triangle_kernel_register.cu |    0
 ...d_stack_transpose_quant_kernel_register.cu |    0
 ...sed_swiglu_weighted_bwd_kernel_register.cu |   30 +
 .../fused_token_prune_kernel_register.cu      |    0
 ...d_transpose_split_quant_kernel_register.cu |    0
 ...nspose_wlch_split_quant_kernel_register.cu |    0
 .../kernels/metax_kernel/metax_context.cc     |   35 -
 .../kernels/metax_kernel/metax_context.h      |    2 -
 20 files changed, 597 insertions(+), 782 deletions(-)
 mode change 100755 => 100644 backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_adam_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_bias_dropout_residual_layer_norm_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_bias_dropout_residual_layer_norm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_embedding_eltwise_layernorm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_layernorm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_seqpool_cvm_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_seqpool_cvm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_softmax_mask_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_softmax_mask_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_softmax_mask_upper_triangle_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_stack_transpose_quant_kernel_register.cu (100%)
 create mode 100644 backends/metax_gpu/kernels/fusion/fused_swiglu_weighted_bwd_kernel_register.cu
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_token_prune_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_transpose_split_quant_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_transpose_wlch_split_quant_kernel_register.cu (100%)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index f282a9fbf7c..7b8c52f1f31 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -70,7 +70,6 @@ include(eigen)
 include(xxhash)
 include(zlib)
 include(protobuf)
-include(generate_pb)
 
 set(PROTO_FILE "${PADDLE_SOURCE_DIR}/paddle/phi/core/external_error.proto")
 get_filename_component(PROTO_WE "${PROTO_FILE}" NAME_WE)
@@ -614,12 +613,9 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/math_function.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/log_softmax_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/backends/context_pool.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/repeat_tensor2index_tensor.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/binomial_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bernoulli_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bmm_grad_kernel_impl.h
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bmm_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cufft.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/box_coder_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
@@ -642,29 +638,11 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gather_tree_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/group_norm_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_act_dequant_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_weighted_swiglu_act_quant_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_elemwise_activation_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_grad_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/fp8_gemm/fp8_gemm_with_cublasLt/fp8_fp8_half_gemm.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_grad_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/variable_length_memory_efficient_attention_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/gemm_epilogue_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/blha_get_max_len.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_elemwise_activation_grad_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/stride/as_real_kernel.cc
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/stride/as_complex_kernel.cc
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/stride/complex_grad_kernel.cc
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/stride/complex_kernel.cc
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/shape_kernel.cc
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/gpu/conv_kernel_igemm.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu
   # ############################################################################
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
   # kernels/kps
@@ -697,7 +675,6 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/cal_aux_loss_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/expand_modality_expert_id_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/int_bincount_kernel.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu)
 
 file(
@@ -707,6 +684,8 @@ file(
   passes/*.cc
   kernels/*.cc
   kernels/*.cu
+  kernels/fusion/*.cc
+  kernels/fusion/*.cu
   kernels/gpudnn/*.cc
   kernels/gpudnn/*.cu
   kernels/cuda_kernels/*.cc
@@ -721,13 +700,7 @@ set_source_files_properties(${CUSTOM_DEVICE_SRCS} PROPERTIES LANGUAGE CUDA)
 set(CMAKE_CUCC_COMPILER "cucc")
 set(CMAKE_CUCC_FLAGS "-I /opt/maca/tools/cu-bridge/include/")
 
-set_source_files_properties(
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu
-  PROPERTIES LANGUAGE CUDA)
-add_library(
-  ${TARGET_NAME} SHARED
-  ${CUSTOM_DEVICE_SRCS}
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu)
+add_library(${TARGET_NAME} SHARED ${CUSTOM_DEVICE_SRCS})
 
 target_include_directories(
   ${TARGET_NAME}
@@ -753,9 +726,6 @@ target_link_libraries(
   ${WARPCTC_LIBRARIES}
   ${WARPRNNT_LIBRARIES}
   ${PADDLE_CORE_LIB})
-target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmccl.so)
-target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcFlashAttn.so)
-target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcpti.so)
 include_directories(BEFORE ${PADDLE_SOURCE_DIR})
 
 target_compile_definitions(
diff --git a/backends/metax_gpu/compile.sh b/backends/metax_gpu/compile.sh
index e9860ccb7d0..eba45a9ced2 100644
--- a/backends/metax_gpu/compile.sh
+++ b/backends/metax_gpu/compile.sh
@@ -30,7 +30,7 @@ fi
 
 echo "make_maca"
 cd build
-cmake_maca .. -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON
+cmake_maca .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON
 make_maca -j10
 
 
diff --git a/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h b/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h
old mode 100755
new mode 100644
index 419387cc9c4..ae4baa52613
--- a/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h
+++ b/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h
@@ -34,70 +34,6 @@ PHI_DECLARE_bool(gemm_use_half_precision_compute_type);
 
 namespace phi {
 namespace funcs {
-
-inline static cublasHandle_t blas_handle_ = nullptr;
-inline static cublasHandle_t blas_tensor_core_handle_ = nullptr;
-inline static cublasHandle_t blas_tf32_tensor_core_handle_ = nullptr;
-
-inline std::once_flag flag_sparse_;
-inline std::once_flag flag_blas_;
-inline std::once_flag flag_blaslt_;
-inline std::once_flag flag_dnn_;
-inline std::once_flag flag_solver_;
-inline std::once_flag flag_cublas_;
-inline std::once_flag flag_tensorcore_cublas_;
-inline std::once_flag flag_eigen_device_;
-
-inline std::mutex blas_mtx_;
-inline std::mutex blas_tensor_core_mtx_;
-inline std::mutex blas_tf32_mtx_;
-inline std::mutex sparse_mtx_;
-inline std::mutex stream_call_back_mtx_;
-
-inline void InitBlasHandle(cublasHandle_t *blas_handle, gpuStream_t stream) {
-  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasCreate(blas_handle));
-  PADDLE_RETRY_CUDA_SUCCESS(
-      phi::dynload::cublasSetStream(*blas_handle, stream));
-}
-
-inline void CublasCall(const std::function<void(cublasHandle_t)> &callback,
-                       phi::stream::stream_t stream) {
-  std::call_once(flag_cublas_, [&]() {
-    if (!blas_handle_) InitBlasHandle(&blas_handle_, stream);
-    if (!blas_tensor_core_handle_) {
-      InitBlasHandle(&blas_tensor_core_handle_, stream);
-      PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-          blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
-    }
-  });
-  std::lock_guard<std::mutex> guard(blas_mtx_);
-  callback(blas_handle_);
-}
-
-inline bool MetaxTensorCoreAvailable() {
-  return blas_tensor_core_handle_ != nullptr;
-}
-
-inline void TensorCoreCublasCallIfAvailable(
-    const std::function<void(cublasHandle_t)> &callback,
-    phi::stream::stream_t stream) {
-  std::call_once(flag_tensorcore_cublas_, [&]() {
-    if (!blas_handle_) InitBlasHandle(&blas_handle_, stream);
-    if (!blas_tensor_core_handle_) {
-      InitBlasHandle(&blas_tensor_core_handle_, stream);
-      PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-          blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
-    }
-  });
-  if (blas_tensor_core_handle_ != nullptr) {
-    std::lock_guard<std::mutex> guard(blas_tensor_core_mtx_);
-    callback(blas_tensor_core_handle_);
-  } else {
-    std::lock_guard<std::mutex> guard(blas_mtx_);
-    callback(blas_handle_);
-  }
-}
-
 template <typename T>
 struct CUBlas;
 
@@ -174,28 +110,26 @@ struct CUBlas<float> {
 // here.
 #if CUDA_VERSION >= 8000
     VLOG(5) << "use_tensor_op_math: "
-            << (MetaxTensorCoreAvailable() ? "True" : "False");
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgemmEx(handle,
-                                                                 transa,
-                                                                 transb,
-                                                                 m,
-                                                                 n,
-                                                                 k,
-                                                                 alpha,
-                                                                 A,
-                                                                 Atype,
-                                                                 lda,
-                                                                 B,
-                                                                 Btype,
-                                                                 ldb,
-                                                                 beta,
-                                                                 C,
-                                                                 Ctype,
-                                                                 ldc));
-        },
-        dev_ctx->stream());
+            << (dev_ctx->tensor_core_available() ? "True" : "False");
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgemmEx(handle,
+                                                             transa,
+                                                             transb,
+                                                             m,
+                                                             n,
+                                                             k,
+                                                             alpha,
+                                                             A,
+                                                             Atype,
+                                                             lda,
+                                                             B,
+                                                             Btype,
+                                                             ldb,
+                                                             beta,
+                                                             C,
+                                                             Ctype,
+                                                             ldc));
+    });
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
         "cublasSgemmEx is not supported on cuda <= 7.5"));
@@ -376,7 +310,7 @@ struct CUBlas<phi::dtype::float16> {
 #if CUDA_VERSION >= 8000
     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
 #if CUDA_VERSION >= 9000
-    bool use_tensor_op_math = MetaxTensorCoreAvailable();
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
     if (use_tensor_op_math) {
       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
     }
@@ -386,31 +320,29 @@ struct CUBlas<phi::dtype::float16> {
     thrust::device_vector<const void *> A_ptr(A, A + batchCount);
     thrust::device_vector<const void *> B_ptr(B, B + batchCount);
     thrust::device_vector<void *> C_ptr(C, C + batchCount);
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              phi::dynload::cublasGemmBatchedEx(handle,
-                                                transa,
-                                                transb,
-                                                m,
-                                                n,
-                                                k,
-                                                alpha,
-                                                A_ptr.data().get(),
-                                                Atype,
-                                                lda,
-                                                B_ptr.data().get(),
-                                                Btype,
-                                                ldb,
-                                                beta,
-                                                C_ptr.data().get(),
-                                                Ctype,
-                                                ldc,
-                                                batchCount,
-                                                computeType,
-                                                algo));
-        },
-        dev_ctx->stream());
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::cublasGemmBatchedEx(handle,
+                                            transa,
+                                            transb,
+                                            m,
+                                            n,
+                                            k,
+                                            alpha,
+                                            A_ptr.data().get(),
+                                            Atype,
+                                            lda,
+                                            B_ptr.data().get(),
+                                            Btype,
+                                            ldb,
+                                            beta,
+                                            C_ptr.data().get(),
+                                            Ctype,
+                                            ldc,
+                                            batchCount,
+                                            computeType,
+                                            algo));
+    });
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
         "cublasGemmBatchedEx is not supported on cuda <= 7.5"));
@@ -486,7 +418,7 @@ struct CUBlas<phi::dtype::float16> {
 #if CUDA_VERSION >= 8000
     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
 #if CUDA_VERSION >= 9000
-    bool use_tensor_op_math = MetaxTensorCoreAvailable();
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
     if (use_tensor_op_math) {
       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
     }
@@ -494,29 +426,27 @@ struct CUBlas<phi::dtype::float16> {
             << (use_tensor_op_math ? "True" : "False");
 #endif  // CUDA_VERSION >= 9000
 
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
-                                                                transa,
-                                                                transb,
-                                                                m,
-                                                                n,
-                                                                k,
-                                                                alpha,
-                                                                A,
-                                                                Atype,
-                                                                lda,
-                                                                B,
-                                                                Btype,
-                                                                ldb,
-                                                                beta,
-                                                                C,
-                                                                Ctype,
-                                                                ldc,
-                                                                computeType,
-                                                                algo));
-        },
-        dev_ctx->stream());
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
+                                                            transa,
+                                                            transb,
+                                                            m,
+                                                            n,
+                                                            k,
+                                                            alpha,
+                                                            A,
+                                                            Atype,
+                                                            lda,
+                                                            B,
+                                                            Btype,
+                                                            ldb,
+                                                            beta,
+                                                            C,
+                                                            Ctype,
+                                                            ldc,
+                                                            computeType,
+                                                            algo));
+    });
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
         "cublasGemmEx is not supported on cuda <= 7.5"));
@@ -696,7 +626,7 @@ struct CUBlas<phi::dtype::complex<float>> {
 #if CUDA_VERSION >= 8000
     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
 #if CUDA_VERSION >= 9000
-    bool use_tensor_op_math = MetaxTensorCoreAvailable();
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
     if (use_tensor_op_math) {
       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
     }
@@ -704,29 +634,27 @@ struct CUBlas<phi::dtype::complex<float>> {
             << (use_tensor_op_math ? "True" : "False");
 #endif  // CUDA_VERSION >= 9000
 
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
-                                                                transa,
-                                                                transb,
-                                                                m,
-                                                                n,
-                                                                k,
-                                                                alpha,
-                                                                A,
-                                                                Atype,
-                                                                lda,
-                                                                B,
-                                                                Btype,
-                                                                ldb,
-                                                                beta,
-                                                                C,
-                                                                Ctype,
-                                                                ldc,
-                                                                computeType,
-                                                                algo));
-        },
-        dev_ctx->stream());
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
+                                                            transa,
+                                                            transb,
+                                                            m,
+                                                            n,
+                                                            k,
+                                                            alpha,
+                                                            A,
+                                                            Atype,
+                                                            lda,
+                                                            B,
+                                                            Btype,
+                                                            ldb,
+                                                            beta,
+                                                            C,
+                                                            Ctype,
+                                                            ldc,
+                                                            computeType,
+                                                            algo));
+    });
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
         "cublasGemmEx is not supported on cuda <= 7.5"));
@@ -1024,7 +952,7 @@ struct CUBlas<phi::dtype::complex<double>> {
 #if CUDA_VERSION >= 8000
     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
 #if CUDA_VERSION >= 9000
-    bool use_tensor_op_math = MetaxTensorCoreAvailable();
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
     if (use_tensor_op_math) {
       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
     }
@@ -1032,29 +960,27 @@ struct CUBlas<phi::dtype::complex<double>> {
             << (use_tensor_op_math ? "True" : "False");
 #endif  // CUDA_VERSION >= 9000
 
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
-                                                                transa,
-                                                                transb,
-                                                                m,
-                                                                n,
-                                                                k,
-                                                                alpha,
-                                                                A,
-                                                                Atype,
-                                                                lda,
-                                                                B,
-                                                                Btype,
-                                                                ldb,
-                                                                beta,
-                                                                C,
-                                                                Ctype,
-                                                                ldc,
-                                                                computeType,
-                                                                algo));
-        },
-        dev_ctx->stream());
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
+                                                            transa,
+                                                            transb,
+                                                            m,
+                                                            n,
+                                                            k,
+                                                            alpha,
+                                                            A,
+                                                            Atype,
+                                                            lda,
+                                                            B,
+                                                            Btype,
+                                                            ldb,
+                                                            beta,
+                                                            C,
+                                                            Ctype,
+                                                            ldc,
+                                                            computeType,
+                                                            algo));
+    });
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
         "cublasGemmEx is not supported on cuda <= 7.5"));
@@ -1186,24 +1112,22 @@ void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
       PADDLE_THROW(common::errors::Unimplemented(
           "GEMM_EX_64 is not supported on cuda < 12.3"));
     } else {
-      CublasCall(
-          [&](cublasHandle_t handle) {
-            CUBlas<T>::GEMM(handle,
-                            cuTransB,
-                            cuTransA,
-                            N,
-                            M,
-                            K,
-                            &alpha,
-                            B,
-                            ldb,
-                            A,
-                            lda,
-                            &beta,
-                            C,
-                            N);
-          },
-          dev_ctx_.stream());
+      dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+        CUBlas<T>::GEMM(handle,
+                        cuTransB,
+                        cuTransA,
+                        N,
+                        M,
+                        K,
+                        &alpha,
+                        B,
+                        ldb,
+                        A,
+                        lda,
+                        &beta,
+                        C,
+                        N);
+      });
     }
 
 #if CUDA_VERSION >= 8000
@@ -1271,24 +1195,22 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 #else
   // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
 
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<phi::dtype::float16>::GEMM(handle,
-                                          cuTransB,
-                                          cuTransA,
-                                          N,
-                                          M,
-                                          K,
-                                          &h_alpha,
-                                          h_B,
-                                          ldb,
-                                          h_A,
-                                          lda,
-                                          &h_beta,
-                                          h_C,
-                                          N);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<phi::dtype::float16>::GEMM(handle,
+                                      cuTransB,
+                                      cuTransA,
+                                      N,
+                                      M,
+                                      K,
+                                      &h_alpha,
+                                      h_B,
+                                      ldb,
+                                      h_A,
+                                      lda,
+                                      &h_beta,
+                                      h_C,
+                                      N);
+  });
 #endif  // CUDA_VERSION >= 8000
 }
 
@@ -1352,24 +1274,22 @@ void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
       PADDLE_THROW(common::errors::Unimplemented(
           "GEMM_EX_64 is not supported on cuda < 12.3"));
     } else {
-      CublasCall(
-          [&](cublasHandle_t handle) {
-            CUBlas<T>::GEMM(handle,
-                            cuTransB,
-                            cuTransA,
-                            static_cast<int>(N),
-                            static_cast<int>(M),
-                            static_cast<int>(K),
-                            &t_alpha,
-                            B,
-                            static_cast<int>(ldb),
-                            A,
-                            static_cast<int>(lda),
-                            &t_beta,
-                            C,
-                            static_cast<int>(N));
-          },
-          dev_ctx_.stream());
+      dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+        CUBlas<T>::GEMM(handle,
+                        cuTransB,
+                        cuTransA,
+                        static_cast<int>(N),
+                        static_cast<int>(M),
+                        static_cast<int>(K),
+                        &t_alpha,
+                        B,
+                        static_cast<int>(ldb),
+                        A,
+                        static_cast<int>(lda),
+                        &t_beta,
+                        C,
+                        static_cast<int>(N));
+      });
     }
 
 #if CUDA_VERSION >= 8000
@@ -1447,24 +1367,22 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                                          CUBLAS_COMPUTE_32F);
 #else
     // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
-    CublasCall(
-        [&](cublasHandle_t handle) {
-          CUBlas<phi::dtype::float16>::GEMM(handle,
-                                            cuTransB,
-                                            cuTransA,
-                                            static_cast<int>(N),
-                                            static_cast<int>(M),
-                                            static_cast<int>(K),
-                                            &h_alpha,
-                                            h_B,
-                                            static_cast<int>(ldb),
-                                            h_A,
-                                            static_cast<int>(lda),
-                                            &h_beta,
-                                            h_C,
-                                            static_cast<int>(N));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<phi::dtype::float16>::GEMM(handle,
+                                        cuTransB,
+                                        cuTransA,
+                                        static_cast<int>(N),
+                                        static_cast<int>(M),
+                                        static_cast<int>(K),
+                                        &h_alpha,
+                                        h_B,
+                                        static_cast<int>(ldb),
+                                        h_A,
+                                        static_cast<int>(lda),
+                                        &h_beta,
+                                        h_C,
+                                        static_cast<int>(N));
+    });
 #endif  // CUDA_VERSION >= 8000
   }
 }
@@ -1503,7 +1421,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
   float h_beta = static_cast<float>(beta);
 
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
-  bool use_tensor_op_math = MetaxTensorCoreAvailable();
+  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
   if (use_tensor_op_math) {
     algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
   }
@@ -1519,30 +1437,27 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 #endif  // CUDA_VERSION >= 12030
   } else {
     CheckGEMMNSize(N);
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              phi::dynload::cublasGemmEx(handle,
-                                         cuTransB,
-                                         cuTransA,
-                                         N,
-                                         M,
-                                         K,
-                                         &h_alpha,
-                                         B,
-                                         CUDA_R_16BF,
-                                         ldb,
-                                         A,
-                                         CUDA_R_16BF,
-                                         lda,
-                                         &h_beta,
-                                         C,
-                                         CUDA_R_16BF,
-                                         N,
-                                         CUBLAS_COMPUTE_32F,
-                                         algo));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
+                                                            cuTransB,
+                                                            cuTransA,
+                                                            N,
+                                                            M,
+                                                            K,
+                                                            &h_alpha,
+                                                            B,
+                                                            CUDA_R_16BF,
+                                                            ldb,
+                                                            A,
+                                                            CUDA_R_16BF,
+                                                            lda,
+                                                            &h_beta,
+                                                            C,
+                                                            CUDA_R_16BF,
+                                                            N,
+                                                            CUBLAS_COMPUTE_32F,
+                                                            algo));
+    });
   }
 #else
   // raise error
@@ -1621,24 +1536,22 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 #else
     // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
 
-    CublasCall(
-        [&](cublasHandle_t handle) {
-          CUBlas<phi::dtype::complex<float>>::GEMM(handle,
-                                                   cuTransB,
-                                                   cuTransA,
-                                                   static_cast<int>(N),
-                                                   static_cast<int>(M),
-                                                   static_cast<int>(K),
-                                                   &c_alpha,
-                                                   h_B,
-                                                   static_cast<int>(ldb),
-                                                   h_A,
-                                                   static_cast<int>(lda),
-                                                   &c_beta,
-                                                   h_C,
-                                                   static_cast<int>(N));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<phi::dtype::complex<float>>::GEMM(handle,
+                                               cuTransB,
+                                               cuTransA,
+                                               static_cast<int>(N),
+                                               static_cast<int>(M),
+                                               static_cast<int>(K),
+                                               &c_alpha,
+                                               h_B,
+                                               static_cast<int>(ldb),
+                                               h_A,
+                                               static_cast<int>(lda),
+                                               &c_beta,
+                                               h_C,
+                                               static_cast<int>(N));
+    });
 #endif  // CUDA_VERSION >= 8000
   }
 }
@@ -1713,24 +1626,22 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 #else
     // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
 
-    CublasCall(
-        [&](cublasHandle_t handle) {
-          CUBlas<phi::dtype::complex<double>>::GEMM(handle,
-                                                    cuTransB,
-                                                    cuTransA,
-                                                    static_cast<int>(N),
-                                                    static_cast<int>(M),
-                                                    static_cast<int>(K),
-                                                    &c_alpha,
-                                                    h_B,
-                                                    static_cast<int>(ldb),
-                                                    h_A,
-                                                    static_cast<int>(lda),
-                                                    &c_beta,
-                                                    h_C,
-                                                    static_cast<int>(N));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<phi::dtype::complex<double>>::GEMM(handle,
+                                                cuTransB,
+                                                cuTransA,
+                                                static_cast<int>(N),
+                                                static_cast<int>(M),
+                                                static_cast<int>(K),
+                                                &c_alpha,
+                                                h_B,
+                                                static_cast<int>(ldb),
+                                                h_A,
+                                                static_cast<int>(lda),
+                                                &c_beta,
+                                                h_C,
+                                                static_cast<int>(N));
+    });
 #endif  // CUDA_VERSION >= 8000
   }
 }
@@ -1769,7 +1680,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
   float h_beta = beta;
 
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
-  bool use_tensor_op_math = MetaxTensorCoreAvailable();
+  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
   if (use_tensor_op_math) {
     algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
   }
@@ -1784,30 +1695,28 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 #endif  // CUDA_VERSION >= 12030
   } else {
     CheckGEMMNSize(N);
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              phi::dynload::cublasGemmEx(handle,
-                                         cuTransB,
-                                         cuTransA,
-                                         static_cast<int>(N),
-                                         static_cast<int>(M),
-                                         static_cast<int>(K),
-                                         &h_alpha,
-                                         B,
-                                         CUDA_R_16BF,
-                                         static_cast<int>(ldb),
-                                         A,
-                                         CUDA_R_16BF,
-                                         static_cast<int>(lda),
-                                         &h_beta,
-                                         C,
-                                         CUDA_R_16BF,
-                                         static_cast<int>(N),
-                                         CUDA_R_32F,
-                                         algo));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::cublasGemmEx(handle,
+                                     cuTransB,
+                                     cuTransA,
+                                     static_cast<int>(N),
+                                     static_cast<int>(M),
+                                     static_cast<int>(K),
+                                     &h_alpha,
+                                     B,
+                                     CUDA_R_16BF,
+                                     static_cast<int>(ldb),
+                                     A,
+                                     CUDA_R_16BF,
+                                     static_cast<int>(lda),
+                                     &h_beta,
+                                     C,
+                                     CUDA_R_16BF,
+                                     static_cast<int>(N),
+                                     CUDA_R_32F,
+                                     algo));
+    });
   }
 #else
   // raise error
@@ -1860,24 +1769,22 @@ void Blas<phi::GPUContext>::GEMM(bool transA,
   } else {
 #endif  // CUDA_VERSION >= 8000
 
-    CublasCall(
-        [&](cublasHandle_t handle) {
-          CUBlas<T>::GEMM(handle,
-                          cuTransB,
-                          cuTransA,
-                          N,
-                          M,
-                          K,
-                          &alpha,
-                          B,
-                          ldb,
-                          A,
-                          lda,
-                          &beta,
-                          C,
-                          ldc);
-        },
-        dev_ctx_.stream());
+    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<T>::GEMM(handle,
+                      cuTransB,
+                      cuTransA,
+                      N,
+                      M,
+                      K,
+                      &alpha,
+                      B,
+                      ldb,
+                      A,
+                      lda,
+                      &beta,
+                      C,
+                      ldc);
+    });
 
 #if CUDA_VERSION >= 8000
   }
@@ -1904,24 +1811,22 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
   cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
   cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
 
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<phi::dtype::float16>::GEMM(handle,
-                                          cuTransB,
-                                          cuTransA,
-                                          N,
-                                          M,
-                                          K,
-                                          &alpha,
-                                          B,
-                                          ldb,
-                                          A,
-                                          lda,
-                                          &beta,
-                                          C,
-                                          ldc);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<phi::dtype::float16>::GEMM(handle,
+                                      cuTransB,
+                                      cuTransA,
+                                      N,
+                                      M,
+                                      K,
+                                      &alpha,
+                                      B,
+                                      ldb,
+                                      A,
+                                      lda,
+                                      &beta,
+                                      C,
+                                      ldc);
+  });
 }
 
 template <>
@@ -1957,36 +1862,33 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
   float h_beta = static_cast<float>(beta);
 
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
-  bool use_tensor_op_math = MetaxTensorCoreAvailable();
+  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
   if (use_tensor_op_math) {
     algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
   }
   VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
 
-  TensorCoreCublasCallIfAvailable(
-      [&](cublasHandle_t handle) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cublasGemmEx(handle,
-                                       cuTransB,
-                                       cuTransA,
-                                       N,
-                                       M,
-                                       K,
-                                       &h_alpha,
-                                       B,
-                                       CUDA_R_16BF,
-                                       ldb,
-                                       A,
-                                       CUDA_R_16BF,
-                                       lda,
-                                       &h_beta,
-                                       C,
-                                       CUDA_R_16BF,
-                                       ldc,
-                                       CUBLAS_COMPUTE_32F,
-                                       algo));
-      },
-      dev_ctx_.stream());
+  dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
+                                                          cuTransB,
+                                                          cuTransA,
+                                                          N,
+                                                          M,
+                                                          K,
+                                                          &h_alpha,
+                                                          B,
+                                                          CUDA_R_16BF,
+                                                          ldb,
+                                                          A,
+                                                          CUDA_R_16BF,
+                                                          lda,
+                                                          &h_beta,
+                                                          C,
+                                                          CUDA_R_16BF,
+                                                          ldc,
+                                                          CUBLAS_COMPUTE_32F,
+                                                          algo));
+  });
 #else
   // raise error
   PADDLE_THROW(phi::errors::Unimplemented(
@@ -1998,27 +1900,23 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
+  });
 }
 
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {
-  CublasCall(
-      [&](cublasHandle_t handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall(
+      [&](cublasHandle_t handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
 }
 
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::VCOPY(int n, const T *x, T *y) const {
-  CublasCall(
-      [&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall(
+      [&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
 }
 
 template <>
@@ -2033,12 +1931,9 @@ void Blas<phi::GPUContext>::GEMV(bool trans_a,
                                  T *C) const {
   cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
 
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<T>::GEMV(
-            handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
+  });
 }
 
 template <>
@@ -2112,7 +2007,7 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same<T, float>::value)) ||
       std::is_same<T, phi::dtype::float16>::value) {
     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-    bool use_tensor_op_math = MetaxTensorCoreAvailable();
+    bool use_tensor_op_math = dev_ctx_.tensor_core_available();
     if (use_tensor_op_math) {
       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
     }
@@ -2153,60 +2048,56 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
           "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3"));
 #endif  // CUDA_VERSION >= 12030
     } else {
-      TensorCoreCublasCallIfAvailable(
-          [&](cublasHandle_t handle) {
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                phi::dynload::cublasGemmStridedBatchedEx(handle,
-                                                         cuTransB,
-                                                         cuTransA,
-                                                         N,
-                                                         M,
-                                                         K,
-                                                         a,
-                                                         B,
-                                                         fp,
-                                                         ldb,
-                                                         strideB,
-                                                         A,
-                                                         fp,
-                                                         lda,
-                                                         strideA,
-                                                         b,
-                                                         C,
-                                                         fp,
-                                                         ldc,
-                                                         strideC,
-                                                         batchCount,
-                                                         compute_type,
-                                                         algo));
-          },
-          dev_ctx_.stream());
+      dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            phi::dynload::cublasGemmStridedBatchedEx(handle,
+                                                     cuTransB,
+                                                     cuTransA,
+                                                     N,
+                                                     M,
+                                                     K,
+                                                     a,
+                                                     B,
+                                                     fp,
+                                                     ldb,
+                                                     strideB,
+                                                     A,
+                                                     fp,
+                                                     lda,
+                                                     strideA,
+                                                     b,
+                                                     C,
+                                                     fp,
+                                                     ldc,
+                                                     strideC,
+                                                     batchCount,
+                                                     compute_type,
+                                                     algo));
+      });
     }
   } else {
 #endif  // CUDA_VERSION >= 9010
 
-    CublasCall(
-        [&](cublasHandle_t handle) {
-          CUBlas<T>::GEMM_STRIDED_BATCH(handle,
-                                        cuTransB,
-                                        cuTransA,
-                                        static_cast<int>(N),
-                                        static_cast<int>(M),
-                                        static_cast<int>(K),
-                                        &alpha,
-                                        B,
-                                        static_cast<int>(ldb),
-                                        strideB,
-                                        A,
-                                        static_cast<int>(lda),
-                                        strideA,
-                                        &beta,
-                                        C,
-                                        ldc,
-                                        strideC,
-                                        static_cast<int>(batchCount));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<T>::GEMM_STRIDED_BATCH(handle,
+                                    cuTransB,
+                                    cuTransA,
+                                    static_cast<int>(N),
+                                    static_cast<int>(M),
+                                    static_cast<int>(K),
+                                    &alpha,
+                                    B,
+                                    static_cast<int>(ldb),
+                                    strideB,
+                                    A,
+                                    static_cast<int>(lda),
+                                    strideA,
+                                    &beta,
+                                    C,
+                                    ldc,
+                                    strideC,
+                                    static_cast<int>(batchCount));
+    });
 
 #if CUDA_VERSION >= 9010
   }
@@ -2242,7 +2133,7 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same<T, float>::value)) ||
       std::is_same<T, phi::dtype::float16>::value) {
     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-    bool use_tensor_op_math = MetaxTensorCoreAvailable();
+    bool use_tensor_op_math = dev_ctx_.tensor_core_available();
     if (use_tensor_op_math) {
       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
     }
@@ -2284,61 +2175,57 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
           "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3"));
 #endif  // CUDA_VERSION >= 12030
     } else {
-      TensorCoreCublasCallIfAvailable(
-          [&](cublasHandle_t handle) {
-            PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmStridedBatchedEx(
-                handle,
-                cuTransB,
-                cuTransA,
-                static_cast<int>(N),
-                static_cast<int>(M),
-                static_cast<int>(K),
-                a,
-                B,
-                fp,
-                static_cast<int>(ldb),
-                strideB,
-                A,
-                fp,
-                static_cast<int>(lda),
-                strideA,
-                b,
-                C,
-                fp,
-                static_cast<int>(ldc),
-                strideC,
-                static_cast<int>(batchCount),
-                compute_type,
-                algo));
-          },
-          dev_ctx_.stream());
+      dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmStridedBatchedEx(
+            handle,
+            cuTransB,
+            cuTransA,
+            static_cast<int>(N),
+            static_cast<int>(M),
+            static_cast<int>(K),
+            a,
+            B,
+            fp,
+            static_cast<int>(ldb),
+            strideB,
+            A,
+            fp,
+            static_cast<int>(lda),
+            strideA,
+            b,
+            C,
+            fp,
+            static_cast<int>(ldc),
+            strideC,
+            static_cast<int>(batchCount),
+            compute_type,
+            algo));
+      });
     }
   } else {
 #endif  // CUDA_VERSION >= 9010
     T h_alpha = static_cast<T>(alpha);
     T h_beta = static_cast<T>(beta);
-    CublasCall(
-        [&](cublasHandle_t handle) {
-          CUBlas<T>::GEMM_STRIDED_BATCH(handle,
-                                        cuTransB,
-                                        cuTransA,
-                                        static_cast<int>(N),
-                                        static_cast<int>(M),
-                                        static_cast<int>(K),
-                                        &h_alpha,
-                                        B,
-                                        static_cast<int>(ldb),
-                                        strideB,
-                                        A,
-                                        static_cast<int>(lda),
-                                        strideA,
-                                        &h_beta,
-                                        C,
-                                        static_cast<int>(ldc),
-                                        strideC,
-                                        static_cast<int>(batchCount));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<T>::GEMM_STRIDED_BATCH(handle,
+                                    cuTransB,
+                                    cuTransA,
+                                    static_cast<int>(N),
+                                    static_cast<int>(M),
+                                    static_cast<int>(K),
+                                    &h_alpha,
+                                    B,
+                                    static_cast<int>(ldb),
+                                    strideB,
+                                    A,
+                                    static_cast<int>(lda),
+                                    strideA,
+                                    &h_beta,
+                                    C,
+                                    static_cast<int>(ldc),
+                                    strideC,
+                                    static_cast<int>(batchCount));
+    });
 
 #if CUDA_VERSION >= 9010
   }
@@ -2377,7 +2264,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   float h_beta = static_cast<float>(beta);
 
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-  bool use_tensor_op_math = MetaxTensorCoreAvailable();
+  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
   if (use_tensor_op_math) {
     algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
   }
@@ -2392,34 +2279,32 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
         "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3"));
 #endif  // CUDA_VERSION >= 12030
   } else {
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmStridedBatchedEx(
-              handle,
-              cuTransB,
-              cuTransA,
-              static_cast<int>(N),
-              static_cast<int>(M),
-              static_cast<int>(K),
-              &h_alpha,
-              B,
-              CUDA_R_16BF,
-              static_cast<int>(ldb),
-              strideB,
-              A,
-              CUDA_R_16BF,
-              static_cast<int>(lda),
-              strideA,
-              &h_beta,
-              C,
-              CUDA_R_16BF,
-              static_cast<int>(ldc),
-              strideC,
-              static_cast<int>(batchCount),
-              CUBLAS_COMPUTE_32F,
-              algo));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::cublasGemmStridedBatchedEx(handle,
+                                                   cuTransB,
+                                                   cuTransA,
+                                                   static_cast<int>(N),
+                                                   static_cast<int>(M),
+                                                   static_cast<int>(K),
+                                                   &h_alpha,
+                                                   B,
+                                                   CUDA_R_16BF,
+                                                   static_cast<int>(ldb),
+                                                   strideB,
+                                                   A,
+                                                   CUDA_R_16BF,
+                                                   static_cast<int>(lda),
+                                                   strideA,
+                                                   &h_beta,
+                                                   C,
+                                                   CUDA_R_16BF,
+                                                   static_cast<int>(ldc),
+                                                   strideC,
+                                                   static_cast<int>(batchCount),
+                                                   CUBLAS_COMPUTE_32F,
+                                                   algo));
+    });
   }
 #else
   // raise error
@@ -2460,7 +2345,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   float h_beta = beta;
 
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-  bool use_tensor_op_math = MetaxTensorCoreAvailable();
+  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
   if (use_tensor_op_math) {
     algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
   }
@@ -2475,34 +2360,32 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
         "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3"));
 #endif  // CUDA_VERSION >= 12030
   } else {
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmStridedBatchedEx(
-              handle,
-              cuTransB,
-              cuTransA,
-              static_cast<int>(N),
-              static_cast<int>(M),
-              static_cast<int>(K),
-              &h_alpha,
-              B,
-              CUDA_R_16BF,
-              static_cast<int>(ldb),
-              strideB,
-              A,
-              CUDA_R_16BF,
-              static_cast<int>(lda),
-              strideA,
-              &h_beta,
-              C,
-              CUDA_R_16BF,
-              static_cast<int>(ldc),
-              strideC,
-              static_cast<int>(batchCount),
-              CUBLAS_COMPUTE_32F,
-              algo));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::cublasGemmStridedBatchedEx(handle,
+                                                   cuTransB,
+                                                   cuTransA,
+                                                   static_cast<int>(N),
+                                                   static_cast<int>(M),
+                                                   static_cast<int>(K),
+                                                   &h_alpha,
+                                                   B,
+                                                   CUDA_R_16BF,
+                                                   static_cast<int>(ldb),
+                                                   strideB,
+                                                   A,
+                                                   CUDA_R_16BF,
+                                                   static_cast<int>(lda),
+                                                   strideA,
+                                                   &h_beta,
+                                                   C,
+                                                   CUDA_R_16BF,
+                                                   static_cast<int>(ldc),
+                                                   strideC,
+                                                   static_cast<int>(batchCount),
+                                                   CUBLAS_COMPUTE_32F,
+                                                   algo));
+    });
   }
 #else
   // raise error
@@ -2547,7 +2430,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 //        (std::is_same<float16, float>::value)) ||
 //       std::is_same<float16, phi::dtype::float16>::value) {
 //     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-//     bool use_tensor_op_math = MetaxTensorCoreAvailable();
+//     bool use_tensor_op_math = dev_ctx_.tensor_core_available();
 //     if (use_tensor_op_math) {
 //       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
 //     }
@@ -2579,7 +2462,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 // #endif
 //     }
 
-//     TensorCoreCublasCallIfAvailable(
+//     dev_ctx_.TensorCoreCublasCallIfAvailable(
 //         [&](cublasHandle_t handle) {
 //           PADDLE_ENFORCE_GPU_SUCCESS(
 //               phi::dynload::cublasGemmStridedBatchedEx(handle,
@@ -2605,12 +2488,11 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 //                                                        batchCount,
 //                                                        compute_type,
 //                                                        algo));
-//         },
-//         dev_ctx_.stream());
+//         });
 //   } else {
 // #endif  // CUDA_VERSION >= 9010
 
-//     CublasCall(
+//     dev_ctx_.CublasCall(
 //         [&](cublasHandle_t handle) {
 //           CUBlas<float16>::GEMM_STRIDED_BATCH(handle,
 //                                               cuTransB,
@@ -2667,7 +2549,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 //   cublasOperation_t cuTransB =
 //       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
 //   const int64_t strideC = M * N;
-//   CublasCall(
+//   dev_ctx_.CublasCall(
 //       [&](cublasHandle_t handle) {
 //         PADDLE_ENFORCE_GPU_SUCCESS(
 //             phi::dynload::cublasDgemmStridedBatched(handle,
@@ -2723,14 +2605,14 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 //   float h_beta = static_cast<float>(beta);
 
 //   cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-//   bool use_tensor_op_math = MetaxTensorCoreAvailable();
+//   bool use_tensor_op_math = dev_ctx->tensor_core_available();
 //   if (use_tensor_op_math) {
 //     algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
 //   }
 //   VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" :
 //   "False");
 
-//   TensorCoreCublasCallIfAvailable(
+//   dev_ctx_.TensorCoreCublasCallIfAvailable(
 //       [&](cublasHandle_t handle) {
 //         PADDLE_ENFORCE_GPU_SUCCESS(
 //             phi::dynload::cublasGemmStridedBatchedEx(handle,
@@ -2756,8 +2638,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 //                                                      batchCount,
 //                                                      CUBLAS_COMPUTE_32F,
 //                                                      algo));
-//       },
-//       dev_ctx_.stream());
+//       });
 // #else
 //   // raise error
 //   PADDLE_THROW(phi::errors::Unimplemented(
@@ -2812,25 +2693,23 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   thrust::device_vector<const double *> B_ptr(B, B + batchCount);
   thrust::device_vector<double *> C_ptr(C, C + batchCount);
 
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<double>::GEMM_BATCH(handle,
-                                   cuTransB,
-                                   cuTransA,
-                                   N,
-                                   M,
-                                   K,
-                                   &alpha,
-                                   B_ptr.data().get(),
-                                   ldb,
-                                   A_ptr.data().get(),
-                                   lda,
-                                   &beta,
-                                   C_ptr.data().get(),
-                                   ldc,
-                                   batchCount);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<double>::GEMM_BATCH(handle,
+                               cuTransB,
+                               cuTransA,
+                               N,
+                               M,
+                               K,
+                               &alpha,
+                               B_ptr.data().get(),
+                               ldb,
+                               A_ptr.data().get(),
+                               lda,
+                               &beta,
+                               C_ptr.data().get(),
+                               ldc,
+                               batchCount);
+  });
 }
 
 template <>
@@ -2859,25 +2738,23 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   thrust::device_vector<const float *> B_ptr(B, B + batchCount);
   thrust::device_vector<float *> C_ptr(C, C + batchCount);
 
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<float>::GEMM_BATCH(handle,
-                                  cuTransB,
-                                  cuTransA,
-                                  N,
-                                  M,
-                                  K,
-                                  &alpha,
-                                  B_ptr.data().get(),
-                                  ldb,
-                                  A_ptr.data().get(),
-                                  lda,
-                                  &beta,
-                                  C_ptr.data().get(),
-                                  ldc,
-                                  batchCount);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<float>::GEMM_BATCH(handle,
+                              cuTransB,
+                              cuTransA,
+                              N,
+                              M,
+                              K,
+                              &alpha,
+                              B_ptr.data().get(),
+                              ldb,
+                              A_ptr.data().get(),
+                              lda,
+                              &beta,
+                              C_ptr.data().get(),
+                              ldc,
+                              batchCount);
+  });
 }
 
 template <>
@@ -2970,7 +2847,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   float f_beta = static_cast<float>(beta);
 
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
-  bool use_tensor_op_math = MetaxTensorCoreAvailable();
+  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
   if (use_tensor_op_math) {
     algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
   }
@@ -2979,31 +2856,29 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   thrust::device_vector<const void *> A_ptr(A, A + batchCount);
   thrust::device_vector<const void *> B_ptr(B, B + batchCount);
   thrust::device_vector<void *> C_ptr(C, C + batchCount);
-  TensorCoreCublasCallIfAvailable(
-      [&](cublasHandle_t handle) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cublasGemmBatchedEx(handle,
-                                              cuTransB,
-                                              cuTransA,
-                                              N,
-                                              M,
-                                              K,
-                                              &f_alpha,
-                                              B_ptr.data().get(),
-                                              CUDA_R_16BF,
-                                              ldb,
-                                              A_ptr.data().get(),
-                                              CUDA_R_16BF,
-                                              lda,
-                                              &f_beta,
-                                              C_ptr.data().get(),
-                                              CUDA_R_16BF,
-                                              ldc,
-                                              batchCount,
-                                              CUBLAS_COMPUTE_32F,
-                                              algo));
-      },
-      dev_ctx_.stream());
+  dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cublasGemmBatchedEx(handle,
+                                          cuTransB,
+                                          cuTransA,
+                                          N,
+                                          M,
+                                          K,
+                                          &f_alpha,
+                                          B_ptr.data().get(),
+                                          CUDA_R_16BF,
+                                          ldb,
+                                          A_ptr.data().get(),
+                                          CUDA_R_16BF,
+                                          lda,
+                                          &f_beta,
+                                          C_ptr.data().get(),
+                                          CUDA_R_16BF,
+                                          ldc,
+                                          batchCount,
+                                          CUBLAS_COMPUTE_32F,
+                                          algo));
+  });
 #else
   // raise error
   PADDLE_THROW(phi::errors::Unimplemented(
@@ -3038,33 +2913,19 @@ void Blas<phi::GPUContext>::TRSM(CBLAS_SIDE side,
   cublasDiagType_t cuDiag =
       (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
 
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<T>::TRSM(handle,
-                        cuSide,
-                        cuUplo,
-                        cuTransA,
-                        cuDiag,
-                        N,
-                        M,
-                        &alpha,
-                        A,
-                        lda,
-                        B,
-                        ldb);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::TRSM(
+        handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A, lda, B, ldb);
+  });
 }
 
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::BatchedGETRF(
     int n, T **a, int *ipiv, int *info, int batch_size) const {
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
+  });
 }
 
 template <>
@@ -3084,23 +2945,18 @@ void Blas<phi::GPUContext>::BatchedGETRI(int n,
           "overlap memory space of input matrix (address: %p).",
           a_inv,
           a));
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<T>::GETRI_BATCH(
-            handle, n, a, n, ipiv, a_inv, n, info, batch_size);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
+  });
 }
 
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::BatchedMatInv(
     int n, const T **a, T **a_inv, int *info, int batch_size) const {
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
+  });
 }
 
 template <>
@@ -3118,12 +2974,10 @@ void Blas<phi::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans,
   // use CUBLAS_OP_C (conjugate transpose) for complex
   cublasOperation_t cuTrans =
       (trans == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<T>::GETRS_BATCH(
-            handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info, batch_size);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GETRS_BATCH(
+        handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info, batch_size);
+  });
 }
 
 template <>
@@ -3152,23 +3006,21 @@ void Blas<phi::GPUContext>::BatchedTRSM(CBLAS_SIDE side,
   cublasDiagType_t cuDiag =
       (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
 
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<T>::TRSM_BATCH(handle,
-                              cuSide,
-                              cuUplo,
-                              cuTransA,
-                              cuDiag,
-                              N,
-                              M,
-                              &alpha,
-                              A,
-                              lda,
-                              B,
-                              ldb,
-                              batch_size);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::TRSM_BATCH(handle,
+                          cuSide,
+                          cuUplo,
+                          cuTransA,
+                          cuDiag,
+                          N,
+                          M,
+                          &alpha,
+                          A,
+                          lda,
+                          B,
+                          ldb,
+                          batch_size);
+  });
 }
 
 }  // namespace funcs
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_adam_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_adam_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_adam_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_adam_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_bias_dropout_residual_layer_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_bias_dropout_residual_layer_norm_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_bias_dropout_residual_layer_norm_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_bias_dropout_residual_layer_norm_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_bias_dropout_residual_layer_norm_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_bias_dropout_residual_layer_norm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_bias_dropout_residual_layer_norm_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_bias_dropout_residual_layer_norm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_embedding_eltwise_layernorm_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_embedding_eltwise_layernorm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_embedding_eltwise_layernorm_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_embedding_eltwise_layernorm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_layernorm_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_layernorm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_layernorm_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_layernorm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_seqpool_cvm_grad_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_seqpool_cvm_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_seqpool_cvm_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_seqpool_cvm_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_seqpool_cvm_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_seqpool_cvm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_seqpool_cvm_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_seqpool_cvm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_grad_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_softmax_mask_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_softmax_mask_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_softmax_mask_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_softmax_mask_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_upper_triangle_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_softmax_mask_upper_triangle_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_upper_triangle_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_softmax_mask_upper_triangle_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_stack_transpose_quant_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_stack_transpose_quant_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_stack_transpose_quant_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_stack_transpose_quant_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/fusion/fused_swiglu_weighted_bwd_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_swiglu_weighted_bwd_kernel_register.cu
new file mode 100644
index 00000000000..08876233bfb
--- /dev/null
+++ b/backends/metax_gpu/kernels/fusion/fused_swiglu_weighted_bwd_kernel_register.cu
@@ -0,0 +1,30 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/fusion/gpu/fused_swiglu_weighted_bwd_kernel.cu"  //NOLINT
+
+PD_CUSTOM_KERNEL_REGISTER(fused_swiglu_weighted_bwd,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::FusedSwigluWeightedBwdKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          phi::bfloat16) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::BFLOAT16);
+  kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(2).SetDataType(phi::DataType::BFLOAT16);
+}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_token_prune_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_token_prune_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_token_prune_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_token_prune_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_transpose_split_quant_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_transpose_split_quant_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_transpose_split_quant_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_transpose_split_quant_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_transpose_wlch_split_quant_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_transpose_wlch_split_quant_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_transpose_wlch_split_quant_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_transpose_wlch_split_quant_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
index 62aaa5fb2de..a388387de45 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
@@ -15,25 +15,6 @@
 #include "kernels/metax_kernel/metax_context.h"
 
 namespace phi {
-const bool allow_tf32_cublas = []() -> bool {
-  const char* v = std::getenv("ALLOW_TF32_CUBLAS");
-  if (v) {
-    return std::atoi(v);
-  }
-  return false;
-}();
-
-const bool allow_tf32_cudnn = []() -> bool {
-  const char* v = std::getenv("ALLOW_TF32_CUDNN");
-  if (v) {
-    return std::atoi(v);
-  }
-  return false;
-}();
-
-bool AllowTF32Cublas() { return allow_tf32_cublas; }
-bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
-
 void DnnWorkspaceHandle::RunFuncSync(
     const std::function<void(void*)>& cudnn_func,
     size_t required_workspace_bytes,
@@ -87,20 +68,4 @@ static void InitBlasLtHandle(blasLtHandle_t* blaslt_handle) {
   phi::dynload::hipblasLtCreate(blaslt_handle);
 #endif
 }
-
-blasLtHandle_t GetBlasLtHandle() {
-  std::call_once(flag_blaslt_, [&]() {
-    if (!blaslt_handle_) {
-      if (!blaslt_handle_creator_)
-        InitBlasLtHandle(&blaslt_handle_);
-      else
-        blaslt_handle_ = blaslt_handle_creator_();
-    }
-  });
-  PADDLE_ENFORCE_NOT_NULL(
-      blaslt_handle_,
-      common::errors::InvalidArgument(
-          "The GPU blasLt handle is nullptr. It must not be null."));
-  return blaslt_handle_;
-}
 }  // namespace phi
diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
index a6610c1dab2..2339e18a4a6 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.h
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
@@ -128,8 +128,6 @@ inline void InitCusolverDnHandle(cusolverDnHandle_t* handle,
   }
 }
 
-bool AllowTF32Cublas();
-bool AllowTF32Cudnn();
 inline cusolverDnHandle_t GetCusolverDnHandle(gpuStream_t stream, Place place) {
   std::call_once(flag_cusolver_dn_, [&]() {
     if (!cusolver_dn_handle_) {

From 1af5148d20ce28e202fb0ac672f266c807d98b17 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Fri, 19 Sep 2025 18:31:14 +0800
Subject: [PATCH 36/58] [Metax] add log analysis script (#46)

* [Metax] fix dgc & mklml compile product path problem

* [Metax] update metax_gpu CMakeLists.txt

* [Metax] organize documents

* [Metax] add log analysis script
---
 .../metax_gpu/tests/scripts/classify.json     |  22 ++
 .../metax_gpu/tests/scripts/log_analysis.py   | 216 ++++++++++++++++++
 2 files changed, 238 insertions(+)
 create mode 100644 backends/metax_gpu/tests/scripts/classify.json
 create mode 100644 backends/metax_gpu/tests/scripts/log_analysis.py

diff --git a/backends/metax_gpu/tests/scripts/classify.json b/backends/metax_gpu/tests/scripts/classify.json
new file mode 100644
index 00000000000..b97255adc3d
--- /dev/null
+++ b/backends/metax_gpu/tests/scripts/classify.json
@@ -0,0 +1,22 @@
+{
+    "OK":{
+        "skipped":{
+            "rule":["skipped="]
+        }
+    },
+
+    "FAILED":{
+        "precision":{
+            "rule":["Mismatched elements"]
+        },
+        "api":{
+            "rule":["(PermissionDenied) Cannot use CUDAPinnedPlace", "ValueError: The API paddle.device.cuda.get_device_properties", "TypeError: paddle.index_add api"]
+        },
+        "missing":{
+            "rule":["missing metax_gpu kernel", "UnimplementedError: There are no kernels which are registered"]
+        },
+        "file_not_found":{
+            "rule":["FileNotFoundError:"]
+        }
+    }
+}
diff --git a/backends/metax_gpu/tests/scripts/log_analysis.py b/backends/metax_gpu/tests/scripts/log_analysis.py
new file mode 100644
index 00000000000..c0716f5b6f5
--- /dev/null
+++ b/backends/metax_gpu/tests/scripts/log_analysis.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import fnmatch
+import shutil
+from enum import Enum
+
+
+class TestResult(Enum):
+    OK = "OK"
+    FAILURE = "FAILED"
+
+
+class LogAnalyzer:
+    def __init__(
+        self,
+        classify_file: str,
+        search_path: str,
+        pattern: str = None,
+        encoding: str = "utf-8",
+    ):
+        self.__patten = pattern
+        self.__search_path = search_path
+        self.__encoding = encoding
+        self.__statistical_data = {}
+
+        self.__classify_data = self.__read_json_file(classify_file)
+        for key, value in self.__classify_data.items():
+            self.__statistical_data[key] = {}
+            for sub_key in list(value.keys()):
+                self.__statistical_data[key][sub_key] = []
+
+        self.__statistical_data[TestResult.OK.value]["noskip"] = []
+        self.__statistical_data[TestResult.FAILURE.value]["other"] = []
+
+    def __read_json_file(self, path: str) -> dict:
+        with open(path, "r", encoding=self.__encoding) as f:
+            data = json.load(f)
+        f.close()
+        return data
+
+    def __check_path(self, path: str) -> None:
+        """
+        处理指定路径：
+        - 若为文件夹路径：不存在则创建，存在则清空内容
+        - 若为文件路径：不存在则创建，存在则清空内容
+        """
+        try:
+            # 判断路径是否存在
+            if os.path.exists(path):
+                # 路径存在，判断是文件还是文件夹
+                if os.path.isfile(path):
+                    # 处理文件：清空内容
+                    with open(path, "w", encoding="utf-8") as f:
+                        f.write("")  # 写入空内容清空文件
+                    # print(f"文件已存在，已清空内容: {path}")
+
+                elif os.path.isdir(path):
+                    # 处理文件夹：清空所有内容
+                    for item in os.listdir(path):
+                        item_path = os.path.join(path, item)
+                        if os.path.isfile(item_path) or os.path.islink(item_path):
+                            os.remove(item_path)  # 删除文件或链接
+                        elif os.path.isdir(item_path):
+                            shutil.rmtree(item_path)  # 递归删除子文件夹
+                    # print(f"文件夹已存在，已清空内容: {path}")
+            else:
+                # 路径不存在，判断目标类型（根据最后一个元素是否有扩展名）
+                # 获取路径的最后一部分
+                last_part = os.path.basename(path)
+
+                # 判断是否为文件路径（包含扩展名）
+                if "." in last_part and not last_part.endswith("."):
+                    # 创建文件（包括父目录）
+                    parent_dir = os.path.dirname(path)
+                    if parent_dir and not os.path.exists(parent_dir):
+                        os.makedirs(parent_dir, exist_ok=True)
+                    with open(path, "w", encoding="utf-8") as f:
+                        pass  # 创建空文件
+                    # print(f"文件不存在，已创建: {path}")
+
+                else:
+                    # 创建文件夹（支持多级目录）
+                    os.makedirs(path, exist_ok=True)
+                    # print(f"文件夹不存在，已创建: {path}")
+
+        except PermissionError:
+            print(f"权限错误：无法操作路径 {path}")
+        except Exception as e:
+            print(f"处理路径时发生错误: {str(e)}")
+
+    def save_result(self, dir_path: str = "./") -> None:
+        """
+        判断文件夹是否存在：
+        - 不存在则创建
+        - 存在则清空文件夹内所有内容（保留文件夹本身）
+        """
+
+        for key, value in self.__statistical_data.items():
+            sub_dir = os.path.join(dir_path, key)
+            self.__check_path(sub_dir)
+
+            for sub_key, sub_value in value.items():
+                # print(f"{sub_key}: {len(value[sub_key])} - ({sub_value})")
+                try:
+                    with open(
+                        os.path.join(sub_dir, sub_key) + ".txt", "w", encoding="utf-8"
+                    ) as f:
+                        for op_name in sub_value:
+                            if not op_name.endswith("\n"):
+                                op_name += "\n"
+                            f.write(op_name)
+                    # print(f"内容已成功{'追加' if append else '写入'}到 {file_path}")
+                except Exception as e:
+                    print(f"写入文件失败: {e}")
+
+    def show_result(self) -> None:
+        test_counts = 0
+        for key, value in self.__statistical_data.items():
+            print(f"\n----------  {key}  ----------")
+            for sub_key, sub_value in value.items():
+                test_counts = test_counts + len(value[sub_key])
+                print(f"{sub_key}: {len(value[sub_key])}\n\t{sub_value}\n")
+        print(
+            f"\n******************* Total log num: {test_counts} *******************\n\n"
+        )
+
+    def run(self):
+        """
+        读取指定目录下符合命名规则的文件，并遍历每一行
+
+        参数:
+            search_path: 要搜索的根目录
+            pattern: 文件名匹配规则（支持通配符，如 '*.txt', 'file_*.log')
+        """
+        for dirpath, dirnames, filenames in os.walk(self.__search_path):
+            for filename in fnmatch.filter(filenames, self.__patten):
+                file_path = os.path.join(dirpath, filename)
+                # print(f"\n===== 正在处理文件: {file_path} =====")
+
+                cur_res_type = TestResult.FAILURE
+                cur_sub_type = "other"
+                pre_line = None
+                finish_early = False
+
+                try:
+                    with open(file_path, "r", encoding=self.__encoding) as f:
+                        for line in f:
+                            for sub_type, sub_type_params in self.__classify_data[
+                                cur_res_type.value
+                            ].items():
+                                for keyword in sub_type_params["rule"]:
+                                    if keyword in line:
+                                        cur_sub_type = sub_type
+                                        if sub_type == "missing":
+                                            finish_early = True
+                                        break
+
+                                if finish_early:
+                                    break
+
+                            pre_line = line
+                            if finish_early:
+                                break
+
+                        if "OK" in pre_line:
+                            cur_res_type = TestResult.OK
+                            cur_sub_type = None
+                            for sub_type, sub_type_params in self.__classify_data[
+                                cur_res_type.value
+                            ].items():
+                                for rule in sub_type_params["rule"]:
+                                    if rule in line:
+                                        cur_sub_type = sub_type
+
+                        op_name = filename.split(".")
+                        if cur_sub_type is None:
+                            self.__statistical_data[cur_res_type.value][
+                                "noskip"
+                            ].append(op_name[0])
+                        else:
+                            self.__statistical_data[cur_res_type.value][
+                                cur_sub_type
+                            ].append(op_name[0])
+                        # print(f"Result: {cur_res_type.value}, type: {cur_sub_type}")
+                    f.close()
+                except UnicodeDecodeError:
+                    print(f"警告: 文件 {file_path} 编码不是 utf-8,跳过处理")
+                except Exception as e:
+                    print(f"处理文件 {file_path} 时出错: {str(e)}")
+
+
+if __name__ == "__main__":
+
+    analyzer = LogAnalyzer(
+        classify_file="./classify.json",
+        search_path="./NPU_logs/20250918_065326",
+        pattern="test_*.log",
+    )
+
+    analyzer.run()
+    analyzer.show_result()
+    analyzer.save_result("./output")

From 518bee8382cdb7879f38e8b81e719aa8853b825e Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Fri, 19 Sep 2025 19:07:47 +0800
Subject: [PATCH 37/58] add_generate_pb (#47)

* add_generate_pb

---------
---
 backends/metax_gpu/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 7b8c52f1f31..78b4c9c566b 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -70,6 +70,7 @@ include(eigen)
 include(xxhash)
 include(zlib)
 include(protobuf)
+include(generate_pb)
 
 set(PROTO_FILE "${PADDLE_SOURCE_DIR}/paddle/phi/core/external_error.proto")
 get_filename_component(PROTO_WE "${PROTO_FILE}" NAME_WE)

From bc02549e7450cffb6b6925ef199b6f6fcbd63259 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Mon, 22 Sep 2025 16:44:28 +0800
Subject: [PATCH 38/58] modify blas (#51)

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel

* modify library to static library

* modify kernel

* modify fused_bias_dropout_residual_layer_norm

* modify compile

* modify blas

* modify blas

* modify blas

* modify blas
---
 backends/metax_gpu/CMakeLists.txt                    |  1 +
 .../metax_gpu/kernels/metax_kernel/metax_context.cc  | 12 ------------
 .../metax_gpu/kernels/metax_kernel/metax_context.h   |  4 +---
 backends/metax_gpu/patch/paddle.patch                |  1 -
 4 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 78b4c9c566b..b98f2bcc919 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -733,6 +733,7 @@ target_compile_definitions(
   ${TARGET_NAME}
   PUBLIC PADDLE_WITH_CUDA=1
          PADDLE_WITH_CUSTOM_DEVICE=1
+         mcblasContext=cublasContext
          GPUContext=CustomContext
          KPSContext=CustomContext
          STREAM_TYPE=cudaStream_t
diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
index a388387de45..6d86c81041f 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
@@ -56,16 +56,4 @@ void DnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
   allocation_.reset();
   allocation_ = allocator_->Allocate(required_workspace_bytes);
 }
-
-static std::function<blasLtHandle_t()> blaslt_handle_creator_{nullptr};
-static blasLtHandle_t blaslt_handle_{nullptr};
-static std::once_flag flag_blaslt_;
-
-static void InitBlasLtHandle(blasLtHandle_t* blaslt_handle) {
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
-  mcblasLtCreate(blaslt_handle);
-#elif defined(PADDLE_WITH_HIP)
-  phi::dynload::hipblasLtCreate(blaslt_handle);
-#endif
-}
 }  // namespace phi
diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
index 2339e18a4a6..376981f27a4 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.h
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
@@ -27,9 +27,7 @@
 #include "paddle/phi/core/attribute.h"
 #include "paddle/phi/core/device_context.h"
 
-using blasLtHandle_t = struct mcblasLtContext*;
-
-blasLtHandle_t GetBlasLtHandle();
+cublasLtHandle_t GetBlasLtHandle();
 
 namespace phi {
 class DnnWorkspaceHandle {
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index b7bdb953077..beefb730bf7 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -488,7 +488,6 @@ index 4eae698648..5c047723ea 100644
  #endif
    return block_dim >= kMaxBlockDim ? kMaxBlockDim : lwarpSize;
  }
-
 diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h
 index 15e1a4a3c3..e4780538d7 100644
 --- a/paddle/phi/kernels/funcs/math/context_project.h

From 1977ca87be51518f59506d37c08790938e4c1345 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Mon, 22 Sep 2025 17:31:21 +0800
Subject: [PATCH 39/58] [metax] modify tf32 (#52)

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel

* modify library to static library

* modify kernel

* modify fused_bias_dropout_residual_layer_norm

* modify compile

* modify blas

* modify blas

* modify blas

* modify blas

* modify context
---
 .../kernels/metax_kernel/metax_context.cc      | 18 ++++++++++++++++++
 .../kernels/metax_kernel/metax_context.h       |  2 ++
 2 files changed, 20 insertions(+)

diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
index 6d86c81041f..efddba5f00b 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
@@ -15,6 +15,24 @@
 #include "kernels/metax_kernel/metax_context.h"
 
 namespace phi {
+const bool allow_tf32_cublas = []() -> bool {
+  const char* v = std::getenv("ALLOW_TF32_CUBLAS");
+  if (v) {
+    return std::atoi(v);
+  }
+  return true;
+}();
+
+const bool allow_tf32_cudnn = []() -> bool {
+  const char* v = std::getenv("ALLOW_TF32_CUDNN");
+  if (v) {
+    return std::atoi(v);
+  }
+  return false;
+}();
+
+bool AllowTF32Cublas() { return allow_tf32_cublas; }
+bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
 void DnnWorkspaceHandle::RunFuncSync(
     const std::function<void(void*)>& cudnn_func,
     size_t required_workspace_bytes,
diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
index 376981f27a4..2d761439089 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.h
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
@@ -30,6 +30,8 @@
 cublasLtHandle_t GetBlasLtHandle();
 
 namespace phi {
+bool AllowTF32Cublas();
+bool AllowTF32Cudnn();
 class DnnWorkspaceHandle {
  public:
   inline DnnWorkspaceHandle(Allocator* allocator, gpuStream_t stream)

From 1ae2618ac81e21e41b05797e08f1330eb504c4d5 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Mon, 22 Sep 2025 17:46:50 +0800
Subject: [PATCH 40/58] [Metax] update metax backend CI test (#53)

* [Metax] fix dgc & mklml compile product path problem

* [Metax] update metax_gpu CMakeLists.txt

* [Metax] organize documents

* [Metax] add log analysis script

* [Metax] update metax backend CI test
---
 backends/metax_gpu/tests/CMakeLists.txt | 192 +++++++++++-------------
 backends/metax_gpu/tests/default.txt    |  67 +++++++++
 backends/metax_gpu/tests/run_test.sh    |  56 ++++++-
 3 files changed, 202 insertions(+), 113 deletions(-)
 create mode 100644 backends/metax_gpu/tests/default.txt

diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 795a3c5b8ac..ded54233f24 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -11,117 +11,95 @@ set(METAX_UNIT_TEST_PATH ${CMAKE_CURRENT_LIST_DIR}/unit_test)
 
 file(GLOB_RECURSE PYTHON_TEST_SCRIPTS "${METAX_UNIT_TEST_PATH}/*.py")
 
-list(
-  APPEND
-  PYTHON_TEST_SCRIPTS
-  ${PADDLE_LEGACY_TEST_PATH}/test_accuracy_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_where_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_split_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_fill_constant_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_empty_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_sign_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_cast_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_unbind_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_put_along_axis_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_maximum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_accuracy_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_strided_slice_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_set_value_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_flatten_contiguous_range_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_top_k_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_subtract_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_greater_equal_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_top_k_v2_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_one_hot_v2_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_fill_any_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_reshape_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_index_put_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_bitwise_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_pad_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_cast_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_zeros_like_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_shape_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_index_put_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_bincount_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_assign_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_squared_l2_norm_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_fused_bias_act_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_expand_v2_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_adamw_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_gather_nd_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_concat_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_nd_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_floordiv_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_mul_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_einsum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_numel_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_scale_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_full_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_clip_op.py)
-
-list(
-  REMOVE_ITEM
-  PYTHON_TEST_SCRIPTS
-  # 精度问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
-  # core.cudnnversion
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
-  # op_test.py 里 self._get_places()接口的适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
-  # device == "gpu" 适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
-  # paddle-gpu 报错一致
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
-  # paddle.device.cuda.get_device_properties
-  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py
-  # needs check_grad with fp64 precision
-  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
-  # CUDAPinnedPlace 问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py)
+if(NOT TEST_LIST_FILE)
+  message(
+    STATUS
+      "<TEST_LIST_FILE> is not set, default test list [ ${CMAKE_CURRENT_LIST_DIR}/default.txt ] will be used."
+  )
+  file(STRINGS ${CMAKE_CURRENT_LIST_DIR}/default.txt TEST_PROGRAMS)
+
+else()
+  if(NOT EXISTS ${TEST_LIST_FILE})
+    message(FATAL_ERROR "<TEST_LIST_FILE> is not exist, please check it again.")
+  endif()
+
+  file(STRINGS ${TEST_LIST_FILE} TEST_PROGRAMS)
+
+  if(NOT TEST_PROGRAMS)
+    message(FATAL_ERROR "<TEST_LIST_FILE> is empty.")
+  endif()
+
+  set(PYTHON_TEST_SCRIPTS "")
+endif()
+
+foreach(test_name ${TEST_PROGRAMS})
+  set(CURRENT_TEST_PROGRAM ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py)
+  if(NOT EXISTS ${CURRENT_TEST_PROGRAM})
+    message(WARNING "${CURRENT_TEST_PROGRAM} is not exist, skip it.")
+  else()
+    list(APPEND PYTHON_TEST_SCRIPTS ${CURRENT_TEST_PROGRAM})
+  endif()
+endforeach()
 
 list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS)
+
+if(NOT TEST_LIST_FILE)
+  list(
+    REMOVE_ITEM
+    PYTHON_TEST_SCRIPTS
+    # 精度问题
+    ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
+    # core.cudnnversion
+    ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
+    # op_test.py 里 self._get_places()接口的适配问题
+    ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
+    # device == "gpu" 适配问题
+    ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
+    # paddle-gpu 报错一致
+    ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
+    # paddle.device.cuda.get_device_properties
+    ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py
+    # needs check_grad with fp64 precision
+    ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
+    # CUDAPinnedPlace 问题
+    ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py)
+endif()
+
+if(LOG_OUTPUT_DIR AND NOT EXISTS ${LOG_OUTPUT_DIR})
+  file(MAKE_DIRECTORY ${LOG_OUTPUT_DIR})
+  message(WARNING "${LOG_OUTPUT_DIR} is not exist, create it now.")
+endif()
+
 foreach(test_script ${PYTHON_TEST_SCRIPTS})
   get_filename_component(test_name ${test_script} NAME_WE)
 
-  add_test(
-    NAME "python_${test_name}"
-    COMMAND ${Python_EXECUTABLE} ${test_script}
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+  if(LOG_OUTPUT_DIR)
+    set(test_log_file "${LOG_OUTPUT_DIR}/${test_name}.log")
+
+    add_test(
+      NAME "python_${test_name}"
+      COMMAND sh -c
+              "${Python_EXECUTABLE} ${test_script} > ${test_log_file} 2>&1"
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+
+  else()
+    add_test(
+      NAME "python_${test_name}"
+      COMMAND ${Python_EXECUTABLE} ${test_script}
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+  endif()
+
   set_tests_properties("python_${test_name}" PROPERTIES TIMEOUT 360)
 endforeach()
diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt
new file mode 100644
index 00000000000..8e2c3bcdd7e
--- /dev/null
+++ b/backends/metax_gpu/tests/default.txt
@@ -0,0 +1,67 @@
+test_accuracy_op
+test_tril_triu_op
+test_where_op
+test_split_op
+test_fill_constant_op
+test_empty_op
+test_sign_op
+test_cast_op
+test_index_add_op
+test_unbind_op
+test_put_along_axis_op
+test_layer_norm_op
+test_maximum_op
+test_accuracy_op
+test_strided_slice_op
+test_sum_op
+test_set_value_op
+test_flatten_contiguous_range_op
+test_top_k_op
+test_subtract_op
+test_softmax_op
+test_cumsum_op
+test_greater_equal_op
+test_elementwise_div_op
+test_top_k_v2_op
+test_stack_op
+test_one_hot_v2_op
+test_fill_any_op
+test_gather_op
+test_reshape_op
+test_index_put_op
+test_bitwise_op
+test_max_op
+test_pad_op
+test_elementwise_pow_op
+test_uniform_random_op
+test_scatter_op
+test_cast_op
+test_zeros_like_op
+test_compare_op
+test_shape_op
+test_tril_triu_op
+test_slice_op
+test_elementwise_add_op
+test_index_put_op
+test_bincount_op
+test_assign_op
+test_logical_op
+test_squared_l2_norm_op
+test_mean_op
+test_fused_bias_act_op
+test_expand_v2_op
+test_adamw_op
+test_gather_nd_op
+test_concat_op
+test_scatter_nd_op
+test_elementwise_floordiv_op
+test_elementwise_mul_op
+test_transpose_op
+test_einsum_op
+test_randint_op
+test_c_embedding_op
+test_numel_op
+test_scale_op
+test_softmax_with_cross_entropy_op
+test_full_op
+test_scatter_op
diff --git a/backends/metax_gpu/tests/run_test.sh b/backends/metax_gpu/tests/run_test.sh
index 7d1e8e072a9..b9e8ec5b5cc 100755
--- a/backends/metax_gpu/tests/run_test.sh
+++ b/backends/metax_gpu/tests/run_test.sh
@@ -2,13 +2,13 @@
 #!/bin/bash
 
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -29,10 +29,54 @@ export
 rm -r build
 mkdir -p build && cd build
 
-cmake ..
 
+TEST_LOG_LEVEL=0
+TEST_LIST_FILE=""
+TEST_LOG_OUTPUT_DIR=""
+TEST_PARALLEL_NUM=10
 
-cmake --build .
+while getopts "i:o:v:j:h" opt; do
+  case "$opt" in
+    i)
+      TEST_LIST_FILE="$OPTARG"
+      ;;
+    o)
+      TEST_LOG_OUTPUT_DIR="$OPTARG"
+      echo "Set log output dir [ $TEST_LOG_OUTPUT_DIR ]"
+      ;;
+    v)
+      TEST_LOG_LEVEL=$OPTARG
+      ;;
+    j)
+      TEST_PARALLEL_NUM="$OPTARG"
+      ;;
+    h)
+      echo "用法：$0 -i <测试列表文件> -o <日志输出路径> ..."
+      echo "选项说明："
+      echo "  -i  测试程序列表文件"
+      echo "  -o  日志输出路径"
+      echo "  -v  GLOG_v 日志等级"
+      echo "  -j  ctest 测试并行数量"
+      echo "  -h  显示帮助"
+      exit 0
+      ;;
+    \?)
+      echo "error: unknow option '-$OPTARG'."
+      exit 1
+      ;;
+    :)
+      echo "error option '-$OPTARG' must have parameter."
+      exit 1
+      ;;
+  esac
+done
+
+
+export GLOG_v=$TEST_LOG_LEVEL
 
 
-ctest -j10 --output-on-failure
+cmake .. -DTEST_LIST_FILE=$TEST_LIST_FILE -DLOG_OUTPUT_DIR=$TEST_LOG_OUTPUT_DIR
+
+cmake --build .
+
+ctest -j$TEST_PARALLEL_NUM --output-on-failure

From 76d5eb0245904cc209e52dd9fa92dea990db1ad7 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Tue, 23 Sep 2025 09:43:37 +0800
Subject: [PATCH 41/58] [Metax] fix log_analysis.py bug (#54)

* [Metax] fix dgc & mklml compile product path problem

* [Metax] update metax_gpu CMakeLists.txt

* [Metax] organize documents

* [Metax] add log analysis script

* [Metax] update metax backend CI test

* [Metax] fix log_analysis.py bug
---
 .../metax_gpu/tests/scripts/log_analysis.py   | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/backends/metax_gpu/tests/scripts/log_analysis.py b/backends/metax_gpu/tests/scripts/log_analysis.py
index c0716f5b6f5..963d50751f7 100644
--- a/backends/metax_gpu/tests/scripts/log_analysis.py
+++ b/backends/metax_gpu/tests/scripts/log_analysis.py
@@ -153,7 +153,6 @@ def run(self):
 
                 cur_res_type = TestResult.FAILURE
                 cur_sub_type = "other"
-                pre_line = None
                 finish_early = False
 
                 try:
@@ -172,19 +171,19 @@ def run(self):
                                 if finish_early:
                                     break
 
-                            pre_line = line
                             if finish_early:
                                 break
 
-                        if "OK" in pre_line:
-                            cur_res_type = TestResult.OK
-                            cur_sub_type = None
-                            for sub_type, sub_type_params in self.__classify_data[
-                                cur_res_type.value
-                            ].items():
-                                for rule in sub_type_params["rule"]:
-                                    if rule in line:
-                                        cur_sub_type = sub_type
+                            if len(line) >= 2 and line[:2] == "OK":
+                                cur_res_type = TestResult.OK
+                                cur_sub_type = None
+                                for sub_type, sub_type_params in self.__classify_data[
+                                    cur_res_type.value
+                                ].items():
+                                    for rule in sub_type_params["rule"]:
+                                        if rule in line:
+                                            cur_sub_type = sub_type
+                                break
 
                         op_name = filename.split(".")
                         if cur_sub_type is None:

From 9c17b6e0867119ea51c1c4230603f2a34137ac68 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Tue, 23 Sep 2025 11:09:44 +0800
Subject: [PATCH 42/58] [Metax] update metax CI CMakeLists & scripts (#56)

* [Metax] fix dgc & mklml compile product path problem

* [Metax] update metax_gpu CMakeLists.txt

* [Metax] organize documents

* [Metax] add log analysis script

* [Metax] update metax backend CI test

* [Metax] fix log_analysis.py bug

* [Metax] update metax CI CMakeLists & scripts
---
 .github/workflows/metax_work.yaml             |  2 +-
 backends/metax_gpu/tests/CMakeLists.txt       |  4 ++-
 backends/metax_gpu/tests/run_test.sh          |  2 +-
 .../metax_gpu/tests/scripts/classify.json     | 31 +++++++++++++++++--
 4 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index 51c0c62cef6..aff530d475c 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -51,4 +51,4 @@ jobs:
       - name: run test
         run: |
           cd backends/metax_gpu/tests
-          bash run_test.sh
+          bash run_test.sh -j 16
diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index ded54233f24..5b7be15e4f9 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -47,6 +47,8 @@ if(NOT TEST_LIST_FILE)
   list(
     REMOVE_ITEM
     PYTHON_TEST_SCRIPTS
+    # Metax unit test
+    ${METAX_UNIT_TEST_PATH}/test_matmul_op__metax.py
     # 精度问题
     ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
     ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
@@ -101,5 +103,5 @@ foreach(test_script ${PYTHON_TEST_SCRIPTS})
       WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   endif()
 
-  set_tests_properties("python_${test_name}" PROPERTIES TIMEOUT 360)
+  set_tests_properties("python_${test_name}" PROPERTIES TIMEOUT 600)
 endforeach()
diff --git a/backends/metax_gpu/tests/run_test.sh b/backends/metax_gpu/tests/run_test.sh
index b9e8ec5b5cc..7f2277fe4fb 100755
--- a/backends/metax_gpu/tests/run_test.sh
+++ b/backends/metax_gpu/tests/run_test.sh
@@ -33,7 +33,7 @@ mkdir -p build && cd build
 TEST_LOG_LEVEL=0
 TEST_LIST_FILE=""
 TEST_LOG_OUTPUT_DIR=""
-TEST_PARALLEL_NUM=10
+TEST_PARALLEL_NUM=1
 
 while getopts "i:o:v:j:h" opt; do
   case "$opt" in
diff --git a/backends/metax_gpu/tests/scripts/classify.json b/backends/metax_gpu/tests/scripts/classify.json
index b97255adc3d..ca92ad4a0a4 100644
--- a/backends/metax_gpu/tests/scripts/classify.json
+++ b/backends/metax_gpu/tests/scripts/classify.json
@@ -7,13 +7,38 @@
 
     "FAILED":{
         "precision":{
-            "rule":["Mismatched elements"]
+            "rule":["Mismatched elements",
+            "RuntimeError: Jacobian mismatch for output 0 in y with respect to input 0 in x on Place(metax_gpu:0),",
+            "AssertionError: np.float64("]
         },
         "api":{
-            "rule":["(PermissionDenied) Cannot use CUDAPinnedPlace", "ValueError: The API paddle.device.cuda.get_device_properties", "TypeError: paddle.index_add api"]
+            "rule":["(PermissionDenied) Cannot use CUDAPinnedPlace",
+            "ValueError: The API paddle.device.cuda.get_device_properties",
+            "TypeError: paddle.index_add api",
+            "RuntimeError: (Unavailable) Paddle is not compiled with CUDA.",
+            "ValueError: invalid literal for int() with base",
+            "AttributeError: module 'paddle.base.libpaddle' has no attribute 'cudnn_version'",
+            "RuntimeError: Pinning memory is not supported for Place(metax_gpu:0)",
+            "PreconditionNotMetError: Context place error, excepted GPUPlace, but actually Place(metax_gpu:0).",
+            "AttributeError: module 'paddle.base.libpaddle.eager.ops.legacy' has no attribute 'fused_gemm_epilogue'",
+            "ValueError: The device should not be 'gpu', since PaddlePaddle is not compiled with CUDA"]
         },
         "missing":{
-            "rule":["missing metax_gpu kernel", "UnimplementedError: There are no kernels which are registered"]
+            "rule":["missing metax_gpu kernel",
+            "missing ONEDNN kernel",
+            "UnimplementedError: There are no kernels which are registered",
+            "symbol lookup error:",
+            "RuntimeError: (NotFound) The kernel"]
+        },
+        "core_dumped":{
+            "rule":["Segmentation fault"]
+        },
+        "input_dim":{
+            "rule":["ValueError: (InvalidArgument) The Input(",
+            "Test range of input is out of bound"]
+        },
+        "array_dim":{
+            "rule":["Arrays are not equal"]
         },
         "file_not_found":{
             "rule":["FileNotFoundError:"]

From 51c98a20020ba61b2bfab54abf11668a9f40e0b6 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Tue, 23 Sep 2025 19:11:49 +0800
Subject: [PATCH 43/58] [Metax] fix MatmulKernel problem (#57)

* [Metax] fix dgc & mklml compile product path problem

* [Metax] update metax_gpu CMakeLists.txt

* [Metax] organize documents

* [Metax] add log analysis script

* [Metax] update metax backend CI test

* [Metax] fix log_analysis.py bug

* [Metax] update metax CI CMakeLists & scripts

* [Metax] fix MatmulKernel problem

* [Metax] update metax CI program
---
 .../kernels/impl/matmul_kernel_impl.h         |  19 +-
 backends/metax_gpu/tests/CMakeLists.txt       |   2 +-
 backends/metax_gpu/tests/default.txt          | 258 ++++++++++++
 ...r_equal.py => test_greater_equal_metax.py} |   0
 ...ild_src_rank_and_local_expert_id_metax.py} |   0
 ...cubate_expand_modality_expert_id_metax.py} |   0
 ....py => test_incubate_moe_combine_metax.py} |   0
 ...e_dispatch_partial_nosoftmaxtopk_metax.py} |   0
 ..._moe_gate_dispatch_w_permute_bwd_metax.py} |   0
 ...bate_moe_gate_dispatch_w_permute_metax.py} |   0
 ...layer_norm.py => test_layer_norm_metax.py} |   0
 ...l_op__metax.py => test_matmul_op_metax.py} |   0
 ...mpling.py => test_top_p_sampling_metax.py} |   0
 .../tests/unittest/test_matmul_op__metax.py   | 395 ------------------
 14 files changed, 272 insertions(+), 402 deletions(-)
 rename backends/metax_gpu/tests/unit_test/{test_greater_equal.py => test_greater_equal_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_incubate_build_src_rank_and_local_expert_id.py => test_incubate_build_src_rank_and_local_expert_id_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_incubate_expand_modality_expert_id.py => test_incubate_expand_modality_expert_id_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_incubate_moe_combine.py => test_incubate_moe_combine_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py => test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_incubate_moe_gate_dispatch_w_permute_bwd.py => test_incubate_moe_gate_dispatch_w_permute_bwd_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_incubate_moe_gate_dispatch_w_permute.py => test_incubate_moe_gate_dispatch_w_permute_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_layer_norm.py => test_layer_norm_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_matmul_op__metax.py => test_matmul_op_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_top_p_sampling.py => test_top_p_sampling_metax.py} (100%)
 delete mode 100644 backends/metax_gpu/tests/unittest/test_matmul_op__metax.py

diff --git a/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h b/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h
index bf228c81291..5221bd93ba9 100755
--- a/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h
@@ -40,6 +40,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 && 0
 #include "paddle/phi/kernels/autotune/auto_tune_base.h"
 #endif
+#include "paddle/phi/kernels/full_kernel.h"
 // clang-format on
 namespace phi {
 
@@ -1485,16 +1486,22 @@ void MatmulKernel(const Context& ctx,
                   bool transpose_x,
                   bool transpose_y,
                   DenseTensor* out) {
-  PADDLE_ENFORCE_NE(
+  if (x.numel() == 0 || y.numel() == 0) {
+    // input shape [1, 1, 5, 0], [1, 1, 0, 5], result shape is [1, 1, 5, 5]
+    phi::Full<T, Context>(
+        ctx, phi::IntArray(common::vectorize(out->dims())), 0, out);
+    return;
+  }
+  PADDLE_ENFORCE_GE(
       common::product(x.dims()),
       0,
-      phi::errors::InvalidArgument("The Input(X) dims size must not be equal 0,"
-                                   " but reviced dims size is 0. "));
-  PADDLE_ENFORCE_NE(
+      common::errors::InvalidArgument(
+          "The dims of Input(X) should be greater than or equal to 0."));
+  PADDLE_ENFORCE_GE(
       common::product(y.dims()),
       0,
-      phi::errors::InvalidArgument("The Input(Y) dims size must not be equal 0,"
-                                   " but reviced dims size is 0. "));
+      common::errors::InvalidArgument(
+          "The dims of Input(Y) should be greater than or equal to 0."));
   const std::vector<std::int64_t> x_dims = common::vectorize(x.dims());
   const std::vector<std::int64_t> y_dims = common::vectorize(y.dims());
   MatmulJudgeDtypeKernel<Context, T>(
diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 5b7be15e4f9..e8b11d347d9 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -48,7 +48,7 @@ if(NOT TEST_LIST_FILE)
     REMOVE_ITEM
     PYTHON_TEST_SCRIPTS
     # Metax unit test
-    ${METAX_UNIT_TEST_PATH}/test_matmul_op__metax.py
+    ${METAX_UNIT_TEST_PATH}/test_matmul_op_metax.py
     # 精度问题
     ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
     ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt
index 8e2c3bcdd7e..9f073d7e92f 100644
--- a/backends/metax_gpu/tests/default.txt
+++ b/backends/metax_gpu/tests/default.txt
@@ -65,3 +65,261 @@ test_scale_op
 test_softmax_with_cross_entropy_op
 test_full_op
 test_scatter_op
+test_assign_pos_op
+test_index_select_compatible
+test_dequantize_abs_max_op
+test_fill_any_op
+test_fractional_max_pool3d_api
+test_nll_loss
+test_is_empty_op
+test_norm_nn_grad
+test_index_fill
+test_floor
+test_slice_scatter
+test_nn_matmul_v2_grad
+test_matmul_op_with_head
+test_broadcast_shape
+test_fill_constant_op
+test_decayed_adagrad_op
+test_count_nonzero_api
+test_tensor_fill_
+test_minimum_op
+test_sigmoid_focal_loss
+test_dynamic_rnn_stop_gradient
+test_ops_roi_align
+test_split_op
+test_sum_decorator
+test_share_data_op
+test_assert_op
+test_masked_select_op
+test_tensor_fill_diagonal_tensor_
+test_unfold_op
+test_scatter_add_op
+test_flatten_contiguous_range_op
+test_empty_like_op
+test_logsumexp
+test_multiply
+test_ceil_op
+test_nearest_interp_v2_op
+test_incubate_expand_modality_expert_id
+test_bmm_op
+test_prelu_op
+test_batch_fc_op
+test_masked_fill
+test_overlap_add_op
+test_update_loss_scaling_op
+test_floor_divide_op
+test_increment
+test_complex_abs
+test_gather_compatible
+test_functional_conv2d
+test_group_norm_op_v2
+test_conv2d_transpose_op_depthwise_conv
+test_diagonal_op
+test_maximum_op
+test_erfinv_op
+test_interp_recompute_scale_factor
+test_embedding_scale_grad_by_freq
+test_diagonal_scatter
+test_higher_dim_scatter
+test_infer_shape
+test_flip
+test_fused_bias_dropout_residual_layer_norm_op
+test_greater_equal_op
+test_add_op
+test_cartesian_prod
+test_uniform_random_inplace_op
+test_feed_fetch_method
+test_pow_op
+test_conv3d_transpose_op
+test_add_position_encoding_op
+test_imperative_data_loader_base
+test_rnn_cell_api
+test_linspace
+test_adaptive_log_softmax_with_loss
+test_cross_entropy2_op
+test_complex_reshape
+test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk
+test_gaussian_nll_loss
+test_log_normal
+test_unstack_op
+test_expand_as_v2_op
+test_dequantize_log_op
+test_complex_sum_layer
+test_slice_var
+test_scale_op
+test_hinge_embedding_loss
+test_set_value_op
+test_merged_adam_op
+test_index_sample_op
+test_cuda_empty_cache
+test_add_n_op
+test_randint_like
+test_unique_consecutive_op
+test_fill_diagonal_tensor_op
+test_log_loss_op
+test_linalg_cholesky_inverse
+test_numel_op
+test_tril_triu_op
+test_adaptive_max_pool2d
+test_sigmoid_cross_entropy_with_logits_grad_with_auto_grad
+test_complex_cast
+test_poisson_nll_loss
+test_empty_op
+test_functional_conv1d_transpose
+test_clip_by_norm_op
+test_box_clip_op
+test_clip_op
+test_grad_clip_minimize
+test_less_than_op
+test_adamw_op
+test_data_feeder
+test_top_p_sampling
+test_subtract_op
+test_batch_norm_op_v2
+test_cosine_embedding_loss
+test_imperative_data_parallel
+test_sigmoid
+test_adaptive_max_pool3d
+test_roll_op
+test_index_put_op
+test_assign_op
+test_amp_check_finite_and_scale_op
+test_strided_slice_op
+test_label_smooth_functional
+test_c_softmax_with_cross_entropy_op
+test_sync_batch_norm_op_convert
+test_tensor_fill_diagonal_tensor
+test_bfloat16_embedding
+test_gelu_op
+test_full_
+test_concat_op
+test_imperative_data_loader_process
+test_tensor_fill_diagonal_
+test_clip_grad_norm_
+test_eager_deletion_padding_rnn
+test_pool2d_api
+test_clip_grad_value_
+test_isfinite_v2_op
+test_nn_sigmoid_op
+test_adaptive_avg_pool2d
+test_size
+test_sigmoid_cross_entropy_with_logits_op
+test_scatter_reduce_op
+test_rsqrt
+test_conv2d_transpose_layer
+test_scatter_compatible
+test_scatter_nd_op
+test_add_op_fluid
+test_unique
+test_compat_split_static
+test_stack_op
+test_tile_op
+test_adam_optimizer_fp32_fp64
+test_batch_norm_op
+test_gather_nd_op
+test_pow
+test_executor_check_fetch_list
+test_inplace_softmax_with_cross_entropy
+test_cos
+test_imperative_parallel_coalesce_split
+test_grid_sample_function
+test_rnn_decode_api
+test_triu_indices_op
+test_binary_cross_entropy_with_logits_op
+test_mean_op_v1
+test_round_op
+test_assign_pos_op_dygraph
+test_nn_functional_embedding_static
+test_norm_op
+test_unbind_op
+test_bilinear_interp_v2_op
+test_tensor_data_ptr
+test_norm_all
+test_conv1d_transpose_layer
+test_arange
+test_compat_unfold
+test_fetch_var
+test_index_select_op
+test_sign_op
+test_functional_conv3d_transpose
+test_uniform_random_bf16_op
+test_gather_tree_op
+test_histogram_bin_edges_op
+test_fractional_max_pool2d_api
+test_fill_any_like_op
+test_alpha_dropout
+test_conv3d_layer
+test_compat_pad
+test_box_coder_op
+test_full_op
+test_repeat_interleave_op
+test_reshape_op
+test_embedding_renorm
+test_log_softmax
+test_pad3d_op
+test_diag_v2
+test_complex_transpose
+test_prior_box_op
+test_square_error_cost
+test_fused_rotary_position_embedding
+test_gru_rnn_op
+test_restrict_nonzero
+test_dygraph_weight_norm
+test_conv_transpose_nn_grad
+test_incubate_build_src_rank_and_local_expert_id
+test_elementwise_nn_grad
+test_fused_bias_dropout_residual_layer_norm_op_api
+test_simple_rnn_op
+test_data_generator
+test_compat_split
+test_scatter_add_inplace_op
+test_c_softmax_with_multi_label_cross_entropy_op
+test_conv3d_transpose_layer
+test_less_equal_op
+test_gumbel_softmax_op
+test_assign_value_op
+test_cast_op
+test_fused_bias_act_op
+test_conv3d_transpose_part2_op
+test_log
+test_data
+test_incubate_moe_combine
+test_masked_scatter
+test_silu_op
+test_select_scatter_op
+test_adagrad_op_v2
+test_functional_conv3d
+test_bce_with_logits_loss
+test_argsort_op
+test_layer_norm_op_v2
+test_adaptive_max_pool1d
+test_shard_index_op
+test_cuda_max_memory_allocated
+test_roi_align_op
+test_sin
+test_take
+test_take_along_dim
+test_complex_matmul
+test_reduce_as_op
+test_log_normal_inplace
+test_repeat
+test_fetch_lod_tensor_array
+test_partial_concat_op
+test_accuracy_op
+test_l1_norm_op
+test_bce_loss
+test_fused_conv2d_add_act_op
+test_tril_indices_op
+test_cross_entropy_op
+test_blha_get_max_len_op
+test_softmax_mask_fuse_op
+test_diag_embed
+test_one_hot_v2_op
+test_selu_op
+test_huber_loss_op
+test_einsum_op
+test_dygraph_spectral_norm
+test_block_diag
+test_index_elementwise
+test_matmul_out
diff --git a/backends/metax_gpu/tests/unit_test/test_greater_equal.py b/backends/metax_gpu/tests/unit_test/test_greater_equal_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_greater_equal.py
rename to backends/metax_gpu/tests/unit_test/test_greater_equal_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py b/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py
rename to backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py b/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py
rename to backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py
rename to backends/metax_gpu/tests/unit_test/test_incubate_moe_combine_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py
rename to backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py
rename to backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py
rename to backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_layer_norm.py b/backends/metax_gpu/tests/unit_test/test_layer_norm_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_layer_norm.py
rename to backends/metax_gpu/tests/unit_test/test_layer_norm_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py b/backends/metax_gpu/tests/unit_test/test_matmul_op_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py
rename to backends/metax_gpu/tests/unit_test/test_matmul_op_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_top_p_sampling.py b/backends/metax_gpu/tests/unit_test/test_top_p_sampling_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_top_p_sampling.py
rename to backends/metax_gpu/tests/unit_test/test_top_p_sampling_metax.py
diff --git a/backends/metax_gpu/tests/unittest/test_matmul_op__metax.py b/backends/metax_gpu/tests/unittest/test_matmul_op__metax.py
deleted file mode 100644
index 7545e16d14d..00000000000
--- a/backends/metax_gpu/tests/unittest/test_matmul_op__metax.py
+++ /dev/null
@@ -1,395 +0,0 @@
-#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-from tests.op_test import OpTest
-import paddle
-
-paddle.enable_static()
-SEED = 2022
-
-
-def reference_matmul(X, Y, transpose_X=False, transpose_Y=False, scale=1.0):
-    """Reference forward implementation using np.matmul."""
-    # np.matmul does not support the transpose flags, so we manually
-    # transpose X and Y appropriately.
-    if transpose_X:
-        if X.ndim == 1:
-            X = X.reshape((X.size,))
-        elif X.ndim == 2:
-            X = X.T
-        else:
-            dim = [i for i in range(len(X.shape))]
-            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
-            X = np.transpose(X, tuple(dim))
-    if transpose_Y:
-        if Y.ndim == 1:
-            Y = Y.reshape((Y.size,))
-        else:
-            dim = [i for i in range(len(Y.shape))]
-            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
-            Y = np.transpose(Y, tuple(dim))
-
-    Out = np.matmul(X, Y)
-    if abs(scale - 1.0) > 1e-09:
-        Out = Out * scale
-    return Out
-
-
-class TestBmmOp(OpTest):
-    """
-    case 0
-    """
-
-    def set_metax_gpu(self):
-        self.__class__.use_custom_device = True
-        self.place = paddle.CustomPlace("metax_gpu", 0)
-
-    def config(self):
-        self.x_shape = (10, 2, 5)
-        self.y_shape = (10, 5, 8)
-
-    def init_kernel_type(self):
-        self.dtype = "float32"
-
-    def setUp(self):
-        self.set_metax_gpu()
-        self.init_kernel_type()
-        self.config()
-        self.op_type = "bmm"
-        x = np.random.random(self.x_shape).astype(self.dtype)
-        y = np.random.random(self.y_shape).astype(self.dtype)
-        # -0.1 ~ 0.1
-        x = -0.1 + 0.2 * x
-        y = -0.1 + 0.2 * y
-        result = reference_matmul(x, y)
-        result = result.astype(self.dtype)
-        self.inputs = {
-            "X": x,
-            "Y": y,
-        }
-        self.outputs = {"Out": result}
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-3)
-
-    def test_check_grad(self):
-        self.check_grad_with_place(self.place, ["X", "Y"], "Out")
-
-
-class TestBmmOp1(TestBmmOp):
-    """
-    case 1
-    """
-
-    def config(self):
-        self.x_shape = (40, 10, 10)
-        self.y_shape = (40, 10, 10)
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-3)
-
-    def test_check_grad(self):
-        self.check_grad_with_place(self.place, ["X", "Y"], "Out")
-
-
-class TestBmmOp2(TestBmmOp):
-    """
-    case 2
-    """
-
-    def config(self):
-        self.x_shape = (4, 10, 80)
-        self.y_shape = (4, 80, 1)
-
-    def test_check_grad(self):
-        self.check_grad_with_place(
-            self.place,
-            ["X", "Y"],
-            "Out",
-            max_relative_error=1e-2,
-        )
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-3)
-
-
-class TestMatMulOp(OpTest):
-    """
-    basic case
-    """
-
-    def setUp(self):
-        self.set_metax_gpu()
-        self.op_type = "matmul_v2"
-        self.init_dtype()
-        self.init_alpha()
-        self.config()
-
-        X = np.random.random(self.x_shape).astype(self.dtype)
-        Y = np.random.random(self.y_shape).astype(self.dtype)
-        # -0.1 ~ 0.1
-        X = -0.1 + 0.2 * X
-        Y = -0.1 + 0.2 * Y
-        Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y, self.alpha)
-        Out = Out.astype(self.dtype)
-        self.inputs = {"X": X, "Y": Y}
-        self.attrs = {
-            "trans_x": self.transpose_X,
-            "trans_y": self.transpose_Y,
-            "alpha": self.alpha,
-        }
-        self.outputs = {"Out": Out}
-
-    def set_metax_gpu(self):
-        self.__class__.use_custom_device = True
-        self.place = paddle.CustomPlace("metax_gpu", 0)
-
-    def config(self):
-        self.x_shape = (100,)
-        self.y_shape = (100,)
-        self.transpose_X = False
-        self.transpose_Y = False
-
-    def init_alpha(self):
-        self.alpha = 1.0
-
-    def init_dtype(self):
-        self.dtype = "float32"
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-7)
-
-    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ["X", "Y"], "Out")
-
-
-class TestMatMulOp1(TestMatMulOp):
-    """
-    case x_ndim == 1, y_ndim != 1
-    """
-
-    def config(self):
-        self.x_shape = (100,)
-        self.y_shape = (1, 3, 2, 100)
-        self.transpose_X = False
-        self.transpose_Y = True
-
-
-class TestMatMulOp2(TestMatMulOp):
-    """
-    case x_ndim != 1, y_ndim == 1
-    """
-
-    def config(self):
-        self.x_shape = (1, 2, 100, 1)
-        self.y_shape = (100,)
-        self.transpose_X = True
-        self.transpose_Y = False
-
-
-class TestMatMulOp3(TestMatMulOp):
-    """
-    case [M, K] x [K, N] = [M, N]
-    """
-
-    def config(self):
-        self.x_shape = (2, 100)
-        self.y_shape = (100, 2)
-        self.transpose_X = False
-        self.transpose_Y = False
-
-
-class TestMatMulOp4(TestMatMulOp):
-    """
-    case [M, K] x [K, N] = [M, N]
-    """
-
-    def config(self):
-        self.x_shape = (2, 100)
-        self.y_shape = (2, 100)
-        self.transpose_X = False
-        self.transpose_Y = True
-
-
-class TestMatMulOp5(TestMatMulOp):
-    """
-    case [M, K] x [K, N] = [M, N]
-    """
-
-    def config(self):
-        self.x_shape = (100, 2)
-        self.y_shape = (100, 2)
-        self.transpose_X = True
-        self.transpose_Y = False
-
-
-class TestMatMulOp6(TestMatMulOp):
-    """
-    case [B, M, K] x [K, N] =  [B, M, N]
-    """
-
-    def config(self):
-        self.x_shape = (2, 2, 25)
-        self.y_shape = (25, 4)
-        self.transpose_X = False
-        self.transpose_Y = False
-
-
-class TestMatMulOp7(TestMatMulOp):
-    """
-    case [B, M, K] x [K, N] =  [B, M, N]
-    """
-
-    def config(self):
-        self.x_shape = (1, 4, 25)
-        self.y_shape = (4, 25)
-        self.transpose_X = False
-        self.transpose_Y = True
-
-
-class TestMatMulOp8(TestMatMulOp):
-    """
-    case [B, M, K] x [K, N] =  [B, M, N]
-    """
-
-    def config(self):
-        self.x_shape = (1, 25, 4)
-        self.y_shape = (25, 4)
-        self.transpose_X = True
-        self.transpose_Y = False
-
-
-class TestMatMulOp9(TestMatMulOp):
-    """
-    case [B, M, K] x  [B, K, N] = [B, M, N]
-    """
-
-    def config(self):
-        self.x_shape = (2, 5, 10)
-        self.y_shape = (2, 10, 5)
-        self.transpose_X = False
-        self.transpose_Y = False
-
-
-class TestMatMulOp10(TestMatMulOp):
-    """
-    case [B, M, K] x  [B, K, N] = [B, M, N]
-    """
-
-    def config(self):
-        self.x_shape = (2, 10, 5)
-        self.y_shape = (2, 10, 5)
-        self.transpose_X = True
-        self.transpose_Y = False
-
-
-class TestMatMulOp11(TestMatMulOp):
-    """
-    case [B, M, K] x  [B, K, N] = [B, M, N]
-    """
-
-    def config(self):
-        self.x_shape = (2, 5, 10)
-        self.y_shape = (2, 5, 10)
-        self.transpose_X = False
-        self.transpose_Y = True
-
-
-class TestMatMulOp12(TestMatMulOp):
-    """
-    case to check the gradient for special case
-    """
-
-    def config(self):
-        self.x_shape = 100
-        self.y_shape = (1, 2, 2, 100, 2)
-        self.transpose_X = False
-        self.transpose_Y = False
-
-
-class TestMatMulOp13(TestMatMulOp):
-    """
-    case to check the gradient for special case
-    """
-
-    def config(self):
-        self.x_shape = (2, 1, 100)
-        self.y_shape = 100
-        self.transpose_X = False
-        self.transpose_Y = False
-
-
-# TODO(metax_gpu): alpha will be supported in next version
-# --------------------test matmul alpha--------------------
-# def create_test_alpha_class(parent):
-#     class TestMatMulOpAlphaCase(parent):
-#         def init_alpha(self):
-#             self.alpha = 0.125
-
-#     cls_name = "{0}_{1}".format(parent.__name__, "Alpha")
-#     TestMatMulOpAlphaCase.__name__ = cls_name
-#     globals()[cls_name] = TestMatMulOpAlphaCase
-
-# create_test_alpha_class(TestMatMulOp)
-# create_test_alpha_class(TestMatMulOp1)
-# create_test_alpha_class(TestMatMulOp2)
-# create_test_alpha_class(TestMatMulOp3)
-# create_test_alpha_class(TestMatMulOp4)
-# create_test_alpha_class(TestMatMulOp5)
-# create_test_alpha_class(TestMatMulOp6)
-# create_test_alpha_class(TestMatMulOp9)
-# create_test_alpha_class(TestMatMulOp10)
-# create_test_alpha_class(TestMatMulOp11)
-# create_test_alpha_class(TestMatMulOp12)
-# create_test_alpha_class(TestMatMulOp13)
-
-
-# --------------------test matmul fp16--------------------
-def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5):
-    class TestMatMulOpFp16Case(parent):
-        def init_kernel_type(self):
-            self.dtype = np.float16
-
-        def test_check_output(self):
-            self.check_output_with_place(self.place, atol=atol)
-
-        def test_check_grad(self):
-            self.check_grad_with_place(
-                self.place, ["X", "Y"], "Out", max_relative_error=max_relative_error
-            )
-
-    cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
-    TestMatMulOpFp16Case.__name__ = cls_name
-    globals()[cls_name] = TestMatMulOpFp16Case
-
-
-create_test_fp16_class(TestMatMulOp)
-create_test_fp16_class(TestMatMulOp1)
-create_test_fp16_class(TestMatMulOp2)
-create_test_fp16_class(TestMatMulOp3)
-create_test_fp16_class(TestMatMulOp4)
-create_test_fp16_class(TestMatMulOp5)
-create_test_fp16_class(TestMatMulOp6)
-create_test_fp16_class(TestMatMulOp9)
-create_test_fp16_class(TestMatMulOp10)
-create_test_fp16_class(TestMatMulOp11)
-create_test_fp16_class(TestMatMulOp12)
-create_test_fp16_class(TestMatMulOp13)
-
-if __name__ == "__main__":
-    unittest.main()

From d113018e9befab1540aa21ee5d6f8261831e245d Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 23 Sep 2025 19:12:06 +0800
Subject: [PATCH 44/58] [metax]fix paddle bug" (#58)

* [metax]fix paddle bug
---
 backends/metax_gpu/CMakeLists.txt             |   2 -
 .../grid_sample_grad_kernel_register.cu       |  23 -
 .../grid_sample_kernel_register.cu            |  19 -
 .../grid_sample_grad_kernel_register.cu       | 839 ++++++++++++++++++
 .../grid_sample_kernel_register.cu            | 527 +++++++++++
 .../metax_kernel/weight_only_linear_kernel.cu |   3 +-
 6 files changed, 1368 insertions(+), 45 deletions(-)
 delete mode 100644 backends/metax_gpu/kernels/cuda_kernels/grid_sample_grad_kernel_register.cu
 delete mode 100644 backends/metax_gpu/kernels/cuda_kernels/grid_sample_kernel_register.cu
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/grid_sample_grad_kernel_register.cu
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/grid_sample_kernel_register.cu

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index b98f2bcc919..bca1ce7aad4 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -310,8 +310,6 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/hinge_loss_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/hinge_loss_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gru_grad_kernel.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/grid_sample_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gaussian_inplace_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gammaln_kernel.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/grid_sample_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/grid_sample_grad_kernel_register.cu
deleted file mode 100644
index 83c47dc86db..00000000000
--- a/backends/metax_gpu/kernels/cuda_kernels/grid_sample_grad_kernel_register.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/grid_sample_grad_kernel.h"
-
-PD_CUSTOM_KERNEL_REGISTER(grid_sample_grad,
-                          metax_gpu,
-                          ALL_LAYOUT,
-                          phi::GridSampleGradKernel,
-                          float,
-                          double) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/grid_sample_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/grid_sample_kernel_register.cu
deleted file mode 100644
index a0447405971..00000000000
--- a/backends/metax_gpu/kernels/cuda_kernels/grid_sample_kernel_register.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/grid_sample_kernel.h"
-
-PD_CUSTOM_KERNEL_REGISTER(
-    grid_sample, metax_gpu, ALL_LAYOUT, phi::GridSampleKernel, float, double) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/grid_sample_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/grid_sample_grad_kernel_register.cu
new file mode 100644
index 00000000000..8aae95bdb22
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/grid_sample_grad_kernel_register.cu
@@ -0,0 +1,839 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "kernels/metax_kernel/metax_context.h"
+#include "paddle/phi/backends/dynload/cudnn.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/grid_sample_utils.h"
+#include "paddle/phi/kernels/grid_sample_grad_kernel.h"
+
+namespace phi {
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ void AtomicAdd(T* data,
+                                                 IndexT h,
+                                                 IndexT w,
+                                                 IndexT sH,
+                                                 IndexT sW,
+                                                 IndexT H,
+                                                 IndexT W,
+                                                 T delta) {
+  if (InBounds(h, w, H, W)) {
+    phi::CudaAtomicAdd(data + h * sH + w * sW, delta);
+  }
+}
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ void AtomicAdd3D(T* data,
+                                                   IndexT d,
+                                                   IndexT h,
+                                                   IndexT w,
+                                                   IndexT sD,
+                                                   IndexT sH,
+                                                   IndexT sW,
+                                                   IndexT D,
+                                                   IndexT H,
+                                                   IndexT W,
+                                                   T delta) {
+  if (InBounds3D(d, h, w, D, H, W)) {
+    phi::CudaAtomicAdd(data + d * sD + h * sH + w * sW, delta);
+  }
+}
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ T
+UnnormalizeWithMask(T coord, IndexT size, bool align_corners, T* grad_in) {
+  if (align_corners) {
+    *grad_in = static_cast<T>(size - 1) / 2;
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    *grad_in = static_cast<T>(size) / 2;
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ T ClipIndexesWithMask(T in,
+                                                        IndexT clip_limit,
+                                                        T* grad_in) {
+  if (in <= static_cast<T>(0)) {
+    *grad_in = static_cast<T>(0);
+    return static_cast<T>(0);
+  } else {
+    T max = static_cast<T>(clip_limit - 1);
+    if (in >= max) {
+      *grad_in = static_cast<T>(0);
+      return max;
+    } else {
+      *grad_in = static_cast<T>(1);
+      return in;
+    }
+  }
+}
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ T
+ReflectIndexesWithMask(T in, IndexT twice_low, IndexT twice_high, T* grad_in) {
+  if (twice_low == twice_high) {
+    *grad_in = static_cast<T>(0);
+    return static_cast<T>(0);
+  }
+  IndexT grad_in_mult_;
+  T min = static_cast<T>(twice_low) / 2;
+  T span = static_cast<T>(twice_high - twice_low) / 2;
+  in = in - min;
+  if (in < static_cast<T>(0)) {
+    grad_in_mult_ = -1;
+    in = -in;
+  } else {
+    grad_in_mult_ = 1;
+  }
+  T extra = fmod(in, span);
+  IndexT flips = static_cast<IndexT>(floor(in / span));
+  if (flips % 2 == 0) {
+    *grad_in = static_cast<T>(grad_in_mult_);
+    return extra + min;
+  } else {
+    *grad_in = static_cast<T>(-grad_in_mult_);
+    return span - extra + min;
+  }
+}
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ T
+ComputePositionsWithMask(T coord,
+                         IndexT size,
+                         PaddingMode padding_mode,
+                         bool align_corners,
+                         T* grad_in) {
+  T grad_clip, grad_refl;
+  coord = UnnormalizeWithMask<T>(coord, size, align_corners, grad_in);
+  if (padding_mode == PaddingMode::border) {
+    coord = ClipIndexesWithMask(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_clip;
+  } else if (padding_mode == PaddingMode::reflect) {
+    coord = align_corners ? ReflectIndexesWithMask<T, IndexT>(
+                                coord, 0, 2 * (size - 1), &grad_refl)
+                          : ReflectIndexesWithMask<T, IndexT>(
+                                coord, -1, 2 * size - 1, &grad_refl);
+    coord = ClipIndexesWithMask(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_refl * grad_clip;
+  }
+  return SafeDownGradeToIntRange(coord);
+}
+
+template <typename T, typename IndexT>
+__global__ void GridSamplerCudaBackwardKernel(const IndexT nthreads,
+                                              const T* grad_output,
+                                              const T* input,
+                                              const T* grid,
+                                              IndexT n,
+                                              IndexT out_c,
+                                              IndexT out_h,
+                                              IndexT out_w,
+                                              IndexT in_h,
+                                              IndexT in_w,
+                                              T* grad_input,
+                                              T* grad_grid,
+                                              const Mode mode,
+                                              const PaddingMode padding_mode,
+                                              bool align_corners) {
+  IndexT inp_sN = out_c * in_h * in_w;
+  IndexT inp_sC = in_h * in_w;
+  IndexT inp_sH = in_w;
+  IndexT inp_sW = 1;
+  IndexT grid_sN = out_h * out_w * 2;
+  IndexT grid_sH = out_w * 2;
+  IndexT grid_sW = 2;
+  IndexT grid_sCoor = 1;
+
+  IndexT gOut_sN = out_c * out_h * out_w;
+  IndexT gOut_sC = out_h * out_w;
+  IndexT gOut_sH = out_w;
+  IndexT gOut_sW = 1;
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const IndexT w = index % out_w;
+    const IndexT h = (index / out_w) % out_h;
+    const IndexT n = index / (out_h * out_w);
+    const IndexT grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+
+    T gix_mult, giy_mult;
+    ix = ComputePositionsWithMask<T, IndexT>(
+        ix, in_w, padding_mode, align_corners, &gix_mult);
+    iy = ComputePositionsWithMask<T, IndexT>(
+        iy, in_h, padding_mode, align_corners, &giy_mult);
+
+    if (mode == Mode::bilinear) {
+      IndexT ix_nw = static_cast<IndexT>(floor(ix));
+      IndexT iy_nw = static_cast<IndexT>(floor(iy));
+      IndexT ix_ne = ix_nw + 1;
+      IndexT iy_ne = iy_nw;
+      IndexT ix_sw = ix_nw;
+      IndexT iy_sw = iy_nw + 1;
+      IndexT ix_se = ix_nw + 1;
+      IndexT iy_se = iy_nw + 1;
+
+      T nw = (ix_se - ix) * (iy_se - iy);
+      T ne = (ix - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix) * (iy - iy_ne);
+      T se = (ix - ix_nw) * (iy - iy_nw);
+
+      T gix = static_cast<T>(0), giy = static_cast<T>(0);
+      IndexT gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      IndexT inp_offset_NC = n * inp_sN;
+      for (IndexT c = 0; c < out_c; ++c,
+                  inp_offset_NC += inp_sC,
+                  gInp_ptr_NC += inp_sC,
+                  gOut_offset += gOut_sC) {
+        T gOut = grad_output[gOut_offset];
+
+        AtomicAdd(
+            gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w, nw * gOut);
+        AtomicAdd(
+            gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w, ne * gOut);
+        AtomicAdd(
+            gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w, sw * gOut);
+        AtomicAdd(
+            gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w, se * gOut);
+
+        if (InBounds(iy_nw, ix_nw, in_h, in_w)) {
+          T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW];
+          gix -= nw_val * (iy_se - iy) * gOut;
+          giy -= nw_val * (ix_se - ix) * gOut;
+        }
+        if (InBounds(iy_ne, ix_ne, in_h, in_w)) {
+          T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW];
+          gix += ne_val * (iy_sw - iy) * gOut;
+          giy -= ne_val * (ix - ix_sw) * gOut;
+        }
+        if (InBounds(iy_sw, ix_sw, in_h, in_w)) {
+          T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW];
+          gix -= sw_val * (iy - iy_ne) * gOut;
+          giy += sw_val * (ix_ne - ix) * gOut;
+        }
+        if (InBounds(iy_se, ix_se, in_h, in_w)) {
+          T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW];
+          gix += se_val * (iy - iy_nw) * gOut;
+          giy += se_val * (ix - ix_nw) * gOut;
+        }
+      }
+
+      if (grad_grid != nullptr) {
+        T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
+        gGrid_ptr_NHW[0] = gix_mult * gix;
+        gGrid_ptr_NHW[1] = giy_mult * giy;
+      }
+    } else if (mode == Mode::nearest) {
+      IndexT ix_nearest = static_cast<IndexT>(std::nearbyint(ix));
+      IndexT iy_nearest = static_cast<IndexT>(std::nearbyint(iy));
+
+      IndexT gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      for (IndexT c = 0; c < out_c;
+           ++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
+        AtomicAdd(gInp_ptr_NC,
+                  iy_nearest,
+                  ix_nearest,
+                  inp_sH,
+                  inp_sW,
+                  in_h,
+                  in_w,
+                  grad_output[gOut_offset]);
+      }
+
+      if (grad_grid != nullptr) {
+        T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
+        gGrid_ptr_NHW[0] = static_cast<T>(0);
+        gGrid_ptr_NHW[1] = static_cast<T>(0);
+      }
+    }
+  }
+}
+
+template <typename T, typename IndexT>
+__global__ void GridSampler3DCudaBackwardKernel(const IndexT nthreads,
+                                                const T* grad_output,
+                                                const T* input,
+                                                const T* grid,
+                                                IndexT out_c,
+                                                IndexT out_d,
+                                                IndexT out_h,
+                                                IndexT out_w,
+                                                IndexT in_d,
+                                                IndexT in_h,
+                                                IndexT in_w,
+                                                T* grad_input,
+                                                T* grad_grid,
+                                                const Mode mode,
+                                                const PaddingMode padding_mode,
+                                                bool align_corners) {
+  IndexT inp_sW = 1;
+  IndexT inp_sH = in_w;
+  IndexT inp_sD = in_h * in_w;
+  IndexT inp_sC = in_d * inp_sD;
+  IndexT inp_sN = out_c * inp_sC;
+
+  IndexT grid_sCoor = 1;
+  IndexT grid_sW = 3;
+  IndexT grid_sH = out_w * grid_sW;
+  IndexT grid_sD = out_h * grid_sH;
+  IndexT grid_sN = out_d * grid_sD;
+
+  IndexT gOut_sW = 1;
+  IndexT gOut_sH = out_w;
+  IndexT gOut_sD = out_h * out_w;
+  IndexT gOut_sC = out_d * gOut_sD;
+  IndexT gOut_sN = out_c * gOut_sC;
+
+  CUDA_KERNEL_LOOP_TYPE(index, nthreads, IndexT) {
+    const IndexT w = index % out_w;
+    const IndexT h = (index / out_w) % out_h;
+    const IndexT d = (index / (out_h * out_w)) % out_d;
+    const IndexT n = index / (out_d * out_h * out_w);
+    const auto grid_offset =
+        n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
+
+    // get the corresponding input x, y, z coordinates from grid
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+    T iz = grid[grid_offset + 2 * grid_sCoor];
+
+    // multipliers for gradients on ix, iy, and iz
+    T gix_mult, giy_mult, giz_mult;
+    ix = ComputePositionsWithMask(
+        ix, in_w, padding_mode, align_corners, &gix_mult);
+    iy = ComputePositionsWithMask(
+        iy, in_h, padding_mode, align_corners, &giy_mult);
+    iz = ComputePositionsWithMask(
+        iz, in_d, padding_mode, align_corners, &giz_mult);
+
+    if (mode == Mode::bilinear) {
+      // get corner pixel values from (x, y, z)
+      // for 4d, we used north-east-south-west
+      // for 5d, we add top-bottom
+      IndexT ix_tnw = static_cast<IndexT>(std::floor(ix));
+      IndexT iy_tnw = static_cast<IndexT>(std::floor(iy));
+      IndexT iz_tnw = static_cast<IndexT>(std::floor(iz));
+
+      IndexT ix_tne = ix_tnw + 1;
+      IndexT iy_tne = iy_tnw;
+      IndexT iz_tne = iz_tnw;
+
+      IndexT ix_tsw = ix_tnw;
+      IndexT iy_tsw = iy_tnw + 1;
+      IndexT iz_tsw = iz_tnw;
+
+      IndexT ix_tse = ix_tnw + 1;
+      IndexT iy_tse = iy_tnw + 1;
+      IndexT iz_tse = iz_tnw;
+
+      IndexT ix_bnw = ix_tnw;
+      IndexT iy_bnw = iy_tnw;
+      IndexT iz_bnw = iz_tnw + 1;
+
+      IndexT ix_bne = ix_tnw + 1;
+      IndexT iy_bne = iy_tnw;
+      IndexT iz_bne = iz_tnw + 1;
+
+      IndexT ix_bsw = ix_tnw;
+      IndexT iy_bsw = iy_tnw + 1;
+      IndexT iz_bsw = iz_tnw + 1;
+
+      IndexT ix_bse = ix_tnw + 1;
+      IndexT iy_bse = iy_tnw + 1;
+      IndexT iz_bse = iz_tnw + 1;
+
+      // get surfaces to each neighbor:
+      T tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz);
+      T tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz);
+      T tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz);
+      T tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz);
+      T bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse);
+      T bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw);
+      T bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne);
+      T bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw);
+
+      T gix = static_cast<T>(0), giy = static_cast<T>(0),
+        giz = static_cast<T>(0);
+      IndexT gOut_offset =
+          n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
+      IndexT inp_offset_NC = n * inp_sN;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      for (IndexT c = 0; c < out_c; ++c,
+                  gOut_offset += gOut_sC,
+                  gInp_ptr_NC += inp_sC,
+                  inp_offset_NC += inp_sC) {
+        T gOut = grad_output[gOut_offset];
+
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_tnw,
+                    iy_tnw,
+                    ix_tnw,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    tnw * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_tne,
+                    iy_tne,
+                    ix_tne,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    tne * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_tsw,
+                    iy_tsw,
+                    ix_tsw,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    tsw * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_tse,
+                    iy_tse,
+                    ix_tse,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    tse * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_bnw,
+                    iy_bnw,
+                    ix_bnw,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    bnw * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_bne,
+                    iy_bne,
+                    ix_bne,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    bne * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_bsw,
+                    iy_bsw,
+                    ix_bsw,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    bsw * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_bse,
+                    iy_bse,
+                    ix_bse,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    bse * gOut);
+
+        // calculate grad_grid
+        if (InBounds3D(iz_tnw, iy_tnw, ix_tnw, in_d, in_h, in_w)) {
+          T tnw_val = input[inp_offset_NC + iz_tnw * inp_sD + iy_tnw * inp_sH +
+                            ix_tnw * inp_sW];
+          gix -= tnw_val * (iy_bse - iy) * (iz_bse - iz) * gOut;
+          giy -= tnw_val * (ix_bse - ix) * (iz_bse - iz) * gOut;
+          giz -= tnw_val * (ix_bse - ix) * (iy_bse - iy) * gOut;
+        }
+        if (InBounds3D(iz_tne, iy_tne, ix_tne, in_d, in_h, in_w)) {
+          T tne_val = input[inp_offset_NC + iz_tne * inp_sD + iy_tne * inp_sH +
+                            ix_tne * inp_sW];
+          gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gOut;
+          giy -= tne_val * (ix - ix_bsw) * (iz_bsw - iz) * gOut;
+          giz -= tne_val * (ix - ix_bsw) * (iy_bsw - iy) * gOut;
+        }
+        if (InBounds3D(iz_tsw, iy_tsw, ix_tsw, in_d, in_h, in_w)) {
+          T tsw_val = input[inp_offset_NC + iz_tsw * inp_sD + iy_tsw * inp_sH +
+                            ix_tsw * inp_sW];
+          gix -= tsw_val * (iy - iy_bne) * (iz_bne - iz) * gOut;
+          giy += tsw_val * (ix_bne - ix) * (iz_bne - iz) * gOut;
+          giz -= tsw_val * (ix_bne - ix) * (iy - iy_bne) * gOut;
+        }
+        if (InBounds3D(iz_tse, iy_tse, ix_tse, in_d, in_h, in_w)) {
+          T tse_val = input[inp_offset_NC + iz_tse * inp_sD + iy_tse * inp_sH +
+                            ix_tse * inp_sW];
+          gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gOut;
+          giy += tse_val * (ix - ix_bnw) * (iz_bnw - iz) * gOut;
+          giz -= tse_val * (ix - ix_bnw) * (iy - iy_bnw) * gOut;
+        }
+        if (InBounds3D(iz_bnw, iy_bnw, ix_bnw, in_d, in_h, in_w)) {
+          T bnw_val = input[inp_offset_NC + iz_bnw * inp_sD + iy_bnw * inp_sH +
+                            ix_bnw * inp_sW];
+          gix -= bnw_val * (iy_tse - iy) * (iz - iz_tse) * gOut;
+          giy -= bnw_val * (ix_tse - ix) * (iz - iz_tse) * gOut;
+          giz += bnw_val * (ix_tse - ix) * (iy_tse - iy) * gOut;
+        }
+        if (InBounds3D(iz_bne, iy_bne, ix_bne, in_d, in_h, in_w)) {
+          T bne_val = input[inp_offset_NC + iz_bne * inp_sD + iy_bne * inp_sH +
+                            ix_bne * inp_sW];
+          gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gOut;
+          giy -= bne_val * (ix - ix_tsw) * (iz - iz_tsw) * gOut;
+          giz += bne_val * (ix - ix_tsw) * (iy_tsw - iy) * gOut;
+        }
+        if (InBounds3D(iz_bsw, iy_bsw, ix_bsw, in_d, in_h, in_w)) {
+          T bsw_val = input[inp_offset_NC + iz_bsw * inp_sD + iy_bsw * inp_sH +
+                            ix_bsw * inp_sW];
+          gix -= bsw_val * (iy - iy_tne) * (iz - iz_tne) * gOut;
+          giy += bsw_val * (ix_tne - ix) * (iz - iz_tne) * gOut;
+          giz += bsw_val * (ix_tne - ix) * (iy - iy_tne) * gOut;
+        }
+        if (InBounds3D(iz_bse, iy_bse, ix_bse, in_d, in_h, in_w)) {
+          T bse_val = input[inp_offset_NC + iz_bse * inp_sD + iy_bse * inp_sH +
+                            ix_bse * inp_sW];
+          gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gOut;
+          giy += bse_val * (ix - ix_tnw) * (iz - iz_tnw) * gOut;
+          giz += bse_val * (ix - ix_tnw) * (iy - iy_tnw) * gOut;
+        }
+      }
+      if (grad_grid != nullptr) {
+        T* gGrid_ptr_NDHW = grad_grid + index * grid_sW;
+        gGrid_ptr_NDHW[0] = gix_mult * gix;
+        gGrid_ptr_NDHW[1] = giy_mult * giy;
+        gGrid_ptr_NDHW[2] = giz_mult * giz;
+      }
+    } else if (mode == Mode::nearest) {
+      IndexT ix_nearest = static_cast<IndexT>(std::round(ix));
+      IndexT iy_nearest = static_cast<IndexT>(std::round(iy));
+      IndexT iz_nearest = static_cast<IndexT>(std::round(iz));
+
+      // assign nearest neighbor pixel value to output pixel
+      IndexT gOut_offset =
+          n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      for (IndexT c = 0; c < out_c;
+           ++c, gOut_offset += gOut_sC, gInp_ptr_NC += inp_sC) {
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_nearest,
+                    iy_nearest,
+                    ix_nearest,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    grad_output[gOut_offset]);
+      }
+      if (grad_grid != nullptr) {
+        T* gGrid_ptr_NDHW = grad_grid + index * grid_sW;
+        gGrid_ptr_NDHW[0] = static_cast<T>(0);
+        gGrid_ptr_NDHW[1] = static_cast<T>(0);
+        gGrid_ptr_NDHW[2] = static_cast<T>(0);
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void GridSampleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& grid,
+                          const DenseTensor& out_grad,
+                          const std::string& mode,
+                          const std::string& padding_mode,
+                          bool align_corners,
+                          DenseTensor* x_grad,
+                          DenseTensor* grid_grad) {
+  if (out_grad.numel() == 0) {
+    if (x_grad) {
+      phi::Full<T, Context>(
+          dev_ctx, phi::IntArray(common::vectorize(x_grad->dims())), 0, x_grad);
+    }
+    if (grid_grad) {
+      phi::Full<T, Context>(dev_ctx,
+                            phi::IntArray(common::vectorize(grid_grad->dims())),
+                            0,
+                            grid_grad);
+    }
+    return;
+  }
+
+  PaddingMode enum_padding_mode;
+  Mode enum_mode;
+  if (padding_mode == "border") {
+    enum_padding_mode = PaddingMode::border;
+  } else if (padding_mode == "reflection") {
+    enum_padding_mode = PaddingMode::reflect;
+  } else {
+    enum_padding_mode = PaddingMode::zeros;
+  }
+
+  if (mode == "nearest") {
+    enum_mode = Mode::nearest;
+  } else {
+    enum_mode = Mode::bilinear;
+  }
+
+#ifndef PADDLE_WITH_HIP
+  if (condCudnnGridSampler<T>(x, grid) &&
+      enum_padding_mode == PaddingMode::zeros && enum_mode == Mode::bilinear &&
+      align_corners) {
+    const int64_t N = x.dims()[0];
+    const int64_t C = x.dims()[1];
+    const int64_t H_in = x.dims()[2];
+    const int64_t W_in = x.dims()[3];
+    const int64_t H_out = grid.dims()[1];
+    const int64_t W_out = grid.dims()[2];
+
+    // cuDNN handle
+    cudnnHandle_t handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
+
+    // Create and set Tensor descriptors (NCHW) for x/y
+    cudnnTensorDescriptor_t x_desc, dx_desc, y_desc;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnCreateTensorDescriptor(&x_desc));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnCreateTensorDescriptor(&dx_desc));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnCreateTensorDescriptor(&y_desc));
+
+    const cudnnDataType_t cudnn_dtype =
+        std::is_same<T, float>::value ? CUDNN_DATA_FLOAT : CUDNN_DATA_DOUBLE;
+
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnSetTensor4dDescriptor(x_desc,
+                                                 CUDNN_TENSOR_NCHW,
+                                                 cudnn_dtype,
+                                                 static_cast<int>(N),
+                                                 static_cast<int>(C),
+                                                 static_cast<int>(H_in),
+                                                 static_cast<int>(W_in)));
+
+    // The shape of dx is consistent with that of x
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnSetTensor4dDescriptor(dx_desc,
+                                                 CUDNN_TENSOR_NCHW,
+                                                 cudnn_dtype,
+                                                 static_cast<int>(N),
+                                                 static_cast<int>(C),
+                                                 static_cast<int>(H_in),
+                                                 static_cast<int>(W_in)));
+
+    // The shape of y is consistent with out_grad
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnSetTensor4dDescriptor(y_desc,
+                                                 CUDNN_TENSOR_NCHW,
+                                                 cudnn_dtype,
+                                                 static_cast<int>(N),
+                                                 static_cast<int>(C),
+                                                 static_cast<int>(H_out),
+                                                 static_cast<int>(W_out)));
+
+    // Spatial Transformer descriptor: specifies sampler type and output
+    // dimension (N, C, H_out, W_out)
+    cudnnSpatialTransformerDescriptor_t st_desc;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnCreateSpatialTransformerDescriptor(&st_desc));
+    int st_dims[4] = {static_cast<int>(N),
+                      static_cast<int>(C),
+                      static_cast<int>(H_out),
+                      static_cast<int>(W_out)};
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnSetSpatialTransformerNdDescriptor(
+            st_desc, CUDNN_SAMPLER_BILINEAR, cudnn_dtype, 4, st_dims));
+
+    // data pointer
+    const T* x_data = x.data<T>();
+    const T* grid_data = grid.data<T>();
+    const T* dy_data = out_grad.data<T>();
+
+    T* dx_data = dev_ctx.template Alloc<T>(x_grad);
+    phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));
+
+    T* dgrid_data = nullptr;
+    if (grid_grad) {
+      dgrid_data = dev_ctx.template Alloc<T>(grid_grad);
+    }
+
+    // alpha/beta
+    using AlphaBetaT = typename std::
+        conditional<std::is_same<T, float>::value, float, double>::type;
+    const AlphaBetaT one = static_cast<AlphaBetaT>(1.0);
+    const AlphaBetaT zero = static_cast<AlphaBetaT>(0.0);
+
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSpatialTfSamplerBackward(
+        handle,
+        st_desc,
+        static_cast<const void*>(&one),  // alpha (for dx)
+        x_desc,
+        static_cast<const void*>(x_data),
+        static_cast<const void*>(&zero),  // beta (for dx)
+        dx_desc,
+        static_cast<void*>(dx_data),
+        static_cast<const void*>(&one),  // alpha (for dgrid)
+        y_desc,
+        static_cast<const void*>(dy_data),
+        static_cast<const void*>(grid_data),
+        static_cast<const void*>(&zero),  // beta (for dgrid)
+        static_cast<void*>(dgrid_data)));
+
+    // resource release
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnDestroySpatialTransformerDescriptor(st_desc));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnDestroyTensorDescriptor(x_desc));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnDestroyTensorDescriptor(dx_desc));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnDestroyTensorDescriptor(y_desc));
+    return;
+  }
+#endif
+
+  bool use_int32_index = x.numel() <= std::numeric_limits<int>::max() &&
+                         grid.numel() <= std::numeric_limits<int>::max() &&
+                         out_grad.numel() <= std::numeric_limits<int>::max();
+
+  if (x.dims().size() == 4) {
+    const int64_t n = grid.dims()[0];
+    const int64_t out_h = grid.dims()[1];
+    const int64_t out_w = grid.dims()[2];
+    const int64_t c = x.dims()[1];
+    const int64_t in_h = x.dims()[2];
+    const int64_t in_w = x.dims()[3];
+
+    dev_ctx.template Alloc<T>(x_grad);
+    phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));
+
+    T* grid_grad_data = nullptr;
+    if (grid_grad != nullptr) {
+      grid_grad_data = dev_ctx.template Alloc<T>(grid_grad);
+    }
+
+    int64_t count = n * out_h * out_w;
+    auto cu_stream = dev_ctx.stream();
+    backends::gpu::GpuLaunchConfig config =
+        backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
+
+#define LAUNCH_KERNEL(INDEX_TYPE)                                         \
+  GridSamplerCudaBackwardKernel<T, INDEX_TYPE>                            \
+      <<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>( \
+          count,                                                          \
+          out_grad.data<T>(),                                             \
+          x.data<T>(),                                                    \
+          grid.data<T>(),                                                 \
+          n,                                                              \
+          c,                                                              \
+          out_h,                                                          \
+          out_w,                                                          \
+          in_h,                                                           \
+          in_w,                                                           \
+          x_grad->data<T>(),                                              \
+          grid_grad_data,                                                 \
+          enum_mode,                                                      \
+          enum_padding_mode,                                              \
+          align_corners);
+    if (use_int32_index) {
+      LAUNCH_KERNEL(int32_t)
+    } else {
+      LAUNCH_KERNEL(int64_t)
+    }
+#undef LAUNCH_KERNEL
+  } else {
+    const int64_t out_d = grid.dims()[1];
+    const int64_t out_h = grid.dims()[2];
+    const int64_t out_w = grid.dims()[3];
+    const int64_t n = x.dims()[0];
+    const int64_t c = x.dims()[1];
+    const int64_t in_d = x.dims()[2];
+    const int64_t in_h = x.dims()[3];
+    const int64_t in_w = x.dims()[4];
+
+    dev_ctx.template Alloc<T>(x_grad);
+    phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));
+
+    T* grid_grad_data = nullptr;
+    if (grid_grad != nullptr) {
+      grid_grad_data = dev_ctx.template Alloc<T>(grid_grad);
+    }
+
+    int64_t count = static_cast<int64_t>(n * out_d * out_h * out_w);
+    auto cu_stream = dev_ctx.stream();
+    backends::gpu::GpuLaunchConfig config =
+        backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
+
+#define LAUNCH_KERNEL(INDEX_TYPE)                                         \
+  GridSampler3DCudaBackwardKernel<T, INDEX_TYPE>                          \
+      <<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>( \
+          count,                                                          \
+          out_grad.data<T>(),                                             \
+          x.data<T>(),                                                    \
+          grid.data<T>(),                                                 \
+          c,                                                              \
+          out_d,                                                          \
+          out_h,                                                          \
+          out_w,                                                          \
+          in_d,                                                           \
+          in_h,                                                           \
+          in_w,                                                           \
+          x_grad->data<T>(),                                              \
+          grid_grad_data,                                                 \
+          enum_mode,                                                      \
+          enum_padding_mode,                                              \
+          align_corners);
+    if (use_int32_index) {
+      LAUNCH_KERNEL(int32_t)
+    } else {
+      LAUNCH_KERNEL(int64_t)
+    }
+#undef LAUNCH_KERNEL
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_PLUGIN_KERNEL(grid_sample_grad,
+                          metax_gpus,
+                          ALL_LAYOUT,
+                          phi::GridSampleGradKernel,
+                          float,
+                          double) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/grid_sample_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/grid_sample_kernel_register.cu
new file mode 100644
index 00000000000..71050c264c6
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/grid_sample_kernel_register.cu
@@ -0,0 +1,527 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "glog/logging.h"
+#include "kernels/metax_kernel/metax_context.h"
+#include "paddle/phi/backends/dynload/cudnn.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/grid_sample_utils.h"
+#include "paddle/phi/kernels/grid_sample_kernel.h"
+
+namespace phi {
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ T Unnormalize(T coord,
+                                                IndexT size,
+                                                bool align_corners) {
+  return align_corners ? ((coord + 1.f) / 2) * (size - 1)
+                       : ((coord + 1.f) * size - 1) / 2;
+}
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ T ClipIndexes(T in, IndexT max_value) {
+  return min(static_cast<T>(max_value - 1), max(in, static_cast<T>(0)));
+}
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ T ReflectIndexes(T in,
+                                                   IndexT twice_low,
+                                                   IndexT twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<T>(0);
+  }
+  T min = static_cast<T>(twice_low) / 2;
+  T span = static_cast<T>(twice_high - twice_low) / 2;
+  in = fabs(in - min);
+  T extra = fmod(in, span);
+  IndexT flips = floor(in / span);
+  return (flips & 1) ? span - extra + min : extra + min;  // cond ? odd : even
+}
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ T ComputePositions(T coord,
+                                                     IndexT size,
+                                                     PaddingMode padding_mode,
+                                                     bool align_corners) {
+  coord = Unnormalize(coord, size, align_corners);
+  if (padding_mode == PaddingMode::border) {
+    coord = ClipIndexes(coord, size);
+  } else if (padding_mode == PaddingMode::reflect) {
+    coord = align_corners ? ReflectIndexes<T, IndexT>(coord, 0, 2 * (size - 1))
+                          : ReflectIndexes<T, IndexT>(coord, -1, 2 * size - 1);
+    coord = ClipIndexes(coord, size);
+  }
+  return SafeDownGradeToIntRange(coord);
+}
+
+template <typename T, typename IndexT>
+__global__ void GridSampleCudaKernel(IndexT n,
+                                     IndexT out_c,
+                                     IndexT out_hw,
+                                     IndexT in_h,
+                                     IndexT in_w,
+                                     const T* __restrict__ input,
+                                     const T* __restrict__ grid,
+                                     T* __restrict__ output,
+                                     const Mode mode,
+                                     const PaddingMode padding_mode,
+                                     bool align_corners) {
+  IndexT nthreads = n * out_hw;
+  IndexT inp_sN = out_c * (in_h * in_w);
+  IndexT inp_sC = in_h * in_w;
+  IndexT inp_sH = in_w;
+  IndexT inp_sW = 1;
+  IndexT grid_sNHW = 2;
+  IndexT grid_sCoor = 1;
+  IndexT out_sN = out_c * out_hw;
+  IndexT out_sC = out_hw;
+  IndexT out_sHW = 1;
+  CUDA_KERNEL_LOOP_TYPE(index, nthreads, IndexT) {
+    const IndexT hw = index % out_hw;
+    const IndexT n = index / out_hw;
+    const IndexT grid_offset = index * grid_sNHW;
+
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+
+    ix = ComputePositions(ix, in_w, padding_mode, align_corners);
+    iy = ComputePositions(iy, in_h, padding_mode, align_corners);
+    if (mode == Mode::bilinear) {
+      IndexT ix_nw = floor(ix);
+      IndexT iy_nw = floor(iy);
+      IndexT ix_ne = ix_nw + 1;
+      IndexT iy_ne = iy_nw;
+      IndexT ix_sw = ix_nw;
+      IndexT iy_sw = iy_nw + 1;
+      IndexT ix_se = ix_nw + 1;
+      IndexT iy_se = iy_nw + 1;
+
+      T nw = (ix_se - ix) * (iy_se - iy);
+      T ne = (ix - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix) * (iy - iy_ne);
+      T se = (ix - ix_nw) * (iy - iy_nw);
+
+      IndexT inp_offset_NC = n * inp_sN;
+      T* out_ptr_NCHW = output + (n * out_sN + hw * out_sHW);
+
+      for (IndexT c = 0; c < out_c;
+           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        T value{0};
+        if (InBounds(iy_nw, ix_nw, in_h, in_w)) {
+          value += input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+        }
+        if (InBounds(iy_ne, ix_ne, in_h, in_w)) {
+          value += input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+        }
+        if (InBounds(iy_sw, ix_sw, in_h, in_w)) {
+          value += input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+        }
+        if (InBounds(iy_se, ix_se, in_h, in_w)) {
+          value += input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se;
+        }
+        *out_ptr_NCHW = value;
+      }
+    } else if (mode == Mode::nearest) {
+      IndexT ix_nearest = std::nearbyint(ix);
+      IndexT iy_nearest = std::nearbyint(iy);
+      IndexT inp_offset_NC = n * inp_sN;
+      T* out_ptr_NCHW = output + (n * out_sN + hw * out_sHW);
+      for (IndexT c = 0; c < out_c;
+           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        if (InBounds(iy_nearest, ix_nearest, in_h, in_w)) {
+          *out_ptr_NCHW =
+              input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW];
+        } else {
+          *out_ptr_NCHW = static_cast<T>(0);
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename IndexT>
+__global__ void GridSample3DCudaKernel(const IndexT nthreads,
+                                       IndexT out_c,
+                                       IndexT out_d,
+                                       IndexT out_h,
+                                       IndexT out_w,
+                                       IndexT in_d,
+                                       IndexT in_h,
+                                       IndexT in_w,
+                                       const T* input,
+                                       const T* grid,
+                                       T* output,
+                                       const Mode interpolation_mode,
+                                       const PaddingMode padding_mode,
+                                       bool align_corners) {
+  IndexT inp_sW = 1;
+  IndexT inp_sH = in_w;
+  IndexT inp_sD = in_h * in_w;
+  IndexT inp_sC = in_d * inp_sD;
+  IndexT inp_sN = out_c * inp_sC;
+
+  IndexT grid_sCoor = 1;
+  IndexT grid_sW = 3;
+  IndexT grid_sH = out_w * grid_sW;
+  IndexT grid_sD = out_h * grid_sH;
+  IndexT grid_sN = out_d * grid_sD;
+
+  IndexT out_sW = 1;
+  IndexT out_sH = out_w;
+  IndexT out_sD = out_h * out_w;
+  IndexT out_sC = out_d * out_sD;
+  IndexT out_sN = out_c * out_sC;
+
+  CUDA_KERNEL_LOOP_TYPE(index, nthreads, IndexT) {
+    const IndexT w = index % out_w;
+    const IndexT h = (index / out_w) % out_h;
+    const IndexT d = (index / (out_h * out_w)) % out_d;
+    const IndexT n = index / (out_d * out_h * out_w);
+    const IndexT grid_offset =
+        n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
+    // get the corresponding input x, y, z coordinates from grid
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+    T iz = grid[grid_offset + 2 * grid_sCoor];
+    ix = ComputePositions(ix, in_w, padding_mode, align_corners);
+    iy = ComputePositions(iy, in_h, padding_mode, align_corners);
+    iz = ComputePositions(iz, in_d, padding_mode, align_corners);
+    if (interpolation_mode == Mode::bilinear) {
+      // get corner pixel values from (x, y, z)
+      // for 4d, we used north-east-south-west
+      // for 5d, we add top-bottom
+      IndexT ix_tnw = static_cast<IndexT>(std::floor(ix));
+      IndexT iy_tnw = static_cast<IndexT>(std::floor(iy));
+      IndexT iz_tnw = static_cast<IndexT>(std::floor(iz));
+
+      IndexT ix_tne = ix_tnw + 1;
+      IndexT iy_tne = iy_tnw;
+      IndexT iz_tne = iz_tnw;
+
+      IndexT ix_tsw = ix_tnw;
+      IndexT iy_tsw = iy_tnw + 1;
+      IndexT iz_tsw = iz_tnw;
+
+      IndexT ix_tse = ix_tnw + 1;
+      IndexT iy_tse = iy_tnw + 1;
+      IndexT iz_tse = iz_tnw;
+
+      IndexT ix_bnw = ix_tnw;
+      IndexT iy_bnw = iy_tnw;
+      IndexT iz_bnw = iz_tnw + 1;
+
+      IndexT ix_bne = ix_tnw + 1;
+      IndexT iy_bne = iy_tnw;
+      IndexT iz_bne = iz_tnw + 1;
+
+      IndexT ix_bsw = ix_tnw;
+      IndexT iy_bsw = iy_tnw + 1;
+      IndexT iz_bsw = iz_tnw + 1;
+
+      IndexT ix_bse = ix_tnw + 1;
+      IndexT iy_bse = iy_tnw + 1;
+      IndexT iz_bse = iz_tnw + 1;
+
+      // get surfaces to each neighbor:
+      T tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz);
+      T tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz);
+      T tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz);
+      T tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz);
+      T bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse);
+      T bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw);
+      T bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne);
+      T bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw);
+
+      const T* inp_ptr_NC = input + n * inp_sN;
+      T* out_ptr_NCDHW =
+          output + (n * out_sN + d * out_sD + h * out_sH + w * out_sW);
+      for (IndexT c = 0; c < out_c;
+           ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
+        *out_ptr_NCDHW = static_cast<T>(0);
+        if (InBounds3D(iz_tnw, iy_tnw, ix_tnw, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] *
+              tnw;
+        }
+        if (InBounds3D(iz_tne, iy_tne, ix_tne, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] *
+              tne;
+        }
+        if (InBounds3D(iz_tsw, iy_tsw, ix_tsw, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] *
+              tsw;
+        }
+        if (InBounds3D(iz_tse, iy_tse, ix_tse, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] *
+              tse;
+        }
+        if (InBounds3D(iz_bnw, iy_bnw, ix_bnw, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] *
+              bnw;
+        }
+        if (InBounds3D(iz_bne, iy_bne, ix_bne, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] *
+              bne;
+        }
+        if (InBounds3D(iz_bsw, iy_bsw, ix_bsw, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] *
+              bsw;
+        }
+        if (InBounds3D(iz_bse, iy_bse, ix_bse, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] *
+              bse;
+        }
+      }
+    } else if (interpolation_mode == Mode::nearest) {
+      IndexT ix_nearest = static_cast<IndexT>(std::nearbyint(ix));
+      IndexT iy_nearest = static_cast<IndexT>(std::nearbyint(iy));
+      IndexT iz_nearest = static_cast<IndexT>(std::nearbyint(iz));
+
+      // assign nearest neighbor pixel value to output pixel
+      const T* inp_ptr_NC = input + n * inp_sN;
+      T* out_ptr_NCDHW =
+          output + (n * out_sN + d * out_sD + h * out_sH + w * out_sW);
+      for (IndexT c = 0; c < out_c;
+           ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
+        if (InBounds3D(iz_nearest, iy_nearest, ix_nearest, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW =
+              inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH +
+                         ix_nearest * inp_sW];
+        } else {
+          *out_ptr_NCDHW = static_cast<T>(0);
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void GridSampleKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& grid,
+                      const std::string& mode,
+                      const std::string& padding_mode,
+                      bool align_corners,
+                      DenseTensor* out) {
+  if (out && out->numel() == 0) {
+    dev_ctx.template Alloc<T>(out);
+    return;
+  }
+  PaddingMode enum_padding_mode;
+  Mode enum_mode;
+  if (padding_mode == "border") {
+    enum_padding_mode = PaddingMode::border;
+  } else if (padding_mode == "reflection") {
+    enum_padding_mode = PaddingMode::reflect;
+  } else {
+    enum_padding_mode = PaddingMode::zeros;
+  }
+
+  if (mode == "nearest") {
+    enum_mode = Mode::nearest;
+  } else {
+    enum_mode = Mode::bilinear;
+  }
+
+#ifndef PADDLE_WITH_HIP
+  if (condCudnnGridSampler<T>(x, grid) &&
+      enum_padding_mode == PaddingMode::zeros && enum_mode == Mode::bilinear &&
+      align_corners) {
+    const int64_t N = x.dims()[0];
+    const int64_t C = x.dims()[1];
+    const int64_t H_in = x.dims()[2];
+    const int64_t W_in = x.dims()[3];
+    const int64_t H_out = grid.dims()[1];
+    const int64_t W_out = grid.dims()[2];
+
+    out->Resize({N, C, H_out, W_out});
+    auto* out_data = dev_ctx.template Alloc<T>(out);
+
+    cudnnHandle_t handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
+
+    // Create and set Tensor descriptors (NCHW) for x and out
+    cudnnTensorDescriptor_t x_desc, y_desc;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnCreateTensorDescriptor(&x_desc));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnCreateTensorDescriptor(&y_desc));
+
+    const cudnnDataType_t cudnn_dtype =
+        std::is_same<T, float>::value ? CUDNN_DATA_FLOAT : CUDNN_DATA_DOUBLE;
+
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnSetTensor4dDescriptor(x_desc,
+                                                 CUDNN_TENSOR_NCHW,
+                                                 cudnn_dtype,
+                                                 static_cast<int>(N),
+                                                 static_cast<int>(C),
+                                                 static_cast<int>(H_in),
+                                                 static_cast<int>(W_in)));
+
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnSetTensor4dDescriptor(y_desc,
+                                                 CUDNN_TENSOR_NCHW,
+                                                 cudnn_dtype,
+                                                 static_cast<int>(N),
+                                                 static_cast<int>(C),
+                                                 static_cast<int>(H_out),
+                                                 static_cast<int>(W_out)));
+
+    // Spatial Transformer descriptor: specifies sampler type and output
+    // dimension (N, C, H_out, W_out)
+    cudnnSpatialTransformerDescriptor_t st_desc;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnCreateSpatialTransformerDescriptor(&st_desc));
+    int st_dims[4] = {static_cast<int>(N),
+                      static_cast<int>(C),
+                      static_cast<int>(H_out),
+                      static_cast<int>(W_out)};
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnSetSpatialTransformerNdDescriptor(
+            st_desc, CUDNN_SAMPLER_BILINEAR, cudnn_dtype, 4, st_dims));
+
+    const T* x_data = x.data<T>();
+    const T* grid_data = grid.data<T>();
+    using AlphaBetaT = typename std::
+        conditional<std::is_same<T, float>::value, float, double>::type;
+    const AlphaBetaT alpha = static_cast<AlphaBetaT>(1.0);
+    const AlphaBetaT beta = static_cast<AlphaBetaT>(0.0);
+
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSpatialTfSamplerForward(
+        handle,
+        st_desc,
+        static_cast<const void*>(&alpha),
+        x_desc,
+        static_cast<const void*>(x_data),
+        static_cast<const void*>(grid_data),
+        static_cast<const void*>(&beta),
+        y_desc,
+        static_cast<void*>(out_data)));
+
+    // resource release
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnDestroySpatialTransformerDescriptor(st_desc));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnDestroyTensorDescriptor(x_desc));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnDestroyTensorDescriptor(y_desc));
+    return;
+  }
+#endif
+
+  bool use_int32_index = x.numel() <= std::numeric_limits<int>::max() &&
+                         grid.numel() <= std::numeric_limits<int>::max() &&
+                         out->numel() <= std::numeric_limits<int>::max();
+
+  if (x.dims().size() == 4) {
+    const int64_t n = grid.dims()[0];
+    const int64_t out_h = grid.dims()[1];
+    const int64_t out_w = grid.dims()[2];
+    const int64_t c = x.dims()[1];
+    const int64_t in_h = x.dims()[2];
+    const int64_t in_w = x.dims()[3];
+    VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h
+            << "; out_w: " << out_w;
+
+    auto* output_data = dev_ctx.template Alloc<T>(out);
+    VLOG(3) << "out dims: " << out->dims()[0] << "; " << out->dims()[1] << "; "
+            << out->dims()[2] << "; " << out->dims()[3];
+
+    int64_t count = n * out_h * out_w;
+    auto cu_stream = dev_ctx.stream();
+    backends::gpu::GpuLaunchConfig config =
+        backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
+
+#define LAUNCH_KERNEL(INDEX_TYPE)                                         \
+  GridSampleCudaKernel<T, INDEX_TYPE>                                     \
+      <<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>( \
+          n,                                                              \
+          c,                                                              \
+          out_h * out_w,                                                  \
+          in_h,                                                           \
+          in_w,                                                           \
+          x.data<T>(),                                                    \
+          grid.data<T>(),                                                 \
+          output_data,                                                    \
+          enum_mode,                                                      \
+          enum_padding_mode,                                              \
+          align_corners)
+    if (use_int32_index) {
+      LAUNCH_KERNEL(int);
+    } else {
+      LAUNCH_KERNEL(int64_t);
+    }
+#undef LAUNCH_KERNEL
+  } else {
+    const int64_t n = grid.dims()[0];
+    const int64_t out_d = grid.dims()[1];
+    const int64_t out_h = grid.dims()[2];
+    const int64_t out_w = grid.dims()[3];
+    const int64_t c = x.dims()[1];
+    const int64_t in_d = x.dims()[2];
+    const int64_t in_h = x.dims()[3];
+    const int64_t in_w = x.dims()[4];
+
+    VLOG(3) << "n: " << n << "; c: " << c << "; out_d: " << out_d
+            << "; out_h: " << out_h << "; out_w: " << out_w;
+
+    auto* output_data = dev_ctx.template Alloc<T>(out);
+    VLOG(3) << "out dims: " << out->dims()[0] << "; " << out->dims()[1] << "; "
+            << out->dims()[2] << "; " << out->dims()[3] << "; "
+            << out->dims()[4];
+
+    int64_t count = n * out_d * out_h * out_w;
+    auto cu_stream = dev_ctx.stream();
+    backends::gpu::GpuLaunchConfig config =
+        backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
+
+#define LAUNCH_KERNEL(INDEX_TYPE)                                         \
+  GridSample3DCudaKernel<T, INDEX_TYPE>                                   \
+      <<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>( \
+          count,                                                          \
+          c,                                                              \
+          out_d,                                                          \
+          out_h,                                                          \
+          out_w,                                                          \
+          in_d,                                                           \
+          in_h,                                                           \
+          in_w,                                                           \
+          x.data<T>(),                                                    \
+          grid.data<T>(),                                                 \
+          output_data,                                                    \
+          enum_mode,                                                      \
+          enum_padding_mode,                                              \
+          align_corners)
+    if (use_int32_index) {
+      LAUNCH_KERNEL(int);
+    } else {
+      LAUNCH_KERNEL(int64_t);
+    }
+#undef LAUNCH_KERNEL
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_PLUGIN_KERNEL(
+    grid_sample, metax_gpu, ALL_LAYOUT, phi::GridSampleKernel, float, double) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
index eae8c8c0301..d2f39ccf751 100644
--- a/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
@@ -35,6 +35,7 @@ void WeightOnlyLinearKernel(const Context& dev_ctx,
                             const int32_t group_size,
                             DenseTensor* out) {
   dev_ctx.template Alloc<T>(out);
+  auto stream = dev_ctx.stream();
   const T* x_data = x.data<T>();
   const int8_t* weight_data = weight.data<int8_t>();
   const T* bias_data = bias ? bias.get().data<T>() : nullptr;
@@ -128,7 +129,7 @@ void WeightOnlyLinearKernel(const Context& dev_ctx,
           k,
           n,
           n};
-      mctlass_op(arguments);
+      mctlass_op(arguments, NULL, stream);
     } else {
       mctlassGemmScaleOp_w8a16_bias mctlass_op;
       typename mctlassGemmScaleOp_w8a16_bias::Arguments arguments{

From 89912995a39f939a582aeb953f761a588c89663d Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 23 Sep 2025 20:02:41 +0800
Subject: [PATCH 45/58] =?UTF-8?q?change=E2=80=94ut=20(#59)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* change_ut
---
 backends/metax_gpu/tests/default.txt | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt
index 9f073d7e92f..9c989161fed 100644
--- a/backends/metax_gpu/tests/default.txt
+++ b/backends/metax_gpu/tests/default.txt
@@ -42,7 +42,6 @@ test_shape_op
 test_tril_triu_op
 test_slice_op
 test_elementwise_add_op
-test_index_put_op
 test_bincount_op
 test_assign_op
 test_logical_op
@@ -73,7 +72,6 @@ test_fractional_max_pool3d_api
 test_nll_loss
 test_is_empty_op
 test_norm_nn_grad
-test_index_fill
 test_floor
 test_slice_scatter
 test_nn_matmul_v2_grad
@@ -127,10 +125,8 @@ test_flip
 test_fused_bias_dropout_residual_layer_norm_op
 test_greater_equal_op
 test_add_op
-test_cartesian_prod
 test_uniform_random_inplace_op
 test_feed_fetch_method
-test_pow_op
 test_conv3d_transpose_op
 test_add_position_encoding_op
 test_imperative_data_loader_base
@@ -223,12 +219,9 @@ test_executor_check_fetch_list
 test_inplace_softmax_with_cross_entropy
 test_cos
 test_imperative_parallel_coalesce_split
-test_grid_sample_function
-test_rnn_decode_api
 test_triu_indices_op
 test_binary_cross_entropy_with_logits_op
 test_mean_op_v1
-test_round_op
 test_assign_pos_op_dygraph
 test_nn_functional_embedding_static
 test_norm_op
@@ -262,7 +255,6 @@ test_diag_v2
 test_complex_transpose
 test_prior_box_op
 test_square_error_cost
-test_fused_rotary_position_embedding
 test_gru_rnn_op
 test_restrict_nonzero
 test_dygraph_weight_norm
@@ -295,7 +287,6 @@ test_argsort_op
 test_layer_norm_op_v2
 test_adaptive_max_pool1d
 test_shard_index_op
-test_cuda_max_memory_allocated
 test_roi_align_op
 test_sin
 test_take

From a770e6f197e8c519712a4a7d2359110d34dc0431 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 23 Sep 2025 20:50:24 +0800
Subject: [PATCH 46/58] change_ut (#60)

* change_ut

---------
---
 backends/metax_gpu/tests/default.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt
index 9c989161fed..21adad68f5b 100644
--- a/backends/metax_gpu/tests/default.txt
+++ b/backends/metax_gpu/tests/default.txt
@@ -28,7 +28,6 @@ test_one_hot_v2_op
 test_fill_any_op
 test_gather_op
 test_reshape_op
-test_index_put_op
 test_bitwise_op
 test_max_op
 test_pad_op
@@ -214,7 +213,6 @@ test_tile_op
 test_adam_optimizer_fp32_fp64
 test_batch_norm_op
 test_gather_nd_op
-test_pow
 test_executor_check_fetch_list
 test_inplace_softmax_with_cross_entropy
 test_cos

From 902112bb8707edebefa747e4994384df27c3f356 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 24 Sep 2025 10:05:05 +0800
Subject: [PATCH 47/58] change_ut (#63)

* change_ut

* change_ut

---------
---
 backends/metax_gpu/tests/default.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt
index 21adad68f5b..54f0b7c008f 100644
--- a/backends/metax_gpu/tests/default.txt
+++ b/backends/metax_gpu/tests/default.txt
@@ -177,7 +177,6 @@ test_imperative_data_parallel
 test_sigmoid
 test_adaptive_max_pool3d
 test_roll_op
-test_index_put_op
 test_assign_op
 test_amp_check_finite_and_scale_op
 test_strided_slice_op

From cfe44ce24e2e67c595057e0568b7c34f55c08b0a Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Thu, 25 Sep 2025 16:04:11 +0800
Subject: [PATCH 48/58] [Metax] add keyword filter in CI CMakeLists.txt (#64)

* [Metax] add keyword filter in CI CMakeLists.txt

* [Metax] add ignore case list
---
 backends/metax_gpu/tests/CMakeLists.txt | 62 ++++++++++++-------------
 backends/metax_gpu/tests/ignore.txt     | 21 +++++++++
 2 files changed, 50 insertions(+), 33 deletions(-)
 create mode 100644 backends/metax_gpu/tests/ignore.txt

diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index e8b11d347d9..0c84ada4b65 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -9,6 +9,8 @@ set(PADDLE_LEGACY_TEST_PATH
     ${CMAKE_CURRENT_LIST_DIR}/../../../Paddle/test/legacy_test)
 set(METAX_UNIT_TEST_PATH ${CMAKE_CURRENT_LIST_DIR}/unit_test)
 
+set(NEED_REMOVE_KEYWORDS "attention")
+
 file(GLOB_RECURSE PYTHON_TEST_SCRIPTS "${METAX_UNIT_TEST_PATH}/*.py")
 
 if(NOT TEST_LIST_FILE)
@@ -33,6 +35,20 @@ else()
 endif()
 
 foreach(test_name ${TEST_PROGRAMS})
+  set(IS_REMOVE FALSE)
+
+  foreach(keyword ${NEED_REMOVE_KEYWORDS})
+    string(FIND "${test_name}" "${keyword}" RES)
+    if(NOT RES EQUAL -1)
+      set(IS_REMOVE TRUE)
+      break()
+    endif()
+  endforeach()
+
+  if(IS_REMOVE)
+    continue()
+  endif()
+
   set(CURRENT_TEST_PROGRAM ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py)
   if(NOT EXISTS ${CURRENT_TEST_PROGRAM})
     message(WARNING "${CURRENT_TEST_PROGRAM} is not exist, skip it.")
@@ -44,39 +60,19 @@ endforeach()
 list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS)
 
 if(NOT TEST_LIST_FILE)
-  list(
-    REMOVE_ITEM
-    PYTHON_TEST_SCRIPTS
-    # Metax unit test
-    ${METAX_UNIT_TEST_PATH}/test_matmul_op_metax.py
-    # 精度问题
-    ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
-    # core.cudnnversion
-    ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
-    # op_test.py 里 self._get_places()接口的适配问题
-    ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
-    # device == "gpu" 适配问题
-    ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
-    # paddle-gpu 报错一致
-    ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
-    # paddle.device.cuda.get_device_properties
-    ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py
-    # needs check_grad with fp64 precision
-    ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
-    # CUDAPinnedPlace 问题
-    ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py)
+  set(NEED_IGNORE_FILE ${CMAKE_CURRENT_LIST_DIR}/ignore.txt)
+  if(EXISTS ${NEED_IGNORE_FILE})
+    file(STRINGS ${NEED_IGNORE_FILE} NEED_IGNORE_TEST_PROGRAMS)
+    foreach(test_name ${NEED_IGNORE_TEST_PROGRAMS})
+      if(EXISTS ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py)
+        list(REMOVE_ITEM PYTHON_TEST_SCRIPTS
+             ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py)
+      else()
+        list(REMOVE_ITEM PYTHON_TEST_SCRIPTS
+             ${METAX_UNIT_TEST_PATH}/${test_name}.py)
+      endif()
+    endforeach()
+  endif()
 endif()
 
 if(LOG_OUTPUT_DIR AND NOT EXISTS ${LOG_OUTPUT_DIR})
diff --git a/backends/metax_gpu/tests/ignore.txt b/backends/metax_gpu/tests/ignore.txt
new file mode 100644
index 00000000000..b4f1afbe5b0
--- /dev/null
+++ b/backends/metax_gpu/tests/ignore.txt
@@ -0,0 +1,21 @@
+test_matmul_op_metax
+test_sum_op
+test_max_op
+test_cumsum_op
+test_softmax_with_cross_entropy_op
+test_softmax_op
+test_elementwise_add_op
+test_gather_op
+test_elementwise_pow_op
+test_layer_norm_op
+test_index_add_op
+test_elementwise_div_op
+test_stack_op
+test_logical_op
+test_mean_op
+test_transpose_op
+test_randint_op
+test_uniform_random_op
+test_c_embedding_op
+test_slice_op
+test_compare_op

From 78946fd334dacbdb3f8ba9b07d9273a8462e8512 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Fri, 26 Sep 2025 15:48:08 +0800
Subject: [PATCH 49/58] [metax] modify kernels (#67)

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel

* modify library to static library

* modify kernel

* modify fused_bias_dropout_residual_layer_norm

* modify compile

* modify blas

* modify blas

* modify blas

* modify blas

* modify context

* modify kernels
---
 .../fused_conv2d_add_act_kernel_register.cu   |  0
 .../fused_rope_grad_kernel_register.cu        |  0
 .../fused_rope_kernel_register.cu             |  0
 .../kernels/metax_kernel/metax_context.cc     | 26 -------------------
 .../kernels/metax_kernel/metax_context.h      |  3 +--
 5 files changed, 1 insertion(+), 28 deletions(-)
 rename backends/metax_gpu/kernels/{metax_kernel => fusion}/fused_conv2d_add_act_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{metax_kernel => fusion}/fused_rope_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{metax_kernel => fusion}/fused_rope_kernel_register.cu (100%)

diff --git a/backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_conv2d_add_act_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_conv2d_add_act_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/fused_rope_grad_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_rope_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/metax_kernel/fused_rope_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_rope_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/fused_rope_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_rope_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/metax_kernel/fused_rope_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_rope_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
index efddba5f00b..0712fb75bbe 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
@@ -15,24 +15,6 @@
 #include "kernels/metax_kernel/metax_context.h"
 
 namespace phi {
-const bool allow_tf32_cublas = []() -> bool {
-  const char* v = std::getenv("ALLOW_TF32_CUBLAS");
-  if (v) {
-    return std::atoi(v);
-  }
-  return true;
-}();
-
-const bool allow_tf32_cudnn = []() -> bool {
-  const char* v = std::getenv("ALLOW_TF32_CUDNN");
-  if (v) {
-    return std::atoi(v);
-  }
-  return false;
-}();
-
-bool AllowTF32Cublas() { return allow_tf32_cublas; }
-bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
 void DnnWorkspaceHandle::RunFuncSync(
     const std::function<void(void*)>& cudnn_func,
     size_t required_workspace_bytes,
@@ -42,19 +24,11 @@ void DnnWorkspaceHandle::RunFuncSync(
     void* workspace_ptr = nullptr;
     size_t size = ((required_workspace_bytes + 255) >> 8) << 8;
     std::lock_guard<std::mutex> guard(*mtx_);
-#ifdef PADDLE_WITH_HIP
-    auto status = hipMalloc(&workspace_ptr, size);
-#else
     auto status = cudaMalloc(&workspace_ptr, size);
-#endif
     if (status == gpuSuccess) {
       cudnn_func(workspace_ptr);
       phi::backends::gpu::GpuStreamSync(stream_);
-#ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_GPU_SUCCESS(hipFree(workspace_ptr));
-#else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaFree(workspace_ptr));
-#endif
       return;
     }
   }
diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
index 2d761439089..7386811a236 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.h
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
@@ -18,6 +18,7 @@
 #include <mutex>
 
 #include "kernels/funcs/blas/cublasLt.h"
+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/custom/custom_context.h"
 #include "paddle/phi/backends/gpu/forwards.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
@@ -30,8 +31,6 @@
 cublasLtHandle_t GetBlasLtHandle();
 
 namespace phi {
-bool AllowTF32Cublas();
-bool AllowTF32Cudnn();
 class DnnWorkspaceHandle {
  public:
   inline DnnWorkspaceHandle(Allocator* allocator, gpuStream_t stream)

From ac78af20874e28a7d5c3f1beed40762c716213bb Mon Sep 17 00:00:00 2001
From: Theendlessofhell <148317258+Theendlessofhell@users.noreply.github.com>
Date: Fri, 26 Sep 2025 15:48:59 +0800
Subject: [PATCH 50/58] Fix part of the missing kernel issues (#66)

Co-authored-by: root <root@lt-wks-10-0-180-15.pub.metax-tech.com>
---
 .../kernels/cuda_kernels/multinomial_kernel_register.cu      | 3 ++-
 .../kernels/cuda_kernels/take_along_axis_kernel_register.cu  | 5 ++++-
 .../metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu  | 1 +
 .../kernels/metax_kernel/layer_norm_grad_kernel_register.cu  | 1 +
 4 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/backends/metax_gpu/kernels/cuda_kernels/multinomial_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/multinomial_kernel_register.cu
index 622e70728f1..1325fa339b0 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/multinomial_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/multinomial_kernel_register.cu
@@ -21,6 +21,7 @@ PD_CUSTOM_KERNEL_REGISTER(multinomial,
                           phi::MultinomialKernel,
                           phi::dtype::float16,
                           phi::dtype::bfloat16,
-                          float) {
+                          float,
+                          double) {
   kernel->OutputAt(0).SetDataType(phi::DataType::INT64);
 }
diff --git a/backends/metax_gpu/kernels/cuda_kernels/take_along_axis_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/take_along_axis_kernel_register.cu
index 4b23b0820fc..b628552aaaf 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/take_along_axis_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/take_along_axis_kernel_register.cu
@@ -25,4 +25,7 @@ PD_CUSTOM_KERNEL_REGISTER(take_along_axis,
                           int64_t,
                           int,
                           phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::dtype::bfloat16,
+                          uint8_t,  // 支持 uint8
+                          int16_t   // 支持 int16
+) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu
index 287fa8de41a..ead21b1eb7e 100644
--- a/backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu
@@ -22,5 +22,6 @@ PD_REGISTER_PLUGIN_KERNEL(addmm,
                           ALL_LAYOUT,
                           phi::AddmmKernel,
                           float,
+                          double,
                           phi::dtype::float16,
                           phi::dtype::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu
index 87c06dab2a4..857dcb6d522 100644
--- a/backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu
@@ -115,6 +115,7 @@ PD_REGISTER_PLUGIN_KERNEL(layer_norm_grad,
                           ALL_LAYOUT,
                           phi::LayerNormGradKernel,
                           float,
+                          double,
                           phi::dtype::float16,
                           phi::dtype::bfloat16) {
   if (kernel_key.dtype() == phi::DataType::FLOAT16) {

From 4ce9fe6de10402f04917cae8bd0f83bf499bdf1e Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Fri, 26 Sep 2025 18:18:36 +0800
Subject: [PATCH 51/58] [Metax] fix index_elementwise_get kernel (#68)

* [Metax] add keyword filter in CI CMakeLists.txt

* [Metax] add ignore case list

* [Metax] fix phi::backends::gpu::DnnVersion() symbol not found

* Revert "[Metax] fix phi::backends::gpu::DnnVersion() symbol not found"

This reverts commit 087a9c1240f024210d536e543a2fc55db1175529.

* [Metax] fix index_elementwise_get kernel
---
 backends/metax_gpu/CMakeLists.txt                      |  2 +-
 .../index_elementwise_get_kernel_register.cu           | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index bca1ce7aad4..3b74ae39c18 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -326,7 +326,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/im2sequence_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/im2sequence_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/increment_kernel.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_get_kernel.cu
+  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_get_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_get_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_put_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_put_grad_kernel.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_get_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_get_kernel_register.cu
index 5ab3d2a3170..a45a740fc61 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_get_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_get_kernel_register.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/index_elementwise_get_kernel.h"
+#include "paddle/phi/kernels/gpu/index_elementwise_get_kernel.cu"  // NOLINT
 
 PD_CUSTOM_KERNEL_REGISTER(index_elementwise_get,
                           metax_gpu,
@@ -27,7 +27,7 @@ PD_CUSTOM_KERNEL_REGISTER(index_elementwise_get,
                           int64_t,
                           int16_t,
                           uint8_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}

From 3c8d0173075d49bef48a909a39f12d325e276f00 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 29 Sep 2025 10:42:05 +0800
Subject: [PATCH 52/58] [metax]fix patch and fix missing kernel (#72)

* [metax]fix patch and fix missing kernel
---
 backends/metax_gpu/CMakeLists.txt             |  3 +
 .../cuda_kernels/adam_kernel_selected_rows.cu | 41 ++++++++++++
 .../cuda_kernels/einsum_kernel_register.cu    | 16 ++---
 .../lars_momentum_kernel_register.cu          | 29 +++++++++
 .../cuda_kernels/nonzero_kernel_register.cu   |  8 ++-
 .../put_along_axis_kernel_register.cu         |  6 +-
 backends/metax_gpu/patch/paddle.patch         | 65 -------------------
 7 files changed, 90 insertions(+), 78 deletions(-)
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/adam_kernel_selected_rows.cu
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/lars_momentum_kernel_register.cu

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 3b74ae39c18..5930eaaebd2 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -535,6 +535,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/clip_by_norm_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/uniform_random_batch_size_like_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/get_tensor_from_selected_rows_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/batch_norm_kernel.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/empty_kernel.cc
@@ -642,6 +643,8 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/lars_momentum_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/partial_sum_kernel.cu
   # ############################################################################
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
   # kernels/kps
diff --git a/backends/metax_gpu/kernels/cuda_kernels/adam_kernel_selected_rows.cu b/backends/metax_gpu/kernels/cuda_kernels/adam_kernel_selected_rows.cu
new file mode 100644
index 00000000000..df4105efbd2
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/adam_kernel_selected_rows.cu
@@ -0,0 +1,41 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/selected_rows_functor.h"
+#include "paddle/phi/kernels/selected_rows/adam_kernel.h"
+
+PD_CUSTOM_KERNEL_REGISTER(adam_dense_param_sparse_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::sr::AdamDenseParamSparseGradKernel,
+                          float,
+                          double,
+                          phi::float16) {
+  // Skip beta1_pow, beta2_pow, skip_update data transform
+  kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(7).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(9).SetBackend(phi::Backend::ALL_BACKEND);
+
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(5).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(6).SetDataType(phi::DataType::FLOAT32);
+  }
+  kernel->OutputAt(4).SetBackend(phi::Backend::UNDEFINED);
+  kernel->OutputAt(5).SetBackend(phi::Backend::UNDEFINED);
+}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/einsum_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/einsum_kernel_register.cu
index 444928af78f..0f613b55e9e 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/einsum_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/einsum_kernel_register.cu
@@ -23,10 +23,10 @@ PD_CUSTOM_KERNEL_REGISTER(einsum,
                           phi::EinsumKernel,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 
 PD_CUSTOM_KERNEL_REGISTER(einsum_infer,
                           metax_gpu,
@@ -34,7 +34,7 @@ PD_CUSTOM_KERNEL_REGISTER(einsum_infer,
                           phi::EinsumInferKernel,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/lars_momentum_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/lars_momentum_kernel_register.cu
new file mode 100644
index 00000000000..5647c806bfd
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/lars_momentum_kernel_register.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/lars_momentum_kernel.h"
+
+PD_CUSTOM_KERNEL_REGISTER(lars_momentum,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::LarsMomentumKernel,
+                          float,
+                          double,
+                          phi::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  }
+}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/nonzero_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/nonzero_kernel_register.cu
index 1f84b628e84..dc92b2c6d69 100755
--- a/backends/metax_gpu/kernels/cuda_kernels/nonzero_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/nonzero_kernel_register.cu
@@ -23,11 +23,13 @@ PD_CUSTOM_KERNEL_REGISTER(nonzero,
                           int64_t,
                           int,
                           int16_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
+                          phi::float16,
+                          phi::bfloat16,
                           bool,
                           float,
-                          double) {
+                          double,
+                          phi::complex64,
+                          phi::complex128) {
   kernel->OutputAt(0).SetDataType(phi::DataType::INT64);
 }
 
diff --git a/backends/metax_gpu/kernels/cuda_kernels/put_along_axis_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/put_along_axis_kernel_register.cu
index 8ff1f5959ab..ca93a8ca079 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/put_along_axis_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/put_along_axis_kernel_register.cu
@@ -23,6 +23,8 @@ PD_CUSTOM_KERNEL_REGISTER(put_along_axis,
                           float,
                           double,
                           int64_t,
+                          uint8_t,
+                          int16_t,
                           int,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::float16,
+                          phi::bfloat16) {}
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index beefb730bf7..4c06609338c 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -869,19 +869,6 @@ index e838778952..83e805e75a 100644
  
  namespace phi {
  namespace fusion {
-diff --git a/paddle/phi/kernels/gpu/correlation_kernel.cu b/paddle/phi/kernels/gpu/correlation_kernel.cu
-index 4c93778bde..c7bdf8a2cc 100644
---- a/paddle/phi/kernels/gpu/correlation_kernel.cu
-+++ b/paddle/phi/kernels/gpu/correlation_kernel.cu
-@@ -103,7 +103,7 @@ void CorrelationCUDAKernel(const Context &dev_ctx,
-                            int stride2,
-                            int corr_type_multiply,
-                            DenseTensor *out) {
--  bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU;
-+  bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM;
-   PADDLE_ENFORCE_EQ(
-       is_gpu_place,
-       true,
 diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
 index f0cca0f701..02ea957240 100644
 --- a/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -897,19 +884,6 @@ index f0cca0f701..02ea957240 100644
  
  namespace phi {
  // To determine use cudnn or not.
-diff --git a/paddle/phi/kernels/gpu/dgc_kernel.cu b/paddle/phi/kernels/gpu/dgc_kernel.cu
-index c2ddfa1347..c6adf5a6de 100644
---- a/paddle/phi/kernels/gpu/dgc_kernel.cu
-+++ b/paddle/phi/kernels/gpu/dgc_kernel.cu
-@@ -188,7 +188,7 @@ void DGCKernel(const Context& dev_ctx,
-   int buf_size = paddle::communication::dgc::get_buffer_size(k);
-   phi::Allocator::AllocationPtr tmp_ious_data;
- #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
--  if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
-+  if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
-     tmp_ious_data = phi::memory_utils::Alloc(
-         dev_ctx.GetPlace(),
-         buf_size,
 diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h
 index 29fa252e96..4ae72b0935 100644
 --- a/paddle/phi/kernels/gpu/gelu_funcs.h
@@ -974,19 +948,6 @@ index 1bdbe1564c..f753b54bc6 100644
  #include "paddle/phi/kernels/impl/qr_kernel_impl.h"
  #include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
  #include "paddle/phi/kernels/lstsq_kernel.h"
-diff --git a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
-index 05a977828f..5136608c41 100644
---- a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
-+++ b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
-@@ -58,7 +58,7 @@ void ShuffleBatchKernel(const Context& dev_ctx,
-   int64_t seed_int = 0;
-   if (seed.initialized()) {
-     const auto& seed_place = seed.place().GetType();
--    bool is_gpu_place = seed_place == phi::AllocationType::GPU;
-+    bool is_gpu_place = seed_place == phi::AllocationType::GPU || seed_place == phi::AllocationType::CUSTOM;
-     if (is_gpu_place) {
-       // NOTE: We have overwritten GetKernelTypeForVar, so seed_place would
-       // not be CUDAPlace in practice. This case would only happen in Python
 diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
 index 9bc5326c90..79b57a8203 100644
 --- a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
@@ -1144,32 +1105,6 @@ index 6f03f76eeb..5fe2c3e7dc 100644
  #include "paddle/phi/kernels/funcs/for_range.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
  
-diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h
-index 7b85903776..3f4b298807 100644
---- a/paddle/phi/kernels/impl/merged_momentum_impl.h
-+++ b/paddle/phi/kernels/impl/merged_momentum_impl.h
-@@ -297,7 +297,7 @@ void MergedMomentumInnerCompute(
-                 params_out[idx],
-                 velocities_out[idx]);
-         VLOG(10) << "Launch MergedMomentum cpu kernel.";
--      } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
-+      } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
-         phi::funcs::ForRange<Context> for_range(
-             static_cast<const Context &>(dev_ctx), params[idx]->numel());
-         const auto grad_type = grads[idx]->dtype();
-diff --git a/paddle/phi/kernels/impl/momentum_kernel_impl.h b/paddle/phi/kernels/impl/momentum_kernel_impl.h
-index de5bcfc30b..eb2a9714f5 100644
---- a/paddle/phi/kernels/impl/momentum_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/momentum_kernel_impl.h
-@@ -457,7 +457,7 @@ void MomentumDenseImpl(const Context& dev_ctx,
-             regularization_coeff,
-             param_out,
-             velocity_out);
--  } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
-+  } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
-     funcs::ForRange<Context> for_range(dev_ctx, param.numel());
-     const auto grad_type = grad.dtype();
- #define PADDLE_LAUNCH_DENSE_MOMENTUM_KERNEL(__nesterov, __reg_type)     \
 diff --git a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
 index 4099d8b506..baef2cd643 100644
 --- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h

From 7303ae2c86253711559c2fe2f0abbc770541fe5e Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Mon, 29 Sep 2025 17:08:34 +0800
Subject: [PATCH 53/58] [metax] modify kernels (#73)

* modify kernels
---
 .../kernels/impl/addmm_kernel_impl.h          |  1 +
 backends/metax_gpu/patch/paddle.patch         | 60 ++++++++++++++++++-
 2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h b/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h
index fb1368b069c..b517b719d49 100644
--- a/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h
@@ -98,6 +98,7 @@ void AddmmKernel(const Context& dev_ctx,
           y_dims[0]));
 
   dev_ctx.template Alloc<T>(out);
+  if (out->numel() == 0) return;
   auto blas = funcs::GetBlas<Context, T>(dev_ctx);
 
   // calc broadcast dim
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 4c06609338c..69d714ef6e0 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -438,6 +438,21 @@ index d69eb67d6f..1d8b6e9375 100644
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
  
+diff --git a/paddle/phi/kernels/funcs/embedding_grad.h b/paddle/phi/kernels/funcs/embedding_grad.h
+index 461e6e2474..48a64ae9ce 100644
+--- a/paddle/phi/kernels/funcs/embedding_grad.h
++++ b/paddle/phi/kernels/funcs/embedding_grad.h
+@@ -143,8 +143,8 @@ void LaunchEmbeddingGradDeterministicKernel(const GPUContext& dev_ctx,
+   constexpr int kWarpSize = 64;
+   constexpr int kBlockDimY = 16;
+ #else
+-  constexpr int kWarpSize = 32;
+-  constexpr int kBlockDimY = 32;
++  constexpr int kWarpSize = 64;
++  constexpr int kBlockDimY = 16;
+ #endif
+   dim3 threads(kWarpSize, kBlockDimY);
+   dim3 grids(static_cast<int>((D + kWarpSize - 1) / kWarpSize));
 diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
 index cb35feee32..64f5bd24ac 100644
 --- a/paddle/phi/kernels/funcs/fc_functor.cu
@@ -501,6 +516,49 @@ index 15e1a4a3c3..e4780538d7 100644
  #include "paddle/phi/kernels/funcs/im2col.h"
  
  namespace phi {
+diff --git a/paddle/phi/kernels/funcs/math_cuda_utils.h b/paddle/phi/kernels/funcs/math_cuda_utils.h
+index e5361b836e..5ad238df08 100644
+--- a/paddle/phi/kernels/funcs/math_cuda_utils.h
++++ b/paddle/phi/kernels/funcs/math_cuda_utils.h
+@@ -175,12 +175,12 @@ struct KeyValuePair<half> {
+ #define WARP_SIZE_WIDTH_MASK 0x3f
+ typedef u_int64_t warp_mask_t;
+ #else
+-#define FINAL_MASK 0xffffffff
+-#define HALF_WARP 16
+-#define WARP_SIZE 32
+-#define WARP_SIZE_WIDTH 5
+-#define WARP_SIZE_WIDTH_MASK 0x1f
+-typedef unsigned warp_mask_t;
++#define FINAL_MASK 0xffffffffffffffffUL
++#define HALF_WARP 32
++#define WARP_SIZE 64
++#define WARP_SIZE_WIDTH 6
++#define WARP_SIZE_WIDTH_MASK 0x3f
++typedef u_int64_t warp_mask_t;
+ #endif
+ 
+ template <typename T>
+@@ -200,19 +200,13 @@ __inline__ __device__ T BlockReduceSum(T val, warp_mask_t mask) {
+   static __shared__ T shared[WARP_SIZE];
+   int lane = threadIdx.x & WARP_SIZE_WIDTH_MASK;
+   int wid = threadIdx.x >> WARP_SIZE_WIDTH;
+-
+   val = WarpReduceSum<T>(val, mask);
+-
+-  __syncthreads();
+   if (lane == 0) shared[wid] = val;
+-
+   __syncthreads();
+-
+   // align block_span to warpSize
+   int block_span = (blockDim.x + warpSize - 1) >> WARP_SIZE_WIDTH;
+   val = (lane < block_span) ? shared[lane] : static_cast<T>(0.0f);
+   val = WarpReduceSum<T>(val, mask);
+-
+   return val;
+ }
+ 
 diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
 index e101224970..a52eb6096f 100644
 --- a/paddle/phi/kernels/funcs/matrix_inverse.cu
@@ -534,7 +592,7 @@ index 558d363b39..05da04b517 100644
  #include "paddle/phi/kernels/funcs/scatter.cu.h"
  
 diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
-index 8b0baf5f5f..260482f124 100644
+index 047f52bd91..a05b34d3ba 100644
 --- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 +++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 @@ -27,7 +27,7 @@ namespace cub = hipcub;

From 8b184a32bd9e02c0d8b405d670a8e888a4522f42 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Mon, 29 Sep 2025 18:11:03 +0800
Subject: [PATCH 54/58] [metax] modify kernels (#74)

* modify kernels
---
 .../gpudnn/conv_grad_kernel_register.cu       | 37 ++++++++-----------
 .../kernels/gpudnn/conv_kernel_register.cu    | 19 +++++-----
 .../kernels/gpudnn/conv_transpose_kernel.cu   | 15 ++++----
 .../depthwise_conv_grad_kernel.cu             | 14 +++----
 .../metax_kernel/depthwise_conv_kernel.cu     | 14 +++----
 5 files changed, 45 insertions(+), 54 deletions(-)

diff --git a/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu
index e4acb2f95b6..2da42c7ff8c 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu
@@ -437,26 +437,22 @@ void ConvCudnnGradKernel(const Context& dev_ctx,
     dev_ctx.template Alloc<T>(filter_grad);
   }
 
-  //   bool has_use_addto = dev_ctx.HasDnnAttr("use_addto");
-  bool has_use_addto = "true";
+  bool has_use_addto = dev_ctx.HasDnnAttr("use_addto");
   VLOG(4) << "GPUContext contains `use_addto`: " << has_use_addto;
-  //   bool use_addto = has_use_addto
-  //                        ? PADDLE_GET_CONST(bool, "true")
-  //                        : false;
-  bool use_addto = "true";
+  bool use_addto = has_use_addto
+                       ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("use_addto"))
+                       : false;
   std::vector<int> dilations = dilations_t;
   std::vector<int> strides = strides_t;
   std::vector<int> paddings = paddings_t;
 
-  //   bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
-  bool has_exhaustive_search = "true";
+  bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
   VLOG(4) << "GPUContext contains `exhaustive_search`: "
           << has_exhaustive_search;
-  //   bool exhaustive_search_attr =
-  //       has_exhaustive_search
-  //           ? PADDLE_GET_CONST(bool, "true")
-  //           : false;
-  bool exhaustive_search_attr = "true";
+  bool exhaustive_search_attr =
+      has_exhaustive_search
+          ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search"))
+          : false;
   bool exhaustive_search =
       FLAGS_cudnn_exhaustive_search || exhaustive_search_attr;
   bool deterministic = FLAGS_cudnn_deterministic;
@@ -835,14 +831,13 @@ void ConvCudnnGradGradKernel(
   T* transformed_dx = nullptr;
   std::vector<int> dilations = dilations_t;
 
-  //   bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
-  //   VLOG(4) << "GPUContext contains `exhaustive_search`: "
-  //           << has_exhaustive_search;
-  //   bool exhaustive_search_attr =
-  //       has_exhaustive_search
-  //           ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search"))
-  //           : false;
-  bool exhaustive_search_attr = "true";
+  bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
+  VLOG(4) << "GPUContext contains `exhaustive_search`: "
+          << has_exhaustive_search;
+  bool exhaustive_search_attr =
+      has_exhaustive_search
+          ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search"))
+          : false;
   bool exhaustive_search =
       FLAGS_cudnn_exhaustive_search || exhaustive_search_attr;
   bool deterministic = FLAGS_cudnn_deterministic;
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
index 0a83b504c76..d6b243c956c 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
@@ -228,15 +228,16 @@ void ConvCudnnKernel(const Context& dev_ctx,
   std::vector<int> paddings = paddings_t;
   std::vector<int> dilations = dilations_t;
 
-  // bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
-  // VLOG(4) << "GPUContext contains `exhaustive_search`: "
-  //         << has_exhaustive_search;
-  // bool exhaustive_search_attr =
-  //     has_exhaustive_search
-  //         ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search"))
-  //         : false;
-
-  bool exhaustive_search = FLAGS_cudnn_exhaustive_search;
+  bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
+  VLOG(4) << "GPUContext contains `exhaustive_search`: "
+          << has_exhaustive_search;
+  bool exhaustive_search_attr =
+      has_exhaustive_search
+          ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search"))
+          : false;
+
+  bool exhaustive_search =
+      FLAGS_cudnn_exhaustive_search || exhaustive_search_attr;
   bool deterministic = FLAGS_cudnn_deterministic;
 
   PADDLE_ENFORCE_EQ(exhaustive_search && deterministic,
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
index 532b7af0db4..4049d2f3130 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
@@ -260,14 +260,13 @@ void ConvTransposeRawGPUDNNKernel(const Context& dev_ctx,
     return;
   }
 
-  // bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
-  // bool exhaustive_search_attr =
-  //     has_exhaustive_search
-  //         ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search"))
-  //         : false;
-  // bool exhaustive_search =
-  //     FLAGS_cudnn_exhaustive_search || exhaustive_search_attr;
-  bool exhaustive_search = FLAGS_cudnn_exhaustive_search;
+  bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
+  bool exhaustive_search_attr =
+      has_exhaustive_search
+          ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search"))
+          : false;
+  bool exhaustive_search =
+      FLAGS_cudnn_exhaustive_search || exhaustive_search_attr;
 
   bool deterministic = FLAGS_cudnn_deterministic;
   PADDLE_ENFORCE_EQ(exhaustive_search && deterministic,
diff --git a/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu
index f2475298963..4e5f881385a 100644
--- a/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu
@@ -54,14 +54,12 @@ void DepthwiseConvGradKernel(const Context& dev_ctx,
     return;
   }
 
-  // bool has_fuse_relu = dev_ctx.HasDnnAttr("fuse_relu_before_depthwise_conv");
-  // bool fuse_relu =
-  //     has_fuse_relu
-  //         ? PADDLE_GET_CONST(
-  //               bool, dev_ctx.GetDnnAttr("fuse_relu_before_depthwise_conv"))
-  //         : false;
-  bool has_fuse_relu = false;
-  bool fuse_relu = false;
+  bool has_fuse_relu = dev_ctx.HasDnnAttr("fuse_relu_before_depthwise_conv");
+  bool fuse_relu =
+      has_fuse_relu
+          ? PADDLE_GET_CONST(
+                bool, dev_ctx.GetDnnAttr("fuse_relu_before_depthwise_conv"))
+          : false;
 
   std::vector<int> strides = strides_t;
   std::vector<int> paddings = paddings_t;
diff --git a/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu
index 517f26b1c02..d3d6c4a4edd 100644
--- a/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu
@@ -48,14 +48,12 @@ void DepthwiseConvKernel(const Context& dev_ctx,
 
   const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
 
-  // bool has_fuse_relu = dev_ctx.HasDnnAttr("fuse_relu_before_depthwise_conv");
-  // bool fuse_relu =
-  //     has_fuse_relu
-  //         ? PADDLE_GET_CONST(
-  //               bool, dev_ctx.GetDnnAttr("fuse_relu_before_depthwise_conv"))
-  //         : false;
-  bool has_fuse_relu = false;
-  bool fuse_relu = false;
+  bool has_fuse_relu = dev_ctx.HasDnnAttr("fuse_relu_before_depthwise_conv");
+  bool fuse_relu =
+      has_fuse_relu
+          ? PADDLE_GET_CONST(
+                bool, dev_ctx.GetDnnAttr("fuse_relu_before_depthwise_conv"))
+          : false;
 
   if (channel_last) {
     PADDLE_ENFORCE_EQ(

From 60f0ed637f73305e8f0fbd03917e3c8e2978d1ef Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 30 Sep 2025 11:33:54 +0800
Subject: [PATCH 55/58] [metax] link mccl and fix missing kernel (#76)

* [metax] link mccl and fix missing kernel
---
 backends/metax_gpu/CMakeLists.txt             |   7 +
 .../cross_entropy_bwd_w_downcast.cu           | 291 ++++++++++++
 .../embedding_grad_add_to_kernel.cu           |  27 ++
 .../cuda_kernels/gammaln_grad_kernel.cu       |  28 ++
 .../moe_combine_no_weight_grad_kernel.cu      |  25 +
 .../cuda_kernels/multihead_matmul_kernel.cu   | 433 ++++++++++++++++++
 backends/metax_gpu/kernels/funcs/generator.cc | 287 ++++++++++++
 .../kernels/impl/gammaln_grad_kernel_impl.h   | 112 +++++
 .../metax_kernel/cudnn_lstm_grad_kernel.cu    | 362 +++++++++++++++
 .../kernels/metax_kernel/cudnn_lstm_kernel.cu | 428 +++++++++++++++++
 backends/metax_gpu/tests/ignore.txt           |   4 +
 11 files changed, 2004 insertions(+)
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/cross_entropy_bwd_w_downcast.cu
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/embedding_grad_add_to_kernel.cu
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/moe_combine_no_weight_grad_kernel.cu
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/multihead_matmul_kernel.cu
 create mode 100644 backends/metax_gpu/kernels/funcs/generator.cc
 create mode 100644 backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_grad_kernel.cu
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_kernel.cu

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 5930eaaebd2..2bb282cf54f 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -326,6 +326,8 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/im2sequence_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/im2sequence_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/increment_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/embedding_grad_add_to_kernel.cu
+  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/cross_entropy_bwd_w_downcast.cu
   # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_get_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_get_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_put_kernel.cu
@@ -728,6 +730,11 @@ target_link_libraries(
   ${WARPCTC_LIBRARIES}
   ${WARPRNNT_LIBRARIES}
   ${PADDLE_CORE_LIB})
+
+target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmccl.so)
+target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcFlashAttn.so)
+target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcpti.so)
+
 include_directories(BEFORE ${PADDLE_SOURCE_DIR})
 
 target_compile_definitions(
diff --git a/backends/metax_gpu/kernels/cuda_kernels/cross_entropy_bwd_w_downcast.cu b/backends/metax_gpu/kernels/cuda_kernels/cross_entropy_bwd_w_downcast.cu
new file mode 100644
index 00000000000..a0d5dfd7a5a
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/cross_entropy_bwd_w_downcast.cu
@@ -0,0 +1,291 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/cross_entropy_grad_kernel.h"
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "kernels/gpudnn/softmax_gpudnn.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
+#include "paddle/phi/backends/gpu/gpu_dnn.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/core/visit_type.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/softmax.h"
+
+namespace phi {
+
+/*
+  Vectorized wrapper of softmax with cross entropy grad hard label.
+  Optimized with float4 vectorization for memory coalescing and improved
+  throughput.
+*/
+template <typename T, typename LabelT, typename LogitT>
+__global__ void SoftmaxWithCrossEntropyGradHardLabelVectorized(
+    LogitT* __restrict__ logits_grad,
+    const T* __restrict__ loss_grad,
+    const T* __restrict__ softmax,
+    const LabelT* __restrict__ labels,
+    const int64_t n,
+    const int64_t dim,
+    const int64_t d,
+    const int ignore_index) {
+  // Vectorized load/store with float4 for 128-bit memory transactions
+  constexpr int VEC_SIZE = 4;
+  using VecT = typename phi::AlignedVector<LogitT, VEC_SIZE>;
+  using SoftmaxVecT = typename phi::AlignedVector<T, VEC_SIZE>;
+
+  int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t vec_id = tid * VEC_SIZE;
+
+  // Ensure we don't exceed bounds
+  if (vec_id >= n * dim * d) return;
+
+  // Compute indices for vectorized access
+  int64_t idx_n = vec_id / (d * dim);
+  int64_t idx_dim_start = (vec_id / d) % dim;
+  int64_t idx_d = vec_id % d;
+  int64_t ids = idx_n * d + idx_d;
+
+  // Load label once per thread
+  auto lbl = static_cast<int64_t>(labels[ids]);
+
+  if (lbl == ignore_index) {
+    // Vectorized zero fill for ignore_index
+    VecT* vec_grad = reinterpret_cast<VecT*>(&logits_grad[vec_id]);
+    VecT zero_vec;
+#pragma unroll
+    for (int i = 0; i < VEC_SIZE; ++i) {
+      zero_vec.val[i] = static_cast<LogitT>(0.0f);
+    }
+    *vec_grad = zero_vec;
+    return;
+  }
+
+  // Vectorized load of softmax values
+  SoftmaxVecT softmax_vec;
+  const SoftmaxVecT* softmax_ptr =
+      reinterpret_cast<const SoftmaxVecT*>(&softmax[vec_id]);
+  softmax_vec = *softmax_ptr;
+
+  // Load loss gradient (broadcast across vector elements)
+  T loss_grad_val = loss_grad[ids];
+
+  // Vectorized computation
+  VecT grad_vec;
+#pragma unroll
+  for (int i = 0; i < VEC_SIZE; ++i) {
+    int64_t current_dim = idx_dim_start + i;
+    if (current_dim < dim) {  // Bounds check for partial vectors
+      float softmax_val = static_cast<float>(softmax_vec.val[i]);
+      float grad_val;
+
+      if (lbl == current_dim) {
+        grad_val = (softmax_val - 1.0f) * static_cast<float>(loss_grad_val);
+      } else {
+        grad_val = softmax_val * static_cast<float>(loss_grad_val);
+      }
+
+      grad_vec.val[i] = static_cast<LogitT>(grad_val);
+    } else {
+      grad_vec.val[i] = static_cast<LogitT>(0.0f);
+    }
+  }
+
+  // Vectorized store
+  VecT* grad_ptr = reinterpret_cast<VecT*>(&logits_grad[vec_id]);
+  *grad_ptr = grad_vec;
+}
+
+/*
+  Specialized kernel for dimensions not divisible by vector size
+  Uses warp-level primitives for better performance on irregular sizes
+*/
+template <typename T, typename LabelT, typename LogitT>
+__global__ void SoftmaxWithCrossEntropyGradHardLabelWarp(
+    LogitT* __restrict__ logits_grad,
+    const T* __restrict__ loss_grad,
+    const T* __restrict__ softmax,
+    const LabelT* __restrict__ labels,
+    const int64_t n,
+    const int64_t dim,
+    const int64_t d,
+    const int ignore_index) {
+  const int warps_per_block = 4;
+  const int threads_per_warp = 32;
+  const int threads_per_block = warps_per_block * threads_per_warp;
+
+  int tid = blockIdx.x * threads_per_block + threadIdx.x;
+  int warp_id = threadIdx.x / threads_per_warp;
+  int lane_id = threadIdx.x % threads_per_warp;
+
+  // Process multiple elements per thread using warp-level parallelism
+  int64_t elements_per_thread =
+      (n * dim * d + gridDim.x * threads_per_block - 1) /
+      (gridDim.x * threads_per_block);
+
+  for (int e = 0; e < elements_per_thread; ++e) {
+    int64_t idx = tid + e * gridDim.x * threads_per_block;
+    if (idx >= n * dim * d) break;
+
+    int64_t idx_n = idx / (d * dim);
+    int64_t idx_dim = (idx / d) % dim;
+    int64_t idx_d = idx % d;
+    int64_t ids = idx_n * d + idx_d;
+
+    auto lbl = static_cast<int64_t>(labels[ids]);
+
+    if (lbl == ignore_index) {
+      logits_grad[idx] = static_cast<LogitT>(0.0f);
+    } else if (lbl == idx_dim) {
+      logits_grad[idx] =
+          static_cast<LogitT>((static_cast<float>(softmax[idx]) - 1.0f) *
+                              static_cast<float>(loss_grad[ids]));
+    } else {
+      logits_grad[idx] =
+          static_cast<LogitT>(static_cast<float>(softmax[idx]) *
+                              static_cast<float>(loss_grad[ids]));
+    }
+  }
+}
+
+/*
+  Optimized kernel selector based on problem size and alignment
+*/
+template <typename T, typename LabelT, typename LogitT>
+void LaunchOptimizedCrossEntropyGradKernel(const GPUContext& dev_ctx,
+                                           LogitT* logits_grad,
+                                           const T* loss_grad,
+                                           const T* softmax,
+                                           const LabelT* labels,
+                                           const int64_t n,
+                                           const int64_t dim,
+                                           const int64_t d,
+                                           const int ignore_index) {
+  const int64_t total_elements = n * dim * d;
+  auto stream = dev_ctx.stream();
+
+  // Check alignment for vectorized kernel
+  bool is_aligned = (reinterpret_cast<uintptr_t>(logits_grad) % 16 == 0) &&
+                    (reinterpret_cast<uintptr_t>(softmax) % 16 == 0) &&
+                    (total_elements % 4 == 0);
+
+  if (is_aligned && total_elements >= 1024) {
+    // Use vectorized kernel for aligned, large problems
+    constexpr int VEC_SIZE = 4;
+    const int threads_per_block = 256;
+    const int vec_elements = total_elements / VEC_SIZE;
+    const int blocks =
+        (vec_elements + threads_per_block - 1) / threads_per_block;
+
+    SoftmaxWithCrossEntropyGradHardLabelVectorized<T, LabelT, LogitT>
+        <<<blocks, threads_per_block, 0, stream>>>(
+            logits_grad, loss_grad, softmax, labels, n, dim, d, ignore_index);
+  } else {
+    // Use warp-specialized kernel for irregular sizes
+    const int warps_per_block = 4;
+    const int threads_per_block = warps_per_block * 32;
+    const int blocks =
+        std::min(1024,
+                 static_cast<int>((total_elements + threads_per_block - 1) /
+                                  threads_per_block));
+
+    SoftmaxWithCrossEntropyGradHardLabelWarp<T, LabelT, LogitT>
+        <<<blocks, threads_per_block, 0, stream>>>(
+            logits_grad, loss_grad, softmax, labels, n, dim, d, ignore_index);
+  }
+}
+
+template <typename T, typename LabelT>
+void CrossEntropyWithSoftmaxBwdWithDowncastGPUKernel(
+    const GPUContext& dev_ctx,
+    const DenseTensor& label,
+    const DenseTensor& softmax,
+    const DenseTensor& loss_grad,
+    int axis,
+    DenseTensor* logits_grad) {
+  //   PADDLE_ENFORCE_EQ(
+  //       dev_ctx.GetPlace().GetType(),
+  //       phi::AllocationType::GPU,
+  //       common::errors::Unavailable("softmax_with_cross_entropy operator's "
+  //                                   "CUDA kernel only runs on GPU device."));
+
+  using LogitT = phi::bfloat16;
+  const T* loss_grad_data = loss_grad.data<T>();
+  DenseTensor* logit_grad = logits_grad;
+
+  LogitT* logit_grad_data = nullptr;
+  logit_grad_data = dev_ctx.template Alloc<LogitT>(logit_grad);
+
+  const int rank = logit_grad->dims().size();
+  const int axis_v = phi::funcs::CanonicalAxis(axis, rank);
+  int axis_dim = logit_grad->dims()[axis_v];
+
+  const int64_t n = phi::funcs::SizeToAxis(axis_v, logit_grad->dims());
+  const int64_t d = phi::funcs::SizeFromAxis(axis_v, logit_grad->dims());
+  const int64_t remain = d / axis_dim;
+
+  const T* softmax_data = softmax.data<T>();
+  const auto* label_data = label.data<LabelT>();
+
+  // Launch optimized kernel with automatic selection
+  LaunchOptimizedCrossEntropyGradKernel<T, LabelT, LogitT>(dev_ctx,
+                                                           logit_grad_data,
+                                                           loss_grad_data,
+                                                           softmax_data,
+                                                           label_data,
+                                                           n,
+                                                           axis_dim,
+                                                           remain,
+                                                           -100);
+}
+
+template <typename T, typename Context>
+void CrossEntropyWithSoftmaxBwdWithDowncastKernel(const Context& dev_ctx,
+                                                  const DenseTensor& label,
+                                                  const DenseTensor& softmax,
+                                                  const DenseTensor& loss_grad,
+                                                  DenseTensor* logits_grad) {
+  constexpr int axis = -1;
+  if (logits_grad->numel() == 0) {
+    dev_ctx.template Alloc<phi::bfloat16>(logits_grad);
+    return;
+  }
+  auto dtype = label.dtype();
+  PD_VISIT_INTEGRAL_TYPES(
+      dtype, "CrossEntropyWithSoftmaxBwdWithDowncastGPUKernel", ([&] {
+        CrossEntropyWithSoftmaxBwdWithDowncastGPUKernel<T, data_t>(
+            dev_ctx, label, softmax, loss_grad, axis, logits_grad);
+      }));
+}
+
+}  // namespace phi
+
+PD_REGISTER_PLUGIN_KERNEL(cross_entropy_with_softmax_bwd_w_downcast,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::CrossEntropyWithSoftmaxBwdWithDowncastKernel,
+                          float,
+                          double,
+                          phi::float16) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/embedding_grad_add_to_kernel.cu b/backends/metax_gpu/kernels/cuda_kernels/embedding_grad_add_to_kernel.cu
new file mode 100644
index 00000000000..6b20feee0fd
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/embedding_grad_add_to_kernel.cu
@@ -0,0 +1,27 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/embedding_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/embedding_grad.h"
+#include "paddle/phi/kernels/gpu/embedding_grad_add_to_kernel.cu"  // NOLINT
+
+PD_CUSTOM_KERNEL_REGISTER(embedding_grad_add_to,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::EmbeddingGradAddToAddToKernel,
+                          float,
+                          double,
+                          phi::float16,
+                          phi::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu b/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu
new file mode 100644
index 00000000000..c6bd53f007f
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu
@@ -0,0 +1,28 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "kernels/impl/gammaln_grad_kernel_impl.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gammaln_grad_kernel.h"
+
+PD_REGISTER_PLUGIN_KERNEL(gammaln_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::GammalnGradKernel,
+                          float,
+                          double,
+                          phi::float16,
+                          phi::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/moe_combine_no_weight_grad_kernel.cu b/backends/metax_gpu/kernels/cuda_kernels/moe_combine_no_weight_grad_kernel.cu
new file mode 100644
index 00000000000..e6984cf86d2
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/moe_combine_no_weight_grad_kernel.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/legacy/gpu/moe_combine_no_weight_grad_kernel.cu"  // NOLINT
+
+PD_CUSTOM_KERNEL_REGISTER(moe_combine_no_weight_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::MoeCombineNoWeightGradKernel,
+                          float,
+                          double,
+                          phi::bfloat16,
+                          phi::float16) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/multihead_matmul_kernel.cu b/backends/metax_gpu/kernels/cuda_kernels/multihead_matmul_kernel.cu
new file mode 100644
index 00000000000..151c929e41c
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/multihead_matmul_kernel.cu
@@ -0,0 +1,433 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <type_traits>
+
+#include "kernels/funcs/blas/blas.h"
+#include "paddle/common/errors.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/multihead_matmul_functor.h"
+
+namespace phi {
+namespace fusion {
+
+template <typename T>
+__global__ void transpose(T *src,
+                          T *dst,
+                          const int batch_size,
+                          const int seq_len,
+                          const int head_num,
+                          const int size_per_head) {
+  int batch_id = blockIdx.x / (head_num * seq_len);
+  int seq_id = blockIdx.x % seq_len;
+  int head_id = (blockIdx.x % (head_num * seq_len)) / seq_len;
+  dst[batch_id * (head_num * seq_len * size_per_head) +
+      seq_id * head_num * size_per_head + head_id * size_per_head +
+      threadIdx.x] = src[blockIdx.x * size_per_head + threadIdx.x];
+}
+
+template <typename T>
+inline __device__ T add_func(T a, T b);
+
+template <>
+__device__ float add_func<float>(float a, float b) {
+  return a + b;
+}
+
+template <>
+__device__ float2 add_func<float2>(float2 a, float2 b) {
+  float2 c;
+  c.x = a.x + b.x;
+  c.y = a.y + b.y;
+  return c;
+}
+
+template <>
+__device__ float4 add_func<float4>(float4 a, float4 b) {
+  float4 c;
+  c.x = a.x + b.x;
+  c.y = a.y + b.y;
+  c.z = a.z + b.z;
+  c.w = a.w + b.w;
+  return c;
+}
+#if defined(PADDLE_WITH_CUDA)
+template <>
+__device__ half2 add_func<half2>(half2 a, half2 b) {
+#if __CUDA_ARCH__ >= 530
+  return __hadd2(a, b);
+#else
+  return half2(__float2half(__half2float(a.x) + __half2float(b.x)),
+               __float2half(__half2float(b.x) + __half2float(b.y)));
+#endif
+}
+
+template <>
+__device__ half add_func<half>(half a, half b) {
+#if __CUDA_ARCH__ >= 530
+  return __hadd(a, b);
+#else
+  return __float2half(__half2float(a) + __half2float(b));
+#endif
+}
+#endif
+
+template <typename T>
+__global__ void TransposeQkvKernel(const int H,
+                                   const T *input,
+                                   const T *bias,
+                                   T *output) {
+  // Input: BxSx3xNxH
+  // Bias: 3xNxH
+  // Output: 3xBxNxSxH
+  int n = threadIdx.y;
+  int s = blockIdx.x;
+  int b = blockIdx.y;
+  int m = blockIdx.z;
+
+  const int N = blockDim.y;
+  const int S = gridDim.x;
+  const int B = gridDim.y;
+
+  const int NH = N * H;
+  const int NHS = NH * S;
+  const int in_offset = n * H + m * NH + s * 3 * NH + b * NHS * 3;
+  const int bias_offset = m * NH + n * H;
+  const int out_offset = s * H + n * S * H + b * NHS + m * NHS * B;
+
+  const int i = threadIdx.x;
+  output[out_offset + i] =
+      add_func(input[in_offset + i], bias[bias_offset + i]);
+}
+
+template <typename T>
+void TransQKVWithBias(const int batch,
+                      const int seq_len,
+                      const int head_size,
+                      const int head_num,
+                      const T *input,
+                      const T *bias,
+                      T *output,
+                      gpuStream_t stream);
+
+template <>
+void TransQKVWithBias(const int batch,
+                      const int seq_len,
+                      const int head_size,
+                      const int head_num,
+                      const float *input,
+                      const float *bias,
+                      float *output,
+                      gpuStream_t stream) {
+  // BxSx3xNxH + 3xNxH -> 3xBxNxSxH
+  int scratch_size = batch * head_num * seq_len * seq_len;
+  const dim3 grid(seq_len, batch, 3);
+  // scratch % 4 == 0 to ensure the alignment
+  if (head_size % 4 == 0 && scratch_size % 4 == 0) {
+    const int h = head_size / 4;
+    const float4 *input4 = reinterpret_cast<const float4 *>(input);
+    const float4 *bias4 = reinterpret_cast<const float4 *>(bias);
+    float4 *output4 = reinterpret_cast<float4 *>(output);
+    const dim3 block(h, head_num, 1);
+
+    // limit h * head_num to max block size(1024).
+    PADDLE_ENFORCE_LE(h * head_num,
+                      1024,
+                      common::errors::InvalidArgument(
+                          "head_num (%d) * head_size (%d) should <= %d",
+                          head_num,
+                          head_size,
+                          1024 * 4));
+    TransposeQkvKernel<float4>
+        <<<grid, block, 0, stream>>>(h, input4, bias4, output4);
+  } else if (head_size % 2 == 0 && scratch_size % 2 == 0) {
+    const int h = head_size / 2;
+    const float2 *input2 = reinterpret_cast<const float2 *>(input);
+    const float2 *bias2 = reinterpret_cast<const float2 *>(bias);
+    float2 *output2 = reinterpret_cast<float2 *>(output);
+    const dim3 block(h, head_num, 1);
+    // limit h * head_num to max block size(1024).
+    PADDLE_ENFORCE_LE(h * head_num,
+                      1024,
+                      common::errors::InvalidArgument(
+                          "head_num (%d) * head_size (%d) should <= %d",
+                          head_num,
+                          head_size,
+                          1024 * 2));
+    TransposeQkvKernel<float2>
+        <<<grid, block, 0, stream>>>(h, input2, bias2, output2);
+  } else {
+    const dim3 block(head_size, head_num, 1);
+    // limit head_size * head_num to max block size(1024).
+    PADDLE_ENFORCE_LE(head_size * head_num,
+                      1024,
+                      common::errors::InvalidArgument(
+                          "head_num (%d) * head_size (%d) should <= %d",
+                          head_num,
+                          head_size,
+                          1024));
+    TransposeQkvKernel<float>
+        <<<grid, block, 0, stream>>>(head_size, input, bias, output);
+  }
+}
+
+#if defined(PADDLE_WITH_CUDA)
+template <>
+void TransQKVWithBias(const int batch,
+                      const int seq_len,
+                      const int head_size,
+                      const int head_num,
+                      const phi::float16 *input,
+                      const phi::float16 *bias,
+                      phi::float16 *output,
+                      gpuStream_t stream) {
+  // BxSx3xNxH + 3xNxH -> 3xBxNxSxH
+  int scratch_size = batch * head_num * seq_len * seq_len;
+  const dim3 grid(seq_len, batch, 3);
+  if (head_size % 2 == 0 && scratch_size % 2 == 0) {
+    const int h = head_size / 2;
+    const half2 *input2 = reinterpret_cast<const half2 *>(input);
+    const half2 *bias2 = reinterpret_cast<const half2 *>(bias);
+    half2 *output2 = reinterpret_cast<half2 *>(output);
+    const dim3 block(h, head_num, 1);
+    // limit h * head_num to max block size(1024).
+    PADDLE_ENFORCE_LE(h * head_num,
+                      1024,
+                      common::errors::InvalidArgument(
+                          "head_num (%d) * head_size (%d) should <= %d",
+                          head_num,
+                          head_size,
+                          1024 * 2));
+    TransposeQkvKernel<half2>
+        <<<grid, block, 0, stream>>>(h, input2, bias2, output2);
+  } else {
+    const dim3 block(head_size, head_num, 1);
+    const half *input_half = reinterpret_cast<const half *>(input);
+    const half *bias_half = reinterpret_cast<const half *>(bias);
+    half *output_half = reinterpret_cast<half *>(output);
+
+    // limit head_size * head_num to max block size(1024).
+    PADDLE_ENFORCE_LE(head_size * head_num,
+                      1024,
+                      common::errors::InvalidArgument(
+                          "head_num (%d) * head_size (%d) should <= %d",
+                          head_num,
+                          head_size,
+                          1024));
+    TransposeQkvKernel<half><<<grid, block, 0, stream>>>(
+        head_size, input_half, bias_half, output_half);
+  }
+}
+#endif
+
+inline int round_up(int seq_len, int multiple = 32) {
+  PADDLE_ENFORCE_GT(
+      multiple,
+      0,
+      common::errors::InvalidArgument(
+          "multiple should be a positive number, but it's (%d)", multiple));
+  return ((seq_len + multiple - 1) / multiple) * multiple;
+}
+
+template <typename T>
+__global__ void broadcast(const T *src,
+                          T *dst,
+                          const int seq_len,
+                          const int head_num) {
+  int batch_id = blockIdx.x / (head_num * seq_len);
+  int dst_offset = blockIdx.x * seq_len;
+  if (threadIdx.x < seq_len) {
+    dst[threadIdx.x + dst_offset] = src[threadIdx.x + batch_id * seq_len];
+  }
+}
+
+template <typename T>
+__global__ void broadcast_batch_head_number(const T *src,
+                                            T *dst,
+                                            const int batch_size,
+                                            const int seq_len,
+                                            const int head_num) {
+  int src_seq_id = blockIdx.x % seq_len;
+  int dst_offset = blockIdx.x * seq_len;
+  if (threadIdx.x < seq_len) {
+    dst[threadIdx.x + dst_offset] = src[threadIdx.x + src_seq_id * seq_len];
+  }
+}
+
+template <typename T, typename Context>
+void MultiheadMatmulKernel(const Context &dev_ctx,
+                           const DenseTensor &input,
+                           const DenseTensor &w,
+                           const DenseTensor &bias,
+                           const paddle::optional<DenseTensor> &bias_qk,
+                           const bool transpose_q,
+                           const bool transpose_k,
+                           const bool transpose_v,
+                           const float alpha,
+                           const int head_number,
+                           DenseTensor *out) {
+  auto *input_d = input.data<T>();
+  auto *w_d = w.data<T>();
+  auto *bias_d = bias.data<T>();
+  auto *bias_qk_d = bias_qk ? bias_qk->data<T>() : nullptr;
+  T scale = static_cast<T>(alpha);
+
+  // compute q*k with eltadd
+  auto stream = dev_ctx.stream();
+  // should be (B * S * hidden)
+  auto input_dims = input.dims();
+  // shouble be (hidden * 3 * all_head_size)
+  auto w_dims = w.dims();
+  int batch = input_dims[0];
+  int seq_len = input_dims[1];
+  int hidden = input_dims[2];
+  phi::DenseTensor temp_bias_tensor;
+  // if bias_qk is[batch, 1, 1, seq_len], the bias_qk_d need to be broadcasted
+  if (bias_qk && bias_qk->numel() == (batch * seq_len)) {
+    VLOG(4) << "Do broadcasted bias_qk from [batch, 1, 1, seq_len]";
+    temp_bias_tensor.Resize({batch * head_number * seq_len * seq_len});
+    auto *temp_qk_bias = dev_ctx.template Alloc<T>(
+        &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
+    int grid = batch * head_number * seq_len;
+    int block = round_up(seq_len);
+    broadcast<<<grid, block, 0, stream>>>(
+        bias_qk_d, temp_qk_bias, seq_len, head_number);
+    bias_qk_d = static_cast<const T *>(temp_qk_bias);
+  }
+  // if bias_qk is[1, 1, seq_len, seq_len], the bias_qk_d need to be
+  // broadcasted
+  if (bias_qk && bias_qk->numel() == (1 * seq_len * seq_len)) {
+    VLOG(4) << "do broadcasted bias_qk from  [1, 1, seq_len, seq_len]";
+    temp_bias_tensor.Resize({batch * head_number * seq_len * seq_len});
+    auto *temp_qk_bias = dev_ctx.template Alloc<T>(
+        &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
+    int grid = batch * head_number * seq_len;
+    int block = round_up(seq_len);
+    broadcast_batch_head_number<<<grid, block, 0, stream>>>(
+        bias_qk_d, temp_qk_bias, batch, seq_len, head_number);
+    bias_qk_d = static_cast<const T *>(temp_qk_bias);
+  }
+  if (!bias_qk) {
+    int size = batch * head_number * seq_len * seq_len;
+    temp_bias_tensor.Resize({size});
+    auto *temp_qk_bias = dev_ctx.template Alloc<T>(
+        &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
+#ifdef PADDLE_WITH_HIP
+    hipMemset(temp_qk_bias, 0, sizeof(float) * size);
+#else
+    cudaMemset(temp_qk_bias, 0, sizeof(float) * size);
+#endif
+    bias_qk_d = static_cast<const T *>(temp_qk_bias);
+  }
+  int all_head_size = w_dims[2];
+  int head_size = all_head_size / head_number;
+
+  out->Resize({batch, seq_len, all_head_size});
+  auto *output_d = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
+
+  // (B*S, hidden)
+  const phi::DenseTensor input_matrix =
+      phi::ReshapeToMatrix(input, 2 /*x_num_col_dims */);
+  // (hidden, 3 * all_head_size)
+  const phi::DenseTensor w_matrix =
+      phi::ReshapeToMatrix(w, 1 /*y_num_col_dims*/);
+
+  phi::DenseTensor temp_out_tensor;
+  auto temp_out_dims =
+      common::make_ddim({batch, seq_len, 3, head_number, head_size});
+  temp_out_tensor.Resize(
+      {batch * seq_len, common::product(temp_out_dims) / (batch * seq_len)});
+  auto *temp_out_data = dev_ctx.template Alloc<T>(
+      &temp_out_tensor, temp_out_tensor.numel() * sizeof(T));
+
+  // (B * S, hidden) * (hidden, 3 * N * H) -> (B * S * 3 * N * H)
+  auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
+  blas.MatMul(input_matrix, w_matrix, &temp_out_tensor);
+  VLOG(2) << "(B * S, hidden) * (hidden, 3 * N * H) -> (B * S * 3 * N * H)";
+  // temp_out_tensor.Resize(temp_out_dims);
+
+  phi::DenseTensor multihead_temp_tensor;
+  // B * head_number * S * S * 1 + B * S * 3 * N * H
+  int scratch_size = batch * head_number * seq_len * seq_len * 1;
+  multihead_temp_tensor.Resize({scratch_size + temp_out_tensor.numel()});
+  auto *multihead_temp_data = dev_ctx.template Alloc<T>(
+      &multihead_temp_tensor, multihead_temp_tensor.numel() * sizeof(T));
+
+  auto *qkptr = multihead_temp_data;
+  auto *tptr = multihead_temp_data + scratch_size;
+
+  // Do the transpose with bias.
+  // BxSx3xNxH => tptr: 3xBxNxSxH.
+  TransQKVWithBias(batch,
+                   seq_len,
+                   head_size,
+                   head_number,
+                   temp_out_data,
+                   bias_d,
+                   tptr,
+                   stream);
+  if (std::is_same<T, phi::float16>::value) {
+    phi::funcs::MultiheadGPUComputeFunctor<half> multihead_compute_func;
+    multihead_compute_func(dev_ctx,
+                           batch,
+                           seq_len,
+                           head_number,
+                           head_size,
+                           reinterpret_cast<half *>(qkptr),
+                           reinterpret_cast<const half *>(bias_qk_d),
+                           false,
+                           reinterpret_cast<half *>(tptr),
+                           __float2half(static_cast<float>(scale)),
+                           __float2half(0.0));
+  } else {
+    phi::funcs::MultiheadGPUComputeFunctor<T> multihead_compute_func;
+    multihead_compute_func(dev_ctx,
+                           batch,
+                           seq_len,
+                           head_number,
+                           head_size,
+                           qkptr,
+                           bias_qk_d,
+                           false,
+                           tptr,
+                           scale,
+                           T(0.0));
+  }
+
+  int grid = batch * head_number * seq_len;
+  int block = head_size;
+  transpose<T><<<grid, block, 0, stream>>>(
+      tptr, output_d, batch, seq_len, head_number, head_size);
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+#if defined(PADDLE_WITH_CUDA)
+PD_REGISTER_PLUGIN_KERNEL(multihead_matmul,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::fusion::MultiheadMatmulKernel,
+                          float,
+                          phi::float16) {}
+#else
+PD_REGISTER_PLUGIN_KERNEL(multihead_matmul,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::fusion::MultiheadMatmulKernel,
+                          float) {}
+#endif
diff --git a/backends/metax_gpu/kernels/funcs/generator.cc b/backends/metax_gpu/kernels/funcs/generator.cc
new file mode 100644
index 00000000000..8fcbf474b07
--- /dev/null
+++ b/backends/metax_gpu/kernels/funcs/generator.cc
@@ -0,0 +1,287 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/generator.h"
+
+#include <glog/logging.h>
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/xpu/xpu_info.h"
+#include "paddle/phi/core/enforce.h"
+
+static uint64_t GetRandomSeed() {
+  std::random_device rd;
+  // double has 53 bit significant, so limit uint64 to 53 bits
+  return ((((uint64_t)rd()) << 32) + rd()) & 0x1FFFFFFFFFFFFF;
+}
+
+namespace phi {
+
+const std::shared_ptr<Generator>& DefaultXPUGenerator(int64_t device_id) {
+#if defined(PADDLE_WITH_XPU)
+
+  static int64_t num_xpu_devices = -1;
+  static std::once_flag num_devices_init_flag;
+  static std::deque<std::once_flag> xpu_device_flags;
+  static std::vector<std::shared_ptr<Generator>> default_xpu_generators;
+
+  std::call_once(num_devices_init_flag, []() {
+    num_xpu_devices = phi::backends::xpu::GetXPUDeviceCount();
+    xpu_device_flags.resize(num_xpu_devices);
+    default_xpu_generators.resize(num_xpu_devices);
+  });
+  if (device_id < 0) {
+    PADDLE_THROW(common::errors::InvalidArgument(
+        "xpu device id should be greater than 0"));
+  }
+
+  std::call_once(xpu_device_flags[device_id], [device_id]() {
+    default_xpu_generators[device_id] =
+        std::make_shared<Generator>(GetRandomSeed(), device_id);
+    VLOG(4) << "initial seed: "
+            << default_xpu_generators[device_id]->GetCurrentSeed();
+  });
+  return default_xpu_generators[device_id];
+#else
+  PADDLE_THROW(common::errors::PermissionDenied(
+      "getDefaultXPUGenerator only support in XPU place"));
+#endif
+}
+
+const std::shared_ptr<Generator>& DefaultCUDAGenerator(int64_t device_id) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+  static int64_t num_cuda_devices = -1;
+  static std::once_flag num_devices_init_flag;
+  static std::deque<std::once_flag> cuda_device_flags;
+  static std::vector<std::shared_ptr<Generator>> default_cuda_generators;
+
+  std::call_once(num_devices_init_flag, []() {
+    num_cuda_devices = phi::backends::gpu::GetGPUDeviceCount();
+    cuda_device_flags.resize(num_cuda_devices);
+    default_cuda_generators.resize(num_cuda_devices);
+  });
+  if (device_id < 0) {
+    PADDLE_THROW(common::errors::InvalidArgument(
+        "cuda device id should be greater than 0"));
+  }
+
+  std::call_once(cuda_device_flags[device_id], [device_id]() {
+    default_cuda_generators[device_id] =
+        std::make_shared<Generator>(GetRandomSeed(), device_id);
+    VLOG(7) << "initial seed: "
+            << default_cuda_generators[device_id]->GetCurrentSeed();
+  });
+  return default_cuda_generators[device_id];
+#else
+  PADDLE_THROW(common::errors::PermissionDenied(
+      "getDefaultCUDAGenerator only support in CUDA place"));
+#endif
+}
+
+const std::shared_ptr<Generator>& DefaultCPUGenerator() {
+  static auto default_cpu_generator =
+      std::make_shared<Generator>(GetRandomSeed());
+  return default_cpu_generator;
+}
+
+const std::shared_ptr<Generator>& DefaultCustomDeviceGenerator(
+    const phi::CustomPlace& place) {
+  static std::
+      unordered_map<phi::Place, std::shared_ptr<Generator>, phi::Place::Hash>
+          generators;
+  if (generators.find(place) == generators.end()) {
+    generators.insert({place, std::make_shared<Generator>(GetRandomSeed())});
+  }
+  return generators[place];
+}
+
+using RNGMap = std::unordered_map<std::string, std::shared_ptr<Generator>>;
+
+static RNGMap& GetRandomSeedGeneratorMap() {
+  static auto random_seed_generator_map = RNGMap();
+  return random_seed_generator_map;
+}
+
+const std::shared_ptr<Generator>& SetRandomSeedGenerator(
+    const std::string& name, uint64_t seed) {
+  auto& rng_map = GetRandomSeedGeneratorMap();
+  auto iter = rng_map.find(name);
+  PADDLE_ENFORCE_EQ(iter == rng_map.end(),
+                    true,
+                    common::errors::AlreadyExists(
+                        "%s RandomSeedGenerator is already exist", name));
+
+  auto generator = std::make_shared<Generator>(seed);
+  bool emplace_success = rng_map.emplace(name, generator).second;
+  PADDLE_ENFORCE_EQ(
+      emplace_success,
+      true,
+      common::errors::PermissionDenied(
+          "SetRandomSeedGenerator cannot emplace %s RandomSeedGenerator",
+          name));
+  return rng_map[name];
+}
+
+const std::shared_ptr<Generator>& GetRandomSeedGenerator(
+    const std::string& name) {
+  auto& rng_map = GetRandomSeedGeneratorMap();
+  auto iter = rng_map.find(name);
+  PADDLE_ENFORCE_EQ(iter != rng_map.end(),
+                    true,
+                    common::errors::NotFound(
+                        "%s RandomSeedGenerator is not found, please "
+                        "use `set_random_seed_generator` to set rng first",
+                        name));
+  return iter->second;
+}
+
+// There are 3 conditions:
+// (1) op seed is set, use op seed.
+// (2) op seed is not set, global seed is set, use global seed.
+// (3) op seed is not set, global seed is not set too, use random seed from
+// RandomGenerator.
+std::shared_ptr<std::mt19937_64> GetCPURandomEngine(uint64_t seed) {
+  if (seed == 0) {
+    VLOG(4) << "Use random cpu_engine from generator";
+    return DefaultCPUGenerator()->GetCPUEngine();
+  } else {
+    // NOTE(zhiqiu): creating an cpu_engine instance everytime instead of using
+    // OpDefaultCPUEngine(), this is the legacy behavior of random operators.
+    // The benefit is that when running PE with fixed-seed in multiple threads,
+    // each thread has their own cpu_engine, and doesn't affect each other.
+    //
+    // And we need to measure the determinacy of Generator in PE.
+    auto cpu_engine = std::make_shared<std::mt19937_64>();
+    static std::mutex mu_;
+    {
+      std::lock_guard<std::mutex> lock(mu_);
+      cpu_engine->seed(seed);
+    }
+    return cpu_engine;
+  }
+}
+
+inline void Generator::print_state_info() {
+  VLOG(7) << "Generator Random state "
+          << "device id: " << state().device << ", seed: " << state().seed
+          << ", offset: " << state().offset << ", cpu_engine: " << cpu_engine();
+}
+
+Generator::Generator() {
+  auto seed = GetRandomSeed();
+  current_index = states_.size();
+  states_.emplace_back(-1, seed);
+  print_state_info();
+}
+
+Generator::Generator(uint64_t seed) {
+  current_index = states_.size();
+  states_.emplace_back(-1, seed);
+  print_state_info();
+}
+
+Generator::Generator(uint64_t seed, int64_t device_id) {
+  current_index = states_.size();
+  // device id first, then seed
+  states_.emplace_back(device_id, seed);
+  print_state_info();
+}
+
+phi::Generator::GeneratorState Generator::GetState() { return state(); }
+
+void Generator::SetState(const phi::Generator::GeneratorState& state) {
+  std::lock_guard<std::mutex> lock(mu_);
+  if (current_index < states_.size())
+    states_[current_index] = state;
+  else
+    PADDLE_THROW(common::errors::NotFound("Generator index is not found"));
+  print_state_info();
+}
+
+uint64_t Generator::GetStateIndex() { return current_index; }
+
+void Generator::SetStateIndex(uint64_t StateIndex) {
+  std::lock_guard<std::mutex> lock(mu_);
+  if (current_index < states_.size())
+    current_index = StateIndex;
+  else
+    PADDLE_THROW(common::errors::NotFound("Generator index is not found"));
+}
+
+uint64_t Generator::RegisterStateIndex(const GeneratorState& state) {
+  std::lock_guard<std::mutex> lock(mu_);
+  auto new_index = states_.size();
+  states_.push_back(state);
+  current_index = new_index;
+  return new_index;
+}
+
+inline Generator::GeneratorState& Generator::state() {
+  if (current_index < states_.size())
+    return states_[current_index];
+  else
+    PADDLE_THROW(common::errors::NotFound("Generator index is not found"));
+}
+
+inline std::shared_ptr<std::mt19937_64> Generator::cpu_engine() {
+  return state().cpu_engine;
+}
+
+uint64_t Generator::GetCurrentSeed() {
+  std::lock_guard<std::mutex> lock(mu_);
+  return state().seed;
+}
+
+uint64_t Generator::Seed() {
+  std::lock_guard<std::mutex> lock(mu_);
+  uint64_t seed = GetRandomSeed();
+  state().reset(seed);
+  return seed;
+}
+
+void Generator::SetCurrentSeed(uint64_t seed) {
+  std::lock_guard<std::mutex> lock(mu_);
+  state().reset(seed);
+}
+
+std::shared_ptr<std::mt19937_64> Generator::GetCPUEngine() {
+  return cpu_engine();
+}
+
+uint64_t Generator::Random64() {
+  std::lock_guard<std::mutex> lock(mu_);
+  auto current_engine = cpu_engine();
+  return (*current_engine)();
+}
+
+std::pair<uint64_t, uint64_t> Generator::IncrementOffset(uint64_t increment) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU)
+  std::lock_guard<std::mutex> lock(mu_);
+  uint64_t offset = state().offset;
+  state().offset = offset + increment;
+  print_state_info();
+  return std::make_pair(state().seed, offset);
+#else
+  PADDLE_THROW(common::errors::PermissionDenied(
+      "Increment Offset only support in CUDA place"));
+#endif
+}
+
+}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h
new file mode 100644
index 00000000000..2b222ba3b2c
--- /dev/null
+++ b/backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+namespace phi {
+template <typename T>
+HOSTDEVICE T digamma_positive_domain(T x) {
+  constexpr T c = T{8.5};
+  constexpr T euler_mascheroni = T{0.57721566490153286060};
+  T r;
+  T value;
+  T x2;
+
+  if (x <= T{0.000001}) {
+    value = -euler_mascheroni - T{1.0} / x + T{1.6449340668482264365} * x;
+    return value;
+  }
+
+  value = T{0.0};
+  x2 = x;
+  while (x2 < c) {
+    value = value - T{1.0} / x2;  // NOLINT
+    x2 = x2 + T{1.0};
+  }
+
+  r = T{1.0} / x2;
+  value = value + std::log(x2) - T{0.5} * r;
+
+  r = r * r;
+
+  value = value -
+          r * (T{1.0} / T{12.0} -
+               r * (T{1.0} / T{120.0} -
+                    r * (T{1.0} / T{252.0} -
+                         r * (T{1.0} / T{240.0} - r * (T{1.0} / T{132.0})))));
+
+  return value;
+}
+
+template <typename T>
+HOSTDEVICE T digamma(T x) {
+  const static T pi = T{3.14159265358979323846};  // NOLINT
+
+  if (x == T{0.0}) {
+    T inf = std::numeric_limits<T>::infinity();
+    return std::signbit(x) ? inf : -inf;
+  } else if (x < T{0.0}) {
+    if (x == std::trunc(x)) {
+      return std::numeric_limits<T>::quiet_NaN();
+    } else {
+      T iptr;
+      T frac_part = std::modf(x, &iptr);
+      return digamma_positive_domain(T{1.0} - x) -
+             pi / std::tan(pi * frac_part);
+    }
+  } else {
+    return digamma_positive_domain(x);
+  }
+}
+
+template <typename T>
+struct GammalnGradFunctor {
+  GammalnGradFunctor(const T* dout, const T* x, T* output, int64_t numel)
+      : dout_(dout), x_(x), output_(output), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+    const MT mp_dout = static_cast<MT>(dout_[idx]);
+    const MT mp_x = static_cast<MT>(x_[idx]);
+    output_[idx] = static_cast<T>(mp_dout * digamma<MT>(mp_x));
+  }
+
+ private:
+  const T* dout_;
+  const T* x_;
+  T* output_;
+  int64_t numel_;
+};
+template <typename T, typename Context>
+void GammalnGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& d_out,
+                       DenseTensor* d_x) {
+  auto numel = d_out.numel();
+  if (d_x && d_x->numel() == 0) {
+    dev_ctx.template Alloc<T>(d_x);
+    return;
+  }
+  auto* dout_data = d_out.data<T>();
+  auto* x_data = x.data<T>();
+  auto* dx_data =
+      dev_ctx.template Alloc<T>(d_x, static_cast<size_t>(numel * sizeof(T)));
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+  GammalnGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
+  for_range(functor);
+}
+}  // namespace phi
diff --git a/backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_grad_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_grad_kernel.cu
new file mode 100644
index 00000000000..766d984a25b
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_grad_kernel.cu
@@ -0,0 +1,362 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cudnn_lstm_grad_kernel.h"
+#include "paddle/phi/kernels/gpu/cudnn_lstm_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CudnnLSTMGradKernel(
+    const Context &dev_ctx,
+    const DenseTensor &x,
+    const DenseTensor &init_h,
+    const DenseTensor &init_c,
+    const paddle::optional<std::vector<const DenseTensor *>> &weight_list,
+    const paddle::optional<DenseTensor> &sequence_length,
+    const DenseTensor &out,
+    const DenseTensor &reserve,
+    const DenseTensor &state_out,
+    const DenseTensor &out_grad,
+    const DenseTensor &last_h_grad,
+    const DenseTensor &last_c_grad,
+    float dropout_prob,
+    bool is_bidirec,
+    int hidden_size,
+    int num_layers,
+    bool is_test,
+    int seed,
+    DenseTensor *x_grad,
+    DenseTensor *init_h_grad,
+    DenseTensor *init_c_grad,
+    std::vector<DenseTensor *> weight_grad_list) {
+  auto input_dims = x.dims();
+  auto init_h_dims = init_h.dims();
+  auto init_c_dims = init_c.dims();
+
+  auto *init_h_data = init_h.data<T>();
+  auto *init_c_data = init_c.data<T>();
+  auto *out_data = out.data<T>();
+  auto *out_grad_data = out_grad.data<T>();
+  auto *last_h_grad_data = last_h_grad.data<T>();
+  auto *last_c_grad_data = last_c_grad.data<T>();
+
+  auto running_weight_list = *weight_list.get_ptr();
+  int weight_numel = size_sum(running_weight_list);
+  bool continuous = is_continuous<T, std::vector<const phi::DenseTensor *>>(
+      running_weight_list);
+
+  // auto handle = dev_ctx.cudnn_handle();
+  auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
+  auto place = dev_ctx.GetPlace();
+  auto stream = dev_ctx.stream();
+  phi::DenseTensor weight_whole;
+  T *weight_data = nullptr;
+
+  if (!continuous) {
+    weight_whole.Resize({weight_numel});
+    dev_ctx.template Alloc<T>(&weight_whole);
+    weight_to_tensor<T>(place, stream, running_weight_list, &weight_whole);
+    weight_data = weight_whole.data<T>();
+  } else {
+    weight_data = const_cast<T *>(running_weight_list[0]->data<T>());
+  }
+
+  phi::DenseTensor weight_grad;
+  phi::funcs::SetConstant<phi::GPUContext, T> zero;
+  weight_grad.Resize({weight_numel});
+  dev_ctx.template Alloc<T>(&weight_grad);
+  zero(dev_ctx, &weight_grad, static_cast<T>(0.0));
+  T *weight_grad_data = weight_grad.data<T>();
+
+  int offset = 0;
+  for (size_t i = 0; i < weight_grad_list.size(); ++i) {
+    size_t len = weight_grad_list[i]->numel();
+    auto dim = weight_grad_list[i]->dims();
+    weight_grad_list[i]
+        ->ShareDataWith(weight_grad.Slice(static_cast<int64_t>(offset),
+                                          static_cast<int64_t>(offset + len)))
+        .Resize(dim);
+    offset += len;
+  }
+
+  x_grad->Resize(input_dims);
+  dev_ctx.template Alloc<T>(x_grad);
+  auto *in_grad_data = x_grad->data<T>();
+
+  if (init_h_grad) {
+    init_h_grad->Resize(init_h_dims);
+    dev_ctx.template Alloc<T>(init_h_grad);
+  }
+  auto *init_h_grad_data = init_h_grad ? init_h_grad->data<T>() : nullptr;
+
+  if (init_c_grad) {
+    init_c_grad->Resize(init_c_dims);
+    dev_ctx.template Alloc<T>(init_c_grad);
+  }
+  auto *init_c_grad_data = init_c_grad ? init_c_grad->data<T>() : nullptr;
+
+  auto running_seq_length = sequence_length.get_ptr();
+  bool has_seq_length = running_seq_length != nullptr;
+  std::vector<int> SequenceLength;
+  if (has_seq_length) {
+    SequenceLength = phi::GetVectorFromTensor<int>(running_seq_length);
+  }
+
+  int seq_length = input_dims[0];
+  int batch_size = x.dims()[1];
+  int input_size = x.dims()[2];
+
+  size_t workspace_size;
+  size_t reserve_size;
+
+  ScopedRNNBase rnn(seq_length,
+                    batch_size,
+                    input_size,
+                    hidden_size,
+                    num_layers,
+                    dropout_prob,
+                    seed,
+                    weight_numel,
+                    true,
+                    is_bidirec);
+
+  rnn.Create<T>(handle,
+                dev_ctx.GetPlace(),
+                SequenceLength,
+                &workspace_size,
+                &reserve_size,
+                const_cast<phi::DenseTensor *>(&state_out));
+
+  phi::DenseTensor workspace_data_;
+  workspace_data_.Resize({static_cast<int64_t>(workspace_size)});
+  dev_ctx.template Alloc<uint8_t>(&workspace_data_);
+  const uint8_t *reserve_data = reserve.data<uint8_t>();
+
+#if CUDNN_VERSION >= 90000
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardData_v8(
+      handle,
+      rnn.rnn_desc(),
+      nullptr,
+      rnn.y_seq_desc(),
+      out_data,
+      out_grad_data,
+      rnn.x_seq_desc(),
+      in_grad_data,
+      rnn.init_h_desc(),
+      init_h_data,
+      last_h_grad_data,
+      init_h_grad_data,
+      rnn.init_c_desc(),
+      init_c_data,
+      last_c_grad_data,
+      init_c_grad_data,
+      rnn.weights_size(),
+      weight_data,
+      workspace_size,
+      workspace_data_.data<uint8_t>(),
+      reserve_size,
+      const_cast<uint8_t *>(reserve_data)));
+
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeights_v8(
+      handle,
+      rnn.rnn_desc(),
+      CUDNN_WGRAD_MODE_ADD,
+      nullptr,
+      rnn.x_seq_desc(),
+      x.data<T>(),
+      rnn.init_h_desc(),
+      init_h.data<T>(),
+      rnn.y_seq_desc(),
+      out.data<T>(),
+      rnn.weights_size(),
+      weight_grad_data,
+      workspace_size,
+      workspace_data_.data<uint8_t>(),
+      reserve_size,
+      const_cast<uint8_t *>(reserve_data)));
+#else
+
+  if (!has_seq_length) {
+// This interface is used when the input/output is unpadded.
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenRNNBackwardData(handle,
+                                            rnn.rnn_desc(),
+                                            seq_length,
+                                            rnn.y_descs(),
+                                            out_data,
+                                            rnn.y_descs(),
+                                            out_grad_data,
+                                            rnn.last_h_desc(),
+                                            last_h_grad_data,
+                                            rnn.last_c_desc(),
+                                            last_c_grad_data,
+                                            rnn.weight_desc(),
+                                            weight_data,
+                                            rnn.init_h_desc(),
+                                            init_h_data,
+                                            rnn.init_c_desc(),
+                                            init_c_data,
+                                            rnn.x_descs(),
+                                            in_grad_data,
+                                            rnn.init_h_desc(),
+                                            init_h_grad_data,
+                                            rnn.init_c_desc(),
+                                            init_c_grad_data,
+                                            workspace_data_.data<uint8_t>(),
+                                            workspace_size,
+                                            const_cast<uint8_t *>(reserve_data),
+                                            reserve_size));
+
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNBackwardWeights(
+        handle,
+        rnn.rnn_desc(),
+        seq_length,
+        rnn.x_descs(),
+        x.data<T>(),
+        rnn.init_h_desc(),
+        init_h.data<T>(),
+        rnn.y_descs(),
+        out.data<T>(),
+        rnn.weight_desc(),
+        weight_grad_data,
+        workspace_data_.data<uint8_t>(),
+        workspace_size,
+        const_cast<uint8_t *>(reserve_data),
+        reserve_size));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnRNNBackwardData(handle,
+                                           rnn.rnn_desc(),
+                                           seq_length,
+                                           rnn.y_descs(),
+                                           out_data,
+                                           rnn.y_descs(),
+                                           out_grad_data,
+                                           rnn.last_h_desc(),
+                                           last_h_grad_data,
+                                           rnn.last_c_desc(),
+                                           last_c_grad_data,
+                                           rnn.weight_desc(),
+                                           weight_data,
+                                           rnn.init_h_desc(),
+                                           init_h_data,
+                                           rnn.init_c_desc(),
+                                           init_c_data,
+                                           rnn.x_descs(),
+                                           in_grad_data,
+                                           rnn.init_h_desc(),
+                                           init_h_grad_data,
+                                           rnn.init_c_desc(),
+                                           init_c_grad_data,
+                                           workspace_data_.data<uint8_t>(),
+                                           workspace_size,
+                                           const_cast<uint8_t *>(reserve_data),
+                                           reserve_size));
+
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeights(
+        handle,
+        rnn.rnn_desc(),
+        seq_length,
+        rnn.x_descs(),
+        x.data<T>(),
+        rnn.init_h_desc(),
+        init_h.data<T>(),
+        rnn.y_descs(),
+        out.data<T>(),
+        workspace_data_.data<uint8_t>(),
+        workspace_size,
+        rnn.weight_desc(),
+        weight_grad_data,
+        const_cast<uint8_t *>(reserve_data),
+        reserve_size));
+#endif
+  } else {
+#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
+    // for train
+    // This interface is used when the input/output is padded.
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardDataEx(
+        handle,
+        rnn.rnn_desc(),
+        rnn.y_seq_desc(),
+        out_data,
+        rnn.y_seq_desc(),
+        out_grad_data,
+        nullptr,
+        nullptr,
+        rnn.last_h_desc(),
+        last_h_grad_data,
+        rnn.last_c_desc(),
+        last_c_grad_data,
+        rnn.weight_desc(),
+        weight_data,
+        rnn.init_h_desc(),
+        init_h_data,
+        rnn.init_c_desc(),
+        init_c_data,
+        rnn.x_seq_desc(),
+        in_grad_data,
+        rnn.init_h_desc(),
+        init_h_grad_data,
+        rnn.init_c_desc(),
+        init_c_grad_data,
+        nullptr,
+        nullptr,
+        workspace_data_.data<uint8_t>(),
+        workspace_size,
+        const_cast<uint8_t *>(reserve_data),
+        reserve_size));
+
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeightsEx(
+        handle,
+        rnn.rnn_desc(),
+        rnn.x_seq_desc(),
+        x.data<T>(),
+        rnn.init_h_desc(),
+        init_h.data<T>(),
+        rnn.y_seq_desc(),
+        out.data<T>(),
+        workspace_data_.data<uint8_t>(),
+        workspace_size,
+        rnn.weight_desc(),
+        weight_grad_data,
+        const_cast<uint8_t *>(reserve_data),
+        reserve_size));
+#else
+    PADDLE_THROW(common::errors::Unavailable(
+        "The padded input of rnn is supported by cudnnRNNBackwardDataEx, "
+        "cudnnRNNBackwardWeightsEx, but it only works when the version "
+        "of cudnn is larger than 7.2.1"));
+#endif
+  }
+
+#endif  // end CUDNN_VERSION >= 90000
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(
+    cudnn_lstm_grad, GPU, ALL_LAYOUT, phi::CudnnLSTMGradKernel, float) {}
+#else
+PD_REGISTER_PLUGIN_KERNEL(cudnn_lstm_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::CudnnLSTMGradKernel,
+                          float,
+                          double) {}
+#endif
diff --git a/backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_kernel.cu
new file mode 100644
index 00000000000..6bb94c9281a
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_kernel.cu
@@ -0,0 +1,428 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "glog/logging.h"
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cudnn_lstm_kernel.h"
+#include "paddle/phi/kernels/gpu/cudnn_lstm_utils.h"
+
+namespace phi {
+
+template <typename T>
+#ifdef PADDLE_WITH_HIP
+void LSTMInference(const bool &has_seq_length,
+                   const miopenHandle_t &handle,
+#else
+void LSTMInference(const bool &has_seq_length,
+                   const cudnnHandle_t &handle,
+#endif
+                   const int &seq_length,
+                   ScopedRNNBase *rnn,
+                   const T *x_data,
+                   const T *init_h_data,
+                   const T *init_c_data,
+                   const T *w_data,
+                   T *out_data,
+                   T *last_h_data,
+                   T *last_c_data,
+                   phi::DenseTensor *workspace_data,
+                   const size_t &workspace_size) {
+#if CUDNN_VERSION >= 90000
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnRNNForward(handle,
+                                    rnn->rnn_desc(),
+                                    CUDNN_FWD_MODE_INFERENCE,
+                                    nullptr,
+                                    rnn->x_seq_desc(),
+                                    x_data,
+                                    rnn->y_seq_desc(),
+                                    out_data,
+                                    rnn->init_h_desc(),
+                                    init_h_data,
+                                    last_h_data,
+                                    rnn->init_c_desc(),
+                                    init_c_data,
+                                    last_c_data,
+                                    rnn->weights_size(),
+                                    w_data,
+                                    workspace_size,
+                                    workspace_data->data<uint8_t>(),
+                                    0,
+                                    nullptr));
+
+#else
+
+  if (!has_seq_length) {
+// for inference
+// This interface is used when the input/output is unpadded.
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenRNNForwardInference(handle,
+                                                rnn->rnn_desc(),
+                                                seq_length,
+                                                rnn->x_descs(),
+                                                x_data,
+                                                rnn->init_h_desc(),
+                                                init_h_data,
+                                                rnn->init_c_desc(),
+                                                init_c_data,
+                                                rnn->weight_desc(),
+                                                w_data,
+                                                rnn->y_descs(),
+                                                out_data,
+                                                rnn->last_h_desc(),
+                                                last_h_data,
+                                                rnn->last_c_desc(),
+                                                last_c_data,
+                                                workspace_data->data<uint8_t>(),
+                                                workspace_size));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnRNNForwardInference(handle,
+                                               rnn->rnn_desc(),
+                                               seq_length,
+                                               rnn->x_descs(),
+                                               x_data,
+                                               rnn->init_h_desc(),
+                                               init_h_data,
+                                               rnn->init_c_desc(),
+                                               init_c_data,
+                                               rnn->weight_desc(),
+                                               w_data,
+                                               rnn->y_descs(),
+                                               out_data,
+                                               rnn->last_h_desc(),
+                                               last_h_data,
+                                               rnn->last_c_desc(),
+                                               last_c_data,
+                                               workspace_data->data<uint8_t>(),
+                                               workspace_size));
+#endif
+  } else {
+#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
+    // for inference
+    // This interface is used when the input/output is padded.
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNForwardInferenceEx(
+        handle,
+        rnn->rnn_desc(),
+        rnn->x_seq_desc(),
+        x_data,
+        rnn->init_h_desc(),
+        init_h_data,
+        rnn->init_c_desc(),
+        init_c_data,
+        rnn->weight_desc(),
+        w_data,
+        rnn->y_seq_desc(),
+        out_data,
+        rnn->last_h_desc(),
+        last_h_data,
+        rnn->last_c_desc(),
+        last_c_data,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        workspace_data->data<uint8_t>(),
+        workspace_size));
+#else
+    // CUDNN VERSION has to >=7.2.1
+    PADDLE_THROW(common::errors::Unavailable(
+        "The padded input is supported by "
+        "cudnnRNNForwardInferenceEx, but it only works when "
+        "the version of cudnn is larger than 7.2.1"));
+#endif
+  }
+
+#endif  // end CUDNN_VERSION >= 90000
+}
+
+template <typename T, typename Context>
+void CudnnLSTMKernel(
+    const Context &dev_ctx,
+    const DenseTensor &x,
+    const DenseTensor &init_h,
+    const DenseTensor &init_c,
+    const paddle::optional<DenseTensor> &w,
+    const paddle::optional<std::vector<const DenseTensor *>> &weight_list,
+    const paddle::optional<DenseTensor> &sequence_length,
+    float dropout_prob,
+    bool is_bidirec,
+    int hidden_size,
+    int num_layers,
+    bool is_test,
+    int seed,
+    DenseTensor *out,
+    DenseTensor *last_h,
+    DenseTensor *last_c,
+    DenseTensor *reserve,
+    DenseTensor *state_out) {
+  const T *x_data = x.data<T>();
+  const T *init_h_data = init_h.data<T>();
+  const T *init_c_data = init_c.data<T>();
+
+  T *out_data = dev_ctx.template Alloc<T>(out);
+  T *last_h_data = dev_ctx.template Alloc<T>(last_h);
+  T *last_c_data = dev_ctx.template Alloc<T>(last_c);
+
+  if (!is_test) {
+    if (seed == 0) {
+      // If not specify seed, use global Generator to generate seed.
+      int device_id = dev_ctx.GetPlace().GetDeviceId();
+      auto gen_cuda = phi::DefaultCUDAGenerator(device_id);
+      seed = static_cast<int>(gen_cuda->Random64());
+    }
+  }
+
+  auto *running_sequence_length = sequence_length.get_ptr();
+  bool has_seq_length = running_sequence_length != nullptr;
+  std::vector<int> SequenceLength;
+  if (has_seq_length) {
+    SequenceLength = phi::GetVectorFromTensor<int>(running_sequence_length);
+  }
+
+  // auto handle = dev_ctx.cudnn_handle();
+  auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
+
+  int seq_length = x.dims()[0];
+  int batch_size = x.dims()[1];
+  int input_size = x.dims()[2];
+  bool state_initialized = state_out->initialized() ? true : false;
+
+  size_t workspace_size;
+  size_t reserve_size;
+  phi::DenseTensor weight_whole;
+  T *w_data = nullptr;
+  int weight_numel;
+  bool w_initialized = false;
+  auto place = dev_ctx.GetPlace();
+  auto stream = dev_ctx.stream();
+  auto *running_w = w.get_ptr();
+  if (is_test && running_w != nullptr) {
+    w_initialized = running_w->initialized() ? true : false;
+    weight_numel = running_w->numel();
+  }
+  if (!w_initialized) {
+    auto running_weight_list = *weight_list.get_ptr();
+    bool continuous = is_continuous<T, std::vector<const phi::DenseTensor *>>(
+        running_weight_list);
+    weight_numel = size_sum(running_weight_list);
+
+    if (!continuous) {
+      LOG_FIRST_N(WARNING, 2)
+          << "If the memory space of the Input WeightList is not continuous, "
+             "less efficient calculation will be called. Please call "
+             "flatten_parameters() to make the input memory continuous.";
+      weight_whole.Resize({weight_numel});
+      dev_ctx.template Alloc<T>(&weight_whole);
+      weight_to_tensor<T>(place, stream, running_weight_list, &weight_whole);
+      w_data = weight_whole.data<T>();
+      if (is_test) {  // maybe also reset small weights' ptr for training
+        int offset = 0;
+        for (size_t i = 0; i < running_weight_list.size(); ++i) {
+          size_t len = running_weight_list[i]->numel();
+          auto dim = running_weight_list[i]->dims();
+          const_cast<phi::DenseTensor *>(running_weight_list[i])
+              ->ShareDataWith(
+                  weight_whole.Slice(static_cast<int64_t>(offset),
+                                     static_cast<int64_t>(offset + len)))
+              .Resize(dim);
+          offset += len;
+        }
+      }
+    } else {
+      w_data = const_cast<T *>(running_weight_list[0]->data<T>());
+    }
+  } else {
+    w_data = const_cast<T *>(running_w->data<T>());
+  }
+
+  ScopedRNNBase rnn(seq_length,
+                    batch_size,
+                    input_size,
+                    hidden_size,
+                    num_layers,
+                    dropout_prob,
+                    seed,
+                    weight_numel,
+                    state_initialized,
+                    is_bidirec);
+  rnn.Create<T>(handle,
+                dev_ctx.GetPlace(),
+                SequenceLength,
+                &workspace_size,
+                &reserve_size,
+                state_out);
+
+  phi::DenseTensor workspace_data_;
+  workspace_data_.Resize({static_cast<int64_t>(workspace_size)});
+  dev_ctx.template Alloc<uint8_t>(&workspace_data_);
+
+  reserve->Resize({static_cast<int64_t>(reserve_size)});
+  auto *reserve_data = dev_ctx.template Alloc<uint8_t>(reserve);
+
+  if (is_test) {
+    LSTMInference<T>(has_seq_length,
+                     handle,
+                     seq_length,
+                     &rnn,
+                     x_data,
+                     init_h_data,
+                     init_c_data,
+                     w_data,
+                     out_data,
+                     last_h_data,
+                     last_c_data,
+                     &workspace_data_,
+                     workspace_size);
+  } else {
+#if CUDNN_VERSION >= 90000
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnRNNForward(handle,
+                                      rnn.rnn_desc(),
+                                      CUDNN_FWD_MODE_TRAINING,
+                                      nullptr,
+                                      rnn.x_seq_desc(),
+                                      x_data,
+                                      rnn.y_seq_desc(),
+                                      out_data,
+                                      rnn.init_h_desc(),
+                                      init_h_data,
+                                      last_h_data,
+                                      rnn.init_c_desc(),
+                                      init_c_data,
+                                      last_c_data,
+                                      rnn.weights_size(),
+                                      w_data,
+                                      workspace_size,
+                                      workspace_data_.data<uint8_t>(),
+                                      reserve_size,
+                                      reserve_data));
+#else
+
+    if (!has_seq_length) {
+// for train
+// This interface is used when the input/output is unpadded.
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNForwardTraining(
+          handle,
+          rnn.rnn_desc(),
+          seq_length,
+          rnn.x_descs(),
+          x_data,
+          rnn.init_h_desc(),
+          init_h_data,
+          rnn.init_c_desc(),
+          init_c_data,
+          rnn.weight_desc(),
+          w_data,
+          rnn.y_descs(),
+          out_data,
+          rnn.last_h_desc(),
+          last_h_data,
+          rnn.last_c_desc(),
+          last_c_data,
+          workspace_data_.data<uint8_t>(),
+          workspace_size,
+          reserve_data,
+          reserve_size));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::cudnnRNNForwardTraining(handle,
+                                                rnn.rnn_desc(),
+                                                seq_length,
+                                                rnn.x_descs(),
+                                                x_data,
+                                                rnn.init_h_desc(),
+                                                init_h_data,
+                                                rnn.init_c_desc(),
+                                                init_c_data,
+                                                rnn.weight_desc(),
+                                                w_data,
+                                                rnn.y_descs(),
+                                                out_data,
+                                                rnn.last_h_desc(),
+                                                last_h_data,
+                                                rnn.last_c_desc(),
+                                                last_c_data,
+                                                workspace_data_.data<uint8_t>(),
+                                                workspace_size,
+                                                reserve_data,
+                                                reserve_size));
+#endif
+    } else {
+#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
+      // for train
+      // This interface is used when the input/output is padded.
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNForwardTrainingEx(
+          handle,
+          rnn.rnn_desc(),
+          rnn.x_seq_desc(),
+          x_data,
+          rnn.init_h_desc(),
+          init_h_data,
+          rnn.init_c_desc(),
+          init_c_data,
+          rnn.weight_desc(),
+          w_data,
+          rnn.y_seq_desc(),
+          out_data,
+          rnn.last_h_desc(),
+          last_h_data,
+          rnn.last_c_desc(),
+          last_c_data,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          workspace_data_.data<uint8_t>(),
+          workspace_size,
+          reserve_data,
+          reserve_size));
+#else
+      PADDLE_THROW(common::errors::Unavailable(
+          "The padded input is supported by "
+          "cudnnRNNForwardTrainingEx, but it only works when "
+          "the version of cudnn is larger than 7.2.1"));
+#endif
+    }
+#endif  // end CUDNN_VERSION >= 90000
+  }
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(cudnn_lstm, GPU, ALL_LAYOUT, phi::CudnnLSTMKernel, float) {
+  kernel->InputAt(5).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(3).SetDataType(phi::DataType::UINT8);
+  kernel->OutputAt(4).SetDataType(phi::DataType::UINT8);
+}
+#else
+PD_REGISTER_PLUGIN_KERNEL(
+    cudnn_lstm, metax_gpu, ALL_LAYOUT, phi::CudnnLSTMKernel, float, double) {
+  kernel->InputAt(5).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(3).SetDataType(phi::DataType::UINT8);
+  kernel->OutputAt(4).SetDataType(phi::DataType::UINT8);
+}
+#endif
diff --git a/backends/metax_gpu/tests/ignore.txt b/backends/metax_gpu/tests/ignore.txt
index b4f1afbe5b0..4e54e17b3ef 100644
--- a/backends/metax_gpu/tests/ignore.txt
+++ b/backends/metax_gpu/tests/ignore.txt
@@ -19,3 +19,7 @@ test_uniform_random_op
 test_c_embedding_op
 test_slice_op
 test_compare_op
+test_conv3d_transpose_op
+test_conv3d_layer
+test_conv3d_transpose_part2_op
+test_fused_conv2d_add_act_op

From cccf6b7e68cbaedd28c666773020d094556ab251 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 30 Sep 2025 14:12:32 +0800
Subject: [PATCH 56/58] [metax] rename yaml file (#77)

* [metax]fix patch and fix missing kernel

* [metax] link mccl and fix missing kernel

* [metax] rename yaml file

---------
---
 .github/workflows/metax_work.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index aff530d475c..f14023848c6 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -1,4 +1,4 @@
-name: padlle metax gpu test
+name: paddle metax gpu test
 
 on:
   workflow_dispatch:

From 7a7a7a0590eb0b61be1bd7a911f37dfd521cc2ec Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 30 Sep 2025 14:30:16 +0800
Subject: [PATCH 57/58] [metax] rm file (#78)

* [metax]fix patch and fix missing kernel

* [metax] link mccl and fix missing kernel

* [metax] rename yaml file

* [metax] rm file

* [metax] rm file

---------
---
 .../cuda_kernels/gammaln_grad_kernel.cu       |  28 -----
 .../kernels/impl/gammaln_grad_kernel_impl.h   | 112 ------------------
 .../kernels/metax_kernel/rnn_kernel.cu.cc     |   2 +
 3 files changed, 2 insertions(+), 140 deletions(-)
 delete mode 100644 backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu
 delete mode 100644 backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h

diff --git a/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu b/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu
deleted file mode 100644
index c6bd53f007f..00000000000
--- a/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "kernels/impl/gammaln_grad_kernel_impl.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/gammaln_grad_kernel.h"
-
-PD_REGISTER_PLUGIN_KERNEL(gammaln_grad,
-                          metax_gpu,
-                          ALL_LAYOUT,
-                          phi::GammalnGradKernel,
-                          float,
-                          double,
-                          phi::float16,
-                          phi::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h
deleted file mode 100644
index 2b222ba3b2c..00000000000
--- a/backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/kernels/funcs/for_range.h"
-
-namespace phi {
-template <typename T>
-HOSTDEVICE T digamma_positive_domain(T x) {
-  constexpr T c = T{8.5};
-  constexpr T euler_mascheroni = T{0.57721566490153286060};
-  T r;
-  T value;
-  T x2;
-
-  if (x <= T{0.000001}) {
-    value = -euler_mascheroni - T{1.0} / x + T{1.6449340668482264365} * x;
-    return value;
-  }
-
-  value = T{0.0};
-  x2 = x;
-  while (x2 < c) {
-    value = value - T{1.0} / x2;  // NOLINT
-    x2 = x2 + T{1.0};
-  }
-
-  r = T{1.0} / x2;
-  value = value + std::log(x2) - T{0.5} * r;
-
-  r = r * r;
-
-  value = value -
-          r * (T{1.0} / T{12.0} -
-               r * (T{1.0} / T{120.0} -
-                    r * (T{1.0} / T{252.0} -
-                         r * (T{1.0} / T{240.0} - r * (T{1.0} / T{132.0})))));
-
-  return value;
-}
-
-template <typename T>
-HOSTDEVICE T digamma(T x) {
-  const static T pi = T{3.14159265358979323846};  // NOLINT
-
-  if (x == T{0.0}) {
-    T inf = std::numeric_limits<T>::infinity();
-    return std::signbit(x) ? inf : -inf;
-  } else if (x < T{0.0}) {
-    if (x == std::trunc(x)) {
-      return std::numeric_limits<T>::quiet_NaN();
-    } else {
-      T iptr;
-      T frac_part = std::modf(x, &iptr);
-      return digamma_positive_domain(T{1.0} - x) -
-             pi / std::tan(pi * frac_part);
-    }
-  } else {
-    return digamma_positive_domain(x);
-  }
-}
-
-template <typename T>
-struct GammalnGradFunctor {
-  GammalnGradFunctor(const T* dout, const T* x, T* output, int64_t numel)
-      : dout_(dout), x_(x), output_(output), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
-    const MT mp_dout = static_cast<MT>(dout_[idx]);
-    const MT mp_x = static_cast<MT>(x_[idx]);
-    output_[idx] = static_cast<T>(mp_dout * digamma<MT>(mp_x));
-  }
-
- private:
-  const T* dout_;
-  const T* x_;
-  T* output_;
-  int64_t numel_;
-};
-template <typename T, typename Context>
-void GammalnGradKernel(const Context& dev_ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& d_out,
-                       DenseTensor* d_x) {
-  auto numel = d_out.numel();
-  if (d_x && d_x->numel() == 0) {
-    dev_ctx.template Alloc<T>(d_x);
-    return;
-  }
-  auto* dout_data = d_out.data<T>();
-  auto* x_data = x.data<T>();
-  auto* dx_data =
-      dev_ctx.template Alloc<T>(d_x, static_cast<size_t>(numel * sizeof(T)));
-  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
-  GammalnGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
-  for_range(functor);
-}
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc
index 2598ce093e6..fa2c9e6e8b7 100644
--- a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc
@@ -181,6 +181,7 @@ void RnnKernel(const Context &dev_ctx,
   else if (mode == "RNN_TANH")
     rnn_mode = miopenRNNTANH;
 #else
+  VLOG(0) << "Leave lstmKernel.11";
   gpuRNNMode_t rnn_mode = CUDNN_LSTM;
   if (mode == "LSTM")
     rnn_mode = CUDNN_LSTM;
@@ -228,6 +229,7 @@ void RnnKernel(const Context &dev_ctx,
                     common::errors::InvalidArgument(
                         "ROCm do not support SequenceLength yet."));
 #endif
+  VLOG(0) << "Leave lstmKernel.12";
   std::vector<int> SequenceLength;
   if (has_seq_length) {
     SequenceLength = phi::GetVectorFromTensor<int>(sequence_length.get_ptr());

From 5a76d35b53e1f7d970d6b388969ba56ae955dc0d Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 30 Sep 2025 17:18:00 +0800
Subject: [PATCH 58/58]  metax_fix_ci (#79)

* [metax] add Rules

---------
---
 .github/workflows/metax_work.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index f14023848c6..f73442b6fd5 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -7,6 +7,7 @@ on:
     branches: [develop, release/**]
     paths:
       - "**"
+      - "Paddle/**"
       - "!backends/**"
       - "backends/metax_gpu/**"