PaddlePaddle
diff --git a/‎backends/mlu/kernels/adam_kernel.cc
Lines changed: 30 additions & 19 deletions b/‎backends/mlu/kernels/adam_kernel.cc
Lines changed: 30 additions & 19 deletions
diff --git a/‎backends/mlu/kernels/funcs/mlu_baseop.cc
Lines changed: 54 additions & 26 deletions b/‎backends/mlu/kernels/funcs/mlu_baseop.cc
Lines changed: 54 additions & 26 deletions
diff --git a/‎backends/mlu/kernels/funcs/mlu_baseop.h
Lines changed: 4 additions & 4 deletions b/‎backends/mlu/kernels/funcs/mlu_baseop.h
Lines changed: 4 additions & 4 deletions
diff --git a/‎backends/mlu/kernels/gather_nd_kernel.cc
Lines changed: 1 addition & 0 deletions b/‎backends/mlu/kernels/gather_nd_kernel.cc
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/mlu/kernels/interpolate_kernel.cc
Lines changed: 1 addition & 1 deletion b/‎backends/mlu/kernels/interpolate_kernel.cc
Lines changed: 1 addition & 1 deletion
@@ -115,9 +115,7 @@ void AdamKernel(const Context& dev_ctx,
                         "value is:%d.",
                         beta2_pow_out->numel()));
 
-  const phi::DenseTensor* beta1_tensor = nullptr;
-  const phi::DenseTensor* beta2_tensor = nullptr;
-  const phi::DenseTensor* epsilon_tensor = nullptr;
+  Tensor beta1_tensor;
 
   phi::DenseTensor beta1_tmp;
   phi::DenseTensor beta2_tmp;
@@ -128,19 +126,30 @@ void AdamKernel(const Context& dev_ctx,
   epsilon_tmp.Resize({1});
 
   MPDType beta1 = beta1_in.to<MPDType>();
-  dev_ctx.template Alloc<MPDType>(&beta1_tmp);
-  FillMLUTensorWithHostValue<MPDType>(dev_ctx, beta1, &beta1_tmp);
-  beta1_tensor = &beta1_tmp;
 
   MPDType beta2 = beta2_in.to<MPDType>();
-  dev_ctx.template Alloc<MPDType>(&beta2_tmp);
-  FillMLUTensorWithHostValue<MPDType>(dev_ctx, beta2, &beta2_tmp);
-  beta2_tensor = &beta2_tmp;
 
   MPDType epsilon = epsilon_in.to<MPDType>();
-  dev_ctx.template Alloc<MPDType>(&epsilon_tmp);
-  FillMLUTensorWithHostValue<MPDType>(dev_ctx, epsilon, &epsilon_tmp);
-  epsilon_tensor = &epsilon_tmp;
+
+  std::vector<MPDType> parameter_list;
+  parameter_list.push_back(beta1);
+  parameter_list.push_back(beta2);
+  parameter_list.push_back(epsilon);
+
+  Tensor dst;
+  dst.Resize({3});
+  auto dst_place = phi::CustomPlace();
+  C_Device_st device{dst_place.GetDeviceId()};
+  void* dst_ptr = dev_ctx.template Alloc<MPDType>(&dst);
+  auto src_ptr = static_cast<void*>(parameter_list.data());
+  MemCpyH2D(&device, dst_ptr, src_ptr, parameter_list.size() * sizeof(MPDType));
+
+  const void* beta1_tensor_ptr = nullptr;
+  const void* beta2_tensor_ptr = nullptr;
+  const void* epsilon_tensor_ptr = nullptr;
+  beta1_tensor_ptr = dst_ptr,
+  beta2_tensor_ptr = static_cast<char*>(dst_ptr) + sizeof(MPDType);
+  epsilon_tensor_ptr = static_cast<char*>(dst_ptr) + 2 * sizeof(MPDType);
 
   Tensor t_param_in_out, t_grad;
   t_param_in_out.Resize(param.dims());
@@ -198,11 +207,11 @@ void AdamKernel(const Context& dev_ctx,
                      grad_desc.get(),
                      GetBasePtr(&t_grad),
                      GetBasePtr(&learning_rate),
-                     GetBasePtr(beta1_tensor),
-                     GetBasePtr(beta2_tensor),
+                     beta1_tensor_ptr,
+                     beta2_tensor_ptr,
                      GetBasePtr(beta1_pow),
                      GetBasePtr(beta2_pow),
-                     GetBasePtr(epsilon_tensor),
+                     epsilon_tensor_ptr,
                      /*use_nesterov*/ false);
 
   if (param.dtype() != phi::DataType::FLOAT32) {
@@ -221,7 +230,6 @@ void AdamKernel(const Context& dev_ctx,
                   param_out_desc.get(),
                   GetBasePtr(param_out));
   }
-
   if (!use_global_beta_pow) {
     if (beta1_pow->place().GetType() == phi::AllocationType::CPU &&
         beta2_pow->place().GetType() == phi::AllocationType::CPU) {
@@ -235,7 +243,10 @@ void AdamKernel(const Context& dev_ctx,
       dev_ctx.template Alloc<MPDType>(beta1_pow_out);
       dev_ctx.template Alloc<MPDType>(beta2_pow_out);
 
-      MLUCnnlTensorDesc beta1_desc(*beta1_tensor);
+      beta1_tensor.Resize({1});
+      MLUCnnlTensorDesc beta1_desc(
+          beta1_tensor, CNNL_LAYOUT_ARRAY, ToCnnlDataType<MPDType>());
+
       MLUCnnlOpTensorDesc mul_op_desc(CNNL_OP_TENSOR_MUL,
                                       ToCnnlDataType<MPDType>(),
                                       CNNL_NOT_PROPAGATE_NAN);
@@ -245,7 +256,7 @@ void AdamKernel(const Context& dev_ctx,
                         beta1_desc.get(),
                         GetBasePtr(beta1_pow),
                         beta1_desc.get(),
-                        GetBasePtr(beta1_tensor),
+                        beta1_tensor_ptr,
                         beta1_desc.get(),
                         GetBasePtr(beta1_pow_out),
                         ToCnnlDataType<MPDType>());
@@ -255,7 +266,7 @@ void AdamKernel(const Context& dev_ctx,
                         beta1_desc.get(),
                         GetBasePtr(beta2_pow),
                         beta1_desc.get(),
-                        GetBasePtr(beta2_tensor),
+                        beta2_tensor_ptr,
                         beta1_desc.get(),
                         GetBasePtr(beta2_pow_out),
                         ToCnnlDataType<MPDType>());
 
@@ -1788,8 +1788,18 @@ NormalizeDesc::~NormalizeDesc() {
     beta_ptr = static_cast<const void*>(&beta_int64);
   }
 
-  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetOpTensorWorkspaceSize(
-      handle, a_desc, b_desc, output_desc, &workspace_size));
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetOpTensorWorkspaceSize_v2(handle,
+                                                             op_tensor_desc,
+                                                             alpha1_ptr,
+                                                             a_desc,
+                                                             a,
+                                                             alpha2_ptr,
+                                                             b_desc,
+                                                             b,
+                                                             beta_ptr,
+                                                             output_desc,
+                                                             output,
+                                                             &workspace_size));
 
   Tensor workspace;
   workspace.Resize({static_cast<int64_t>(workspace_size)});
@@ -1931,16 +1941,16 @@ NormalizeDesc::~NormalizeDesc() {
 
 /* static */ void MLUCnnl::StridedSlice(
     const Context& ctx,
-    const int begin[],
-    const int end[],
-    const int strides[],
+    const int64_t begin[],
+    const int64_t end[],
+    const int64_t strides[],
     const cnnlTensorDescriptor_t input_desc,
     const void* input,
     const cnnlTensorDescriptor_t output_desc,
     void* output) {
   cnnlHandle_t handle = GetHandleFromCTX(ctx);
 
-  PADDLE_ENFORCE_MLU_SUCCESS(cnnlStridedSlice(
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlStridedSlice_v2(
       handle, input_desc, input, begin, end, strides, output_desc, output));
 }
 
@@ -2312,14 +2322,23 @@ NormalizeDesc::~NormalizeDesc() {
     void* index) {
   cnnlHandle_t handle = GetHandleFromCTX(ctx);
 
-  PADDLE_ENFORCE_MLU_SUCCESS(cnnlAdaptivePoolingForward(handle,
-                                                        input_desc,
-                                                        input,
-                                                        pool_mode,
-                                                        output_desc,
-                                                        output,
-                                                        index_desc,
-                                                        index));
+  size_t workspace_size = 0;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetAdaptivePoolingForwardWorkspaceSize(
+      handle, input_desc, pool_mode, output_desc, &workspace_size));
+  Tensor workspace;
+  workspace.Resize({static_cast<int64_t>(workspace_size)});
+  void* workspace_ptr = ctx.Alloc(&workspace, DataType::INT8, workspace_size);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlAdaptivePoolingForward_v2(handle,
+                                                           input_desc,
+                                                           input,
+                                                           pool_mode,
+                                                           workspace_ptr,
+                                                           workspace_size,
+                                                           output_desc,
+                                                           output,
+                                                           index_desc,
+                                                           index));
 }
 
 /* static */ void MLUCnnl::Pool3D(const Context& ctx,
@@ -3280,7 +3299,6 @@ NormalizeDesc::~NormalizeDesc() {
                                   const cnnlTensorDescriptor_t output_desc,
                                   void* output) {
   cnnlHandle_t handle = GetHandleFromCTX(ctx);
-
   PADDLE_ENFORCE_MLU_SUCCESS(cnnlInterp_v2(handle,
                                            align_corners,
                                            half_pixel_centers,
@@ -3295,7 +3313,7 @@ NormalizeDesc::~NormalizeDesc() {
 
 /* static */ void MLUCnnl::InterpBackward(
     const Context& ctx,
-    const cnnlInterpBackwardMode_t mode,
+    const cnnlInterpMode_t mode,
     const bool align_corners,
     const bool half_pixel_centers,
     const cnnlTensorDescriptor_t input_desc,
@@ -3304,16 +3322,26 @@ NormalizeDesc::~NormalizeDesc() {
     void* output) {
   cnnlHandle_t handle = GetHandleFromCTX(ctx);
 
-  PADDLE_ENFORCE_MLU_SUCCESS(cnnlInterpBackward_v2(handle,
-                                                   align_corners,
-                                                   half_pixel_centers,
-                                                   mode,
-                                                   NULL,
-                                                   true,
-                                                   input_desc,
-                                                   input,
-                                                   output_desc,
-                                                   output));
+  cnnlInterpDescriptor_t interp_desc;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateInterpDescriptor(&interp_desc));
+
+  cnnlInterpAlgo_t algo;
+  if (align_corners == false && half_pixel_centers == false) {
+    algo = CNNL_INTERP_ALGO_0;
+  } else if (align_corners == false && half_pixel_centers == true) {
+    algo = CNNL_INTERP_ALGO_1;
+  } else if (align_corners == true && half_pixel_centers == false) {
+    algo = CNNL_INTERP_ALGO_3;
+  } else if (align_corners == true && half_pixel_centers == true) {
+    algo = CNNL_INTERP_ALGO_4;
+  }
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlSetInterpDescriptor_v2(interp_desc, input_desc, mode, algo, NULL));
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlInterpBackward_v3(
+      handle, interp_desc, input_desc, input, output_desc, output));
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroyInterpDescriptor(interp_desc));
 }
 
 /* static */ void MLUCnnl::Cast(const Context& ctx,
 
@@ -1120,9 +1120,9 @@ class MLUCnnl {
                    void* indices_out);
 
   static void StridedSlice(const Context& ctx,
-                           const int begin[],
-                           const int end[],
-                           const int strides[],
+                           const int64_t begin[],
+                           const int64_t end[],
+                           const int64_t strides[],
                            const cnnlTensorDescriptor_t input_desc,
                            const void* input,
                            const cnnlTensorDescriptor_t output_desc,
@@ -1807,7 +1807,7 @@ class MLUCnnl {
                      void* output);
 
   static void InterpBackward(const Context& ctx,
-                             const cnnlInterpBackwardMode_t mode,
+                             const cnnlInterpMode_t mode,
                              const bool align_corners,
                              const bool half_pixel_centers,
                              const cnnlTensorDescriptor_t input_desc,
 
@@ -141,6 +141,7 @@ PD_REGISTER_PLUGIN_KERNEL(gather_nd,
                           mlu,
                           ALL_LAYOUT,
                           custom_kernel::GatherNdKernel,
+                          int,
                           int64_t,
                           float,
                           phi::dtype::float16) {}
 
@@ -508,7 +508,7 @@ void InterpolateGradKernel(
                                 CNNL_LAYOUT_NHWC,
                                 ToCnnlDataType(transformed_input_grad.dtype()));
   MLUCnnl::InterpBackward(dev_ctx,
-                          GetMLUCnnlInterpBackwardMode(interp_method),
+                          GetMLUCnnlInterpMode(interp_method),
                           align_corners,
                           align_center,
                           input_desc.get(),