[GCU] update to paddlepaddle develop (#1423)

EnflameGCU · web-flow · commit 77090b642bfe · 2024-10-18T16:08:21.000+08:00
diff --git a/backends/gcu/README.md b/backends/gcu/README.md
@@ -24,12 +24,12 @@ cd backends/gcu
 
 # 2) Before compiling, you need to ensure that the PaddlePaddle installation package is installed in the environment.
 #    Just install the PaddlePaddle CPU version directly.
-python -m pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/linux/cpu-mkl/develop.html
+python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
 
 # 3) Start compiling, and submodules will be downloaded on demand during compilation.
 mkdir -p build && cd build
 export PADDLE_CUSTOM_PATH=`python -c "import re, paddle; print(re.compile('/__init__.py.*').sub('',paddle.__file__))"`
-cmake .. -DWITH_TESTING=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DPY_VERSION=3.9
+cmake .. -DWITH_TESTING=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DPY_VERSION=3.10
 make -j $(nproc)
 
 # 4) The compiled product is in the build/dist path and installed using pip.
diff --git a/backends/gcu/README_cn.md b/backends/gcu/README_cn.md
@@ -23,12 +23,12 @@ cd PaddleCustomDevice
 cd backends/gcu
 
 # 2) 编译之前需确保环境下装有飞桨安装包，直接安装飞桨CPU版本即可
-python -m pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/linux/cpu-mkl/develop.html
+python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
 
 # 3) 编译，编译时会按需下载submodule
 mkdir -p build && cd build
 export PADDLE_CUSTOM_PATH=`python -c "import re, paddle; print(re.compile('/__init__.py.*').sub('',paddle.__file__))"`
-cmake .. -DWITH_TESTING=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DPY_VERSION=3.9
+cmake .. -DWITH_TESTING=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DPY_VERSION=3.10
 make -j $(nproc)
 
 # 4) 编译产出在build/dist路径下，使用pip安装
diff --git a/backends/gcu/kernels/activation_kernels.cc b/backends/gcu/kernels/activation_kernels.cc
@@ -791,6 +791,21 @@ void EluKernel(const Context& dev_ctx,
   }
 }
 
+template <typename T, typename Context>
+void RoundKernel(const Context& dev_ctx,
+                 const phi::DenseTensor& x,
+                 const int decimals,
+                 phi::DenseTensor* out) {
+  PADDLE_GCU_KERNEL_TRACE("round");
+  if (LaunchAOTKernel()) {
+    dev_ctx.template Alloc<T>(out);
+    LAUNCH_TOPSATENOP(topsatenRound, dev_ctx, *out, x, decimals);
+
+  } else {  // kernel impl base on JIT
+    THROW_JIT_UNIMPLEMENTED();
+  }
+}
+
 DEFINE_UNARY_AOT_ACTIVATION_KERNEL(logsigmoid, LogSigmoid)
 DEFINE_UNARY_AOT_ACTIVATION_KERNEL(rsqrt, Rsqrt)
 DEFINE_UNARY_AOT_ACTIVATION_KERNEL(log2, Log2)
@@ -805,7 +820,6 @@ DEFINE_UNARY_AOT_ACTIVATION_KERNEL(asinh, Asinh)
 DEFINE_UNARY_AOT_ACTIVATION_KERNEL(atanh, Atanh)
 DEFINE_UNARY_AOT_ACTIVATION_KERNEL(cosh, Cosh)
 DEFINE_UNARY_AOT_ACTIVATION_KERNEL(sinh, Sinh)
-DEFINE_UNARY_AOT_ACTIVATION_KERNEL(round, Round)
 DEFINE_UNARY_AOT_ACTIVATION_KERNEL(tan, Tan)
 DEFINE_UNARY_AOT_ACTIVATION_KERNEL(erf, Erf)
 DEFINE_UNARY_AOT_ACTIVATION_KERNEL(expm1, Expm1)
diff --git a/backends/gcu/kernels/concat_kernel.cc b/backends/gcu/kernels/concat_kernel.cc
@@ -23,6 +23,19 @@ void ConcatKernel(const Context& dev_ctx,
                   const phi::Scalar& axis_scalar,
                   phi::DenseTensor* out) {
   PADDLE_GCU_KERNEL_TRACE("concat");
+  int64_t dim = axis_scalar.to<int64_t>();
+  if (common::contain_unknown_dim(out->dims())) {
+    std::vector<phi::MetaTensor> x_meta_vec;
+    x_meta_vec.reserve(ins.size());
+    std::vector<const phi::MetaTensor*> x_metas(ins.size(), nullptr);
+    for (size_t i = 0; i < ins.size(); ++i) {
+      x_meta_vec.emplace_back(*ins[i]);
+      x_metas[i] = &x_meta_vec[i];
+    }
+    phi::MetaTensor meta_out(*out);
+    phi::ConcatInferMeta(x_metas, dim, &meta_out);
+  }
+
   dev_ctx.template Alloc<T>(out);
 
   if (LaunchAOTKernel()) {
@@ -54,7 +67,7 @@ void ConcatKernel(const Context& dev_ctx,
       }
       in_tensors.emplace_back(CreateTopsatenTensor(tensor));
     }
-    int64_t dim = axis_scalar.to<int64_t>();
+
     if (dim < 0 && !ins.empty()) {
       dim += ins[0]->dims().size();
     }
diff --git a/backends/gcu/kernels/flatten_kernel.cc b/backends/gcu/kernels/flatten_kernel.cc
@@ -17,14 +17,14 @@
 
 namespace custom_kernel {
 template <typename T, typename Context>
-void FlattenInferKernel(const Context& dev_ctx,
-                        const phi::DenseTensor& x,
-                        int start_axis UNUSED,
-                        int stop_axis UNUSED,
-                        phi::DenseTensor* out) {
-  PADDLE_GCU_KERNEL_TRACE("flatten_infer");
+void FlattenKernel(const Context& dev_ctx,
+                   const phi::DenseTensor& x,
+                   int start_axis UNUSED,
+                   int stop_axis UNUSED,
+                   phi::DenseTensor* out) {
+  PADDLE_GCU_KERNEL_TRACE("flatten");
   if (LaunchAOTKernel()) {
-    VLOG(6) << "[HOST_KERNEL] Impl on host for flatten_infer";
+    VLOG(6) << "[HOST_KERNEL] Impl on host for flatten";
     if (x.numel() == 0) {
       return;
     }
@@ -59,18 +59,19 @@ void FlattenInferKernel(const Context& dev_ctx,
 }
 
 template <typename T, typename Context>
-void FlattenKernel(const Context& dev_ctx,
-                   const phi::DenseTensor& x,
-                   int start_axis,
-                   int stop_axis,
-                   phi::DenseTensor* out,
-                   phi::DenseTensor* xshape) {
-  PADDLE_GCU_KERNEL_TRACE("flatten");
+void FlattenWithXShapeKernel(const Context& dev_ctx,
+                             const phi::DenseTensor& x,
+                             int start_axis,
+                             int stop_axis,
+                             phi::DenseTensor* out,
+                             phi::DenseTensor* xshape) {
+  PADDLE_GCU_KERNEL_TRACE("flatten_with_xshape");
   if (LaunchAOTKernel()) {
-    custom_kernel::FlattenInferKernel<T, Context>(
+    custom_kernel::FlattenKernel<T, Context>(
         dev_ctx, x, start_axis, stop_axis, out);
 
   } else {  // kernel impl base on JIT
+    THROW_JIT_UNIMPLEMENTED();
     dev_ctx.template Alloc<T>(out);
     dev_ctx.template Alloc<T>(xshape);
 
@@ -161,10 +162,10 @@ PD_REGISTER_PLUGIN_KERNEL(flatten,
                           int,
                           int64_t) {}
 
-PD_REGISTER_PLUGIN_KERNEL(flatten_infer,
+PD_REGISTER_PLUGIN_KERNEL(flatten_with_xshape,
                           gcu,
                           ALL_LAYOUT,
-                          custom_kernel::FlattenInferKernel,
+                          custom_kernel::FlattenWithXShapeKernel,
                           phi::dtype::bfloat16,
                           phi::dtype::float16,
                           float,
diff --git a/backends/gcu/kernels/funcs/op_utils.cc b/backends/gcu/kernels/funcs/op_utils.cc
@@ -14,14 +14,25 @@
 
 #include "kernels/funcs/op_utils.h"
 
+#include "paddle/common/flags.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 
+PHI_DECLARE_bool(use_stride_kernel);
+
 namespace custom_kernel {
 
 void *GcuDataPtr(const phi::DenseTensor &tensor) {
   if (tensor.initialized()) {
-    return const_cast<void *>(tensor.data());
+    auto contiguous_strides = phi::DenseTensorMeta::calc_strides(tensor.dims());
+    bool is_contiguous = (tensor.strides() == contiguous_strides);
+    auto tensor_tmp = tensor;
+    if (!is_contiguous && !FLAGS_use_stride_kernel) {
+      auto meta = tensor_tmp.meta();
+      meta.strides = contiguous_strides;
+      tensor_tmp.set_meta(meta);
+    }
+    return const_cast<void *>(tensor_tmp.data());
   }
   return nullptr;
 }
diff --git a/backends/gcu/kernels/logsumexp_kernel.cc b/backends/gcu/kernels/logsumexp_kernel.cc
@@ -19,15 +19,15 @@ namespace custom_kernel {
 template <typename T, typename Context>
 void LogsumexpKernel(const Context& dev_ctx,
                      const phi::DenseTensor& x,
-                     const std::vector<int64_t>& axis,
+                     const std::vector<int>& axis,
                      bool keepdim,
                      bool reduce_all,
                      phi::DenseTensor* out) {
   PADDLE_GCU_KERNEL_TRACE("logsumexp");
   if (LaunchAOTKernel()) {
     dev_ctx.template Alloc<T>(out);
 
-    auto reduce_axis = axis;
+    std::vector<int64_t> reduce_axis(axis.begin(), axis.end());
     int64_t rank = x.dims().size();
     if (reduce_all || reduce_axis.empty()) {
       reduce_axis.assign(rank, 0);
diff --git a/backends/gcu/kernels/reshape_kernel.cc b/backends/gcu/kernels/reshape_kernel.cc
@@ -147,10 +147,11 @@ void InferMetaFromVecValue(const phi::DenseTensor& x,
 }  // namespace
 
 template <typename T, typename Context>
-void ReshapeInferKernel(const Context& dev_ctx,
-                        const phi::DenseTensor& x,
-                        const phi::IntArray& shape,
-                        phi::DenseTensor* out) {
+void ReshapeKernel(const Context& dev_ctx,
+                   const phi::DenseTensor& x,
+                   const phi::IntArray& shape,
+                   phi::DenseTensor* out) {
+  PADDLE_GCU_KERNEL_TRACE("reshape");
   PADDLE_ENFORCE_NE(
       x.layout(),
       common::DataLayout::kNDHWC,
@@ -180,13 +181,13 @@ void ReshapeInferKernel(const Context& dev_ctx,
 }
 
 template <typename T, typename Context>
-void ReshapeKernel(const Context& dev_ctx,
-                   const phi::DenseTensor& x,
-                   const phi::IntArray& shape,
-                   phi::DenseTensor* out,
-                   phi::DenseTensor* xshape) {
-  PADDLE_GCU_KERNEL_TRACE("reshape");
-  ReshapeInferKernel<T>(dev_ctx, x, shape, out);
+void ReshapeWithXShapeKernel(const Context& dev_ctx,
+                             const phi::DenseTensor& x,
+                             const phi::IntArray& shape,
+                             phi::DenseTensor* out,
+                             phi::DenseTensor* xshape) {
+  PADDLE_GCU_KERNEL_TRACE("reshape_with_xshape");
+  ReshapeKernel<T>(dev_ctx, x, shape, out);
 }
 
 template <typename T, typename Context>
@@ -251,6 +252,19 @@ PD_REGISTER_PLUGIN_KERNEL(reshape,
                           uint8_t,
                           bool) {}
 
+PD_REGISTER_PLUGIN_KERNEL(reshape_with_xshape,
+                          gcu,
+                          ALL_LAYOUT,
+                          custom_kernel::ReshapeWithXShapeKernel,
+                          float,
+                          phi::dtype::float16,
+                          double,
+                          int8_t,
+                          int16_t,
+                          int32_t,
+                          int64_t,
+                          uint8_t,
+                          bool) {}
 // PD_REGISTER_PLUGIN_KERNEL(reshape_grad,
 //                           gcu,
 //                           ALL_LAYOUT,
diff --git a/backends/gcu/tests/fuse_pass/test_custom_pass_gcu.py b/backends/gcu/tests/fuse_pass/test_custom_pass_gcu.py
@@ -36,9 +36,9 @@ def replace(x, y, z):
 
 @paddle.jit.to_static(
     input_spec=[
-        paddle.static.InputSpec([None, 32], "int32", "x"),
-        paddle.static.InputSpec([None, 32], "int32", "y"),
-        paddle.static.InputSpec([None, 32], "int32", "z"),
+        paddle.static.InputSpec([None, 32], "float32", "x"),
+        paddle.static.InputSpec([None, 32], "float32", "y"),
+        paddle.static.InputSpec([None, 32], "float32", "z"),
     ]
 )
 def func(x, y, z):
@@ -56,7 +56,8 @@ def setUp(self):
                 paddle.utils.cpp_extension.extension_utils.load_op_meta_info_and_register_op(
                     lib
                 )
-        paddle.jit.save(func, MODEL_FILE)
+        with paddle.pir_utils.OldIrGuard():
+            paddle.jit.save(func, MODEL_FILE)
 
     def test_my_add_n(self):
         config = paddle.inference.Config()
diff --git a/backends/gcu/tests/unittests/test_diag.py b/backends/gcu/tests/unittests/test_diag.py
@@ -22,13 +22,12 @@
 # The table retains its original format for better comparison of parameter settings.
 # fmt: off
 DIAG_CASE = [
-    {"shape": [3], "dtype": np.float32, "offset": 0, "padding_value": 0},
-    {"shape": [3], "dtype": np.float32, "offset": 1, "padding_value": 0},
-    {"shape": [3], "dtype": np.float32, "offset": -1, "padding_value": 0},
-    {"shape": [3], "dtype": np.float32, "offset": 0, "padding_value": 6},
-    {"shape": [3], "dtype": np.float32, "offset": 1, "padding_value": 6},
-    {"shape": [3], "dtype": np.float32, "offset": -1, "padding_value": 6},
     {"shape": [3, 3], "dtype": np.float32, "offset": 0, "padding_value": 0},
+    {"shape": [3, 3], "dtype": np.float32, "offset": 1, "padding_value": 0},
+    {"shape": [3, 3], "dtype": np.float32, "offset": -1, "padding_value": 0},
+    {"shape": [3, 6], "dtype": np.float32, "offset": 0, "padding_value": 6},
+    {"shape": [3, 6], "dtype": np.float32, "offset": 1, "padding_value": 6},
+    {"shape": [3, 6], "dtype": np.float32, "offset": -1, "padding_value": 6},
 ]
 # fmt: on
 
diff --git a/backends/gcu/tests/unittests/test_fc.py b/backends/gcu/tests/unittests/test_fc.py
diff --git a/backends/gcu/tests/unittests/test_fused_conv2d_add_act.py b/backends/gcu/tests/unittests/test_fused_conv2d_add_act.py
diff --git a/backends/gcu/tests/unittests/test_mean_all.py b/backends/gcu/tests/unittests/test_mean_all.py
diff --git a/backends/gcu/tests/unittests/test_tril_triu.py b/backends/gcu/tests/unittests/test_tril_triu.py
diff --git a/backends/gcu/tests/unittests/test_unary_ops.py b/backends/gcu/tests/unittests/test_unary_ops.py