[OpenCL]Add NCHW_to_ImageFolder layout, fix layout imageFolder to BufferChw 3,4 dims bug (#7685)

sprouteer · web-flow · commit f05b39181530 · 2021-11-22T19:57:10.000+08:00
diff --git a/lite/backends/opencl/cl_kernel/image/layout_kernel.cl b/lite/backends/opencl/cl_kernel/image/layout_kernel.cl
@@ -413,4 +413,32 @@ __kernel void image2d_folder_to_buffer(__read_only image2d_t input,
   if (outpos_base + 3 < length) {
     output[outpos_base + 3] = out.w;
   }
-}
+}
+
+////////////////////////////////////////////////////////
+// buffer -> image2d_folder
+////////////////////////////////////////////////////////
+__kernel void buffer_to_image2d_folder(__global const CL_DTYPE* input,
+                                       __write_only image2d_t output,
+                                       __private const int out_h,
+                                       __private const int out_w,
+                                       __private const int length) {
+  const int pos_x = get_global_id(0);
+  const int pos_y = get_global_id(1);
+
+  int inpos_base = out_w * pos_y + pos_x * 4;
+
+  CL_COMPUTE_DTYPE4 out = (CL_COMPUTE_DTYPE4)(0.f, 0.f, 0.f, 0.f);
+  out.x = input[inpos_base];
+  if (inpos_base + 1 < length) {
+    out.y = input[inpos_base + 1];
+  }
+  if (inpos_base + 2 < length) {
+    out.z = input[inpos_base + 2];
+  }
+  if (inpos_base + 3 < length) {
+    out.w = input[inpos_base + 3];
+  }
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_x, pos_y), out);
+}
diff --git a/lite/kernels/opencl/layout_image_compute.cc b/lite/kernels/opencl/layout_image_compute.cc
@@ -617,6 +617,10 @@ class LayoutComputeImageFolderToBufferChw
 
   void PrepareForRun() override {
     auto& param = Param<param_t>();
+    auto x_dims = param.x->dims();
+    if (x_dims.size() > 2) {
+      kernel_func_name_ = "image2d_to_buffer";
+    }
     if (!fp16_support_) {
       build_options_ += " -DCL_DTYPE_FLOAT_FORCE";
     }
@@ -641,13 +645,25 @@ class LayoutComputeImageFolderToBufferChw
     auto x_dims = param.x->dims();
     auto y_dims = param.y->dims();
 
-    CLImageConverterFolder folder_converter;
-    auto x_image_shape = folder_converter.InitImageDimInfoWith(x_dims);
+    DDim x_image_shape;
+    if (x_dims.size() > 2) {
+      CLImageConverterFolder folder_converter;
+      x_image_shape = folder_converter.InitImageDimInfoWith(x_dims);
+    } else {
+      CLImageConverterDefault default_converter;
+      x_image_shape = default_converter.InitImageDimInfoWith(x_dims);
+    }
 
     const cl::Buffer* y_data =
         param.y->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
     auto* x_data = GET_DATA_GPU(param.x);
 
+    // out info
+    std::vector<size_t> new_dims = {1, 1, 1, 1};
+    for (int tidx = 0; tidx < x_dims.size(); ++tidx) {
+      new_dims[4 - x_dims.size() + tidx] = x_dims[tidx];
+    }
+
 #ifdef LITE_WITH_LOG
     VLOG(2) << "x_dims:" << x_dims;
     VLOG(2) << "y_dims:" << y_dims;
@@ -661,20 +677,53 @@ class LayoutComputeImageFolderToBufferChw
     kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
+    cl::NDRange global_work_size;
     int arg_idx = 0;
     cl_int status;
-    status = kernel.setArg(arg_idx, *x_data);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, *y_data);
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(y_dims[0]));
-    CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(y_dims[1]));
-    CL_CHECK_FATAL(status);
+    if (x_dims.size() <= 2) {
+      status = kernel.setArg(arg_idx, *x_data);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *y_data);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(y_dims[0]));
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(y_dims[1]));
+      CL_CHECK_FATAL(status);
+
+      global_work_size =
+          cl::NDRange{static_cast<cl::size_type>(x_image_shape[0]),
+                      static_cast<cl::size_type>(x_image_shape[1])};
+    } else {
+      size_t C = new_dims[1];
+      size_t in_height = new_dims[2];
+      size_t in_width = new_dims[3];
+      int size_ch = in_height * in_width;
+      int size_block = size_ch * 4;
+      int size_batch = size_ch * C;
+
+      status = kernel.setArg(arg_idx, *x_data);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(in_width));
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(in_height));
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, *y_data);
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(size_ch));
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(size_block));
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(size_batch));
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(C));
+      CL_CHECK_FATAL(status);
+
+      global_work_size =
+          cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4),
+                      static_cast<cl::size_type>(new_dims[3]),
+                      static_cast<cl::size_type>(new_dims[0] * new_dims[2])};
+    }
 
-    auto global_work_size =
-        cl::NDRange{static_cast<cl::size_type>(x_image_shape[0]),
-                    static_cast<cl::size_type>(x_image_shape[1])};
 #ifdef LITE_WITH_LOG
     for (auto i = 0; i < global_work_size.dimensions(); i++) {
       VLOG(2) << "global_work_size[" << i << "]: " << global_work_size[i];
@@ -702,6 +751,135 @@ class LayoutComputeImageFolderToBufferChw
   std::string build_options_{"-DCL_DTYPE_float "};
 };
 
+// [NCHW] -> [ImageFolder]
+class LayoutComputeBufferChwToImageFolder
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kAny),
+                        DATALAYOUT(kImageFolder)> {
+ public:
+  using param_t = operators::LayoutParam;
+
+  void PrepareForRun() override {
+    auto& param = Param<param_t>();
+    auto x_dims = param.x->dims();
+    if (x_dims.size() > 2) {
+      kernel_func_name_ = "buffer_to_image2d";
+    }
+    if (!fp16_support_) {
+      build_options_ += " -DCL_DTYPE_FLOAT_FORCE";
+    }
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/layout_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
+  }
+
+  void Run() override {
+    auto& param = Param<param_t>();
+    auto x_dims = param.x->dims();
+    auto y_dims = param.y->dims();
+    DDim image_shape;
+    if (y_dims.size() > 2) {
+      CLImageConverterFolder folder_converter;
+      image_shape = folder_converter.InitImageDimInfoWith(y_dims);
+    } else {
+      CLImageConverterDefault default_converter;
+      image_shape = default_converter.InitImageDimInfoWith(y_dims);
+    }
+    auto* y_data =
+        MUTABLE_DATA_GPU(param.y, image_shape[0], image_shape[1], nullptr);
+    auto* x_data = GET_BUFFER_GPU(param.x);
+
+    // out info
+    std::vector<size_t> new_dims = {1, 1, 1, 1};
+    for (int tidx = 0; tidx < x_dims.size(); ++tidx) {
+      new_dims[4 - x_dims.size() + tidx] = x_dims[tidx];
+    }
+
+#ifdef LITE_WITH_LOG
+    VLOG(2) << "x_dims:" << x_dims;
+    VLOG(2) << "y_dims:" << y_dims;
+    VLOG(2) << "image_shape(w,h):" << image_shape[0] << " " << image_shape[1];
+#endif
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int arg_idx = 0;
+    cl_int status;
+    status = kernel.setArg(arg_idx, *x_data);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *y_data);
+    CL_CHECK_FATAL(status);
+    if (y_dims.size() <= 2) {
+      const int length = new_dims[0] * new_dims[1] * new_dims[2] * new_dims[3];
+      status = kernel.setArg(++arg_idx, static_cast<const int>(y_dims[0]));
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(y_dims[1]));
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(length));
+      CL_CHECK_FATAL(status);
+    } else {
+      const int out_C = new_dims[1];
+      const int out_H = new_dims[2];
+      const int out_W = new_dims[3];
+      const int Stride2 = out_C * out_H * out_W;
+      const int Stride1 = out_H * out_W;
+      const int Stride0 = out_W;
+      status = kernel.setArg(++arg_idx, static_cast<const int>(out_H));
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(out_W));
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(out_C));
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(Stride0));
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(Stride1));
+      CL_CHECK_FATAL(status);
+      status = kernel.setArg(++arg_idx, static_cast<const int>(Stride2));
+      CL_CHECK_FATAL(status);
+    }
+
+    if (y_dims.size() <= 2) {
+      gws_ = cl::NDRange{static_cast<cl::size_type>(image_shape[0]),
+                         static_cast<cl::size_type>(image_shape[1])};
+    } else {
+      gws_ = cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4),
+                         static_cast<cl::size_type>(new_dims[3]),
+                         static_cast<cl::size_type>(new_dims[0] * new_dims[2])};
+    }
+
+    status = EnqueueNDRangeKernel(
+        context, kernel, cl::NullRange, gws_, cl::NullRange, nullptr, event_);
+    CL_CHECK_FATAL(status);
+  }
+
+  std::string doc() const override {
+    return "Trans Layout from cl::Buffer(NCHW) to "
+           "cl::Image2D(ImageFolder)";
+  }
+
+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->global_work_size = ch->NDRangeToStr(gws_);
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
+ private:
+  std::string time_stamp_{GetTimeStamp()};
+  std::string kernel_func_name_{"buffer_to_image2d_folder"};
+  std::string build_options_{"-DCL_DTYPE_float "};
+  cl::NDRange gws_;
+};
+
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite
@@ -760,6 +938,24 @@ REGISTER_LITE_KERNEL(
                                        DATALAYOUT(kNCHW))})
     .Finalize();
 
+// [NCHW] -> [ImageFolder]
+REGISTER_LITE_KERNEL(
+    layout,
+    kOpenCL,
+    kAny,
+    kImageFolder,
+    paddle::lite::kernels::opencl::LayoutComputeBufferChwToImageFolder,
+    NCHW_to_ImageFolder)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kImageFolder))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(
     layout,
     kOpenCL,
diff --git a/lite/kernels/opencl/layout_image_compute_test.cc b/lite/kernels/opencl/layout_image_compute_test.cc