[OPENCL] Add elem_add with y.dim.size==1 & Fix conv check. test=develop (#5604) (#5719)

ysh329 · web-flow · commit d73b69ba5769 · 2021-03-17T14:46:39.000+08:00
* [OPENCL] Add elem_add with y.dim.size==1 &amp; Fix conv check
diff --git a/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl b/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl
@@ -42,7 +42,7 @@ __kernel void elementwise_add(__read_only image2d_t input,
 __kernel void channel_add(__read_only image2d_t input,
                           __read_only image2d_t bias,
                           __write_only image2d_t outputImage,
-                          int w) {
+                          int w, int opt) {
      int x = get_global_id(0);
      int y = get_global_id(1);
 
@@ -51,7 +51,7 @@ __kernel void channel_add(__read_only image2d_t input,
      coords.y = y;
 
      int2 coords_bias;
-     coords_bias.x = x % w;
+     coords_bias.x = (opt == 1) ? 0 : x % w;
      coords_bias.y = 0;
 
      CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, coords);
diff --git a/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl b/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl
@@ -52,6 +52,28 @@ __kernel void channel_mul(__global image2d_t input,
   WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
 }
 
+__kernel void channel_mul_d1(__read_only image2d_t input,    
+                             __read_only image2d_t bias,    
+                             __write_only image2d_t outputImage,    
+                             int x_w, int opt) { 
+  int x = get_global_id(0);    
+  int y = get_global_id(1);    
+
+  int2 coords;    
+  coords.x = x;    
+  coords.y = y;    
+
+  int2 coords_bias;    
+  coords_bias.x = (opt == 1) ? 0 : (x % x_w);
+  coords_bias.y = 0;
+
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, coords);    
+  CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, SAMPLER, coords_bias);    
+  CL_DTYPE4 output = in * (CL_DTYPE4)(biase.x);    
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);    
+}
+
 // etc : 1 1 1 72
 // run time Y  [value,0,0,0] * 72
 __kernel void channel_mul_d2(__global image2d_t input,
diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc
@@ -93,8 +93,8 @@ void ConvImageCompute::PrepareForRun() {
   tensor_hold_filter_image_ = std::unique_ptr<Tensor>(new Tensor);
   tensor_hold_bias_image_ = std::unique_ptr<Tensor>(new Tensor);
 
-  if (filter_tensor_h_ == 1 && filter_tensor_w_ == 1) {
-    CHECK(pad_equal && stride_equal && dilation_equal);
+  if (filter_tensor_h_ == 1 && filter_tensor_w_ == 1 && pad_equal &&
+      stride_equal && dilation_equal) {
     if (input_tensor_c_ % 4 == 0) {
       kernel_func_names_.push_back("conv2d_1x1_simple");
     } else {
@@ -117,9 +117,9 @@ void ConvImageCompute::PrepareForRun() {
 #define DEPTH_CONV_USE_SPL
 #ifdef DEPTH_CONV_USE_SPL
   } else if (filter_tensor_c_ == 1 && input_tensor_c_ == output_tensor_c_ &&
-             filter_tensor_h_ == 3 && filter_tensor_w_ == 3 && groups_ > 1) {
+             filter_tensor_h_ == 3 && filter_tensor_w_ == 3 && groups_ > 1 &&
+             dilation_equal) {
     // depth_conv2d_3x3s1, depth_conv2d_3x3
-    CHECK(dilation_equal);
     if (stride_equal && stride_h_ == 1 && dilation_h_ == 1) {
       kernel_func_names_.push_back("depth_conv2d_3x3s1");
       impl_ = &ConvImageCompute::DepthwiseConv2d3x3s1;
@@ -164,10 +164,9 @@ void ConvImageCompute::PrepareForRun() {
 
     impl_ = &ConvImageCompute::DepthwiseConv2d;
   } else if (filter_tensor_h_ == 3 && filter_tensor_w_ == 3 &&
-             dilation_h_ == 1 && dilation_w_ == 1) {
+             dilation_h_ == 1 && dilation_w_ == 1 && pad_equal &&
+             stride_equal && dilation_equal) {
     // conv2d_3x3
-    pad_equal = (pad_left_ == pad_up_);
-    CHECK(pad_equal && stride_equal && dilation_equal);
     if (groups_ == 1) {
       kernel_func_names_.push_back(
           input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch" : "conv2d_3x3_opt");
@@ -189,8 +188,8 @@ void ConvImageCompute::PrepareForRun() {
     converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     MUTABLE_DATA_GPU(
         filter_gpu_image_, filter_image_w_, filter_image_h_, filter_image_data);
-  } else if (filter_tensor_h_ == 5 && filter_tensor_w_ == 5) {
-    CHECK(pad_equal && stride_equal && dilation_equal);
+  } else if (filter_tensor_h_ == 5 && filter_tensor_w_ == 5 && pad_equal &&
+             stride_equal && dilation_equal) {
 #define CONV_5x5_OPT
 #ifndef CONV_5x5_OPT
     // conv2d_5x5
@@ -231,8 +230,8 @@ void ConvImageCompute::PrepareForRun() {
     impl_ = &ConvImageCompute::Conv2d5x5opt;
 #endif
 #undef CONV_5x5_OPT
-  } else if (filter_tensor_h_ == 7 && filter_tensor_w_ == 7) {
-    CHECK(pad_equal && stride_equal && dilation_equal);
+  } else if (filter_tensor_h_ == 7 && filter_tensor_w_ == 7 && pad_equal &&
+             stride_equal && dilation_equal) {
 #define CONV_7x7_OPT
 #ifndef CONV_7x7_OPT
     // conv2d_7x7
diff --git a/lite/kernels/opencl/elementwise_add_image_compute.cc b/lite/kernels/opencl/elementwise_add_image_compute.cc
@@ -68,7 +68,6 @@ void ElementwiseAddImageCompute::PrepareForRun() {
         auto* y_cpu_nchw =
             static_cast<float*>(const_cast<void*>(y->raw_data()));
         default_converter.NCHWToImage(y_cpu_nchw, y_cpu_image, y->dims());
-
         MUTABLE_DATA_GPU(
             y_weights_image_, y_image_dims[0], y_image_dims[1], y_cpu_image);
       }
@@ -88,6 +87,26 @@ void ElementwiseAddImageCompute::PrepareForRun() {
             static_cast<float*>(const_cast<void*>(y->raw_data()));
         folder_converter.NCHWToImage(y_cpu_nchw, y_cpu_image, y->dims());
 
+        MUTABLE_DATA_GPU(
+            y_weights_image_, y_image_dims[0], y_image_dims[1], y_cpu_image);
+      }
+    } else if (axis == -1 && y->dims()[0] == 1) {
+      kernel_func_name_ = "channel_add";  // for opt
+      if (y->persistable()) {
+        LOG(INFO) << "with y->persistable";
+        y_weights_image_ = std::unique_ptr<Tensor>(new Tensor);
+        std::unique_ptr<Tensor> tensor_hold_y_image_ =
+            std::unique_ptr<Tensor>(new Tensor);
+        CLImageConverterFolder folder_converter;
+        const DDim& y_image_dims =
+            folder_converter.InitImageDimInfoWith(y->dims());
+        tensor_hold_y_image_->Resize({1, y_image_dims[0], y_image_dims[1], 4});
+
+        auto* y_cpu_image = MUTABLE_DATA_CPU(tensor_hold_y_image_);
+        auto* y_cpu_nchw =
+            static_cast<float*>(const_cast<void*>(y->raw_data()));
+        folder_converter.NCHWToImage(y_cpu_nchw, y_cpu_image, y->dims());
+
         MUTABLE_DATA_GPU(
             y_weights_image_, y_image_dims[0], y_image_dims[1], y_cpu_image);
       }
@@ -154,6 +173,7 @@ void ElementwiseAddImageCompute::Run() {
   auto* y_img = GET_DATA_GPU(y);
   auto* out_img =
       MUTABLE_DATA_GPU(out, out_img_shape_[0], out_img_shape_[1], nullptr);
+  const int tensor_w = x_dims[x_dims.size() - 1];
 
 #ifdef LITE_WITH_LOG
   VLOG(4) << "x->target():" << TargetToStr(x->target());
@@ -169,7 +189,7 @@ void ElementwiseAddImageCompute::Run() {
 
   cl_int status;
   auto kernel = kernel_;
-  if (y_dims.size() == 4) {
+  if (kernel_func_name_ == "elementwise_add") {
     int output_w = y_dims[3];
     int output_h = y_dims[2];
     status = kernel.setArg(0, *x_img);
@@ -182,34 +202,35 @@ void ElementwiseAddImageCompute::Run() {
     CL_CHECK_FATAL(status);
     status = kernel.setArg(4, output_w);
     CL_CHECK_FATAL(status);
-  } else if (y_dims.size() == 1) {
-    if (axis == x_dims.size() - 1 || axis == x_dims.size() - 3) {
-      const int tensor_w = x_dims[x_dims.size() - 1];
-#ifdef LITE_WITH_LOG
-      VLOG(4) << "tensor_w:" << tensor_w;
-#endif
-      status = kernel.setArg(0, *x_img);
-      CL_CHECK_FATAL(status);
-      if (y->persistable()) {
-        auto* y_img = GET_DATA_GPU(y_weights_image_);
-        status = kernel.setArg(1, *y_img);
-      } else {
-        status = kernel.setArg(1, *y_img);
-      }
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(2, *out_img);
-      CL_CHECK_FATAL(status);
-      status = kernel.setArg(3, tensor_w);
-      CL_CHECK_FATAL(status);
-    } else {
-      LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
-                 << ", x->dims().size():" << x_dims.size()
-                 << ", y->dims.size():" << y_dims.size();
+  } else if (kernel_func_name_ == "channel_add") {
+    if (y->persistable()) {
+      y_img = GET_DATA_GPU(y_weights_image_);
     }
+    const int opt = y_dims[0] == 1;
+    status = kernel.setArg(0, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(1, *y_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(2, *out_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(3, tensor_w);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(4, opt);
+    CL_CHECK_FATAL(status);
+  } else if (kernel_func_name_ == "width_add") {
+    if (y->persistable()) {
+      y_img = GET_DATA_GPU(y_weights_image_);
+    }
+    status = kernel.setArg(0, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(1, *y_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(2, *out_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(3, tensor_w);
+    CL_CHECK_FATAL(status);
   } else {
-    LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
-               << ", x->dims().size():" << x_dims.size()
-               << ", y->dims.size():" << y_dims.size();
+    LOG(FATAL) << "Unsupported kernel: " << kernel_func_name_;
   }
 
   auto& context = ctx_->As<OpenCLContext>();
@@ -257,6 +278,7 @@ REGISTER_LITE_KERNEL(elementwise_add,
                                        PRECISION(kFP16),
                                        DATALAYOUT(kImageDefault))})
     .Finalize();
+
 REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
                      kOpenCL,
                      kFP16,
diff --git a/lite/kernels/opencl/elementwise_mul_image_compute.cc b/lite/kernels/opencl/elementwise_mul_image_compute.cc
@@ -57,6 +57,22 @@ class ElementwiseMulImageCompute
       const int bias_dim_size = bias_dims.size();
       if (bias_dim_size == 1) {
         kernel_func_name_ = "channel_mul_d1";
+        if (y->persistable()) {
+          CLImageConverterFolder folder_converter;
+          const DDim& y_image_dims =
+              folder_converter.InitImageDimInfoWith(bias_dims);
+          auto y_image_cpu_t = std::unique_ptr<Tensor>(new Tensor);
+          y_image_cpu_t->Resize({1, y_image_dims[0], y_image_dims[1], 4});
+          auto* y_image_cpu_p = MUTABLE_DATA_CPU(y_image_cpu_t);
+          auto* y_nchw_cpu_p =
+              static_cast<float*>(const_cast<void*>(y->raw_data()));
+          folder_converter.NCHWToImage(y_nchw_cpu_p, y_image_cpu_p, bias_dims);
+          y_image_gpu_t_persist_ = std::unique_ptr<Tensor>(new Tensor);
+          MUTABLE_DATA_GPU(y_image_gpu_t_persist_,
+                           y_image_dims[0],
+                           y_image_dims[1],
+                           y_image_cpu_p);
+        }
       } else if (bias_dim_size == 2) {
         kernel_func_name_ = "channel_mul_d2";
       } else if (bias_dim_size == 3) {
@@ -106,6 +122,8 @@ class ElementwiseMulImageCompute
     auto out_img_shape =
         default_convertor.InitImageDimInfoWith(out->dims());  // w, h
     auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
+    auto bias_dims = y->dims();
+    auto x_dims = x->dims();
 
     auto* x_img = GET_DATA_GPU(x);
     auto* y_img = GET_DATA_GPU(y);
@@ -123,9 +141,6 @@ class ElementwiseMulImageCompute
     kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
     auto kernel = context.cl_context()->GetKernel(kernel_key.str());
 
-    auto bias_dims = y->dims();
-    auto x_dims = x->dims();
-
     if (bias_dims == x_dims) {
       // kernel_func_name_ = "elementwise_mul";
       cl_int status = kernel.setArg(0, *x_img);
@@ -139,6 +154,10 @@ class ElementwiseMulImageCompute
       if (bias_dim_size == 1) {
         // kernel_func_name_ = "channel_mul_d1";
         const int tensor_w = x_dims[x_dims.size() - 1];
+        const int opt = bias_dims[0] == 1;
+        if (y->persistable()) {
+          y_img = DATA_GPU(y_image_gpu_t_persist_);
+        }
         cl_int status = kernel.setArg(0, *x_img);
         CL_CHECK_FATAL(status);
         status = kernel.setArg(1, *y_img);
@@ -147,6 +166,8 @@ class ElementwiseMulImageCompute
         CL_CHECK_FATAL(status);
         status = kernel.setArg(3, tensor_w);
         CL_CHECK_FATAL(status);
+        status = kernel.setArg(4, opt);
+        CL_CHECK_FATAL(status);
       } else if (bias_dim_size == 2) {
         // kernel_func_name_ = "channel_mul_d2";
         const int tensor_w = x_dims[x_dims.size() - 1];
@@ -189,7 +210,6 @@ class ElementwiseMulImageCompute
     auto global_work_size =
         cl::NDRange{static_cast<cl::size_type>(x_img_width),
                     static_cast<cl::size_type>(x_img_height)};
-
     auto status = EnqueueNDRangeKernel(context,
                                        kernel,
                                        cl::NullRange,
@@ -208,6 +228,9 @@ class ElementwiseMulImageCompute
   std::string kernel_func_name_{"elementwise_mul"};
   std::string build_options_{""};
   std::string time_stamp_{GetTimeStamp()};
+
+  // y is persistable
+  std::unique_ptr<Tensor> y_image_gpu_t_persist_{nullptr};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/transpose_image_compute.cc b/lite/kernels/opencl/transpose_image_compute.cc
@@ -92,11 +92,20 @@ class TransposeComputeFloatImage
     output_image_w_ = output_image_shape.at("width");
 
     if (output_tensor_dims_.size() == 4) {
-      kernel_func_name_ = "transpose_4d";
+      std::set<std::vector<int>> unsupported_cases{
+          std::vector<int>({0, 3, 1, 2})};
+      if (unsupported_cases.find(axis_) == unsupported_cases.end()) {
+        kernel_func_name_ = "transpose_4d";
+      } else {
+        kernel_func_name_ = "transpose_general_buffer";
+      }
     } else if (output_tensor_dims_.size() == 2) {
       kernel_func_name_ = "transpose_2d";
     } else {
       kernel_func_name_ = "transpose_general_buffer";
+    }
+
+    if (kernel_func_name_ == "transpose_general_buffer") {
       build_options_ = "-DCL_DTYPE_float";
       // create kernels of im2buf and buf2im
       auto im2buf_kernels = KernelRegistry::Global().Create(