[cherry pick][OpenCL] Fix poor performance of iocopy and layout (#9665)

sprouteer · web-flow · commit 1eea6b2a687a · 2022-11-11T14:15:04.000+08:00
* fuse precision cast to layout cast test=develop
diff --git a/lite/backends/opencl/cl_kernel/image/layout_kernel.cl b/lite/backends/opencl/cl_kernel/image/layout_kernel.cl
@@ -18,7 +18,7 @@ limitations under the License. */
 ////////////////////////////////////////////////////////
 // buffer -> image2d
 ////////////////////////////////////////////////////////
-__kernel void buffer_to_image2d(__global CL_DTYPE* in,
+__kernel void buffer_to_image2d(__global MUTABLE_TYPE* in,
                                 __write_only image2d_t output_image,
                                 __private const int out_H,
                                 __private const int out_W,
@@ -96,7 +96,7 @@ __kernel void buffer_to_image2d(__global CL_DTYPE* in,
 __kernel void image2d_to_buffer(__read_only image2d_t input,
                                 __private const int in_width,
                                 __private const int in_height,
-                                __global CL_DTYPE* out,
+                                __global MUTABLE_TYPE* out,
                                 __private const int size_ch,
                                 __private const int size_block,
                                 __private const int size_batch,
@@ -129,15 +129,15 @@ __kernel void image2d_to_buffer(__read_only image2d_t input,
 
   const int index =
       in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
-  out[index] = CONVERT_TYPE_TO(in.x, CL_DTYPE);
+  out[index] = CONVERT_TYPE_TO(in.x, MUTABLE_TYPE);
   if (C - 4 * in_c >= 2) {
-    out[index + size_ch] = CONVERT_TYPE_TO(in.y, CL_DTYPE);
+    out[index + size_ch] = CONVERT_TYPE_TO(in.y, MUTABLE_TYPE);
   }
   if (C - 4 * in_c >= 3) {
-    out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, CL_DTYPE);
+    out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, MUTABLE_TYPE);
   }
   if (C - 4 * in_c >= 4) {
-    out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, CL_DTYPE);
+    out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, MUTABLE_TYPE);
   }
 }
 
@@ -386,7 +386,7 @@ __kernel void image2d_folder_to_image2d_default(__read_only image2d_t input,
 // image2d_folder -> buffer
 ////////////////////////////////////////////////////////
 __kernel void image2d_folder_to_buffer(__read_only image2d_t input,
-                                       __global CL_DTYPE* output,
+                                       __global MUTABLE_TYPE* output,
                                        __private const int out_h,
                                        __private const int out_w) {
   const int pos_x = get_global_id(0);
@@ -398,15 +398,15 @@ __kernel void image2d_folder_to_buffer(__read_only image2d_t input,
   CL_DTYPE4 out = in;
   int outpos_base = out_w * pos_y + pos_x * 4;
 
-  output[outpos_base] = out.x;
+  output[outpos_base] = CONVERT_TYPE_TO(out.x, MUTABLE_TYPE);
   if (pos_x * 4 + 1 < out_w) {
-    output[outpos_base + 1] = out.y;
+    output[outpos_base + 1] = CONVERT_TYPE_TO(out.y, MUTABLE_TYPE);
   }
   if (pos_x * 4 + 2 < out_w) {
-    output[outpos_base + 2] = out.z;
+    output[outpos_base + 2] = CONVERT_TYPE_TO(out.z, MUTABLE_TYPE);
   }
   if (pos_x * 4 + 3 < out_w) {
-    output[outpos_base + 3] = out.w;
+    output[outpos_base + 3] = CONVERT_TYPE_TO(out.w, MUTABLE_TYPE);
   }
 }
 
@@ -441,7 +441,7 @@ __kernel void image2d_folder_to_buffer_half2float(__read_only image2d_t input,
 ////////////////////////////////////////////////////////
 // buffer -> image2d_folder
 ////////////////////////////////////////////////////////
-__kernel void buffer_to_image2d_folder(__global const CL_DTYPE* input,
+__kernel void buffer_to_image2d_folder(__global const MUTABLE_TYPE* input,
                                        __write_only image2d_t output,
                                        __private const int out_h,
                                        __private const int out_w,
@@ -452,15 +452,15 @@ __kernel void buffer_to_image2d_folder(__global const CL_DTYPE* input,
   int inpos_base = out_w * pos_y + pos_x * 4;
 
   CL_COMPUTE_DTYPE4 out = (CL_COMPUTE_DTYPE4)(0.f, 0.f, 0.f, 0.f);
-  out.x = input[inpos_base];
+  out.x = CONVERT_TYPE_TO(input[inpos_base], CL_COMPUTE_DTYPE);
   if (inpos_base + 1 < length) {
-    out.y = input[inpos_base + 1];
+    out.y = CONVERT_TYPE_TO(input[inpos_base + 1], CL_COMPUTE_DTYPE);
   }
   if (inpos_base + 2 < length) {
-    out.z = input[inpos_base + 2];
+    out.z = CONVERT_TYPE_TO(input[inpos_base + 2], CL_COMPUTE_DTYPE);
   }
   if (inpos_base + 3 < length) {
-    out.w = input[inpos_base + 3];
+    out.w = CONVERT_TYPE_TO(input[inpos_base + 3], CL_COMPUTE_DTYPE);
   }
 
   WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_x, pos_y), out);
diff --git a/lite/core/optimizer/mir/opencl_memory_object_config_pass.cc b/lite/core/optimizer/mir/opencl_memory_object_config_pass.cc
@@ -301,9 +301,37 @@ void OpenCLMemoryObjectConfigPass::CorrectArgumentPlace(SSAGraph* graph) {
         }
       }
 
-      // 7. reshape change target
-      if (op_type == "reshape" || op_type == "reshape2")
+      // 7. reshape transpose change target
+      if ((op_type == "reshape" || op_type == "reshape2") &&
+          input_shape_default_) {
         change_image2d_to_buffer = true;
+      }
+
+      bool transpose_buffer =
+          false;  // TODO(@sprouteer) transpose buffer poor performance
+      if ((op_type == "transpose" || op_type == "transpose2") &&
+          transpose_buffer) {
+        for (std::list<Node*>::iterator i = x->inlinks.begin();
+             i != x->inlinks.end();
+             ++i) {
+          std::string in_name =
+              get_argname((*i)->AsArg().name, inst.op_info()->inputs());
+          if (in_name == "X" && (*i)->inlinks.front()->IsStmt() &&
+              (*i)->inlinks.front()->AsStmt().op_type() == "reshape2") {
+            change_image2d_to_buffer = true;
+          }
+        }
+        for (std::list<Node*>::iterator i = x->outlinks.begin();
+             i != x->outlinks.end();
+             ++i) {
+          std::string out_name =
+              get_argname((*i)->AsArg().name, inst.op_info()->outputs());
+          if (out_name == "Out" && (*i)->outlinks.front()->IsStmt() &&
+              (*i)->outlinks.front()->AsStmt().op_type() == "reshape2") {
+            change_image2d_to_buffer = true;
+          }
+        }
+      }
     }
 
     if (change_image2d_to_cpu) {
diff --git a/lite/core/optimizer/mir/type_layout_cast_pass.cc b/lite/core/optimizer/mir/type_layout_cast_pass.cc
@@ -186,6 +186,21 @@ void TypeLayoutTransformPass::AddLayoutInst(
     op_desc.SetType(layout_type);
     op_desc.SetInput("Input", {in->AsArg().name});
     op_desc.SetOutput("Out", {layout_output_name});
+    if (inst_node->AsStmt().place().target == TARGET(kOpenCL)) {
+      if (inst_node->AsStmt().op_type() == "io_copy" ||
+          (inst_node->inlinks.size() >= 1 && in->inlinks.size() >= 1 &&
+           in->inlinks.front()->IsStmt() &&
+           in->inlinks.front()->AsStmt().op_type() == "io_copy")) {
+        op_desc.SetAttr("process_type", 2);
+        if (inst_node->AsStmt().op_type() == "io_copy") {
+          auto inst_op = inst_node->AsStmt().mutable_op_info();
+          inst_op->SetAttr("process_type", 2);
+        } else {
+          auto inst_op = in->inlinks.front()->AsStmt().mutable_op_info();
+          inst_op->SetAttr("process_type", 2);
+        }
+      }
+    }
 
     layout_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
     auto kernels = layout_op->CreateKernels(valid_places);
diff --git a/lite/core/profile/precision_profiler.h b/lite/core/profile/precision_profiler.h
@@ -467,42 +467,52 @@ class PrecisionProfiler {
               return;
             }
             default: {
-              auto* in_data_v =
-                  use_fp16
-                      ? static_cast<void*>(
-                            calloc(in->numel(), sizeof(uint16_t)))
-                      : static_cast<void*>(calloc(in->numel(), sizeof(float)));
-              std::vector<float> real_out_v(in->numel());
-              TargetWrapperCL::MemcpySync(
-                  in_data_v,
-                  use_fp16 ? in->data<half_t, cl::Buffer>()
-                           : in->data<float, cl::Buffer>(),
-                  in->numel() * (use_fp16 ? sizeof(uint16_t) : sizeof(float)),
-                  IoDirection::DtoH);
-              VLOG(1) << name << ":" << in->numel();
-              if (use_fp16) {
-                HalfArray2FloatArray(static_cast<half_t*>(in_data_v),
-                                     real_out_v.data(),
-                                     in->numel());
+              // TODO(sprouteer) mutable precision
+              if (op_name == "io_copy" || op_name == "layout") {
+                *mean = -3333333;
+                *std_dev = -3333333;
+                *ave_grow_rate = -3333333;
+                LOG(INFO) << op_name + "has wrong mean, std_dev, ave_grow_rate";
+                return;
               } else {
-                memcpy(
-                    real_out_v.data(), in_data_v, in->numel() * sizeof(float));
+                auto* in_data_v = use_fp16 ? static_cast<void*>(calloc(
+                                                 in->numel(), sizeof(uint16_t)))
+                                           : static_cast<void*>(calloc(
+                                                 in->numel(), sizeof(float)));
+                std::vector<float> real_out_v(in->numel());
+                TargetWrapperCL::MemcpySync(
+                    in_data_v,
+                    use_fp16 ? in->data<half_t, cl::Buffer>()
+                             : in->data<float, cl::Buffer>(),
+                    in->numel() * (use_fp16 ? sizeof(uint16_t) : sizeof(float)),
+                    IoDirection::DtoH);
+                VLOG(1) << name << ":" << in->numel();
+                if (use_fp16) {
+                  HalfArray2FloatArray(static_cast<half_t*>(in_data_v),
+                                       real_out_v.data(),
+                                       in->numel());
+                } else {
+                  memcpy(real_out_v.data(),
+                         in_data_v,
+                         in->numel() * sizeof(float));
+                }
+                *mean =
+                    compute_mean<float>(real_out_v.data(), real_out_v.size());
+                *std_dev = compute_standard_deviation<float>(
+                    real_out_v.data(), in->numel(), true, *mean);
+                *ave_grow_rate = compute_average_grow_rate<float>(
+                    real_out_v.data(), real_out_v.size());
+                std::shared_ptr<lite::Tensor> real_out_t(new lite::Tensor);
+                real_out_t->Resize(in->dims());
+                float* real_out_data = real_out_t->mutable_data<float>();
+                memcpy(real_out_data,
+                       real_out_v.data(),
+                       real_out_v.size() * sizeof(float));
+                if (write_result_to_file) {
+                  write_tensorfile<float>(real_out_t.get(), name, log_dir_);
+                }
+                return;
               }
-              *mean = compute_mean<float>(real_out_v.data(), real_out_v.size());
-              *std_dev = compute_standard_deviation<float>(
-                  real_out_v.data(), in->numel(), true, *mean);
-              *ave_grow_rate = compute_average_grow_rate<float>(
-                  real_out_v.data(), real_out_v.size());
-              std::shared_ptr<lite::Tensor> real_out_t(new lite::Tensor);
-              real_out_t->Resize(in->dims());
-              float* real_out_data = real_out_t->mutable_data<float>();
-              memcpy(real_out_data,
-                     real_out_v.data(),
-                     real_out_v.size() * sizeof(float));
-              if (write_result_to_file) {
-                write_tensorfile<float>(real_out_t.get(), name, log_dir_);
-              }
-              return;
             }
           }
         }
diff --git a/lite/kernels/opencl/io_copy_buffer_compute.cc b/lite/kernels/opencl/io_copy_buffer_compute.cc
@@ -81,14 +81,16 @@ float CopyFromDeviceToDeviceSync(void* target,
 class IoCopyHostToOpenCLCompute
     : public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kAny)> {
  public:
+  using param_t = operators::IoCopyParam;
 #ifdef LITE_WITH_PROFILE
   void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
     ch->kernel_func_name = "HostToOpenCL";
     ch->io_duration = h2d_duration_;
   }
 #endif
   void PrepareForRun() override {
-    if (fp16_support_) {
+    auto& param = Param<param_t>();
+    if (fp16_support_ && param.process_type != 2) {
       VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
       auto& context = ctx_->As<OpenCLContext>();
       context.cl_context()->AddKernel(kernel_func_name_,
@@ -104,22 +106,23 @@ class IoCopyHostToOpenCLCompute
     CHECK(param.x->target() == TARGET(kHost) ||
           param.x->target() == TARGET(kARM));
 
-    auto mem_size = param.x->dims().production() *
-                    PrecisionTypeLength(param.x->precision());
+    auto mem_size = param.x->memory_size();
 #ifdef LITE_WITH_LOG
     VLOG(2) << "param.x->memory_size():" << mem_size;
     VLOG(2) << "param.x->dims().size():" << param.x->dims().size();
     VLOG(2) << "param.x->dims():" << param.x->dims();
     VLOG(2) << "param.y->dims().size():" << param.y->dims().size();
     VLOG(2) << "param.y->dims():" << param.y->dims();
 #endif
-    if (fp16_support_ && param.x->precision() == PRECISION(kFloat)) {
+    if (fp16_support_ && param.x->precision() == PRECISION(kFloat) &&
+        param.process_type != 2) {
       std::unique_ptr<Tensor> precision_cast_t =
           std::unique_ptr<Tensor>(new Tensor);
       precision_cast_t->Resize(param.x->dims());
       auto* data_fp32 =
           precision_cast_t->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
       CHECK(param.x->raw_data());
+      mem_size = param.x->dims().production() * sizeof(float);
       h2d_duration_ =
           CopyFromHostSync(data_fp32, param.x->raw_data(), mem_size);
 
@@ -193,14 +196,16 @@ class IoCopyHostToOpenCLCompute
 class IoCopykOpenCLToHostCompute
     : public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kAny)> {
  public:
+  using param_t = operators::IoCopyParam;
 #ifdef LITE_WITH_PROFILE
   void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
     ch->kernel_func_name = "OpenCLToHost";
     ch->io_duration = d2h_duration_;
   }
 #endif
   void PrepareForRun() override {
-    if (fp16_support_) {
+    auto& param = Param<param_t>();
+    if (fp16_support_ && param.process_type != 2) {
       VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
       auto& context = ctx_->As<OpenCLContext>();
       context.cl_context()->AddKernel(kernel_func_name_,
@@ -213,8 +218,7 @@ class IoCopykOpenCLToHostCompute
   void Run() override {
     auto& param = Param<operators::IoCopyParam>();
     CHECK(param.x->target() == TARGET(kOpenCL));
-    auto mem_size = param.x->dims().production() *
-                    PrecisionTypeLength(param.x->precision());
+    auto mem_size = param.x->memory_size();
     const cl::Buffer* x_ptr;
     if (param.process_type == 1) {
       x_ptr = param.x->data<uint8_t, cl::Buffer>();
@@ -240,7 +244,7 @@ class IoCopykOpenCLToHostCompute
     VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
 #endif
     if (fp16_support_ && param.x->precision() != PRECISION(kInt64) &&
-        param.x->precision() != PRECISION(kInt32)) {
+        param.x->precision() != PRECISION(kInt32) && param.process_type != 2) {
       mem_size = param.x->dims().production() * sizeof(float);
       std::unique_ptr<Tensor> precision_cast_t =
           std::unique_ptr<Tensor>(new Tensor);
diff --git a/lite/kernels/opencl/layout_image_compute.cc b/lite/kernels/opencl/layout_image_compute.cc