Skip to content

Commit 1eea6b2

Browse files
authored
[cherry pick][OpenCL] Fix poor performance of iocopy and layout (#9665)
* fuse precision cast to layout cast test=develop
1 parent 4deb68e commit 1eea6b2

File tree

6 files changed

+162
-67
lines changed

6 files changed

+162
-67
lines changed

lite/backends/opencl/cl_kernel/image/layout_kernel.cl

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ limitations under the License. */
1818
////////////////////////////////////////////////////////
1919
// buffer -> image2d
2020
////////////////////////////////////////////////////////
21-
__kernel void buffer_to_image2d(__global CL_DTYPE* in,
21+
__kernel void buffer_to_image2d(__global MUTABLE_TYPE* in,
2222
__write_only image2d_t output_image,
2323
__private const int out_H,
2424
__private const int out_W,
@@ -96,7 +96,7 @@ __kernel void buffer_to_image2d(__global CL_DTYPE* in,
9696
__kernel void image2d_to_buffer(__read_only image2d_t input,
9797
__private const int in_width,
9898
__private const int in_height,
99-
__global CL_DTYPE* out,
99+
__global MUTABLE_TYPE* out,
100100
__private const int size_ch,
101101
__private const int size_block,
102102
__private const int size_batch,
@@ -129,15 +129,15 @@ __kernel void image2d_to_buffer(__read_only image2d_t input,
129129

130130
const int index =
131131
in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
132-
out[index] = CONVERT_TYPE_TO(in.x, CL_DTYPE);
132+
out[index] = CONVERT_TYPE_TO(in.x, MUTABLE_TYPE);
133133
if (C - 4 * in_c >= 2) {
134-
out[index + size_ch] = CONVERT_TYPE_TO(in.y, CL_DTYPE);
134+
out[index + size_ch] = CONVERT_TYPE_TO(in.y, MUTABLE_TYPE);
135135
}
136136
if (C - 4 * in_c >= 3) {
137-
out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, CL_DTYPE);
137+
out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, MUTABLE_TYPE);
138138
}
139139
if (C - 4 * in_c >= 4) {
140-
out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, CL_DTYPE);
140+
out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, MUTABLE_TYPE);
141141
}
142142
}
143143

@@ -386,7 +386,7 @@ __kernel void image2d_folder_to_image2d_default(__read_only image2d_t input,
386386
// image2d_folder -> buffer
387387
////////////////////////////////////////////////////////
388388
__kernel void image2d_folder_to_buffer(__read_only image2d_t input,
389-
__global CL_DTYPE* output,
389+
__global MUTABLE_TYPE* output,
390390
__private const int out_h,
391391
__private const int out_w) {
392392
const int pos_x = get_global_id(0);
@@ -398,15 +398,15 @@ __kernel void image2d_folder_to_buffer(__read_only image2d_t input,
398398
CL_DTYPE4 out = in;
399399
int outpos_base = out_w * pos_y + pos_x * 4;
400400

401-
output[outpos_base] = out.x;
401+
output[outpos_base] = CONVERT_TYPE_TO(out.x, MUTABLE_TYPE);
402402
if (pos_x * 4 + 1 < out_w) {
403-
output[outpos_base + 1] = out.y;
403+
output[outpos_base + 1] = CONVERT_TYPE_TO(out.y, MUTABLE_TYPE);
404404
}
405405
if (pos_x * 4 + 2 < out_w) {
406-
output[outpos_base + 2] = out.z;
406+
output[outpos_base + 2] = CONVERT_TYPE_TO(out.z, MUTABLE_TYPE);
407407
}
408408
if (pos_x * 4 + 3 < out_w) {
409-
output[outpos_base + 3] = out.w;
409+
output[outpos_base + 3] = CONVERT_TYPE_TO(out.w, MUTABLE_TYPE);
410410
}
411411
}
412412

@@ -441,7 +441,7 @@ __kernel void image2d_folder_to_buffer_half2float(__read_only image2d_t input,
441441
////////////////////////////////////////////////////////
442442
// buffer -> image2d_folder
443443
////////////////////////////////////////////////////////
444-
__kernel void buffer_to_image2d_folder(__global const CL_DTYPE* input,
444+
__kernel void buffer_to_image2d_folder(__global const MUTABLE_TYPE* input,
445445
__write_only image2d_t output,
446446
__private const int out_h,
447447
__private const int out_w,
@@ -452,15 +452,15 @@ __kernel void buffer_to_image2d_folder(__global const CL_DTYPE* input,
452452
int inpos_base = out_w * pos_y + pos_x * 4;
453453

454454
CL_COMPUTE_DTYPE4 out = (CL_COMPUTE_DTYPE4)(0.f, 0.f, 0.f, 0.f);
455-
out.x = input[inpos_base];
455+
out.x = CONVERT_TYPE_TO(input[inpos_base], CL_COMPUTE_DTYPE);
456456
if (inpos_base + 1 < length) {
457-
out.y = input[inpos_base + 1];
457+
out.y = CONVERT_TYPE_TO(input[inpos_base + 1], CL_COMPUTE_DTYPE);
458458
}
459459
if (inpos_base + 2 < length) {
460-
out.z = input[inpos_base + 2];
460+
out.z = CONVERT_TYPE_TO(input[inpos_base + 2], CL_COMPUTE_DTYPE);
461461
}
462462
if (inpos_base + 3 < length) {
463-
out.w = input[inpos_base + 3];
463+
out.w = CONVERT_TYPE_TO(input[inpos_base + 3], CL_COMPUTE_DTYPE);
464464
}
465465

466466
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_x, pos_y), out);

lite/core/optimizer/mir/opencl_memory_object_config_pass.cc

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -301,9 +301,37 @@ void OpenCLMemoryObjectConfigPass::CorrectArgumentPlace(SSAGraph* graph) {
301301
}
302302
}
303303

304-
// 7. reshape change target
305-
if (op_type == "reshape" || op_type == "reshape2")
304+
// 7. reshape transpose change target
305+
if ((op_type == "reshape" || op_type == "reshape2") &&
306+
input_shape_default_) {
306307
change_image2d_to_buffer = true;
308+
}
309+
310+
bool transpose_buffer =
311+
false; // TODO(@sprouteer) transpose buffer poor performance
312+
if ((op_type == "transpose" || op_type == "transpose2") &&
313+
transpose_buffer) {
314+
for (std::list<Node*>::iterator i = x->inlinks.begin();
315+
i != x->inlinks.end();
316+
++i) {
317+
std::string in_name =
318+
get_argname((*i)->AsArg().name, inst.op_info()->inputs());
319+
if (in_name == "X" && (*i)->inlinks.front()->IsStmt() &&
320+
(*i)->inlinks.front()->AsStmt().op_type() == "reshape2") {
321+
change_image2d_to_buffer = true;
322+
}
323+
}
324+
for (std::list<Node*>::iterator i = x->outlinks.begin();
325+
i != x->outlinks.end();
326+
++i) {
327+
std::string out_name =
328+
get_argname((*i)->AsArg().name, inst.op_info()->outputs());
329+
if (out_name == "Out" && (*i)->outlinks.front()->IsStmt() &&
330+
(*i)->outlinks.front()->AsStmt().op_type() == "reshape2") {
331+
change_image2d_to_buffer = true;
332+
}
333+
}
334+
}
307335
}
308336

309337
if (change_image2d_to_cpu) {

lite/core/optimizer/mir/type_layout_cast_pass.cc

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,21 @@ void TypeLayoutTransformPass::AddLayoutInst(
186186
op_desc.SetType(layout_type);
187187
op_desc.SetInput("Input", {in->AsArg().name});
188188
op_desc.SetOutput("Out", {layout_output_name});
189+
if (inst_node->AsStmt().place().target == TARGET(kOpenCL)) {
190+
if (inst_node->AsStmt().op_type() == "io_copy" ||
191+
(inst_node->inlinks.size() >= 1 && in->inlinks.size() >= 1 &&
192+
in->inlinks.front()->IsStmt() &&
193+
in->inlinks.front()->AsStmt().op_type() == "io_copy")) {
194+
op_desc.SetAttr("process_type", 2);
195+
if (inst_node->AsStmt().op_type() == "io_copy") {
196+
auto inst_op = inst_node->AsStmt().mutable_op_info();
197+
inst_op->SetAttr("process_type", 2);
198+
} else {
199+
auto inst_op = in->inlinks.front()->AsStmt().mutable_op_info();
200+
inst_op->SetAttr("process_type", 2);
201+
}
202+
}
203+
}
189204

190205
layout_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
191206
auto kernels = layout_op->CreateKernels(valid_places);

lite/core/profile/precision_profiler.h

Lines changed: 44 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -467,42 +467,52 @@ class PrecisionProfiler {
467467
return;
468468
}
469469
default: {
470-
auto* in_data_v =
471-
use_fp16
472-
? static_cast<void*>(
473-
calloc(in->numel(), sizeof(uint16_t)))
474-
: static_cast<void*>(calloc(in->numel(), sizeof(float)));
475-
std::vector<float> real_out_v(in->numel());
476-
TargetWrapperCL::MemcpySync(
477-
in_data_v,
478-
use_fp16 ? in->data<half_t, cl::Buffer>()
479-
: in->data<float, cl::Buffer>(),
480-
in->numel() * (use_fp16 ? sizeof(uint16_t) : sizeof(float)),
481-
IoDirection::DtoH);
482-
VLOG(1) << name << ":" << in->numel();
483-
if (use_fp16) {
484-
HalfArray2FloatArray(static_cast<half_t*>(in_data_v),
485-
real_out_v.data(),
486-
in->numel());
470+
// TODO(sprouteer) mutable precision
471+
if (op_name == "io_copy" || op_name == "layout") {
472+
*mean = -3333333;
473+
*std_dev = -3333333;
474+
*ave_grow_rate = -3333333;
475+
LOG(INFO) << op_name + "has wrong mean, std_dev, ave_grow_rate";
476+
return;
487477
} else {
488-
memcpy(
489-
real_out_v.data(), in_data_v, in->numel() * sizeof(float));
478+
auto* in_data_v = use_fp16 ? static_cast<void*>(calloc(
479+
in->numel(), sizeof(uint16_t)))
480+
: static_cast<void*>(calloc(
481+
in->numel(), sizeof(float)));
482+
std::vector<float> real_out_v(in->numel());
483+
TargetWrapperCL::MemcpySync(
484+
in_data_v,
485+
use_fp16 ? in->data<half_t, cl::Buffer>()
486+
: in->data<float, cl::Buffer>(),
487+
in->numel() * (use_fp16 ? sizeof(uint16_t) : sizeof(float)),
488+
IoDirection::DtoH);
489+
VLOG(1) << name << ":" << in->numel();
490+
if (use_fp16) {
491+
HalfArray2FloatArray(static_cast<half_t*>(in_data_v),
492+
real_out_v.data(),
493+
in->numel());
494+
} else {
495+
memcpy(real_out_v.data(),
496+
in_data_v,
497+
in->numel() * sizeof(float));
498+
}
499+
*mean =
500+
compute_mean<float>(real_out_v.data(), real_out_v.size());
501+
*std_dev = compute_standard_deviation<float>(
502+
real_out_v.data(), in->numel(), true, *mean);
503+
*ave_grow_rate = compute_average_grow_rate<float>(
504+
real_out_v.data(), real_out_v.size());
505+
std::shared_ptr<lite::Tensor> real_out_t(new lite::Tensor);
506+
real_out_t->Resize(in->dims());
507+
float* real_out_data = real_out_t->mutable_data<float>();
508+
memcpy(real_out_data,
509+
real_out_v.data(),
510+
real_out_v.size() * sizeof(float));
511+
if (write_result_to_file) {
512+
write_tensorfile<float>(real_out_t.get(), name, log_dir_);
513+
}
514+
return;
490515
}
491-
*mean = compute_mean<float>(real_out_v.data(), real_out_v.size());
492-
*std_dev = compute_standard_deviation<float>(
493-
real_out_v.data(), in->numel(), true, *mean);
494-
*ave_grow_rate = compute_average_grow_rate<float>(
495-
real_out_v.data(), real_out_v.size());
496-
std::shared_ptr<lite::Tensor> real_out_t(new lite::Tensor);
497-
real_out_t->Resize(in->dims());
498-
float* real_out_data = real_out_t->mutable_data<float>();
499-
memcpy(real_out_data,
500-
real_out_v.data(),
501-
real_out_v.size() * sizeof(float));
502-
if (write_result_to_file) {
503-
write_tensorfile<float>(real_out_t.get(), name, log_dir_);
504-
}
505-
return;
506516
}
507517
}
508518
}

lite/kernels/opencl/io_copy_buffer_compute.cc

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -81,14 +81,16 @@ float CopyFromDeviceToDeviceSync(void* target,
8181
class IoCopyHostToOpenCLCompute
8282
: public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kAny)> {
8383
public:
84+
using param_t = operators::IoCopyParam;
8485
#ifdef LITE_WITH_PROFILE
8586
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
8687
ch->kernel_func_name = "HostToOpenCL";
8788
ch->io_duration = h2d_duration_;
8889
}
8990
#endif
9091
void PrepareForRun() override {
91-
if (fp16_support_) {
92+
auto& param = Param<param_t>();
93+
if (fp16_support_ && param.process_type != 2) {
9294
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
9395
auto& context = ctx_->As<OpenCLContext>();
9496
context.cl_context()->AddKernel(kernel_func_name_,
@@ -104,22 +106,23 @@ class IoCopyHostToOpenCLCompute
104106
CHECK(param.x->target() == TARGET(kHost) ||
105107
param.x->target() == TARGET(kARM));
106108

107-
auto mem_size = param.x->dims().production() *
108-
PrecisionTypeLength(param.x->precision());
109+
auto mem_size = param.x->memory_size();
109110
#ifdef LITE_WITH_LOG
110111
VLOG(2) << "param.x->memory_size():" << mem_size;
111112
VLOG(2) << "param.x->dims().size():" << param.x->dims().size();
112113
VLOG(2) << "param.x->dims():" << param.x->dims();
113114
VLOG(2) << "param.y->dims().size():" << param.y->dims().size();
114115
VLOG(2) << "param.y->dims():" << param.y->dims();
115116
#endif
116-
if (fp16_support_ && param.x->precision() == PRECISION(kFloat)) {
117+
if (fp16_support_ && param.x->precision() == PRECISION(kFloat) &&
118+
param.process_type != 2) {
117119
std::unique_ptr<Tensor> precision_cast_t =
118120
std::unique_ptr<Tensor>(new Tensor);
119121
precision_cast_t->Resize(param.x->dims());
120122
auto* data_fp32 =
121123
precision_cast_t->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
122124
CHECK(param.x->raw_data());
125+
mem_size = param.x->dims().production() * sizeof(float);
123126
h2d_duration_ =
124127
CopyFromHostSync(data_fp32, param.x->raw_data(), mem_size);
125128

@@ -193,14 +196,16 @@ class IoCopyHostToOpenCLCompute
193196
class IoCopykOpenCLToHostCompute
194197
: public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kAny)> {
195198
public:
199+
using param_t = operators::IoCopyParam;
196200
#ifdef LITE_WITH_PROFILE
197201
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
198202
ch->kernel_func_name = "OpenCLToHost";
199203
ch->io_duration = d2h_duration_;
200204
}
201205
#endif
202206
void PrepareForRun() override {
203-
if (fp16_support_) {
207+
auto& param = Param<param_t>();
208+
if (fp16_support_ && param.process_type != 2) {
204209
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
205210
auto& context = ctx_->As<OpenCLContext>();
206211
context.cl_context()->AddKernel(kernel_func_name_,
@@ -213,8 +218,7 @@ class IoCopykOpenCLToHostCompute
213218
void Run() override {
214219
auto& param = Param<operators::IoCopyParam>();
215220
CHECK(param.x->target() == TARGET(kOpenCL));
216-
auto mem_size = param.x->dims().production() *
217-
PrecisionTypeLength(param.x->precision());
221+
auto mem_size = param.x->memory_size();
218222
const cl::Buffer* x_ptr;
219223
if (param.process_type == 1) {
220224
x_ptr = param.x->data<uint8_t, cl::Buffer>();
@@ -240,7 +244,7 @@ class IoCopykOpenCLToHostCompute
240244
VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
241245
#endif
242246
if (fp16_support_ && param.x->precision() != PRECISION(kInt64) &&
243-
param.x->precision() != PRECISION(kInt32)) {
247+
param.x->precision() != PRECISION(kInt32) && param.process_type != 2) {
244248
mem_size = param.x->dims().production() * sizeof(float);
245249
std::unique_ptr<Tensor> precision_cast_t =
246250
std::unique_ptr<Tensor>(new Tensor);

0 commit comments

Comments
 (0)