Skip to content

Commit d73b69b

Browse files
authored
[OPENCL] Add elem_add with y.dim.size==1 & Fix conv check. test=develop (#5604) (#5719)
* [OPENCL] Add elem_add with y.dim.size==1 & Fix conv check
1 parent 6135491 commit d73b69b

File tree

6 files changed

+121
-46
lines changed

6 files changed

+121
-46
lines changed

lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ __kernel void elementwise_add(__read_only image2d_t input,
4242
__kernel void channel_add(__read_only image2d_t input,
4343
__read_only image2d_t bias,
4444
__write_only image2d_t outputImage,
45-
int w) {
45+
int w, int opt) {
4646
int x = get_global_id(0);
4747
int y = get_global_id(1);
4848

@@ -51,7 +51,7 @@ __kernel void channel_add(__read_only image2d_t input,
5151
coords.y = y;
5252

5353
int2 coords_bias;
54-
coords_bias.x = x % w;
54+
coords_bias.x = (opt == 1) ? 0 : x % w;
5555
coords_bias.y = 0;
5656

5757
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, coords);

lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,28 @@ __kernel void channel_mul(__global image2d_t input,
5252
WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
5353
}
5454

55+
__kernel void channel_mul_d1(__read_only image2d_t input,
56+
__read_only image2d_t bias,
57+
__write_only image2d_t outputImage,
58+
int x_w, int opt) {
59+
int x = get_global_id(0);
60+
int y = get_global_id(1);
61+
62+
int2 coords;
63+
coords.x = x;
64+
coords.y = y;
65+
66+
int2 coords_bias;
67+
coords_bias.x = (opt == 1) ? 0 : (x % x_w);
68+
coords_bias.y = 0;
69+
70+
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, coords);
71+
CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, SAMPLER, coords_bias);
72+
CL_DTYPE4 output = in * (CL_DTYPE4)(biase.x);
73+
74+
WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
75+
}
76+
5577
// etc : 1 1 1 72
5678
// run time Y [value,0,0,0] * 72
5779
__kernel void channel_mul_d2(__global image2d_t input,

lite/kernels/opencl/conv_image_compute.cc

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,8 @@ void ConvImageCompute::PrepareForRun() {
9393
tensor_hold_filter_image_ = std::unique_ptr<Tensor>(new Tensor);
9494
tensor_hold_bias_image_ = std::unique_ptr<Tensor>(new Tensor);
9595

96-
if (filter_tensor_h_ == 1 && filter_tensor_w_ == 1) {
97-
CHECK(pad_equal && stride_equal && dilation_equal);
96+
if (filter_tensor_h_ == 1 && filter_tensor_w_ == 1 && pad_equal &&
97+
stride_equal && dilation_equal) {
9898
if (input_tensor_c_ % 4 == 0) {
9999
kernel_func_names_.push_back("conv2d_1x1_simple");
100100
} else {
@@ -117,9 +117,9 @@ void ConvImageCompute::PrepareForRun() {
117117
#define DEPTH_CONV_USE_SPL
118118
#ifdef DEPTH_CONV_USE_SPL
119119
} else if (filter_tensor_c_ == 1 && input_tensor_c_ == output_tensor_c_ &&
120-
filter_tensor_h_ == 3 && filter_tensor_w_ == 3 && groups_ > 1) {
120+
filter_tensor_h_ == 3 && filter_tensor_w_ == 3 && groups_ > 1 &&
121+
dilation_equal) {
121122
// depth_conv2d_3x3s1, depth_conv2d_3x3
122-
CHECK(dilation_equal);
123123
if (stride_equal && stride_h_ == 1 && dilation_h_ == 1) {
124124
kernel_func_names_.push_back("depth_conv2d_3x3s1");
125125
impl_ = &ConvImageCompute::DepthwiseConv2d3x3s1;
@@ -164,10 +164,9 @@ void ConvImageCompute::PrepareForRun() {
164164

165165
impl_ = &ConvImageCompute::DepthwiseConv2d;
166166
} else if (filter_tensor_h_ == 3 && filter_tensor_w_ == 3 &&
167-
dilation_h_ == 1 && dilation_w_ == 1) {
167+
dilation_h_ == 1 && dilation_w_ == 1 && pad_equal &&
168+
stride_equal && dilation_equal) {
168169
// conv2d_3x3
169-
pad_equal = (pad_left_ == pad_up_);
170-
CHECK(pad_equal && stride_equal && dilation_equal);
171170
if (groups_ == 1) {
172171
kernel_func_names_.push_back(
173172
input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch" : "conv2d_3x3_opt");
@@ -189,8 +188,8 @@ void ConvImageCompute::PrepareForRun() {
189188
converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
190189
MUTABLE_DATA_GPU(
191190
filter_gpu_image_, filter_image_w_, filter_image_h_, filter_image_data);
192-
} else if (filter_tensor_h_ == 5 && filter_tensor_w_ == 5) {
193-
CHECK(pad_equal && stride_equal && dilation_equal);
191+
} else if (filter_tensor_h_ == 5 && filter_tensor_w_ == 5 && pad_equal &&
192+
stride_equal && dilation_equal) {
194193
#define CONV_5x5_OPT
195194
#ifndef CONV_5x5_OPT
196195
// conv2d_5x5
@@ -231,8 +230,8 @@ void ConvImageCompute::PrepareForRun() {
231230
impl_ = &ConvImageCompute::Conv2d5x5opt;
232231
#endif
233232
#undef CONV_5x5_OPT
234-
} else if (filter_tensor_h_ == 7 && filter_tensor_w_ == 7) {
235-
CHECK(pad_equal && stride_equal && dilation_equal);
233+
} else if (filter_tensor_h_ == 7 && filter_tensor_w_ == 7 && pad_equal &&
234+
stride_equal && dilation_equal) {
236235
#define CONV_7x7_OPT
237236
#ifndef CONV_7x7_OPT
238237
// conv2d_7x7

lite/kernels/opencl/elementwise_add_image_compute.cc

Lines changed: 50 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@ void ElementwiseAddImageCompute::PrepareForRun() {
6868
auto* y_cpu_nchw =
6969
static_cast<float*>(const_cast<void*>(y->raw_data()));
7070
default_converter.NCHWToImage(y_cpu_nchw, y_cpu_image, y->dims());
71-
7271
MUTABLE_DATA_GPU(
7372
y_weights_image_, y_image_dims[0], y_image_dims[1], y_cpu_image);
7473
}
@@ -88,6 +87,26 @@ void ElementwiseAddImageCompute::PrepareForRun() {
8887
static_cast<float*>(const_cast<void*>(y->raw_data()));
8988
folder_converter.NCHWToImage(y_cpu_nchw, y_cpu_image, y->dims());
9089

90+
MUTABLE_DATA_GPU(
91+
y_weights_image_, y_image_dims[0], y_image_dims[1], y_cpu_image);
92+
}
93+
} else if (axis == -1 && y->dims()[0] == 1) {
94+
kernel_func_name_ = "channel_add"; // for opt
95+
if (y->persistable()) {
96+
LOG(INFO) << "with y->persistable";
97+
y_weights_image_ = std::unique_ptr<Tensor>(new Tensor);
98+
std::unique_ptr<Tensor> tensor_hold_y_image_ =
99+
std::unique_ptr<Tensor>(new Tensor);
100+
CLImageConverterFolder folder_converter;
101+
const DDim& y_image_dims =
102+
folder_converter.InitImageDimInfoWith(y->dims());
103+
tensor_hold_y_image_->Resize({1, y_image_dims[0], y_image_dims[1], 4});
104+
105+
auto* y_cpu_image = MUTABLE_DATA_CPU(tensor_hold_y_image_);
106+
auto* y_cpu_nchw =
107+
static_cast<float*>(const_cast<void*>(y->raw_data()));
108+
folder_converter.NCHWToImage(y_cpu_nchw, y_cpu_image, y->dims());
109+
91110
MUTABLE_DATA_GPU(
92111
y_weights_image_, y_image_dims[0], y_image_dims[1], y_cpu_image);
93112
}
@@ -154,6 +173,7 @@ void ElementwiseAddImageCompute::Run() {
154173
auto* y_img = GET_DATA_GPU(y);
155174
auto* out_img =
156175
MUTABLE_DATA_GPU(out, out_img_shape_[0], out_img_shape_[1], nullptr);
176+
const int tensor_w = x_dims[x_dims.size() - 1];
157177

158178
#ifdef LITE_WITH_LOG
159179
VLOG(4) << "x->target():" << TargetToStr(x->target());
@@ -169,7 +189,7 @@ void ElementwiseAddImageCompute::Run() {
169189

170190
cl_int status;
171191
auto kernel = kernel_;
172-
if (y_dims.size() == 4) {
192+
if (kernel_func_name_ == "elementwise_add") {
173193
int output_w = y_dims[3];
174194
int output_h = y_dims[2];
175195
status = kernel.setArg(0, *x_img);
@@ -182,34 +202,35 @@ void ElementwiseAddImageCompute::Run() {
182202
CL_CHECK_FATAL(status);
183203
status = kernel.setArg(4, output_w);
184204
CL_CHECK_FATAL(status);
185-
} else if (y_dims.size() == 1) {
186-
if (axis == x_dims.size() - 1 || axis == x_dims.size() - 3) {
187-
const int tensor_w = x_dims[x_dims.size() - 1];
188-
#ifdef LITE_WITH_LOG
189-
VLOG(4) << "tensor_w:" << tensor_w;
190-
#endif
191-
status = kernel.setArg(0, *x_img);
192-
CL_CHECK_FATAL(status);
193-
if (y->persistable()) {
194-
auto* y_img = GET_DATA_GPU(y_weights_image_);
195-
status = kernel.setArg(1, *y_img);
196-
} else {
197-
status = kernel.setArg(1, *y_img);
198-
}
199-
CL_CHECK_FATAL(status);
200-
status = kernel.setArg(2, *out_img);
201-
CL_CHECK_FATAL(status);
202-
status = kernel.setArg(3, tensor_w);
203-
CL_CHECK_FATAL(status);
204-
} else {
205-
LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
206-
<< ", x->dims().size():" << x_dims.size()
207-
<< ", y->dims.size():" << y_dims.size();
205+
} else if (kernel_func_name_ == "channel_add") {
206+
if (y->persistable()) {
207+
y_img = GET_DATA_GPU(y_weights_image_);
208208
}
209+
const int opt = y_dims[0] == 1;
210+
status = kernel.setArg(0, *x_img);
211+
CL_CHECK_FATAL(status);
212+
status = kernel.setArg(1, *y_img);
213+
CL_CHECK_FATAL(status);
214+
status = kernel.setArg(2, *out_img);
215+
CL_CHECK_FATAL(status);
216+
status = kernel.setArg(3, tensor_w);
217+
CL_CHECK_FATAL(status);
218+
status = kernel.setArg(4, opt);
219+
CL_CHECK_FATAL(status);
220+
} else if (kernel_func_name_ == "width_add") {
221+
if (y->persistable()) {
222+
y_img = GET_DATA_GPU(y_weights_image_);
223+
}
224+
status = kernel.setArg(0, *x_img);
225+
CL_CHECK_FATAL(status);
226+
status = kernel.setArg(1, *y_img);
227+
CL_CHECK_FATAL(status);
228+
status = kernel.setArg(2, *out_img);
229+
CL_CHECK_FATAL(status);
230+
status = kernel.setArg(3, tensor_w);
231+
CL_CHECK_FATAL(status);
209232
} else {
210-
LOG(FATAL) << "ElementwiseAddImage doesn't support axis:" << axis
211-
<< ", x->dims().size():" << x_dims.size()
212-
<< ", y->dims.size():" << y_dims.size();
233+
LOG(FATAL) << "Unsupported kernel: " << kernel_func_name_;
213234
}
214235

215236
auto& context = ctx_->As<OpenCLContext>();
@@ -257,6 +278,7 @@ REGISTER_LITE_KERNEL(elementwise_add,
257278
PRECISION(kFP16),
258279
DATALAYOUT(kImageDefault))})
259280
.Finalize();
281+
260282
REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
261283
kOpenCL,
262284
kFP16,

lite/kernels/opencl/elementwise_mul_image_compute.cc

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,22 @@ class ElementwiseMulImageCompute
5757
const int bias_dim_size = bias_dims.size();
5858
if (bias_dim_size == 1) {
5959
kernel_func_name_ = "channel_mul_d1";
60+
if (y->persistable()) {
61+
CLImageConverterFolder folder_converter;
62+
const DDim& y_image_dims =
63+
folder_converter.InitImageDimInfoWith(bias_dims);
64+
auto y_image_cpu_t = std::unique_ptr<Tensor>(new Tensor);
65+
y_image_cpu_t->Resize({1, y_image_dims[0], y_image_dims[1], 4});
66+
auto* y_image_cpu_p = MUTABLE_DATA_CPU(y_image_cpu_t);
67+
auto* y_nchw_cpu_p =
68+
static_cast<float*>(const_cast<void*>(y->raw_data()));
69+
folder_converter.NCHWToImage(y_nchw_cpu_p, y_image_cpu_p, bias_dims);
70+
y_image_gpu_t_persist_ = std::unique_ptr<Tensor>(new Tensor);
71+
MUTABLE_DATA_GPU(y_image_gpu_t_persist_,
72+
y_image_dims[0],
73+
y_image_dims[1],
74+
y_image_cpu_p);
75+
}
6076
} else if (bias_dim_size == 2) {
6177
kernel_func_name_ = "channel_mul_d2";
6278
} else if (bias_dim_size == 3) {
@@ -106,6 +122,8 @@ class ElementwiseMulImageCompute
106122
auto out_img_shape =
107123
default_convertor.InitImageDimInfoWith(out->dims()); // w, h
108124
auto y_img_shape = default_convertor.InitImageDimInfoWith(y->dims());
125+
auto bias_dims = y->dims();
126+
auto x_dims = x->dims();
109127

110128
auto* x_img = GET_DATA_GPU(x);
111129
auto* y_img = GET_DATA_GPU(y);
@@ -123,9 +141,6 @@ class ElementwiseMulImageCompute
123141
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
124142
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
125143

126-
auto bias_dims = y->dims();
127-
auto x_dims = x->dims();
128-
129144
if (bias_dims == x_dims) {
130145
// kernel_func_name_ = "elementwise_mul";
131146
cl_int status = kernel.setArg(0, *x_img);
@@ -139,6 +154,10 @@ class ElementwiseMulImageCompute
139154
if (bias_dim_size == 1) {
140155
// kernel_func_name_ = "channel_mul_d1";
141156
const int tensor_w = x_dims[x_dims.size() - 1];
157+
const int opt = bias_dims[0] == 1;
158+
if (y->persistable()) {
159+
y_img = DATA_GPU(y_image_gpu_t_persist_);
160+
}
142161
cl_int status = kernel.setArg(0, *x_img);
143162
CL_CHECK_FATAL(status);
144163
status = kernel.setArg(1, *y_img);
@@ -147,6 +166,8 @@ class ElementwiseMulImageCompute
147166
CL_CHECK_FATAL(status);
148167
status = kernel.setArg(3, tensor_w);
149168
CL_CHECK_FATAL(status);
169+
status = kernel.setArg(4, opt);
170+
CL_CHECK_FATAL(status);
150171
} else if (bias_dim_size == 2) {
151172
// kernel_func_name_ = "channel_mul_d2";
152173
const int tensor_w = x_dims[x_dims.size() - 1];
@@ -189,7 +210,6 @@ class ElementwiseMulImageCompute
189210
auto global_work_size =
190211
cl::NDRange{static_cast<cl::size_type>(x_img_width),
191212
static_cast<cl::size_type>(x_img_height)};
192-
193213
auto status = EnqueueNDRangeKernel(context,
194214
kernel,
195215
cl::NullRange,
@@ -208,6 +228,9 @@ class ElementwiseMulImageCompute
208228
std::string kernel_func_name_{"elementwise_mul"};
209229
std::string build_options_{""};
210230
std::string time_stamp_{GetTimeStamp()};
231+
232+
// y is persistable
233+
std::unique_ptr<Tensor> y_image_gpu_t_persist_{nullptr};
211234
};
212235

213236
} // namespace opencl

lite/kernels/opencl/transpose_image_compute.cc

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,11 +92,20 @@ class TransposeComputeFloatImage
9292
output_image_w_ = output_image_shape.at("width");
9393

9494
if (output_tensor_dims_.size() == 4) {
95-
kernel_func_name_ = "transpose_4d";
95+
std::set<std::vector<int>> unsupported_cases{
96+
std::vector<int>({0, 3, 1, 2})};
97+
if (unsupported_cases.find(axis_) == unsupported_cases.end()) {
98+
kernel_func_name_ = "transpose_4d";
99+
} else {
100+
kernel_func_name_ = "transpose_general_buffer";
101+
}
96102
} else if (output_tensor_dims_.size() == 2) {
97103
kernel_func_name_ = "transpose_2d";
98104
} else {
99105
kernel_func_name_ = "transpose_general_buffer";
106+
}
107+
108+
if (kernel_func_name_ == "transpose_general_buffer") {
100109
build_options_ = "-DCL_DTYPE_float";
101110
// create kernels of im2buf and buf2im
102111
auto im2buf_kernels = KernelRegistry::Global().Create(

0 commit comments

Comments
 (0)