Skip to content

Commit f05b391

Browse files
authored
[OpenCL]Add NCHW_to_ImageFolder layout, fix layout imageFolder to BufferChw 3,4 dims bug (#7685)
1 parent 8b6d3ae commit f05b391

File tree

3 files changed

+373
-14
lines changed

3 files changed

+373
-14
lines changed

lite/backends/opencl/cl_kernel/image/layout_kernel.cl

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -413,4 +413,32 @@ __kernel void image2d_folder_to_buffer(__read_only image2d_t input,
413413
if (outpos_base + 3 < length) {
414414
output[outpos_base + 3] = out.w;
415415
}
416-
}
416+
}
417+
418+
////////////////////////////////////////////////////////
419+
// buffer -> image2d_folder
420+
////////////////////////////////////////////////////////
421+
__kernel void buffer_to_image2d_folder(__global const CL_DTYPE* input,
422+
__write_only image2d_t output,
423+
__private const int out_h,
424+
__private const int out_w,
425+
__private const int length) {
426+
const int pos_x = get_global_id(0);
427+
const int pos_y = get_global_id(1);
428+
429+
int inpos_base = out_w * pos_y + pos_x * 4;
430+
431+
CL_COMPUTE_DTYPE4 out = (CL_COMPUTE_DTYPE4)(0.f, 0.f, 0.f, 0.f);
432+
out.x = input[inpos_base];
433+
if (inpos_base + 1 < length) {
434+
out.y = input[inpos_base + 1];
435+
}
436+
if (inpos_base + 2 < length) {
437+
out.z = input[inpos_base + 2];
438+
}
439+
if (inpos_base + 3 < length) {
440+
out.w = input[inpos_base + 3];
441+
}
442+
443+
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_x, pos_y), out);
444+
}

lite/kernels/opencl/layout_image_compute.cc

Lines changed: 209 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -617,6 +617,10 @@ class LayoutComputeImageFolderToBufferChw
617617

618618
void PrepareForRun() override {
619619
auto& param = Param<param_t>();
620+
auto x_dims = param.x->dims();
621+
if (x_dims.size() > 2) {
622+
kernel_func_name_ = "image2d_to_buffer";
623+
}
620624
if (!fp16_support_) {
621625
build_options_ += " -DCL_DTYPE_FLOAT_FORCE";
622626
}
@@ -641,13 +645,25 @@ class LayoutComputeImageFolderToBufferChw
641645
auto x_dims = param.x->dims();
642646
auto y_dims = param.y->dims();
643647

644-
CLImageConverterFolder folder_converter;
645-
auto x_image_shape = folder_converter.InitImageDimInfoWith(x_dims);
648+
DDim x_image_shape;
649+
if (x_dims.size() > 2) {
650+
CLImageConverterFolder folder_converter;
651+
x_image_shape = folder_converter.InitImageDimInfoWith(x_dims);
652+
} else {
653+
CLImageConverterDefault default_converter;
654+
x_image_shape = default_converter.InitImageDimInfoWith(x_dims);
655+
}
646656

647657
const cl::Buffer* y_data =
648658
param.y->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
649659
auto* x_data = GET_DATA_GPU(param.x);
650660

661+
// out info
662+
std::vector<size_t> new_dims = {1, 1, 1, 1};
663+
for (int tidx = 0; tidx < x_dims.size(); ++tidx) {
664+
new_dims[4 - x_dims.size() + tidx] = x_dims[tidx];
665+
}
666+
651667
#ifdef LITE_WITH_LOG
652668
VLOG(2) << "x_dims:" << x_dims;
653669
VLOG(2) << "y_dims:" << y_dims;
@@ -661,20 +677,53 @@ class LayoutComputeImageFolderToBufferChw
661677
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
662678
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
663679

680+
cl::NDRange global_work_size;
664681
int arg_idx = 0;
665682
cl_int status;
666-
status = kernel.setArg(arg_idx, *x_data);
667-
CL_CHECK_FATAL(status);
668-
status = kernel.setArg(++arg_idx, *y_data);
669-
CL_CHECK_FATAL(status);
670-
status = kernel.setArg(++arg_idx, static_cast<const int>(y_dims[0]));
671-
CL_CHECK_FATAL(status);
672-
status = kernel.setArg(++arg_idx, static_cast<const int>(y_dims[1]));
673-
CL_CHECK_FATAL(status);
683+
if (x_dims.size() <= 2) {
684+
status = kernel.setArg(arg_idx, *x_data);
685+
CL_CHECK_FATAL(status);
686+
status = kernel.setArg(++arg_idx, *y_data);
687+
CL_CHECK_FATAL(status);
688+
status = kernel.setArg(++arg_idx, static_cast<const int>(y_dims[0]));
689+
CL_CHECK_FATAL(status);
690+
status = kernel.setArg(++arg_idx, static_cast<const int>(y_dims[1]));
691+
CL_CHECK_FATAL(status);
692+
693+
global_work_size =
694+
cl::NDRange{static_cast<cl::size_type>(x_image_shape[0]),
695+
static_cast<cl::size_type>(x_image_shape[1])};
696+
} else {
697+
size_t C = new_dims[1];
698+
size_t in_height = new_dims[2];
699+
size_t in_width = new_dims[3];
700+
int size_ch = in_height * in_width;
701+
int size_block = size_ch * 4;
702+
int size_batch = size_ch * C;
703+
704+
status = kernel.setArg(arg_idx, *x_data);
705+
CL_CHECK_FATAL(status);
706+
status = kernel.setArg(++arg_idx, static_cast<const int>(in_width));
707+
CL_CHECK_FATAL(status);
708+
status = kernel.setArg(++arg_idx, static_cast<const int>(in_height));
709+
CL_CHECK_FATAL(status);
710+
status = kernel.setArg(++arg_idx, *y_data);
711+
CL_CHECK_FATAL(status);
712+
status = kernel.setArg(++arg_idx, static_cast<const int>(size_ch));
713+
CL_CHECK_FATAL(status);
714+
status = kernel.setArg(++arg_idx, static_cast<const int>(size_block));
715+
CL_CHECK_FATAL(status);
716+
status = kernel.setArg(++arg_idx, static_cast<const int>(size_batch));
717+
CL_CHECK_FATAL(status);
718+
status = kernel.setArg(++arg_idx, static_cast<const int>(C));
719+
CL_CHECK_FATAL(status);
720+
721+
global_work_size =
722+
cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4),
723+
static_cast<cl::size_type>(new_dims[3]),
724+
static_cast<cl::size_type>(new_dims[0] * new_dims[2])};
725+
}
674726

675-
auto global_work_size =
676-
cl::NDRange{static_cast<cl::size_type>(x_image_shape[0]),
677-
static_cast<cl::size_type>(x_image_shape[1])};
678727
#ifdef LITE_WITH_LOG
679728
for (auto i = 0; i < global_work_size.dimensions(); i++) {
680729
VLOG(2) << "global_work_size[" << i << "]: " << global_work_size[i];
@@ -702,6 +751,135 @@ class LayoutComputeImageFolderToBufferChw
702751
std::string build_options_{"-DCL_DTYPE_float "};
703752
};
704753

754+
// [NCHW] -> [ImageFolder]
755+
class LayoutComputeBufferChwToImageFolder
756+
: public KernelLite<TARGET(kOpenCL),
757+
PRECISION(kAny),
758+
DATALAYOUT(kImageFolder)> {
759+
public:
760+
using param_t = operators::LayoutParam;
761+
762+
void PrepareForRun() override {
763+
auto& param = Param<param_t>();
764+
auto x_dims = param.x->dims();
765+
if (x_dims.size() > 2) {
766+
kernel_func_name_ = "buffer_to_image2d";
767+
}
768+
if (!fp16_support_) {
769+
build_options_ += " -DCL_DTYPE_FLOAT_FORCE";
770+
}
771+
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
772+
auto& context = ctx_->As<OpenCLContext>();
773+
context.cl_context()->AddKernel(kernel_func_name_,
774+
"image/layout_kernel.cl",
775+
build_options_,
776+
time_stamp_);
777+
}
778+
779+
void Run() override {
780+
auto& param = Param<param_t>();
781+
auto x_dims = param.x->dims();
782+
auto y_dims = param.y->dims();
783+
DDim image_shape;
784+
if (y_dims.size() > 2) {
785+
CLImageConverterFolder folder_converter;
786+
image_shape = folder_converter.InitImageDimInfoWith(y_dims);
787+
} else {
788+
CLImageConverterDefault default_converter;
789+
image_shape = default_converter.InitImageDimInfoWith(y_dims);
790+
}
791+
auto* y_data =
792+
MUTABLE_DATA_GPU(param.y, image_shape[0], image_shape[1], nullptr);
793+
auto* x_data = GET_BUFFER_GPU(param.x);
794+
795+
// out info
796+
std::vector<size_t> new_dims = {1, 1, 1, 1};
797+
for (int tidx = 0; tidx < x_dims.size(); ++tidx) {
798+
new_dims[4 - x_dims.size() + tidx] = x_dims[tidx];
799+
}
800+
801+
#ifdef LITE_WITH_LOG
802+
VLOG(2) << "x_dims:" << x_dims;
803+
VLOG(2) << "y_dims:" << y_dims;
804+
VLOG(2) << "image_shape(w,h):" << image_shape[0] << " " << image_shape[1];
805+
#endif
806+
807+
auto& context = ctx_->As<OpenCLContext>();
808+
CHECK(context.cl_context() != nullptr);
809+
STL::stringstream kernel_key;
810+
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
811+
auto kernel = context.cl_context()->GetKernel(kernel_key.str());
812+
813+
int arg_idx = 0;
814+
cl_int status;
815+
status = kernel.setArg(arg_idx, *x_data);
816+
CL_CHECK_FATAL(status);
817+
status = kernel.setArg(++arg_idx, *y_data);
818+
CL_CHECK_FATAL(status);
819+
if (y_dims.size() <= 2) {
820+
const int length = new_dims[0] * new_dims[1] * new_dims[2] * new_dims[3];
821+
status = kernel.setArg(++arg_idx, static_cast<const int>(y_dims[0]));
822+
CL_CHECK_FATAL(status);
823+
status = kernel.setArg(++arg_idx, static_cast<const int>(y_dims[1]));
824+
CL_CHECK_FATAL(status);
825+
status = kernel.setArg(++arg_idx, static_cast<const int>(length));
826+
CL_CHECK_FATAL(status);
827+
} else {
828+
const int out_C = new_dims[1];
829+
const int out_H = new_dims[2];
830+
const int out_W = new_dims[3];
831+
const int Stride2 = out_C * out_H * out_W;
832+
const int Stride1 = out_H * out_W;
833+
const int Stride0 = out_W;
834+
status = kernel.setArg(++arg_idx, static_cast<const int>(out_H));
835+
CL_CHECK_FATAL(status);
836+
status = kernel.setArg(++arg_idx, static_cast<const int>(out_W));
837+
CL_CHECK_FATAL(status);
838+
status = kernel.setArg(++arg_idx, static_cast<const int>(out_C));
839+
CL_CHECK_FATAL(status);
840+
status = kernel.setArg(++arg_idx, static_cast<const int>(Stride0));
841+
CL_CHECK_FATAL(status);
842+
status = kernel.setArg(++arg_idx, static_cast<const int>(Stride1));
843+
CL_CHECK_FATAL(status);
844+
status = kernel.setArg(++arg_idx, static_cast<const int>(Stride2));
845+
CL_CHECK_FATAL(status);
846+
}
847+
848+
if (y_dims.size() <= 2) {
849+
gws_ = cl::NDRange{static_cast<cl::size_type>(image_shape[0]),
850+
static_cast<cl::size_type>(image_shape[1])};
851+
} else {
852+
gws_ = cl::NDRange{static_cast<cl::size_type>((new_dims[1] + 3) / 4),
853+
static_cast<cl::size_type>(new_dims[3]),
854+
static_cast<cl::size_type>(new_dims[0] * new_dims[2])};
855+
}
856+
857+
status = EnqueueNDRangeKernel(
858+
context, kernel, cl::NullRange, gws_, cl::NullRange, nullptr, event_);
859+
CL_CHECK_FATAL(status);
860+
}
861+
862+
std::string doc() const override {
863+
return "Trans Layout from cl::Buffer(NCHW) to "
864+
"cl::Image2D(ImageFolder)";
865+
}
866+
867+
#ifdef LITE_WITH_PROFILE
868+
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
869+
ch->kernel_func_name = kernel_func_name_;
870+
ch->global_work_size = ch->NDRangeToStr(gws_);
871+
ch->cl_event =
872+
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
873+
}
874+
#endif
875+
876+
private:
877+
std::string time_stamp_{GetTimeStamp()};
878+
std::string kernel_func_name_{"buffer_to_image2d_folder"};
879+
std::string build_options_{"-DCL_DTYPE_float "};
880+
cl::NDRange gws_;
881+
};
882+
705883
} // namespace opencl
706884
} // namespace kernels
707885
} // namespace lite
@@ -760,6 +938,24 @@ REGISTER_LITE_KERNEL(
760938
DATALAYOUT(kNCHW))})
761939
.Finalize();
762940

941+
// [NCHW] -> [ImageFolder]
942+
REGISTER_LITE_KERNEL(
943+
layout,
944+
kOpenCL,
945+
kAny,
946+
kImageFolder,
947+
paddle::lite::kernels::opencl::LayoutComputeBufferChwToImageFolder,
948+
NCHW_to_ImageFolder)
949+
.BindInput("Input",
950+
{LiteType::GetTensorTy(TARGET(kOpenCL),
951+
PRECISION(kAny),
952+
DATALAYOUT(kNCHW))})
953+
.BindOutput("Out",
954+
{LiteType::GetTensorTy(TARGET(kOpenCL),
955+
PRECISION(kAny),
956+
DATALAYOUT(kImageFolder))})
957+
.Finalize();
958+
763959
REGISTER_LITE_KERNEL(
764960
layout,
765961
kOpenCL,

0 commit comments

Comments
 (0)