@@ -617,6 +617,10 @@ class LayoutComputeImageFolderToBufferChw
617
617
618
618
void PrepareForRun () override {
619
619
auto & param = Param<param_t >();
620
+ auto x_dims = param.x ->dims ();
621
+ if (x_dims.size () > 2 ) {
622
+ kernel_func_name_ = " image2d_to_buffer" ;
623
+ }
620
624
if (!fp16_support_) {
621
625
build_options_ += " -DCL_DTYPE_FLOAT_FORCE" ;
622
626
}
@@ -641,13 +645,25 @@ class LayoutComputeImageFolderToBufferChw
641
645
auto x_dims = param.x ->dims ();
642
646
auto y_dims = param.y ->dims ();
643
647
644
- CLImageConverterFolder folder_converter;
645
- auto x_image_shape = folder_converter.InitImageDimInfoWith (x_dims);
648
+ DDim x_image_shape;
649
+ if (x_dims.size () > 2 ) {
650
+ CLImageConverterFolder folder_converter;
651
+ x_image_shape = folder_converter.InitImageDimInfoWith (x_dims);
652
+ } else {
653
+ CLImageConverterDefault default_converter;
654
+ x_image_shape = default_converter.InitImageDimInfoWith (x_dims);
655
+ }
646
656
647
657
const cl::Buffer* y_data =
648
658
param.y ->mutable_data <float , cl::Buffer>(TARGET (kOpenCL ));
649
659
auto * x_data = GET_DATA_GPU (param.x );
650
660
661
+ // out info
662
+ std::vector<size_t > new_dims = {1 , 1 , 1 , 1 };
663
+ for (int tidx = 0 ; tidx < x_dims.size (); ++tidx) {
664
+ new_dims[4 - x_dims.size () + tidx] = x_dims[tidx];
665
+ }
666
+
651
667
#ifdef LITE_WITH_LOG
652
668
VLOG (2 ) << " x_dims:" << x_dims;
653
669
VLOG (2 ) << " y_dims:" << y_dims;
@@ -661,20 +677,53 @@ class LayoutComputeImageFolderToBufferChw
661
677
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
662
678
auto kernel = context.cl_context ()->GetKernel (kernel_key.str ());
663
679
680
+ cl::NDRange global_work_size;
664
681
int arg_idx = 0 ;
665
682
cl_int status;
666
- status = kernel.setArg (arg_idx, *x_data);
667
- CL_CHECK_FATAL (status);
668
- status = kernel.setArg (++arg_idx, *y_data);
669
- CL_CHECK_FATAL (status);
670
- status = kernel.setArg (++arg_idx, static_cast <const int >(y_dims[0 ]));
671
- CL_CHECK_FATAL (status);
672
- status = kernel.setArg (++arg_idx, static_cast <const int >(y_dims[1 ]));
673
- CL_CHECK_FATAL (status);
683
+ if (x_dims.size () <= 2 ) {
684
+ status = kernel.setArg (arg_idx, *x_data);
685
+ CL_CHECK_FATAL (status);
686
+ status = kernel.setArg (++arg_idx, *y_data);
687
+ CL_CHECK_FATAL (status);
688
+ status = kernel.setArg (++arg_idx, static_cast <const int >(y_dims[0 ]));
689
+ CL_CHECK_FATAL (status);
690
+ status = kernel.setArg (++arg_idx, static_cast <const int >(y_dims[1 ]));
691
+ CL_CHECK_FATAL (status);
692
+
693
+ global_work_size =
694
+ cl::NDRange{static_cast <cl::size_type>(x_image_shape[0 ]),
695
+ static_cast <cl::size_type>(x_image_shape[1 ])};
696
+ } else {
697
+ size_t C = new_dims[1 ];
698
+ size_t in_height = new_dims[2 ];
699
+ size_t in_width = new_dims[3 ];
700
+ int size_ch = in_height * in_width;
701
+ int size_block = size_ch * 4 ;
702
+ int size_batch = size_ch * C;
703
+
704
+ status = kernel.setArg (arg_idx, *x_data);
705
+ CL_CHECK_FATAL (status);
706
+ status = kernel.setArg (++arg_idx, static_cast <const int >(in_width));
707
+ CL_CHECK_FATAL (status);
708
+ status = kernel.setArg (++arg_idx, static_cast <const int >(in_height));
709
+ CL_CHECK_FATAL (status);
710
+ status = kernel.setArg (++arg_idx, *y_data);
711
+ CL_CHECK_FATAL (status);
712
+ status = kernel.setArg (++arg_idx, static_cast <const int >(size_ch));
713
+ CL_CHECK_FATAL (status);
714
+ status = kernel.setArg (++arg_idx, static_cast <const int >(size_block));
715
+ CL_CHECK_FATAL (status);
716
+ status = kernel.setArg (++arg_idx, static_cast <const int >(size_batch));
717
+ CL_CHECK_FATAL (status);
718
+ status = kernel.setArg (++arg_idx, static_cast <const int >(C));
719
+ CL_CHECK_FATAL (status);
720
+
721
+ global_work_size =
722
+ cl::NDRange{static_cast <cl::size_type>((new_dims[1 ] + 3 ) / 4 ),
723
+ static_cast <cl::size_type>(new_dims[3 ]),
724
+ static_cast <cl::size_type>(new_dims[0 ] * new_dims[2 ])};
725
+ }
674
726
675
- auto global_work_size =
676
- cl::NDRange{static_cast <cl::size_type>(x_image_shape[0 ]),
677
- static_cast <cl::size_type>(x_image_shape[1 ])};
678
727
#ifdef LITE_WITH_LOG
679
728
for (auto i = 0 ; i < global_work_size.dimensions (); i++) {
680
729
VLOG (2 ) << " global_work_size[" << i << " ]: " << global_work_size[i];
@@ -702,6 +751,135 @@ class LayoutComputeImageFolderToBufferChw
702
751
std::string build_options_{" -DCL_DTYPE_float " };
703
752
};
704
753
754
+ // [NCHW] -> [ImageFolder]
755
+ class LayoutComputeBufferChwToImageFolder
756
+ : public KernelLite<TARGET(kOpenCL ),
757
+ PRECISION (kAny ),
758
+ DATALAYOUT(kImageFolder )> {
759
+ public:
760
+ using param_t = operators::LayoutParam;
761
+
762
+ void PrepareForRun () override {
763
+ auto & param = Param<param_t >();
764
+ auto x_dims = param.x ->dims ();
765
+ if (x_dims.size () > 2 ) {
766
+ kernel_func_name_ = " buffer_to_image2d" ;
767
+ }
768
+ if (!fp16_support_) {
769
+ build_options_ += " -DCL_DTYPE_FLOAT_FORCE" ;
770
+ }
771
+ VLOG (1 ) << " kernel_func_name_:" << kernel_func_name_;
772
+ auto & context = ctx_->As <OpenCLContext>();
773
+ context.cl_context ()->AddKernel (kernel_func_name_,
774
+ " image/layout_kernel.cl" ,
775
+ build_options_,
776
+ time_stamp_);
777
+ }
778
+
779
+ void Run () override {
780
+ auto & param = Param<param_t >();
781
+ auto x_dims = param.x ->dims ();
782
+ auto y_dims = param.y ->dims ();
783
+ DDim image_shape;
784
+ if (y_dims.size () > 2 ) {
785
+ CLImageConverterFolder folder_converter;
786
+ image_shape = folder_converter.InitImageDimInfoWith (y_dims);
787
+ } else {
788
+ CLImageConverterDefault default_converter;
789
+ image_shape = default_converter.InitImageDimInfoWith (y_dims);
790
+ }
791
+ auto * y_data =
792
+ MUTABLE_DATA_GPU (param.y , image_shape[0 ], image_shape[1 ], nullptr );
793
+ auto * x_data = GET_BUFFER_GPU (param.x );
794
+
795
+ // out info
796
+ std::vector<size_t > new_dims = {1 , 1 , 1 , 1 };
797
+ for (int tidx = 0 ; tidx < x_dims.size (); ++tidx) {
798
+ new_dims[4 - x_dims.size () + tidx] = x_dims[tidx];
799
+ }
800
+
801
+ #ifdef LITE_WITH_LOG
802
+ VLOG (2 ) << " x_dims:" << x_dims;
803
+ VLOG (2 ) << " y_dims:" << y_dims;
804
+ VLOG (2 ) << " image_shape(w,h):" << image_shape[0 ] << " " << image_shape[1 ];
805
+ #endif
806
+
807
+ auto & context = ctx_->As <OpenCLContext>();
808
+ CHECK (context.cl_context () != nullptr );
809
+ STL::stringstream kernel_key;
810
+ kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
811
+ auto kernel = context.cl_context ()->GetKernel (kernel_key.str ());
812
+
813
+ int arg_idx = 0 ;
814
+ cl_int status;
815
+ status = kernel.setArg (arg_idx, *x_data);
816
+ CL_CHECK_FATAL (status);
817
+ status = kernel.setArg (++arg_idx, *y_data);
818
+ CL_CHECK_FATAL (status);
819
+ if (y_dims.size () <= 2 ) {
820
+ const int length = new_dims[0 ] * new_dims[1 ] * new_dims[2 ] * new_dims[3 ];
821
+ status = kernel.setArg (++arg_idx, static_cast <const int >(y_dims[0 ]));
822
+ CL_CHECK_FATAL (status);
823
+ status = kernel.setArg (++arg_idx, static_cast <const int >(y_dims[1 ]));
824
+ CL_CHECK_FATAL (status);
825
+ status = kernel.setArg (++arg_idx, static_cast <const int >(length));
826
+ CL_CHECK_FATAL (status);
827
+ } else {
828
+ const int out_C = new_dims[1 ];
829
+ const int out_H = new_dims[2 ];
830
+ const int out_W = new_dims[3 ];
831
+ const int Stride2 = out_C * out_H * out_W;
832
+ const int Stride1 = out_H * out_W;
833
+ const int Stride0 = out_W;
834
+ status = kernel.setArg (++arg_idx, static_cast <const int >(out_H));
835
+ CL_CHECK_FATAL (status);
836
+ status = kernel.setArg (++arg_idx, static_cast <const int >(out_W));
837
+ CL_CHECK_FATAL (status);
838
+ status = kernel.setArg (++arg_idx, static_cast <const int >(out_C));
839
+ CL_CHECK_FATAL (status);
840
+ status = kernel.setArg (++arg_idx, static_cast <const int >(Stride0));
841
+ CL_CHECK_FATAL (status);
842
+ status = kernel.setArg (++arg_idx, static_cast <const int >(Stride1));
843
+ CL_CHECK_FATAL (status);
844
+ status = kernel.setArg (++arg_idx, static_cast <const int >(Stride2));
845
+ CL_CHECK_FATAL (status);
846
+ }
847
+
848
+ if (y_dims.size () <= 2 ) {
849
+ gws_ = cl::NDRange{static_cast <cl::size_type>(image_shape[0 ]),
850
+ static_cast <cl::size_type>(image_shape[1 ])};
851
+ } else {
852
+ gws_ = cl::NDRange{static_cast <cl::size_type>((new_dims[1 ] + 3 ) / 4 ),
853
+ static_cast <cl::size_type>(new_dims[3 ]),
854
+ static_cast <cl::size_type>(new_dims[0 ] * new_dims[2 ])};
855
+ }
856
+
857
+ status = EnqueueNDRangeKernel (
858
+ context, kernel, cl::NullRange, gws_, cl::NullRange, nullptr , event_);
859
+ CL_CHECK_FATAL (status);
860
+ }
861
+
862
+ std::string doc () const override {
863
+ return " Trans Layout from cl::Buffer(NCHW) to "
864
+ " cl::Image2D(ImageFolder)" ;
865
+ }
866
+
867
+ #ifdef LITE_WITH_PROFILE
868
+ void SetProfileRuntimeKernelInfo (paddle::lite::profile::OpCharacter* ch) {
869
+ ch->kernel_func_name = kernel_func_name_;
870
+ ch->global_work_size = ch->NDRangeToStr (gws_);
871
+ ch->cl_event =
872
+ event_; // `event_` defined in `kernel.h`, valid after kernel::Run
873
+ }
874
+ #endif
875
+
876
+ private:
877
+ std::string time_stamp_{GetTimeStamp ()};
878
+ std::string kernel_func_name_{" buffer_to_image2d_folder" };
879
+ std::string build_options_{" -DCL_DTYPE_float " };
880
+ cl::NDRange gws_;
881
+ };
882
+
705
883
} // namespace opencl
706
884
} // namespace kernels
707
885
} // namespace lite
@@ -760,6 +938,24 @@ REGISTER_LITE_KERNEL(
760
938
DATALAYOUT (kNCHW ))})
761
939
.Finalize();
762
940
941
+ // [NCHW] -> [ImageFolder]
942
+ REGISTER_LITE_KERNEL (
943
+ layout,
944
+ kOpenCL ,
945
+ kAny ,
946
+ kImageFolder ,
947
+ paddle::lite::kernels::opencl::LayoutComputeBufferChwToImageFolder,
948
+ NCHW_to_ImageFolder)
949
+ .BindInput(" Input" ,
950
+ {LiteType::GetTensorTy (TARGET (kOpenCL ),
951
+ PRECISION (kAny ),
952
+ DATALAYOUT (kNCHW ))})
953
+ .BindOutput(" Out" ,
954
+ {LiteType::GetTensorTy (TARGET (kOpenCL ),
955
+ PRECISION (kAny ),
956
+ DATALAYOUT (kImageFolder ))})
957
+ .Finalize();
958
+
763
959
REGISTER_LITE_KERNEL (
764
960
layout,
765
961
kOpenCL ,
0 commit comments