@@ -30,6 +30,30 @@ inline intmax_t selectWorkSize(intmax_t max, intmax_t mul_of, intmax_t size)
3030 return std::min ((intmax_t )max, round (size));
3131}
3232
33+ #define OPENCL_TENSOR_MAX_DIMS 32
34+ typedef struct __attribute__ ((packed)) _OpenCLView
35+ {
36+ int stride[OPENCL_TENSOR_MAX_DIMS];
37+ int shape_stride[OPENCL_TENSOR_MAX_DIMS];
38+ int offset;
39+ int dims;
40+ } OpenCLView;
41+
42+ static void makeOpenCLView (const TensorImpl* x, OpenCLView* v)
43+ {
44+ int dims = int (x->dimentions ());
45+ et_assert (dims <= OPENCL_TENSOR_MAX_DIMS, " Too much dimensions for OpenCL backend." );
46+ auto stride = x->stride ();
47+ auto shape_stride = shapeToStride (x->shape ());
48+ for (int i=0 ;i<dims;i++) {
49+ v->stride [i] = stride[i];
50+ v->shape_stride [i] = shape_stride[i];
51+ }
52+ v->offset = x->offset ();
53+ v->dims = dims;
54+ }
55+
56+
3357
3458template <typename T>
3559std::string str (T&& s)
@@ -380,7 +404,7 @@ void OpenCLBackend::sync() const
380404
381405std::shared_ptr<TensorImpl> OpenCLBackend::copy (const TensorImpl* x)
382406{
383- requireProperties (x, this , IsContingous ());
407+ requireProperties (x, this , IsPlain ());
384408 size_t buf_size = x->size ()*dtypeToSize (x->dtype ());
385409 cl::Buffer buf = allocBuffer (buf_size);
386410 const cl::Buffer& src = std::static_pointer_cast<const OpenCLBuffer>(x->buffer ())->buffer ();
@@ -641,117 +665,54 @@ std::optional<cl::Buffer> OpenCLBackend::toSparse(const TensorImpl* x)
641665
642666static std::string jitStridedView (const TensorImpl* x, size_t id)
643667{
668+ // If possible, do the easy route
669+ if (x->iscontiguous ()) {
670+ std::string func = R"(
671+ int location_func$ID(int index) {
672+ return index + $OFFSET;
673+ }
674+ )" ;
675+ replaceAll (func, " $ID" , std::to_string (id));
676+ replaceAll (func, " $OFFSET" , std::to_string (x->offset ()));
677+ return func;
678+ }
679+
680+ // Otherwise go the complex one
644681 std::string func = R"(
645- int location_func$ID(int location )
682+ int location_func$ID(int index )
646683{
647- int in_stride [] = $IN_STRIDE ;
684+ int shape_stride [] = $SHAPE_STRIDE ;
648685 int stride[] = $STRIDE;
649- int bias = $BIAS;
650- int ndpos[$DIMS] = {0};
651- int loc = location;
652- for(int i=0;i<$IN_DIMS;i++) {
653- int s = in_stride[i];
654- ndpos[$DIMS - $IN_DIMS + i] = loc / s;
655- loc %= s;
656- }
686+ int offset = $OFFSET;
687+ int curr_idx = index;
657688 int sum = 0;
658- for(int i=0;i<$DIMS;i++)
659- sum += ndpos[i]*stride[i];
660- sum += bias;
661- return sum;
689+ for(int i=0;i<$DIMS;i++) {
690+ int s = shape_stride[i];
691+ int ndpos = curr_idx / s;
692+ sum += ndpos * stride[i];
693+ curr_idx %= s;
694+ }
695+ return sum + offset;
662696}
663697)" ;
698+ const auto shape_stride = shapeToStride (x->shape ());
664699 replaceAll (func, " $ID" , std::to_string (id));
665- auto in_strides = shapeToStride (x->shape ());
666- replaceAll (func, " $IN_STRIDE" , to_string (in_strides));
667- replaceAll (func, " $IN_DIMS" , std::to_string (in_strides.size ()));
668- replaceAll (func, " $DIMS" , std::to_string (std::max (x->dimentions (), x->stride ().size ())));
669-
700+ replaceAll (func, " $SHAPE_STRIDE" , to_string (shape_stride));
701+ replaceAll (func, " $DIMS" , std::to_string (x->dimentions ()));
670702 replaceAll (func, " $STRIDE" , to_string (x->stride ()));
671- replaceAll (func, " $BIAS " , std::to_string (x->offset ()));
703+ replaceAll (func, " $OFFSET " , std::to_string (x->offset ()));
672704 return func;
673705}
674706
675- static std::vector<std::string> jitCopyFromView (const TensorImpl* x)
676- {
677- std::vector<std::string> convertion;
678- convertion.push_back (jitStridedView (x, 0 ));
679-
680- std::string func = R"(
681- #define Type $TYPE
682- kernel void copy(global Type* restrict x, global Type* restrict y)
683- {
684- int global_id = get_global_id(0);
685- int global_size = get_global_size(0);
686- for(int i=global_id;i<$SIZE;i+=global_size) {
687- int position = location_func0(i);
688- y[i] = x[position];
689- }
690- }
691- )" ;
692-
693- auto s = shapeToStride (x->shape ());
694-
695- std::string type = to_ctype_string (x->dtype ());
696- replaceAll (func, " $TYPE" , type);
697- replaceAll (func, " $SIZE" , std::to_string (x->size ()));
698- convertion.push_back (func);
699- return convertion;
700- }
701-
702- static std::vector<std::string> jitCopyToView (const TensorImpl* x)
703- {
704- std::vector<std::string> convertion;
705- convertion.push_back (jitStridedView (x, 0 ));
706-
707- std::string func = R"(
708- #define Type $TYPE
709- kernel void copy(global Type* restrict x, global Type* restrict y)
710- {
711- int global_id = get_global_id(0);
712- int global_size = get_global_size(0);
713- for(int i=global_id;i<$SIZE;i+=global_size) {
714- int position = location_func0(i);
715- y[position] = x[i];
716- }
717- }
718- )" ;
719-
720- auto s = shapeToStride (x->shape ());
721-
722- std::string type = to_ctype_string (x->dtype ());
723- replaceAll (func, " $TYPE" , type);
724- replaceAll (func, " $SIZE" , std::to_string (x->size ()));
725- convertion.push_back (func);
726- return convertion;
727- }
728-
729707std::shared_ptr<TensorImpl> OpenCLBackend::realize (const TensorImpl* x)
730708{
731709 requireProperties (x, this );
732710 if (x->isplain () == true )
733711 return copy (x);
734-
735- std::vector<std::string> conversion = jitCopyFromView (x);
736-
737- kernel_manager_.compileKernel (conversion, " __copy" , {" copy" });
738- cl::Kernel k = kernel_manager_.kernel (" __copy" , " copy" );
739-
740- cl::Buffer buf = allocBuffer (x->size ()*dtypeToSize (x->dtype ()));
741- k.setArg (0 , std::static_pointer_cast<const OpenCLBuffer>(x->buffer ())->buffer ());
742- k.setArg (1 , buf);
743-
744- size_t local_size = 128 ;
745- cl_int err = queue_.enqueueNDRangeKernel (k, cl::NullRange, cl::NDRange (selectWorkSize (4096 , local_size, x->size ())), cl::NDRange (local_size));
746- if (err != CL_SUCCESS)
747- throw EtError (" OpenCL kernel execution failed. Code " + str (err));
748-
749- // for(auto s : conversion)
750- // std::cout << s << std::endl;
751-
752- kernel_manager_.remove (" __copy" );// We are unlikely to use this kernel again?
753-
754- return createTensor (x->shape (), x->dtype (), buf);
712+
713+ auto res = createTensor (x->shape (), x->dtype ());
714+ assign (res.get (), x);
715+ return res;
755716}
756717
757718
@@ -765,25 +726,30 @@ void OpenCLBackend::assign(TensorImpl* dest, const TensorImpl* src)
765726 + to_string (dest->shape ()) + " and " + to_string (src->shape ()));
766727 }
767728
768- auto source = realize (src);
769-
770- if (dest->dtype () != source->dtype ())
771- source = cast (source.get (), dest->dtype ());
729+ auto param_hash = hashify (dest->dtype (), src->dtype ());
730+ auto program_name = " copy" +param_hash;
731+ if (kernel_manager_.exists (program_name) == false ) {
732+ auto args = " -DINPUT_TYPE=" +to_ctype_string (src->dtype ())+" -DOUTPUT_TYPE=" +to_ctype_string (dest->dtype ());
733+ kernel_manager_.compileFromFile (" copy.cl" , program_name, {" copy" }, false , args);
734+ }
735+ cl::Kernel k = kernel_manager_.kernel (program_name, " copy" );
772736
773- std::vector<std::string> conversion = jitCopyToView (dest);
737+ OpenCLView input_view;
738+ OpenCLView output_view;
739+ makeOpenCLView (src, &input_view);
740+ makeOpenCLView (dest, &output_view);
774741
775- kernel_manager_.compileKernel (conversion, " __copy" , {" copy" });
776- cl::Kernel k = kernel_manager_.kernel (" __copy" , " copy" );
742+ k.setArg (0 , std::static_pointer_cast<OpenCLBuffer>(dest->buffer ())->buffer ());
743+ k.setArg (1 , std::static_pointer_cast<const OpenCLBuffer>(src->buffer ())->buffer ());
744+ k.setArg (2 , output_view);
745+ k.setArg (3 , input_view);
746+ k.setArg (4 , int (src->size ()));
777747
778- k.setArg (0 , std::static_pointer_cast<const OpenCLBuffer>(source->buffer ())->buffer ());
779- k.setArg (1 , std::static_pointer_cast<const OpenCLBuffer>(dest->buffer ())->buffer ());
780748
781749 size_t local_size = 128 ;
782- cl_int err = queue_.enqueueNDRangeKernel (k, cl::NullRange, cl::NDRange (selectWorkSize (4096 , local_size, source ->size ())), cl::NDRange (local_size));
750+ cl_int err = queue_.enqueueNDRangeKernel (k, cl::NullRange, cl::NDRange (selectWorkSize (4096 , local_size, src ->size ())), cl::NDRange (local_size));
783751 if (err != CL_SUCCESS)
784752 throw EtError (" OpenCL kernel execution failed. Code " + str (err));
785-
786- kernel_manager_.remove (" __copy" );// We are unlikely to use this kernel again?
787753}
788754
789755std::shared_ptr<TensorImpl> OpenCLBackend::sum (const TensorImpl* x, size_t chunk_size, DType dtype)
0 commit comments