Skip to content

Commit ffb5cf8

Browse files
authored
Merge pull request #128 from marty1885/apichange
Optimize OpenCL indexing time
2 parents 17a4a03 + 223d1aa commit ffb5cf8

File tree

7 files changed

+158
-137
lines changed

7 files changed

+158
-137
lines changed

Etaler/3rdparty/half_precision

Etaler/Backends/OpenCLBackend.cpp

Lines changed: 74 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,30 @@ inline intmax_t selectWorkSize(intmax_t max, intmax_t mul_of, intmax_t size)
3030
return std::min((intmax_t)max, round(size));
3131
}
3232

33+
#define OPENCL_TENSOR_MAX_DIMS 32
34+
typedef struct __attribute__ ((packed)) _OpenCLView
35+
{
36+
int stride[OPENCL_TENSOR_MAX_DIMS];
37+
int shape_stride[OPENCL_TENSOR_MAX_DIMS];
38+
int offset;
39+
int dims;
40+
} OpenCLView;
41+
42+
static void makeOpenCLView(const TensorImpl* x, OpenCLView* v)
43+
{
44+
int dims = int(x->dimentions());
45+
et_assert(dims <= OPENCL_TENSOR_MAX_DIMS, "Too much dimensions for OpenCL backend.");
46+
auto stride = x->stride();
47+
auto shape_stride = shapeToStride(x->shape());
48+
for(int i=0;i<dims;i++) {
49+
v->stride[i] = stride[i];
50+
v->shape_stride[i] = shape_stride[i];
51+
}
52+
v->offset = x->offset();
53+
v->dims = dims;
54+
}
55+
56+
3357

3458
template <typename T>
3559
std::string str(T&& s)
@@ -380,7 +404,7 @@ void OpenCLBackend::sync() const
380404

381405
std::shared_ptr<TensorImpl> OpenCLBackend::copy(const TensorImpl* x)
382406
{
383-
requireProperties(x, this, IsContingous());
407+
requireProperties(x, this, IsPlain());
384408
size_t buf_size = x->size()*dtypeToSize(x->dtype());
385409
cl::Buffer buf = allocBuffer(buf_size);
386410
const cl::Buffer& src = std::static_pointer_cast<const OpenCLBuffer>(x->buffer())->buffer();
@@ -641,117 +665,54 @@ std::optional<cl::Buffer> OpenCLBackend::toSparse(const TensorImpl* x)
641665

642666
static std::string jitStridedView(const TensorImpl* x, size_t id)
643667
{
668+
// If possible, do the easy route
669+
if(x->iscontiguous()) {
670+
std::string func = R"(
671+
int location_func$ID(int index) {
672+
return index + $OFFSET;
673+
}
674+
)";
675+
replaceAll(func, "$ID", std::to_string(id));
676+
replaceAll(func, "$OFFSET", std::to_string(x->offset()));
677+
return func;
678+
}
679+
680+
// Otherwise go the complex one
644681
std::string func = R"(
645-
int location_func$ID(int location)
682+
int location_func$ID(int index)
646683
{
647-
int in_stride[] = $IN_STRIDE;
684+
int shape_stride[] = $SHAPE_STRIDE;
648685
int stride[] = $STRIDE;
649-
int bias = $BIAS;
650-
int ndpos[$DIMS] = {0};
651-
int loc = location;
652-
for(int i=0;i<$IN_DIMS;i++) {
653-
int s = in_stride[i];
654-
ndpos[$DIMS - $IN_DIMS + i] = loc / s;
655-
loc %= s;
656-
}
686+
int offset = $OFFSET;
687+
int curr_idx = index;
657688
int sum = 0;
658-
for(int i=0;i<$DIMS;i++)
659-
sum += ndpos[i]*stride[i];
660-
sum += bias;
661-
return sum;
689+
for(int i=0;i<$DIMS;i++) {
690+
int s = shape_stride[i];
691+
int ndpos = curr_idx / s;
692+
sum += ndpos * stride[i];
693+
curr_idx %= s;
694+
}
695+
return sum + offset;
662696
}
663697
)";
698+
const auto shape_stride = shapeToStride(x->shape());
664699
replaceAll(func, "$ID", std::to_string(id));
665-
auto in_strides = shapeToStride(x->shape());
666-
replaceAll(func, "$IN_STRIDE", to_string(in_strides));
667-
replaceAll(func, "$IN_DIMS", std::to_string(in_strides.size()));
668-
replaceAll(func, "$DIMS", std::to_string(std::max(x->dimentions(), x->stride().size())));
669-
700+
replaceAll(func, "$SHAPE_STRIDE", to_string(shape_stride));
701+
replaceAll(func, "$DIMS", std::to_string(x->dimentions()));
670702
replaceAll(func, "$STRIDE", to_string(x->stride()));
671-
replaceAll(func, "$BIAS", std::to_string(x->offset()));
703+
replaceAll(func, "$OFFSET", std::to_string(x->offset()));
672704
return func;
673705
}
674706

675-
static std::vector<std::string> jitCopyFromView(const TensorImpl* x)
676-
{
677-
std::vector<std::string> convertion;
678-
convertion.push_back(jitStridedView(x, 0));
679-
680-
std::string func = R"(
681-
#define Type $TYPE
682-
kernel void copy(global Type* restrict x, global Type* restrict y)
683-
{
684-
int global_id = get_global_id(0);
685-
int global_size = get_global_size(0);
686-
for(int i=global_id;i<$SIZE;i+=global_size) {
687-
int position = location_func0(i);
688-
y[i] = x[position];
689-
}
690-
}
691-
)";
692-
693-
auto s = shapeToStride(x->shape());
694-
695-
std::string type = to_ctype_string(x->dtype());
696-
replaceAll(func, "$TYPE", type);
697-
replaceAll(func, "$SIZE", std::to_string(x->size()));
698-
convertion.push_back(func);
699-
return convertion;
700-
}
701-
702-
static std::vector<std::string> jitCopyToView(const TensorImpl* x)
703-
{
704-
std::vector<std::string> convertion;
705-
convertion.push_back(jitStridedView(x, 0));
706-
707-
std::string func = R"(
708-
#define Type $TYPE
709-
kernel void copy(global Type* restrict x, global Type* restrict y)
710-
{
711-
int global_id = get_global_id(0);
712-
int global_size = get_global_size(0);
713-
for(int i=global_id;i<$SIZE;i+=global_size) {
714-
int position = location_func0(i);
715-
y[position] = x[i];
716-
}
717-
}
718-
)";
719-
720-
auto s = shapeToStride(x->shape());
721-
722-
std::string type = to_ctype_string(x->dtype());
723-
replaceAll(func, "$TYPE", type);
724-
replaceAll(func, "$SIZE", std::to_string(x->size()));
725-
convertion.push_back(func);
726-
return convertion;
727-
}
728-
729707
std::shared_ptr<TensorImpl> OpenCLBackend::realize(const TensorImpl* x)
730708
{
731709
requireProperties(x, this);
732710
if(x->isplain() == true)
733711
return copy(x);
734-
735-
std::vector<std::string> conversion = jitCopyFromView(x);
736-
737-
kernel_manager_.compileKernel(conversion, "__copy", {"copy"});
738-
cl::Kernel k = kernel_manager_.kernel("__copy", "copy");
739-
740-
cl::Buffer buf = allocBuffer(x->size()*dtypeToSize(x->dtype()));
741-
k.setArg(0, std::static_pointer_cast<const OpenCLBuffer>(x->buffer())->buffer());
742-
k.setArg(1, buf);
743-
744-
size_t local_size = 128;
745-
cl_int err = queue_.enqueueNDRangeKernel(k, cl::NullRange, cl::NDRange(selectWorkSize(4096, local_size, x->size())), cl::NDRange(local_size));
746-
if(err != CL_SUCCESS)
747-
throw EtError("OpenCL kernel execution failed. Code " + str(err));
748-
749-
//for(auto s : conversion)
750-
// std::cout << s << std::endl;
751-
752-
kernel_manager_.remove("__copy");//We are unlikely to use this kernel again?
753-
754-
return createTensor(x->shape(), x->dtype(), buf);
712+
713+
auto res = createTensor(x->shape(), x->dtype());
714+
assign(res.get(), x);
715+
return res;
755716
}
756717

757718

@@ -765,25 +726,30 @@ void OpenCLBackend::assign(TensorImpl* dest, const TensorImpl* src)
765726
+ to_string(dest->shape()) + " and " + to_string(src->shape()));
766727
}
767728

768-
auto source = realize(src);
769-
770-
if(dest->dtype() != source->dtype())
771-
source = cast(source.get(), dest->dtype());
729+
auto param_hash = hashify(dest->dtype(), src->dtype());
730+
auto program_name = "copy"+param_hash;
731+
if(kernel_manager_.exists(program_name) == false) {
732+
auto args = "-DINPUT_TYPE="+to_ctype_string(src->dtype())+" -DOUTPUT_TYPE="+to_ctype_string(dest->dtype());
733+
kernel_manager_.compileFromFile("copy.cl", program_name, {"copy"}, false, args);
734+
}
735+
cl::Kernel k = kernel_manager_.kernel(program_name, "copy");
772736

773-
std::vector<std::string> conversion = jitCopyToView(dest);
737+
OpenCLView input_view;
738+
OpenCLView output_view;
739+
makeOpenCLView(src, &input_view);
740+
makeOpenCLView(dest, &output_view);
774741

775-
kernel_manager_.compileKernel(conversion, "__copy", {"copy"});
776-
cl::Kernel k = kernel_manager_.kernel("__copy", "copy");
742+
k.setArg(0, std::static_pointer_cast<OpenCLBuffer>(dest->buffer())->buffer());
743+
k.setArg(1, std::static_pointer_cast<const OpenCLBuffer>(src->buffer())->buffer());
744+
k.setArg(2, output_view);
745+
k.setArg(3, input_view);
746+
k.setArg(4, int(src->size()));
777747

778-
k.setArg(0, std::static_pointer_cast<const OpenCLBuffer>(source->buffer())->buffer());
779-
k.setArg(1, std::static_pointer_cast<const OpenCLBuffer>(dest->buffer())->buffer());
780748

781749
size_t local_size = 128;
782-
cl_int err = queue_.enqueueNDRangeKernel(k, cl::NullRange, cl::NDRange(selectWorkSize(4096, local_size, source->size())), cl::NDRange(local_size));
750+
cl_int err = queue_.enqueueNDRangeKernel(k, cl::NullRange, cl::NDRange(selectWorkSize(4096, local_size, src->size())), cl::NDRange(local_size));
783751
if(err != CL_SUCCESS)
784752
throw EtError("OpenCL kernel execution failed. Code " + str(err));
785-
786-
kernel_manager_.remove("__copy");//We are unlikely to use this kernel again?
787753
}
788754

789755
std::shared_ptr<TensorImpl> OpenCLBackend::sum(const TensorImpl* x, size_t chunk_size, DType dtype)

0 commit comments

Comments
 (0)