Skip to content

Commit 9f18299

Browse files
authored
Merge pull request #120 from marty1885/master
sum() performance optimizations
2 parents 55fbab5 + 8583791 commit 9f18299

File tree

7 files changed

+127
-32
lines changed

7 files changed

+127
-32
lines changed

Etaler/Algorithms/Boost.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,9 @@ static Tensor boost(const Tensor& activity, const Tensor& average_activity, floa
1717
return cast(boostFactor(average_activity, target_activity, boost_factor)*activity, DType::Int32);
1818
}
1919

20+
static Tensor logarithmicBoost(const Tensor& activity, const Tensor& average_activity, float target_activity, float target_density, int active_threshold)
21+
{
22+
return activity * (log(average_activity - target_activity)/log(target_density - sum(activity > active_threshold).item<int>()));
23+
}
24+
2025
}

Etaler/Backends/CPUBackend.cpp

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <tbb/parallel_for.h>
1010
#include <tbb/blocked_range.h>
1111
#include <tbb/parallel_sort.h>
12+
#include <tbb/parallel_reduce.h>
1213

1314
using namespace et;
1415

@@ -57,7 +58,9 @@ void* CPUBuffer::data() const
5758
return std::visit([](const auto& v){return (void*)v;}, storage_);
5859
}
5960

60-
template <typename TypeList = type_list_t<int32_t, float, bool, half>, typename Func = void>
61+
using DefaultTypeList = type_list_t<int32_t, float, bool, half>;
62+
63+
template <typename TypeList = DefaultTypeList, typename Func = void>
6164
inline void dispatch(DType dtype, Func f)
6265
{
6366
static_assert(std::is_same_v<Func, void> == false); //void is just a dummy value
@@ -73,6 +76,18 @@ inline void dispatch(DType dtype, Func f)
7376
throw EtError("Cannot dispatch such dtype: " + to_ctype_string(dtype));
7477
}
7578

79+
template <typename TL1 = DefaultTypeList, typename TL2 = DefaultTypeList, typename Func = void>
80+
inline void dispatch2d(DType t1, DType t2, Func f)
81+
{
82+
dispatch<TL1>(t1, [&](auto v1){
83+
using T1 = decltype(v1);
84+
dispatch<TL2>(t2, [&](auto v2){
85+
using T2 = decltype(v2);
86+
f(T1(), T2());
87+
});
88+
});
89+
}
90+
7691
namespace et::detail
7792
{
7893
template <typename PermType>
@@ -624,21 +639,38 @@ std::shared_ptr<TensorImpl> CPUBackend::sum(const TensorImpl* x, size_t chunk_si
624639
}();
625640
}
626641

627-
auto res = createTensor({(intmax_t)(x->size()/chunk_size)}, result_dtype);
642+
size_t result_size = x->size()/chunk_size;
643+
auto res = createTensor({intmax_t(result_size)}, result_dtype);
628644

629-
dispatch(x->dtype(), [&](auto v){
630-
using T = decltype(v);
631-
auto in = (const T*)x->data();
632-
dispatch(result_dtype, [&](auto v) {
633-
using ResType = decltype(v);
645+
// Optimized case for summing everything
646+
if(result_size == 1) {
647+
dispatch2d(x->dtype(), result_dtype, [&](auto v1, auto v2) {
648+
using T = decltype(v1);
649+
auto in = (const T*)x->data();
650+
using ResType = decltype(v2);
651+
auto ptr = (ResType*) res->data();
652+
*ptr = tbb::parallel_reduce(tbb::blocked_range(in, in+x->size()), ResType(0)
653+
, [](const auto& r, ResType init){
654+
return std::accumulate(r.begin(), r.end(), init);
655+
},
656+
[](auto x, auto y) {
657+
return x + y;
658+
});
659+
});
660+
}
661+
else {
662+
dispatch2d(x->dtype(), result_dtype, [&](auto v1, auto v2) {
663+
using T = decltype(v1);
664+
auto in = (const T*)x->data();
665+
using ResType = decltype(v2);
634666
auto ptr = (ResType*) res->data();
635667
tbb::parallel_for(size_t(0), size_t(x->size()/chunk_size), [&](size_t i) {
636668
size_t offset = i*chunk_size;
637669
ResType s = std::accumulate(in+offset, in+offset+chunk_size, ResType(0));
638670
ptr[i] = s;
639671
});
640672
});
641-
});
673+
}
642674

643675
return res;
644676
}

Etaler/Backends/OpenCLBackend.cpp

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -813,26 +813,38 @@ std::shared_ptr<TensorImpl> OpenCLBackend::sum(const TensorImpl* x, size_t chunk
813813
return DType::Int32;
814814
}(x->dtype(), result_dtype);
815815

816-
auto param_hash = hashify(x->dtype(), result_dtype, intermid_type);
816+
intmax_t result_size = intmax_t(x->size()/chunk_size);
817+
bool use_local_kernel = result_size <= numComputeUnits(); // Weather to use the kernel optimized for generating small number of results
818+
auto param_hash = hashify(x->dtype(), result_dtype, intermid_type, use_local_kernel);
817819
std::string program_name = "sum" + param_hash;
818820
if(kernel_manager_.exists(program_name) == false) {
819821
std::string args = "-DInType=" + to_ctype_string(x->dtype()) + " -DOutType=" + to_ctype_string(result_dtype) + " -DIntermidType=" + to_ctype_string(intermid_type)
820822
+ (intermid_type==DType::Half? " -DIntermidIsHalf" : "");
821-
kernel_manager_.compileFromFile("sum.cl", program_name, {"sum"}, false, args);
823+
824+
if(use_local_kernel)
825+
kernel_manager_.compileFromFile("sum_local.cl", program_name, {"sum"}, false, args);
826+
else
827+
kernel_manager_.compileFromFile("sum.cl", program_name, {"sum"}, false, args);
822828
}
823829

824830
cl::Kernel k = kernel_manager_.kernel(program_name, "sum");
825-
826-
auto res = createTensor({intmax_t(x->size()/chunk_size)}, result_dtype);
831+
auto res = createTensor({result_size}, result_dtype);
827832

828833
k.setArg(0, std::static_pointer_cast<const OpenCLBuffer>(x->buffer())->buffer());
829834
k.setArg(1, std::static_pointer_cast<OpenCLBuffer>(res->buffer())->buffer());
830835
k.setArg(2, int(x->size()));
831836
k.setArg(3, int(chunk_size));
832837

833-
size_t local_size = 128;
834-
835-
cl_int err = queue_.enqueueNDRangeKernel(k, cl::NullRange, cl::NDRange(selectWorkSize(4096, local_size, x->size()/chunk_size)), cl::NDRange(local_size));
838+
cl_int err = CL_SUCCESS;
839+
if(use_local_kernel) {
840+
size_t local_size = 64; // the same value set in sum_local.cl
841+
err = queue_.enqueueNDRangeKernel(k, cl::NullRange, cl::NDRange(local_size*result_size), cl::NDRange(local_size));
842+
}
843+
else {
844+
size_t local_size = 128;
845+
err = queue_.enqueueNDRangeKernel(k, cl::NullRange, cl::NDRange(selectWorkSize(4096, local_size, x->size()/chunk_size)), cl::NDRange(local_size));
846+
}
847+
836848
if(err != CL_SUCCESS)
837849
throw EtError("OpenCL kernel execution failed. Code " + str(err));
838850
return res;
@@ -848,7 +860,7 @@ void OpenCLBackend::decaySynapses(TensorImpl* connections, TensorImpl* permeance
848860
size_t input_cell_count = connections->size()/max_synapses_per_cell;
849861

850862
auto param_hash = hashify(input_cell_count, max_synapses_per_cell, permeances->dtype());
851-
std::string program_name = "sum" + param_hash;
863+
std::string program_name = "decaySynapses" + param_hash;
852864
if(kernel_manager_.exists(program_name) == false) {
853865
auto args = "-DNUM_CELLS="+str(input_cell_count) + " -DMAX_SYNAPSE_PER_CELL="+str(max_synapses_per_cell) +
854866
" -DPERM_TYPE="+to_ctype_string(permeances->dtype());

Etaler/Core/Tensor.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -346,8 +346,6 @@ Tensor et::cat(const svector<Tensor>& tensors, intmax_t dim)
346346

347347
Tensor Tensor::copy() const
348348
{
349-
//if(points_to<ViewTensor>(pimpl()))
350-
// return realize().copy();
351349
return backend()->copy(pimpl());
352350
}
353351

Etaler/Core/Tensor.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ inline Tensor realize(const Tensor& t)
249249
return t.realize();
250250
}
251251

252-
inline Tensor attempt_realize(const Tensor& t)
252+
inline Tensor ravel(const Tensor& t)
253253
{
254254
if(t.iscontiguous() == false)
255255
return t;

docs/source/PythonBindings.md

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,18 @@
11
# Python bindings
22

3-
Currently there are no offical python support. The feature is planned. But nevertheless, you can use Etaler in Python via [ROOT](https://root.cern.ch) and it's automatic binding generation feature.
3+
## PyEtaler
4+
[PyEtaler](https://guthub.com/etaler/pyetaler) is the offical binding for Etaler. We try to keep the Python API as close to the C++ one as possible. So you can use the C++ document as the Python document. With that said, some functions are changed in the binding to make it more Pythonic.
45

5-
## Example
6+
```python
7+
>>> from etaler import et
8+
>>> et.ones([2, 2])
9+
{{ 1, 1},
10+
{ 1, 1}}
11+
```
12+
13+
## ROOT
14+
15+
If cppyy is not avaliable to you for any reason. You can use Etaler in Python via [ROOT](https://root.cern.ch) and it's automatic binding generation feature.
616

717
```Python
818
# Load ROOT
@@ -35,16 +45,5 @@ print(t)
3545
"""
3646
{{ 1, 1},
3747
{ 1, 1}}
38-
3948
"""
40-
```
41-
42-
## PyEtaler
43-
The offical Python binding - [PyEtaler](https://guthub.com/etaler/pyetaler) in currently work in progress. We recomment using ROOT to bind from Python before PyEtaler leaves WIP.
44-
45-
```
46-
>>> from etaler import et
47-
>>> et.ones([2, 2])
48-
{{ 1, 1},
49-
{ 1, 1}}
5049
```

kernels/sum_local.cl

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#ifndef InType
2+
#error InType not defined
3+
#endif
4+
5+
#ifndef OutType
6+
#error OutType not defined
7+
#endif
8+
9+
#ifndef IntermidType
10+
#error IntermidType not defined
11+
#endif
12+
13+
#ifdef IntermidIsHalf
14+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
15+
#endif
16+
17+
// Just a sane number for GPUs since quering the number of compute units in a CU in OpenCL
18+
// is quite ganky. TODO: The number needs to be changed for a FPGA or a VLIW processor (
19+
// anything that's not SIMT)
20+
#define WORKITEM_PER_CU 64
21+
22+
//InType: Input Data type
23+
//OutType: Output Data type
24+
//in_size: number of elements of the input
25+
//chunk_size: for each chunk_size elements, produce 1 sum
26+
//local_size: must equal to WORKITEM_PER_CU
27+
//group_size: must equal to in_size/chunk_size
28+
kernel void sum(global InType* restrict x, global OutType* restrict y, int in_size, int chunk_size)
29+
{
30+
local IntermidType local_sum[WORKITEM_PER_CU];
31+
int group_id = get_group_id(0);
32+
int group_size = get_num_groups(0);
33+
int local_size = get_local_size(0);
34+
int local_id = get_local_id(0);
35+
IntermidType private_sum = 0;
36+
int start = chunk_size*group_id;
37+
for(int i=start+local_id;i<start+chunk_size; i+=local_size)
38+
private_sum += x[i];
39+
local_sum[local_id] = private_sum;
40+
barrier(CLK_LOCAL_MEM_FENCE);
41+
42+
// reduce the indivisually computed local result into a final sum
43+
if(local_id == 0) {
44+
IntermidType s = 0;
45+
for(int i=0;i<local_size;i++)
46+
s += local_sum[i];
47+
y[group_id] = s;
48+
}
49+
}

0 commit comments

Comments
 (0)