Merge pull request #120 from marty1885/master

marty1885 · web-flow · commit 9f182998bb47 · 2019-12-13T22:49:10.000+08:00
sum() performance optimizations
diff --git a/Etaler/Algorithms/Boost.hpp b/Etaler/Algorithms/Boost.hpp
@@ -17,4 +17,9 @@ static Tensor boost(const Tensor& activity, const Tensor& average_activity, floa
 	return cast(boostFactor(average_activity, target_activity, boost_factor)*activity, DType::Int32);
 }
 
+static Tensor logarithmicBoost(const Tensor& activity, const Tensor& average_activity, float target_activity, float target_density, int active_threshold)
+{
+	return activity * (log(average_activity - target_activity)/log(target_density - sum(activity > active_threshold).item<int>()));
+}
+
 }
diff --git a/Etaler/Backends/CPUBackend.cpp b/Etaler/Backends/CPUBackend.cpp
@@ -9,6 +9,7 @@
 #include <tbb/parallel_for.h>
 #include <tbb/blocked_range.h>
 #include <tbb/parallel_sort.h>
+#include <tbb/parallel_reduce.h>
 
 using namespace et;
 
@@ -57,7 +58,9 @@ void* CPUBuffer::data() const
 	return std::visit([](const auto& v){return (void*)v;}, storage_);
 }
 
-template <typename TypeList = type_list_t<int32_t, float, bool, half>, typename Func = void>
+using DefaultTypeList = type_list_t<int32_t, float, bool, half>;
+
+template <typename TypeList = DefaultTypeList, typename Func = void>
 inline void dispatch(DType dtype, Func f)
 {
 	static_assert(std::is_same_v<Func, void> == false); //void is just a dummy value
@@ -73,6 +76,18 @@ inline void dispatch(DType dtype, Func f)
 		throw EtError("Cannot dispatch such dtype: " + to_ctype_string(dtype));
 }
 
+template <typename TL1 = DefaultTypeList, typename TL2 = DefaultTypeList, typename Func = void>
+inline void dispatch2d(DType t1, DType t2, Func f)
+{
+	dispatch<TL1>(t1, [&](auto v1){
+		using T1 = decltype(v1);
+		dispatch<TL2>(t2, [&](auto v2){
+			using T2 = decltype(v2);
+			f(T1(), T2());
+		});
+	});
+}
+
 namespace et::detail
 {
 template <typename PermType>
@@ -624,21 +639,38 @@ std::shared_ptr<TensorImpl> CPUBackend::sum(const TensorImpl* x, size_t chunk_si
 		}();
 	}
 
-	auto res = createTensor({(intmax_t)(x->size()/chunk_size)}, result_dtype);
+	size_t result_size = x->size()/chunk_size;
+	auto res = createTensor({intmax_t(result_size)}, result_dtype);
 
-	dispatch(x->dtype(), [&](auto v){
-		using T = decltype(v);
-		auto in = (const T*)x->data();
-		dispatch(result_dtype, [&](auto v) {
-			using ResType = decltype(v);
+	// Optimized case for summing everything
+	if(result_size == 1) {
+		dispatch2d(x->dtype(), result_dtype, [&](auto v1, auto v2) {
+			using T = decltype(v1);
+			auto in = (const T*)x->data();
+			using ResType = decltype(v2);
+			auto ptr = (ResType*) res->data();
+			*ptr = tbb::parallel_reduce(tbb::blocked_range(in, in+x->size()), ResType(0)
+				, [](const auto& r, ResType init){
+					return std::accumulate(r.begin(), r.end(), init);
+				},
+				[](auto x, auto y) {
+					return x + y;
+				});
+		});
+	}
+	else {
+		dispatch2d(x->dtype(), result_dtype, [&](auto v1, auto v2) {
+			using T = decltype(v1);
+			auto in = (const T*)x->data();
+			using ResType = decltype(v2);
 			auto ptr = (ResType*) res->data();
 			tbb::parallel_for(size_t(0), size_t(x->size()/chunk_size), [&](size_t i) {
 				size_t offset = i*chunk_size;
 				ResType s = std::accumulate(in+offset, in+offset+chunk_size, ResType(0));
 				ptr[i] = s;
 			});
 		});
-	});
+	}
 
 	return res;
 }
diff --git a/Etaler/Backends/OpenCLBackend.cpp b/Etaler/Backends/OpenCLBackend.cpp
@@ -813,26 +813,38 @@ std::shared_ptr<TensorImpl> OpenCLBackend::sum(const TensorImpl* x, size_t chunk
 		return DType::Int32;
 	}(x->dtype(), result_dtype);
 
-	auto param_hash = hashify(x->dtype(), result_dtype, intermid_type);
+	intmax_t result_size = intmax_t(x->size()/chunk_size);
+	bool use_local_kernel = result_size <= numComputeUnits(); // Weather to use the kernel optimized for generating small number of results
+	auto param_hash = hashify(x->dtype(), result_dtype, intermid_type, use_local_kernel);
 	std::string program_name = "sum" + param_hash;
 	if(kernel_manager_.exists(program_name) == false) {
 		std::string args = "-DInType=" + to_ctype_string(x->dtype()) + " -DOutType=" + to_ctype_string(result_dtype) + " -DIntermidType=" + to_ctype_string(intermid_type)
 		+ (intermid_type==DType::Half? " -DIntermidIsHalf" : "");
-		kernel_manager_.compileFromFile("sum.cl", program_name, {"sum"}, false, args);
+
+		if(use_local_kernel)
+			kernel_manager_.compileFromFile("sum_local.cl", program_name, {"sum"}, false, args);
+		else
+			kernel_manager_.compileFromFile("sum.cl", program_name, {"sum"}, false, args);
 	}
 
 	cl::Kernel k = kernel_manager_.kernel(program_name, "sum");
-
-	auto res = createTensor({intmax_t(x->size()/chunk_size)}, result_dtype);
+	auto res = createTensor({result_size}, result_dtype);
 
 	k.setArg(0, std::static_pointer_cast<const OpenCLBuffer>(x->buffer())->buffer());
 	k.setArg(1, std::static_pointer_cast<OpenCLBuffer>(res->buffer())->buffer());
 	k.setArg(2, int(x->size()));
 	k.setArg(3, int(chunk_size));
 
-	size_t local_size = 128;
-
-	cl_int err = queue_.enqueueNDRangeKernel(k, cl::NullRange, cl::NDRange(selectWorkSize(4096, local_size, x->size()/chunk_size)), cl::NDRange(local_size));
+	cl_int err = CL_SUCCESS;
+	if(use_local_kernel) {
+		size_t local_size = 64; // the same value set in sum_local.cl
+		err = queue_.enqueueNDRangeKernel(k, cl::NullRange, cl::NDRange(local_size*result_size), cl::NDRange(local_size));
+	}
+	else {
+		size_t local_size = 128;
+		err = queue_.enqueueNDRangeKernel(k, cl::NullRange, cl::NDRange(selectWorkSize(4096, local_size, x->size()/chunk_size)), cl::NDRange(local_size));
+	}
+	
 	if(err != CL_SUCCESS)
 		throw EtError("OpenCL kernel execution failed. Code " + str(err));
 	return res;
@@ -848,7 +860,7 @@ void OpenCLBackend::decaySynapses(TensorImpl* connections, TensorImpl* permeance
 	size_t input_cell_count = connections->size()/max_synapses_per_cell;
 
 	auto param_hash = hashify(input_cell_count, max_synapses_per_cell, permeances->dtype());
-	std::string program_name = "sum" + param_hash;
+	std::string program_name = "decaySynapses" + param_hash;
 	if(kernel_manager_.exists(program_name) == false) {
 		auto args = "-DNUM_CELLS="+str(input_cell_count) + " -DMAX_SYNAPSE_PER_CELL="+str(max_synapses_per_cell) + 
 			" -DPERM_TYPE="+to_ctype_string(permeances->dtype());
diff --git a/Etaler/Core/Tensor.cpp b/Etaler/Core/Tensor.cpp
@@ -346,8 +346,6 @@ Tensor et::cat(const svector<Tensor>& tensors, intmax_t dim)
 
 Tensor Tensor::copy() const
 {
-	//if(points_to<ViewTensor>(pimpl()))
-	//	return realize().copy();
 	return backend()->copy(pimpl());
 }
 
diff --git a/Etaler/Core/Tensor.hpp b/Etaler/Core/Tensor.hpp
@@ -249,7 +249,7 @@ inline Tensor realize(const Tensor& t)
 	return t.realize();
 }
 
-inline Tensor attempt_realize(const Tensor& t)
+inline Tensor ravel(const Tensor& t)
 {
 	if(t.iscontiguous() == false)
 		return t;
diff --git a/docs/source/PythonBindings.md b/docs/source/PythonBindings.md
@@ -1,8 +1,18 @@
 # Python bindings
 
-Currently there are no offical python support. The feature is planned. But nevertheless, you can use Etaler in Python via [ROOT](https://root.cern.ch) and it's automatic binding generation feature.
+## PyEtaler
+[PyEtaler](https://guthub.com/etaler/pyetaler) is the offical binding for Etaler. We try to keep the Python API as close to the C++ one as possible. So you can use the C++ document as the Python document. With that said, some functions are changed in the binding to make it more Pythonic.
 
-## Example
+```python
+>>> from etaler import et
+>>> et.ones([2, 2])
+{{ 1, 1}, 
+ { 1, 1}}
+```
+
+## ROOT
+
+If cppyy is not avaliable to you for any reason. You can use Etaler in Python via [ROOT](https://root.cern.ch) and it's automatic binding generation feature.
 
 ```Python
 # Load ROOT
@@ -35,16 +45,5 @@ print(t)
 """
 {{ 1, 1}, 
  { 1, 1}}
-
 """
-```
-
-## PyEtaler
-The offical Python binding - [PyEtaler](https://guthub.com/etaler/pyetaler) in currently work in progress. We recomment using ROOT to bind from Python before PyEtaler leaves WIP.
-
-```
->>> from etaler import et
->>> et.ones([2, 2])
-{{ 1, 1}, 
- { 1, 1}}
 ```
diff --git a/kernels/sum_local.cl b/kernels/sum_local.cl
@@ -0,0 +1,49 @@
+#ifndef InType
+        #error InType not defined
+#endif
+
+#ifndef OutType
+        #error OutType not defined
+#endif
+
+#ifndef IntermidType
+        #error IntermidType not defined
+#endif
+
+#ifdef IntermidIsHalf
+        #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+#endif
+
+// Just a sane number for GPUs since quering the number of compute units in a CU in OpenCL
+// is quite ganky. TODO: The number needs to be changed for a FPGA or a VLIW processor (
+// anything that's not SIMT)
+#define WORKITEM_PER_CU 64
+
+//InType: Input Data type
+//OutType: Output Data type
+//in_size: number of elements of the input
+//chunk_size: for each chunk_size elements, produce 1 sum
+//local_size: must equal to WORKITEM_PER_CU
+//group_size: must equal to in_size/chunk_size
+kernel void sum(global InType* restrict x, global OutType* restrict y, int in_size, int chunk_size)
+{
+        local IntermidType local_sum[WORKITEM_PER_CU];
+        int group_id = get_group_id(0);
+        int group_size = get_num_groups(0);
+        int local_size = get_local_size(0);
+        int local_id = get_local_id(0);
+        IntermidType private_sum = 0;
+        int start = chunk_size*group_id;
+        for(int i=start+local_id;i<start+chunk_size; i+=local_size)
+                private_sum += x[i];
+        local_sum[local_id] = private_sum;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // reduce the indivisually computed local result into a final sum
+        if(local_id == 0) {
+                IntermidType s = 0;
+                for(int i=0;i<local_size;i++)
+                        s += local_sum[i];
+                y[group_id] = s;
+        }
+}

Original file line number	Diff line number	Diff line change
`@@ -17,4 +17,9 @@ static Tensor boost(const Tensor& activity, const Tensor& average_activity, floa`
`17`	`17`	`return cast(boostFactor(average_activity, target_activity, boost_factor)*activity, DType::Int32);`
`18`	`18`	`}`
`19`	`19`
	`20`	`+static Tensor logarithmicBoost(const Tensor& activity, const Tensor& average_activity, float target_activity, float target_density, int active_threshold)`
	`21`	`+{`
	`22`	`+ return activity * (log(average_activity - target_activity)/log(target_density - sum(activity > active_threshold).item<int>()));`
	`23`	`+}`
	`24`	`+`
`20`	`25`	`}`
Original file line number	Diff line number	Diff line change
`@@ -346,8 +346,6 @@ Tensor et::cat(const svector<Tensor>& tensors, intmax_t dim)`
`346`	`346`
`347`	`347`	`Tensor Tensor::copy() const`
`348`	`348`	`{`
`349`		`- //if(points_to<ViewTensor>(pimpl()))`
`350`		`- // return realize().copy();`
`351`	`349`	`return backend()->copy(pimpl());`
`352`	`350`	`}`
`353`	`351`
Original file line number	Diff line number	Diff line change
`@@ -249,7 +249,7 @@ inline Tensor realize(const Tensor& t)`
`249`	`249`	`return t.realize();`
`250`	`250`	`}`
`251`	`251`
`252`		`-inline Tensor attempt_realize(const Tensor& t)`
	`252`	`+inline Tensor ravel(const Tensor& t)`
`253`	`253`	`{`
`254`	`254`	`if(t.iscontiguous() == false)`
`255`	`255`	`return t;`