Skip to content

Commit 46d5927

Browse files
arm_compute v17.05
1 parent c772c0b commit 46d5927

File tree

1,489 files changed

+10801
-10899
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,489 files changed

+10801
-10899
lines changed

LICENSE

Lines changed: 0 additions & 21 deletions
This file was deleted.

README.md

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,17 @@
11

22
Please report issues here: https://github.com/ARM-software/ComputeLibrary/issues
33

4-
Documentation available here: [v17.04](https://arm-software.github.io/ComputeLibrary/v17.04/) [v17.03.1](https://arm-software.github.io/ComputeLibrary/v17.03.1/)
4+
Documentation available here:
55

6-
Binaries available here: [v17.04](https://github.com/ARM-software/ComputeLibrary/releases/download/v17.04/arm_compute-v17.04-bin.tar.gz) [v17.03.1](https://github.com/ARM-software/ComputeLibrary/releases/download/v17.03.1/arm_compute-v17.03.1-bin.tar.gz)
6+
- [v17.05](https://arm-software.github.io/ComputeLibrary/v17.05/)
7+
- [v17.04](https://arm-software.github.io/ComputeLibrary/v17.04/)
8+
- [v17.03.1](https://arm-software.github.io/ComputeLibrary/v17.03.1/)
9+
10+
Binaries available here:
11+
12+
- [v17.05](https://github.com/ARM-software/ComputeLibrary/releases/download/v17.05/arm_compute-v17.05-bin.tar.gz)
13+
- [v17.04](https://github.com/ARM-software/ComputeLibrary/releases/download/v17.04/arm_compute-v17.04-bin.tar.gz)
14+
- [v17.03.1](https://github.com/ARM-software/ComputeLibrary/releases/download/v17.03.1/arm_compute-v17.03.1-bin.tar.gz)
715

816
917

SConstruct

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,32 @@
2020
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2121
# SOFTWARE.
2222

23-
SConscript('sconscript', variant_dir='build', duplicate=0)
23+
import os
24+
25+
vars = Variables("scons")
26+
vars.AddVariables(
27+
BoolVariable("debug", "Debug", False),
28+
BoolVariable("asserts", "Enable asserts (this flag is forced to 1 for debug=1)", False),
29+
EnumVariable("arch", "Target Architecture", "armv7a", allowed_values=("armv7a", "arm64-v8a", "arm64-v8.2-a", "x86_32", "x86_64")),
30+
EnumVariable("os", "Target OS", "linux", allowed_values=("linux", "android", "bare_metal")),
31+
EnumVariable("build", "Build type", "cross_compile", allowed_values=("native", "cross_compile")),
32+
BoolVariable("Werror", "Enable/disable the -Werror compilation flag", True),
33+
BoolVariable("opencl", "Enable OpenCL support", True),
34+
BoolVariable("neon", "Enable Neon support", False),
35+
BoolVariable("embed_kernels", "Embed OpenCL kernels in library binary", False),
36+
BoolVariable("set_soname", "Set the library's soname and shlibversion (requires SCons 2.4 or above)", False),
37+
BoolVariable("openmp", "Enable OpenMP backend", False),
38+
BoolVariable("cppthreads", "Enable C++11 threads backend", True),
39+
PathVariable("build_dir", "Specify sub-folder for the build", ".", PathVariable.PathIsDirCreate),
40+
("extra_cxx_flags", "Extra CXX flags to be appended to the build command", "")
41+
)
42+
43+
env = Environment(platform='posix', variables = vars, ENV = os.environ)
44+
45+
Help(vars.GenerateHelpText(env))
46+
47+
Export('vars')
48+
Export('env')
49+
50+
if not GetOption("help"):
51+
SConscript('sconscript', variant_dir='#build/%s/arm_compute' % env['build_dir'], duplicate=0)

arm_compute/core/CL/ICLTensor.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ class ICLTensor : public ITensor
4343
ICLTensor();
4444
ICLTensor(const ICLTensor &) = delete;
4545
ICLTensor &operator=(const ICLTensor &) = delete;
46+
ICLTensor(ICLTensor &&) = default;
47+
ICLTensor &operator=(ICLTensor &&) = default;
48+
virtual ~ICLTensor() = default;
4649

4750
/** Interface to be implemented by the child class to return a reference to the OpenCL buffer containing the image's data.
4851
*

arm_compute/core/CL/OpenCL.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,9 @@ namespace cl
3535
{
3636
static const NDRange Range_128_1 = NDRange(128, 1);
3737
}
38+
39+
namespace arm_compute
40+
{
41+
bool opencl_is_available();
42+
}
3843
#endif /* __ARM_COMPUTE_OPENCL_H__ */

arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ class ICLTensor;
4747
* \end{array} \right)
4848
* @f]
4949
*
50-
* After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ]
50+
* After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
5151
*/
5252
class CLGEMMInterleave4x4Kernel : public ICLKernel
5353
{
@@ -64,7 +64,7 @@ class CLGEMMInterleave4x4Kernel : public ICLKernel
6464
CLGEMMInterleave4x4Kernel &operator=(CLGEMMInterleave4x4Kernel &&) = default;
6565
/** Initialise the kernel's input and output.
6666
*
67-
* @param[in] input Input tensor. Data types supported: U8/F16/F32
67+
* @param[in] input Input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32
6868
* @param[out] output Output tensor. Data type supported: same as @p input
6969
*/
7070
void configure(const ICLTensor *input, ICLTensor *output);

arm_compute/core/Helpers.h

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -183,24 +183,37 @@ inline void for_each(F &&func, T &&arg, Ts &&... args)
183183
for_each(func, args...);
184184
}
185185

186-
/** Base case of foldl. Return value. */
186+
/** Base case of foldl.
187+
*
188+
* @return value.
189+
*/
187190
template <typename F, typename T>
188-
inline T foldl(F &&, T &&value)
191+
inline T foldl(F &&, const T &value)
189192
{
190193
return value;
191194
}
192195

196+
/** Base case of foldl.
197+
*
198+
* @return Function evaluation for value1 and value2
199+
*/
200+
template <typename F, typename T, typename U>
201+
inline auto foldl(F &&func, T &&value1, U &&value2) -> decltype(func(value1, value2))
202+
{
203+
return func(value1, value2);
204+
}
205+
193206
/** Fold left.
194207
*
195208
* @param[in] func Function to be called
196209
* @param[in] initial Initial value
197210
* @param[in] value Argument passed to the function
198211
* @param[in] values Remaining arguments
199212
*/
200-
template <typename F, typename I, typename T, typename... Ts>
201-
inline I foldl(F &&func, I &&initial, T &&value, Ts &&... values)
213+
template <typename F, typename I, typename T, typename... Vs>
214+
inline I foldl(F &&func, I &&initial, T &&value, Vs &&... values)
202215
{
203-
return foldl(func, func(initial, value), values...);
216+
return foldl(std::forward<F>(func), func(std::forward<I>(initial), std::forward<T>(value)), std::forward<Vs>(values)...);
204217
}
205218
}
206219

arm_compute/core/NEON/NEMath.h

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ const std::array<float32x4_t, 8> log_tab =
6464
*
6565
* @return The calculated inverse square root.
6666
*/
67-
inline float32x4_t vinvsqrt_f32(float32x4_t x)
67+
inline float32x4_t vinvsqrtq_f32(float32x4_t x)
6868
{
6969
float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
7070
sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
@@ -79,7 +79,7 @@ inline float32x4_t vinvsqrt_f32(float32x4_t x)
7979
*
8080
* @return The calculated reciprocal.
8181
*/
82-
inline float32x4_t vinv_f32(const float32x4_t &x)
82+
inline float32x4_t vinvq_f32(const float32x4_t &x)
8383
{
8484
float32x4_t recip = vrecpeq_f32(x);
8585
recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
@@ -94,7 +94,7 @@ inline float32x4_t vinv_f32(const float32x4_t &x)
9494
*
9595
* @return The calculated approximation.
9696
*/
97-
inline float32x4_t vtaylor_poly_f32(const float32x4_t &x, const std::array<float32x4_t, 8> &coeffs)
97+
inline float32x4_t vtaylor_polyq_f32(const float32x4_t &x, const std::array<float32x4_t, 8> &coeffs)
9898
{
9999
float32x4_t A = vmlaq_f32(coeffs[0], coeffs[4], x);
100100
float32x4_t B = vmlaq_f32(coeffs[2], coeffs[6], x);
@@ -112,7 +112,7 @@ inline float32x4_t vtaylor_poly_f32(const float32x4_t &x, const std::array<float
112112
*
113113
* @return The calculated exponent.
114114
*/
115-
inline float32x4_t vexp_f32(const float32x4_t &x)
115+
inline float32x4_t vexpq_f32(const float32x4_t &x)
116116
{
117117
static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
118118
static const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); // 1/ln(2)
@@ -122,7 +122,7 @@ inline float32x4_t vexp_f32(const float32x4_t &x)
122122
float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), CONST_LN2);
123123

124124
// Polynomial Approximation
125-
float32x4_t poly = vtaylor_poly_f32(val, exp_tab);
125+
float32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
126126

127127
// Reconstruct
128128
poly = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(poly), vshlq_n_s32(m, 23)));
@@ -136,7 +136,7 @@ inline float32x4_t vexp_f32(const float32x4_t &x)
136136
*
137137
* @return The calculated logarithm.
138138
*/
139-
inline float32x4_t vlog_f32(const float32x4_t &x)
139+
inline float32x4_t vlogq_f32(const float32x4_t &x)
140140
{
141141
static const int32x4_t CONST_127 = vdupq_n_s32(127); // 127
142142
static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
@@ -146,7 +146,7 @@ inline float32x4_t vlog_f32(const float32x4_t &x)
146146
float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
147147

148148
// Polynomial Approximation
149-
float32x4_t poly = vtaylor_poly_f32(val, log_tab);
149+
float32x4_t poly = vtaylor_polyq_f32(val, log_tab);
150150

151151
// Reconstruct
152152
poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
@@ -158,19 +158,24 @@ inline float32x4_t vlog_f32(const float32x4_t &x)
158158
*
159159
* tanh(x) = (e^2x - 1)/(e^2x + 1)
160160
*
161+
* @note We clamp x to [-5,5] to avoid overflowing issues.
162+
*
161163
* @param val Input vector value in F32 format.
162164
*
163165
* @return The calculated Hyperbolic Tangent.
164166
*/
165-
inline float32x4_t vtanh_f32(const float32x4_t &val)
167+
inline float32x4_t vtanhq_f32(const float32x4_t &val)
166168
{
167-
static const float32x4_t CONST_1 = vdupq_n_f32(1.f); // 1.f
168-
static const float32x4_t CONST_2 = vdupq_n_f32(2.f); // 2.f
169+
static const float32x4_t CONST_1 = vdupq_n_f32(1.f); // 1.f
170+
static const float32x4_t CONST_2 = vdupq_n_f32(2.f); // 2.f
171+
static const float32x4_t CONST_MIN_TANH = vdupq_n_f32(-5.f); // -5.f
172+
static const float32x4_t CONST_MAX_TANH = vdupq_n_f32(5.f); // 5.f
169173

170-
float32x4_t exp2x = vexp_f32(vmulq_f32(CONST_2, val));
174+
float32x4_t x = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH);
175+
float32x4_t exp2x = vexpq_f32(vmulq_f32(CONST_2, x));
171176
float32x4_t num = vsubq_f32(exp2x, CONST_1);
172177
float32x4_t den = vaddq_f32(exp2x, CONST_1);
173-
float32x4_t tanh = vmulq_f32(num, vinv_f32(den));
178+
float32x4_t tanh = vmulq_f32(num, vinvq_f32(den));
174179
return tanh;
175180
}
176181

@@ -185,7 +190,7 @@ inline float32x4_t vtanh_f32(const float32x4_t &val)
185190
*/
186191
inline float32x4_t vpowq_f32(const float32x4_t &val, const float32x4_t &n)
187192
{
188-
return vexp_f32(vmulq_f32(n, vlog_f32(val)));
193+
return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
189194
}
190195
}
191196

arm_compute/core/NEON/kernels/NEColorConvertKernel.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@ class NEColorConvertKernel : public INEKernel
8282
using ColorConvertFunction = void(const void *__restrict input_ptr, void *__restrict output_ptr, const Window &win);
8383
const void *_input;
8484
void *_output;
85-
unsigned int _num_elems_processed_per_iteration;
8685
ColorConvertFunction *_func;
8786
};
8887
}

arm_compute/core/NEON/kernels/NEHistogramKernel.h

Lines changed: 4 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -97,13 +97,13 @@ class NEHistogramKernel : public INEKernel
9797
*
9898
* @param[in] win Region on which to execute the kernel
9999
*/
100-
void histogram_U8(const Window &win);
100+
void histogram_U8(Window win);
101101
/** Function to perform histogram on the given window where histogram is
102102
* of fixed size 256 without ranges and offsets.
103103
*
104104
* @param[in] win Region on which to execute the kernel
105105
*/
106-
void histogram_fixed_U8(const Window &win);
106+
void histogram_fixed_U8(Window win);
107107
/** Pre-calculate the pixel windowing for every possible pixel
108108
*
109109
* Calculate (V - offset) * numBins / range where V is every possible pixel value.
@@ -115,88 +115,15 @@ class NEHistogramKernel : public INEKernel
115115
*
116116
* @param[in] window Region on which to execute the kernel.
117117
*/
118-
using HistogramFunction = void (NEHistogramKernel::*)(const Window &window);
119-
/** Histogram function to use for the particular image types passed to configure() */
120-
HistogramFunction _func;
118+
using HistogramFunctionPtr = void (NEHistogramKernel::*)(Window window);
121119

122-
private:
120+
HistogramFunctionPtr _func; ///< Histogram function to use for the particular image types passed to configure()
123121
const IImage *_input;
124122
IDistribution1D *_output;
125123
uint32_t *_local_hist;
126124
uint32_t *_window_lut;
127125
std::mutex _hist_mtx;
128126
static constexpr unsigned int _max_range_size{ 256 }; ///< 256 possible pixel values as we handle only U8 images
129127
};
130-
131-
/** Interface for the histogram border handling kernel.
132-
*
133-
* @note If the image width is not a multiple of the number of elements processed by @ref NEHistogramKernel
134-
* this kernel is used to handle the leftover columns.
135-
*/
136-
class NEHistogramBorderKernel : public INEKernel
137-
{
138-
public:
139-
/** Default constructor */
140-
NEHistogramBorderKernel();
141-
/** Prevent instances of this class from being copied (As this class contains pointers) */
142-
NEHistogramBorderKernel(const NEHistogramBorderKernel &) = delete;
143-
/** Prevent instances of this class from being copied (As this class contains pointers) */
144-
NEHistogramBorderKernel &operator=(const NEHistogramBorderKernel &) = delete;
145-
/** Allow instances of this class to be moved */
146-
NEHistogramBorderKernel(NEHistogramBorderKernel &&) = default;
147-
/** Allow instances of this class to be moved */
148-
NEHistogramBorderKernel &operator=(NEHistogramBorderKernel &&) = default;
149-
/** Default destructor */
150-
~NEHistogramBorderKernel() = default;
151-
152-
/** Set the input image and the distribution output.
153-
*
154-
* @param[in] input Source image. Data type supported: U8.
155-
* @param[out] output Destination distribution.
156-
* @param[in] window_lut LUT with precalculated possible window values.
157-
* @param[in] hist_elements_per_thread Pixels per thread that the histogram kernel computes.
158-
*/
159-
void configure(const IImage *input, IDistribution1D *output, uint32_t *window_lut, const unsigned int hist_elements_per_thread);
160-
/** Set the input image and the distribution output.
161-
*
162-
* @note Used for histogram of fixed size equal to 256
163-
*
164-
* @param[in] input Source image. Data type supported: U8.
165-
* @param[out] output Destination distribution.
166-
* @param[in] hist_elements_per_thread Pixels per thread that the histogram kernel computes.
167-
*/
168-
void configure(const IImage *input, IDistribution1D *output, const unsigned int hist_elements_per_thread);
169-
170-
// Inherited methods overridden:
171-
void run(const Window &window) override;
172-
bool is_parallelisable() const override;
173-
174-
private:
175-
/** Function to perform histogram on the given window
176-
*
177-
* @param[in] win Region on which to execute the kernel
178-
*/
179-
void histogram_U8(const Window &win);
180-
/** Function to perform histogram on the given window where histogram is
181-
* of fixed size 256 without ranges and offsets.
182-
*
183-
* @param[in] win Region on which to execute the kernel
184-
*/
185-
void histogram_fixed_U8(const Window &win);
186-
/** Common signature for all the specialised Histogram functions
187-
*
188-
* @param[in] window Region on which to execute the kernel.
189-
*/
190-
using HistogramBorderFunction = void (NEHistogramBorderKernel::*)(const Window &window);
191-
/** Histogram function to use for the particular image types passed to configure() */
192-
HistogramBorderFunction _func;
193-
194-
private:
195-
const IImage *_input;
196-
IDistribution1D *_output;
197-
uint32_t *_window_lut;
198-
static constexpr unsigned int _max_range_size{ 256 }; ///< 256 possible pixel values as we handle only U8 images
199-
};
200128
}
201-
202129
#endif /*__ARM_COMPUTE_NEHISTOGRAMKERNEL_H__ */

0 commit comments

Comments
 (0)