ARM-software
diff --git a/‎LICENSE‎
Lines changed: 0 additions & 21 deletions b/‎LICENSE‎
Lines changed: 0 additions & 21 deletions
diff --git a/‎README.md‎
Lines changed: 10 additions & 2 deletions b/‎README.md‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎SConstruct‎
Lines changed: 29 additions & 1 deletion b/‎SConstruct‎
Lines changed: 29 additions & 1 deletion
diff --git a/‎arm_compute/core/CL/ICLTensor.h‎
Lines changed: 3 additions & 0 deletions b/‎arm_compute/core/CL/ICLTensor.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎arm_compute/core/CL/OpenCL.h‎
Lines changed: 5 additions & 0 deletions b/‎arm_compute/core/CL/OpenCL.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h‎
Lines changed: 2 additions & 2 deletions b/‎arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎arm_compute/core/Helpers.h‎
Lines changed: 18 additions & 5 deletions b/‎arm_compute/core/Helpers.h‎
Lines changed: 18 additions & 5 deletions
diff --git a/‎arm_compute/core/NEON/NEMath.h‎
Lines changed: 18 additions & 13 deletions b/‎arm_compute/core/NEON/NEMath.h‎
Lines changed: 18 additions & 13 deletions
diff --git a/‎arm_compute/core/NEON/kernels/NEColorConvertKernel.h‎
Lines changed: 0 additions & 1 deletion b/‎arm_compute/core/NEON/kernels/NEColorConvertKernel.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎arm_compute/core/NEON/kernels/NEHistogramKernel.h‎
Lines changed: 4 additions & 77 deletions b/‎arm_compute/core/NEON/kernels/NEHistogramKernel.h‎
Lines changed: 4 additions & 77 deletions
@@ -1,9 +1,17 @@
 
 Please report issues here: https://github.com/ARM-software/ComputeLibrary/issues
 
-Documentation available here: [v17.04](https://arm-software.github.io/ComputeLibrary/v17.04/) [v17.03.1](https://arm-software.github.io/ComputeLibrary/v17.03.1/)
+Documentation available here: 
 
-Binaries available here: [v17.04](https://github.com/ARM-software/ComputeLibrary/releases/download/v17.04/arm_compute-v17.04-bin.tar.gz) [v17.03.1](https://github.com/ARM-software/ComputeLibrary/releases/download/v17.03.1/arm_compute-v17.03.1-bin.tar.gz)
+- [v17.05](https://arm-software.github.io/ComputeLibrary/v17.05/)
+- [v17.04](https://arm-software.github.io/ComputeLibrary/v17.04/)
+- [v17.03.1](https://arm-software.github.io/ComputeLibrary/v17.03.1/)
+
+Binaries available here: 
+
+- [v17.05](https://github.com/ARM-software/ComputeLibrary/releases/download/v17.05/arm_compute-v17.05-bin.tar.gz)
+- [v17.04](https://github.com/ARM-software/ComputeLibrary/releases/download/v17.04/arm_compute-v17.04-bin.tar.gz)
+- [v17.03.1](https://github.com/ARM-software/ComputeLibrary/releases/download/v17.03.1/arm_compute-v17.03.1-bin.tar.gz)
 
 Support: [email protected]
 
 
@@ -20,4 +20,32 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-SConscript('sconscript', variant_dir='build', duplicate=0)
+import os
+
+vars = Variables("scons")
+vars.AddVariables(
+    BoolVariable("debug", "Debug", False),
+    BoolVariable("asserts", "Enable asserts (this flag is forced to 1 for debug=1)", False),
+    EnumVariable("arch", "Target Architecture", "armv7a", allowed_values=("armv7a", "arm64-v8a", "arm64-v8.2-a", "x86_32", "x86_64")),
+    EnumVariable("os", "Target OS", "linux", allowed_values=("linux", "android", "bare_metal")),
+    EnumVariable("build", "Build type", "cross_compile", allowed_values=("native", "cross_compile")),
+    BoolVariable("Werror", "Enable/disable the -Werror compilation flag", True),
+    BoolVariable("opencl", "Enable OpenCL support", True),
+    BoolVariable("neon", "Enable Neon support", False),
+    BoolVariable("embed_kernels", "Embed OpenCL kernels in library binary", False),
+    BoolVariable("set_soname", "Set the library's soname and shlibversion (requires SCons 2.4 or above)", False),
+    BoolVariable("openmp", "Enable OpenMP backend", False),
+    BoolVariable("cppthreads", "Enable C++11 threads backend", True),
+    PathVariable("build_dir", "Specify sub-folder for the build", ".", PathVariable.PathIsDirCreate),
+    ("extra_cxx_flags", "Extra CXX flags to be appended to the build command", "")
+)
+
+env = Environment(platform='posix', variables = vars, ENV = os.environ)
+
+Help(vars.GenerateHelpText(env))
+
+Export('vars')
+Export('env')
+
+if not GetOption("help"):
+    SConscript('sconscript', variant_dir='#build/%s/arm_compute' % env['build_dir'], duplicate=0)
@@ -43,6 +43,9 @@ class ICLTensor : public ITensor
     ICLTensor();
     ICLTensor(const ICLTensor &) = delete;
     ICLTensor &operator=(const ICLTensor &) = delete;
+    ICLTensor(ICLTensor &&)                 = default;
+    ICLTensor &operator=(ICLTensor &&) = default;
+    virtual ~ICLTensor()               = default;
 
     /** Interface to be implemented by the child class to return a reference to the OpenCL buffer containing the image's data.
      *
 
@@ -35,4 +35,9 @@ namespace cl
 {
 static const NDRange Range_128_1 = NDRange(128, 1);
 }
+
+namespace arm_compute
+{
+bool opencl_is_available();
+}
 #endif /* __ARM_COMPUTE_OPENCL_H__ */
@@ -47,7 +47,7 @@ class ICLTensor;
  * \end{array} \right)
  * @f]
  *
- * After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ]
+ * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
  */
 class CLGEMMInterleave4x4Kernel : public ICLKernel
 {
@@ -64,7 +64,7 @@ class CLGEMMInterleave4x4Kernel : public ICLKernel
     CLGEMMInterleave4x4Kernel &operator=(CLGEMMInterleave4x4Kernel &&) = default;
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/F16/F32
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32
      * @param[out] output Output tensor. Data type supported: same as @p input
      */
     void configure(const ICLTensor *input, ICLTensor *output);
 
@@ -183,24 +183,37 @@ inline void for_each(F &&func, T &&arg, Ts &&... args)
     for_each(func, args...);
 }
 
-/** Base case of foldl. Return value. */
+/** Base case of foldl.
+ *
+ * @return value.
+ */
 template <typename F, typename T>
-inline T foldl(F &&, T &&value)
+inline T foldl(F &&, const T &value)
 {
     return value;
 }
 
+/** Base case of foldl.
+ *
+ * @return Function evaluation for value1 and value2
+ */
+template <typename F, typename T, typename U>
+inline auto foldl(F &&func, T &&value1, U &&value2) -> decltype(func(value1, value2))
+{
+    return func(value1, value2);
+}
+
 /** Fold left.
  *
  * @param[in] func    Function to be called
  * @param[in] initial Initial value
  * @param[in] value   Argument passed to the function
  * @param[in] values  Remaining arguments
  */
-template <typename F, typename I, typename T, typename... Ts>
-inline I foldl(F &&func, I &&initial, T &&value, Ts &&... values)
+template <typename F, typename I, typename T, typename... Vs>
+inline I foldl(F &&func, I &&initial, T &&value, Vs &&... values)
 {
-    return foldl(func, func(initial, value), values...);
+    return foldl(std::forward<F>(func), func(std::forward<I>(initial), std::forward<T>(value)), std::forward<Vs>(values)...);
 }
 }
 
 
@@ -64,7 +64,7 @@ const std::array<float32x4_t, 8> log_tab =
  *
  * @return The calculated inverse square root.
  */
-inline float32x4_t vinvsqrt_f32(float32x4_t x)
+inline float32x4_t vinvsqrtq_f32(float32x4_t x)
 {
     float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
     sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
@@ -79,7 +79,7 @@ inline float32x4_t vinvsqrt_f32(float32x4_t x)
  *
  * @return The calculated reciprocal.
  */
-inline float32x4_t vinv_f32(const float32x4_t &x)
+inline float32x4_t vinvq_f32(const float32x4_t &x)
 {
     float32x4_t recip = vrecpeq_f32(x);
     recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
@@ -94,7 +94,7 @@ inline float32x4_t vinv_f32(const float32x4_t &x)
  *
  * @return The calculated approximation.
  */
-inline float32x4_t vtaylor_poly_f32(const float32x4_t &x, const std::array<float32x4_t, 8> &coeffs)
+inline float32x4_t vtaylor_polyq_f32(const float32x4_t &x, const std::array<float32x4_t, 8> &coeffs)
 {
     float32x4_t A   = vmlaq_f32(coeffs[0], coeffs[4], x);
     float32x4_t B   = vmlaq_f32(coeffs[2], coeffs[6], x);
@@ -112,7 +112,7 @@ inline float32x4_t vtaylor_poly_f32(const float32x4_t &x, const std::array<float
  *
  * @return The calculated exponent.
  */
-inline float32x4_t vexp_f32(const float32x4_t &x)
+inline float32x4_t vexpq_f32(const float32x4_t &x)
 {
     static const float32x4_t CONST_LN2     = vdupq_n_f32(0.6931471805f); // ln(2)
     static const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); // 1/ln(2)
@@ -122,7 +122,7 @@ inline float32x4_t vexp_f32(const float32x4_t &x)
     float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), CONST_LN2);
 
     // Polynomial Approximation
-    float32x4_t poly = vtaylor_poly_f32(val, exp_tab);
+    float32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
 
     // Reconstruct
     poly = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(poly), vshlq_n_s32(m, 23)));
@@ -136,7 +136,7 @@ inline float32x4_t vexp_f32(const float32x4_t &x)
  *
  * @return The calculated logarithm.
  */
-inline float32x4_t vlog_f32(const float32x4_t &x)
+inline float32x4_t vlogq_f32(const float32x4_t &x)
 {
     static const int32x4_t   CONST_127 = vdupq_n_s32(127);           // 127
     static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
@@ -146,7 +146,7 @@ inline float32x4_t vlog_f32(const float32x4_t &x)
     float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
 
     // Polynomial Approximation
-    float32x4_t poly = vtaylor_poly_f32(val, log_tab);
+    float32x4_t poly = vtaylor_polyq_f32(val, log_tab);
 
     // Reconstruct
     poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
@@ -158,19 +158,24 @@ inline float32x4_t vlog_f32(const float32x4_t &x)
  *
  * tanh(x) = (e^2x - 1)/(e^2x + 1)
  *
+ * @note We clamp x to [-5,5] to avoid overflowing issues.
+ *
  * @param val Input vector value in F32 format.
  *
  * @return The calculated Hyperbolic Tangent.
  */
-inline float32x4_t vtanh_f32(const float32x4_t &val)
+inline float32x4_t vtanhq_f32(const float32x4_t &val)
 {
-    static const float32x4_t CONST_1 = vdupq_n_f32(1.f); // 1.f
-    static const float32x4_t CONST_2 = vdupq_n_f32(2.f); // 2.f
+    static const float32x4_t CONST_1        = vdupq_n_f32(1.f);  // 1.f
+    static const float32x4_t CONST_2        = vdupq_n_f32(2.f);  // 2.f
+    static const float32x4_t CONST_MIN_TANH = vdupq_n_f32(-5.f); // -5.f
+    static const float32x4_t CONST_MAX_TANH = vdupq_n_f32(5.f);  // 5.f
 
-    float32x4_t exp2x = vexp_f32(vmulq_f32(CONST_2, val));
+    float32x4_t x     = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH);
+    float32x4_t exp2x = vexpq_f32(vmulq_f32(CONST_2, x));
     float32x4_t num   = vsubq_f32(exp2x, CONST_1);
     float32x4_t den   = vaddq_f32(exp2x, CONST_1);
-    float32x4_t tanh  = vmulq_f32(num, vinv_f32(den));
+    float32x4_t tanh  = vmulq_f32(num, vinvq_f32(den));
     return tanh;
 }
 
@@ -185,7 +190,7 @@ inline float32x4_t vtanh_f32(const float32x4_t &val)
  */
 inline float32x4_t vpowq_f32(const float32x4_t &val, const float32x4_t &n)
 {
-    return vexp_f32(vmulq_f32(n, vlog_f32(val)));
+    return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
 }
 }
 
 
@@ -82,7 +82,6 @@ class NEColorConvertKernel : public INEKernel
     using ColorConvertFunction = void(const void *__restrict input_ptr, void *__restrict output_ptr, const Window &win);
     const void           *_input;
     void                 *_output;
-    unsigned int          _num_elems_processed_per_iteration;
     ColorConvertFunction *_func;
 };
 }
 
@@ -97,13 +97,13 @@ class NEHistogramKernel : public INEKernel
       *
      *  @param[in] win Region on which to execute the kernel
      */
-    void histogram_U8(const Window &win);
+    void histogram_U8(Window win);
     /** Function to perform histogram on the given window where histogram is
      *         of fixed size 256 without ranges and offsets.
      *
      *  @param[in] win Region on which to execute the kernel
      */
-    void histogram_fixed_U8(const Window &win);
+    void histogram_fixed_U8(Window win);
     /** Pre-calculate the pixel windowing for every possible pixel
      *
      * Calculate (V - offset) * numBins / range where V is every possible pixel value.
@@ -115,88 +115,15 @@ class NEHistogramKernel : public INEKernel
      *
      * @param[in] window Region on which to execute the kernel.
      */
-    using HistogramFunction = void (NEHistogramKernel::*)(const Window &window);
-    /** Histogram function to use for the particular image types passed to configure() */
-    HistogramFunction _func;
+    using HistogramFunctionPtr = void (NEHistogramKernel::*)(Window window);
 
-private:
+    HistogramFunctionPtr          _func; ///< Histogram function to use for the particular image types passed to configure()
     const IImage                 *_input;
     IDistribution1D              *_output;
     uint32_t                     *_local_hist;
     uint32_t                     *_window_lut;
     std::mutex                    _hist_mtx;
     static constexpr unsigned int _max_range_size{ 256 }; ///< 256 possible pixel values as we handle only U8 images
 };
-
-/** Interface for the histogram border handling kernel.
- *
- * @note If the image width is not a multiple of the number of elements processed by @ref NEHistogramKernel
- * this kernel is used to handle the leftover columns.
- */
-class NEHistogramBorderKernel : public INEKernel
-{
-public:
-    /** Default constructor */
-    NEHistogramBorderKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHistogramBorderKernel(const NEHistogramBorderKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHistogramBorderKernel &operator=(const NEHistogramBorderKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEHistogramBorderKernel(NEHistogramBorderKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEHistogramBorderKernel &operator=(NEHistogramBorderKernel &&) = default;
-    /** Default destructor */
-    ~NEHistogramBorderKernel() = default;
-
-    /** Set the input image and the distribution output.
-     *
-     * @param[in]  input                    Source image. Data type supported: U8.
-     * @param[out] output                   Destination distribution.
-     * @param[in]  window_lut               LUT with precalculated possible window values.
-     * @param[in]  hist_elements_per_thread Pixels per thread that the histogram kernel computes.
-     */
-    void configure(const IImage *input, IDistribution1D *output, uint32_t *window_lut, const unsigned int hist_elements_per_thread);
-    /** Set the input image and the distribution output.
-     *
-     * @note Used for histogram of fixed size equal to 256
-     *
-     * @param[in]  input                    Source image. Data type supported: U8.
-     * @param[out] output                   Destination distribution.
-     * @param[in]  hist_elements_per_thread Pixels per thread that the histogram kernel computes.
-     */
-    void configure(const IImage *input, IDistribution1D *output, const unsigned int hist_elements_per_thread);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-    bool is_parallelisable() const override;
-
-private:
-    /** Function to perform histogram on the given window
-      *
-     *  @param[in] win Region on which to execute the kernel
-     */
-    void histogram_U8(const Window &win);
-    /** Function to perform histogram on the given window where histogram is
-     *  of fixed size 256 without ranges and offsets.
-     *
-     *  @param[in] win Region on which to execute the kernel
-     */
-    void histogram_fixed_U8(const Window &win);
-    /** Common signature for all the specialised Histogram functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using HistogramBorderFunction = void (NEHistogramBorderKernel::*)(const Window &window);
-    /** Histogram function to use for the particular image types passed to configure() */
-    HistogramBorderFunction _func;
-
-private:
-    const IImage                 *_input;
-    IDistribution1D              *_output;
-    uint32_t                     *_window_lut;
-    static constexpr unsigned int _max_range_size{ 256 }; ///< 256 possible pixel values as we handle only U8 images
-};
 }
-
 #endif /*__ARM_COMPUTE_NEHISTOGRAMKERNEL_H__ */
Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,9 @@ class ICLTensor : public ITensor`
`43`	`43`	`ICLTensor();`
`44`	`44`	`ICLTensor(const ICLTensor &) = delete;`
`45`	`45`	`ICLTensor &operator=(const ICLTensor &) = delete;`
	`46`	`+ ICLTensor(ICLTensor &&) = default;`
	`47`	`+ ICLTensor &operator=(ICLTensor &&) = default;`
	`48`	`+ virtual ~ICLTensor() = default;`
`46`	`49`
`47`	`50`	`/** Interface to be implemented by the child class to return a reference to the OpenCL buffer containing the image's data.`
`48`	`51`	`*`
Original file line number	Diff line number	Diff line change
`@@ -35,4 +35,9 @@ namespace cl`
`35`	`35`	`{`
`36`	`36`	`static const NDRange Range_128_1 = NDRange(128, 1);`
`37`	`37`	`}`
	`38`	`+`
	`39`	`+namespace arm_compute`
	`40`	`+{`
	`41`	`+bool opencl_is_available();`
	`42`	`+}`
`38`	`43`	`#endif /* __ARM_COMPUTE_OPENCL_H__ */`
Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,7 @@ class ICLTensor;`
`47`	`47`	`* \end{array} \right)`
`48`	`48`	`* @f]`
`49`	`49`	`*`
`50`		`- * After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ]`
	`50`	`+ * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]`
`51`	`51`	`*/`
`52`	`52`	`class CLGEMMInterleave4x4Kernel : public ICLKernel`
`53`	`53`	`{`
`@@ -64,7 +64,7 @@ class CLGEMMInterleave4x4Kernel : public ICLKernel`
`64`	`64`	`CLGEMMInterleave4x4Kernel &operator=(CLGEMMInterleave4x4Kernel &&) = default;`
`65`	`65`	`/** Initialise the kernel's input and output.`
`66`	`66`	`*`
`67`		`- * @param[in] input Input tensor. Data types supported: U8/F16/F32`
	`67`	`+ * @param[in] input Input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32`
`68`	`68`	`* @param[out] output Output tensor. Data type supported: same as @p input`
`69`	`69`	`*/`
`70`	`70`	`void configure(const ICLTensor input, ICLTensor output);`
Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@ const std::array<float32x4_t, 8> log_tab =`
`64`	`64`	`*`
`65`	`65`	`* @return The calculated inverse square root.`
`66`	`66`	`*/`
`67`		`-inline float32x4_t vinvsqrt_f32(float32x4_t x)`
	`67`	`+inline float32x4_t vinvsqrtq_f32(float32x4_t x)`
`68`	`68`	`{`
`69`	`69`	`float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);`
`70`	`70`	`sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);`
`@@ -79,7 +79,7 @@ inline float32x4_t vinvsqrt_f32(float32x4_t x)`
`79`	`79`	`*`
`80`	`80`	`* @return The calculated reciprocal.`
`81`	`81`	`*/`
`82`		`-inline float32x4_t vinv_f32(const float32x4_t &x)`
	`82`	`+inline float32x4_t vinvq_f32(const float32x4_t &x)`
`83`	`83`	`{`
`84`	`84`	`float32x4_t recip = vrecpeq_f32(x);`
`85`	`85`	`recip = vmulq_f32(vrecpsq_f32(x, recip), recip);`
`@@ -94,7 +94,7 @@ inline float32x4_t vinv_f32(const float32x4_t &x)`
`94`	`94`	`*`
`95`	`95`	`* @return The calculated approximation.`
`96`	`96`	`*/`
`97`		`-inline float32x4_t vtaylor_poly_f32(const float32x4_t &x, const std::array<float32x4_t, 8> &coeffs)`
	`97`	`+inline float32x4_t vtaylor_polyq_f32(const float32x4_t &x, const std::array<float32x4_t, 8> &coeffs)`
`98`	`98`	`{`
`99`	`99`	`float32x4_t A = vmlaq_f32(coeffs[0], coeffs[4], x);`
`100`	`100`	`float32x4_t B = vmlaq_f32(coeffs[2], coeffs[6], x);`
`@@ -112,7 +112,7 @@ inline float32x4_t vtaylor_poly_f32(const float32x4_t &x, const std::array<float`
`112`	`112`	`*`
`113`	`113`	`* @return The calculated exponent.`
`114`	`114`	`*/`
`115`		`-inline float32x4_t vexp_f32(const float32x4_t &x)`
	`115`	`+inline float32x4_t vexpq_f32(const float32x4_t &x)`
`116`	`116`	`{`
`117`	`117`	`static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)`
`118`	`118`	`static const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); // 1/ln(2)`
`@@ -122,7 +122,7 @@ inline float32x4_t vexp_f32(const float32x4_t &x)`
`122`	`122`	`float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), CONST_LN2);`
`123`	`123`
`124`	`124`	`// Polynomial Approximation`
`125`		`- float32x4_t poly = vtaylor_poly_f32(val, exp_tab);`
	`125`	`+ float32x4_t poly = vtaylor_polyq_f32(val, exp_tab);`
`126`	`126`
`127`	`127`	`// Reconstruct`
`128`	`128`	`poly = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(poly), vshlq_n_s32(m, 23)));`
`@@ -136,7 +136,7 @@ inline float32x4_t vexp_f32(const float32x4_t &x)`
`136`	`136`	`*`
`137`	`137`	`* @return The calculated logarithm.`
`138`	`138`	`*/`
`139`		`-inline float32x4_t vlog_f32(const float32x4_t &x)`
	`139`	`+inline float32x4_t vlogq_f32(const float32x4_t &x)`
`140`	`140`	`{`
`141`	`141`	`static const int32x4_t CONST_127 = vdupq_n_s32(127); // 127`
`142`	`142`	`static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)`
`@@ -146,7 +146,7 @@ inline float32x4_t vlog_f32(const float32x4_t &x)`
`146`	`146`	`float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));`
`147`	`147`
`148`	`148`	`// Polynomial Approximation`
`149`		`- float32x4_t poly = vtaylor_poly_f32(val, log_tab);`
	`149`	`+ float32x4_t poly = vtaylor_polyq_f32(val, log_tab);`
`150`	`150`
`151`	`151`	`// Reconstruct`
`152`	`152`	`poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);`
`@@ -158,19 +158,24 @@ inline float32x4_t vlog_f32(const float32x4_t &x)`
`158`	`158`	`*`
`159`	`159`	`* tanh(x) = (e^2x - 1)/(e^2x + 1)`
`160`	`160`	`*`
	`161`	`+ * @note We clamp x to [-5,5] to avoid overflowing issues.`
	`162`	`+ *`
`161`	`163`	`* @param val Input vector value in F32 format.`
`162`	`164`	`*`
`163`	`165`	`* @return The calculated Hyperbolic Tangent.`
`164`	`166`	`*/`
`165`		`-inline float32x4_t vtanh_f32(const float32x4_t &val)`
	`167`	`+inline float32x4_t vtanhq_f32(const float32x4_t &val)`
`166`	`168`	`{`
`167`		`- static const float32x4_t CONST_1 = vdupq_n_f32(1.f); // 1.f`
`168`		`- static const float32x4_t CONST_2 = vdupq_n_f32(2.f); // 2.f`
	`169`	`+ static const float32x4_t CONST_1 = vdupq_n_f32(1.f); // 1.f`
	`170`	`+ static const float32x4_t CONST_2 = vdupq_n_f32(2.f); // 2.f`
	`171`	`+ static const float32x4_t CONST_MIN_TANH = vdupq_n_f32(-5.f); // -5.f`
	`172`	`+ static const float32x4_t CONST_MAX_TANH = vdupq_n_f32(5.f); // 5.f`
`169`	`173`
`170`		`- float32x4_t exp2x = vexp_f32(vmulq_f32(CONST_2, val));`
	`174`	`+ float32x4_t x = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH);`
	`175`	`+ float32x4_t exp2x = vexpq_f32(vmulq_f32(CONST_2, x));`
`171`	`176`	`float32x4_t num = vsubq_f32(exp2x, CONST_1);`
`172`	`177`	`float32x4_t den = vaddq_f32(exp2x, CONST_1);`
`173`		`- float32x4_t tanh = vmulq_f32(num, vinv_f32(den));`
	`178`	`+ float32x4_t tanh = vmulq_f32(num, vinvq_f32(den));`
`174`	`179`	`return tanh;`
`175`	`180`	`}`
`176`	`181`
`@@ -185,7 +190,7 @@ inline float32x4_t vtanh_f32(const float32x4_t &val)`
`185`	`190`	`*/`
`186`	`191`	`inline float32x4_t vpowq_f32(const float32x4_t &val, const float32x4_t &n)`
`187`	`192`	`{`
`188`		`- return vexp_f32(vmulq_f32(n, vlog_f32(val)));`
	`193`	`+ return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));`
`189`	`194`	`}`
`190`	`195`	`}`
`191`	`196`
Original file line number	Diff line number	Diff line change
`@@ -82,7 +82,6 @@ class NEColorConvertKernel : public INEKernel`
`82`	`82`	`using ColorConvertFunction = void(const void __restrict input_ptr, void __restrict output_ptr, const Window &win);`
`83`	`83`	`const void *_input;`
`84`	`84`	`void *_output;`
`85`		`- unsigned int _num_elems_processed_per_iteration;`
`86`	`85`	`ColorConvertFunction *_func;`
`87`	`86`	`};`
`88`	`87`	`}`