Skip to content

Commit 1562897

Browse files
committed
perf: remove switch in Arm(R) Neon(TM) activations
Move activation-function selection to new template dispatch_activation_function(), which facilitates inlining of code to increase performance. This commit addresses the Neon kernels. The patch improves the performance for ReLU by 4-7%, Logistic by 11% and Hyperbolic tangent by 2-5%. Performance improvements can vary by platform and problem size. Generally this patch improves the performance the most for larger problem sizes. Partially Resolves: COMPMID-8359 Change-Id: I661e5d51b9993cda9c0fef2f198b1a7438f8ce8c Signed-off-by: Dennis Wildmark <[email protected]> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/14722 Tested-by: Arm Jenkins <[email protected]> Reviewed-by: Pablo Marquez Tello <[email protected]> Benchmark: Arm Jenkins <[email protected]> Comments-Addressed: Arm Jenkins <[email protected]> Reviewed-by: Dongsung Kim <[email protected]>
1 parent 3e4b990 commit 1562897

File tree

11 files changed

+1036
-824
lines changed

11 files changed

+1036
-824
lines changed

src/cpu/kernels/activation/generic/neon/fp16.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021 Arm Limited.
2+
* Copyright (c) 2021, 2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -23,7 +23,7 @@
2323
*/
2424
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
2525

26-
#include "src/cpu/kernels/activation/generic/neon/impl.h"
26+
#include "src/cpu/kernels/activation/generic/neon/fp_impl.h"
2727

2828
namespace arm_compute
2929
{

src/cpu/kernels/activation/generic/neon/fp32.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2021 Arm Limited.
2+
* Copyright (c) 2021, 2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -22,7 +22,7 @@
2222
* SOFTWARE.
2323
*/
2424

25-
#include "src/cpu/kernels/activation/generic/neon/impl.h"
25+
#include "src/cpu/kernels/activation/generic/neon/fp_impl.h"
2626
namespace arm_compute
2727
{
2828
namespace cpu
Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
/*
2+
* Copyright (c) 2020-2023, 2025 Arm Limited.
3+
*
4+
* SPDX-License-Identifier: MIT
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to
8+
* deal in the Software without restriction, including without limitation the
9+
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10+
* sell copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in all
14+
* copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22+
* SOFTWARE.
23+
*/
24+
25+
#ifndef ACL_SRC_CPU_KERNELS_ACTIVATION_GENERIC_NEON_FP_IMPL_H
26+
#define ACL_SRC_CPU_KERNELS_ACTIVATION_GENERIC_NEON_FP_IMPL_H
27+
28+
#include "arm_compute/core/Helpers.h"
29+
#include "arm_compute/core/Window.h"
30+
#include "arm_compute/function_info/ActivationLayerInfo.h"
31+
32+
#include "src/core/NEON/wrapper/wrapper.h"
33+
namespace arm_compute
34+
{
35+
namespace cpu
36+
{
37+
/** Constant parameters needed by the activation implementation.
38+
* These parameters differ for each floating type
39+
*
40+
* @note This are passed as a struct as C++ does not allow float as a template parameter until C++20
41+
**/
42+
struct ActFpImplParams
43+
{
44+
float delta; /**< Minimum delta needed to avoid NaN on corner-cases of elementary functions */
45+
int step_x; /**< Window step at the x dimension */
46+
};
47+
48+
#ifndef __aarch64__
49+
inline float32x4_t mask_float_vector(const float32x4_t &in, const uint32x4_t &mask)
50+
{
51+
auto int_in = vreinterpretq_u32_f32(in);
52+
return vreinterpretq_f32_u32(wrapper::vand(int_in, mask));
53+
}
54+
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
55+
inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &mask)
56+
{
57+
auto int_in = vreinterpretq_u16_f16(in);
58+
return vreinterpretq_f16_u16(wrapper::vand(int_in, mask));
59+
}
60+
#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
61+
#endif /* __aarch64__ */
62+
63+
template <typename T, const ActFpImplParams &P, typename F>
64+
void dispatch_fp_neon_activation_function(ActivationLayerInfo::ActivationFunction act,
65+
const ActivationLayerInfo &act_info,
66+
F &&fn)
67+
{
68+
using ExactTagType =
69+
typename arm_compute::wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
70+
#ifndef __aarch64__
71+
const auto delta = wrapper::vdup_n(static_cast<T>(P.delta), ExactTagType{});
72+
#else /* #ifndef __aarch64__ */
73+
const auto const_inv_2 = wrapper::vdup_n(static_cast<T>(0.5f), ExactTagType{});
74+
const auto const_inv_sqrt_2 = wrapper::vdup_n(static_cast<T>(0.70710678118f), ExactTagType{});
75+
#endif /* __aarch64__ */
76+
const auto const_1 = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
77+
const auto const_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
78+
const auto const_6 = wrapper::vdup_n(static_cast<T>(6.f), ExactTagType{});
79+
const auto const_3 = wrapper::vdup_n(static_cast<T>(3.f), ExactTagType{});
80+
const auto const_inv_6 = wrapper::vdup_n(static_cast<T>(0.166666667f), ExactTagType{});
81+
constexpr float soft_relu_thresh = 12.f;
82+
const auto vsoft_relu_thresh = wrapper::vdup_n(static_cast<T>(soft_relu_thresh), ExactTagType{});
83+
const auto va = wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{});
84+
const auto vb = wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{});
85+
const auto a = static_cast<T>(act_info.a());
86+
const auto b = static_cast<T>(act_info.b());
87+
88+
switch (act)
89+
{
90+
case ActivationLayerInfo::ActivationFunction::ABS:
91+
fn([](auto vin) { return wrapper::vabs(vin); }, [](auto in) { return std::abs(in); });
92+
break;
93+
case ActivationLayerInfo::ActivationFunction::LINEAR:
94+
fn([&](auto vin) { return wrapper::vmla(vb, va, vin); }, [&](auto in) { return a * in + b; });
95+
break;
96+
case ActivationLayerInfo::ActivationFunction::LOGISTIC:
97+
fn([&](auto vin) { return wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin)))); },
98+
[](auto in) { return static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in)); });
99+
break;
100+
case ActivationLayerInfo::ActivationFunction::RELU:
101+
fn([&](auto vin) { return wrapper::vmax(const_0, vin); },
102+
[](auto in) { return std::max<T>(static_cast<T>(0), in); });
103+
break;
104+
case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
105+
fn([&](auto vin) { return wrapper::vmin(va, wrapper::vmax(const_0, vin)); },
106+
[&](auto in) { return std::min<T>(a, std::max(static_cast<T>(0), in)); });
107+
break;
108+
case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
109+
fn([&](auto vin) { return wrapper::vmin(va, wrapper::vmax(vb, vin)); },
110+
[&](auto in) { return std::min<T>(a, std::max<T>(b, in)); });
111+
break;
112+
case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
113+
fn([&](auto vin) { return wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin)); },
114+
[&](auto in) { return (in > 0) ? in : a * in; });
115+
break;
116+
case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
117+
fn(
118+
[&](auto vin)
119+
{
120+
return wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin,
121+
wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
122+
},
123+
[](auto in) { return (in > soft_relu_thresh) ? in : std::log(static_cast<T>(1) + std::exp(in)); });
124+
break;
125+
case ActivationLayerInfo::ActivationFunction::ELU:
126+
fn(
127+
[&](auto vin)
128+
{
129+
return wrapper::vbsl(wrapper::vcge(vin, const_0), vin,
130+
wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
131+
},
132+
[&](auto in) { return (in >= 0) ? in : a * (std::exp(in) - 1); });
133+
break;
134+
case ActivationLayerInfo::ActivationFunction::SQRT:
135+
fn(
136+
#ifdef __aarch64__
137+
[](auto vin) { return wrapper::vsqrt(vin); },
138+
#else /* __aarch64__ */
139+
[&](auto vin)
140+
{
141+
const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0.f, ExactTagType{}));
142+
auto tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
143+
return mask_float_vector(tmp, wrapper::vnot(bitmask));
144+
},
145+
#endif /* __aarch64__ */
146+
[](auto in) { return std::sqrt(in); });
147+
break;
148+
case ActivationLayerInfo::ActivationFunction::SQUARE:
149+
fn([](auto vin) { return wrapper::vmul(vin, vin); }, [](auto in) { return in * in; });
150+
break;
151+
case ActivationLayerInfo::ActivationFunction::TANH:
152+
fn([&](auto vin) { return wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin))); },
153+
[&](auto in) { return a * std::tanh(b * in); });
154+
break;
155+
case ActivationLayerInfo::ActivationFunction::IDENTITY:
156+
fn([](auto vin) { return vin; }, [](auto in) { return in; });
157+
break;
158+
case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
159+
fn(
160+
[&](auto vin)
161+
{
162+
return wrapper::vmul(
163+
vin,
164+
wrapper::vmul(const_inv_6,
165+
wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
166+
},
167+
[](auto in) { return in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f); });
168+
break;
169+
case ActivationLayerInfo::ActivationFunction::SWISH:
170+
fn(
171+
[&](auto vin)
172+
{
173+
return wrapper::vmul(vin, wrapper::vinv(wrapper::vadd(
174+
const_1, wrapper::vexpq(wrapper::vneg(wrapper::vmul(va, vin))))));
175+
},
176+
[&](auto in) { return in / (static_cast<T>(1) + std::exp(-a * in)); });
177+
break;
178+
#ifdef __aarch64__
179+
case ActivationLayerInfo::ActivationFunction::GELU:
180+
fn(
181+
[&](auto vin)
182+
{
183+
return wrapper::vmul(
184+
vin,
185+
wrapper::vmul(const_inv_2,
186+
wrapper::vadd(const_1, wrapper::verf(wrapper::vmul(vin, const_inv_sqrt_2)))));
187+
},
188+
[](auto in)
189+
{ return in * static_cast<T>(0.5f * (1.0f + erff(static_cast<float>(in) / 1.41421356237f))); });
190+
break;
191+
#endif /* __aarch64__ */
192+
default:
193+
ARM_COMPUTE_ERROR("Unsupported activation function");
194+
}
195+
}
196+
197+
template <typename T, const ActFpImplParams &P>
198+
void fp_neon_activation_impl(const ITensor *src,
199+
ITensor *dst,
200+
const ActivationLayerInfo &act_info,
201+
const Window &window)
202+
{
203+
/** SIMD vector tag type. */
204+
// using ExactTagType =
205+
// typename arm_compute::wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
206+
constexpr int window_step_x = P.step_x;
207+
const auto window_start_x = static_cast<int>(window.x().start());
208+
const auto window_end_x = static_cast<int>(window.x().end());
209+
const ActivationLayerInfo::ActivationFunction act = act_info.activation();
210+
Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
211+
win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
212+
Iterator input(src, win_collapsed);
213+
Iterator output(dst, win_collapsed);
214+
// In case of non-aarch64, a small delta value is added to the input
215+
// to prevent NAN values caused by zeros in inputs to SQRT.
216+
// In case of aarh64, we call vsqrt directly, so we don't use delta.
217+
dispatch_fp_neon_activation_function<T, P>(
218+
act, act_info,
219+
[&](auto activation_op_vec, auto activation_op_tail)
220+
{
221+
execute_window_loop(
222+
win_collapsed,
223+
[&](const Coordinates &)
224+
{
225+
const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
226+
const auto output_ptr = reinterpret_cast<T *>(output.ptr());
227+
// Compute S elements per iteration
228+
int x = window_start_x;
229+
for (; x <= (window_end_x - window_step_x); x += window_step_x)
230+
{
231+
const auto vin = wrapper::vloadq(input_ptr + x);
232+
wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp =
233+
activation_op_vec(vin);
234+
wrapper::vstore(output_ptr + x, tmp);
235+
}
236+
// Compute left-over elements
237+
for (; x < window_end_x; ++x)
238+
{
239+
const T in = *(reinterpret_cast<const T *>(input_ptr + x));
240+
*(output_ptr + x) = activation_op_tail(in);
241+
}
242+
},
243+
input, output);
244+
});
245+
}
246+
} // namespace cpu
247+
} // namespace arm_compute
248+
249+
#endif // ACL_SRC_CPU_KERNELS_ACTIVATION_GENERIC_NEON_FP_IMPL_H

0 commit comments

Comments
 (0)