Skip to content

Commit 35cd5a7

Browse files
committed
perf: remove switch in SVE activation
Move activation-function selection to new template dispatch_activation_function(), which facilitates inlining of code to increase performance. This commit addresses the SVE kernels. Partially Resolves: COMPMID-8359 Signed-off-by: Dennis Wildmark <[email protected]> Change-Id: I797039bc87b3f14ec06ba838328e74c2ad9fa4a8 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/14724 Tested-by: Arm Jenkins <[email protected]> Comments-Addressed: Arm Jenkins <[email protected]> Benchmark: Arm Jenkins <[email protected]> Reviewed-by: Dongsung Kim <[email protected]>
1 parent 353c410 commit 35cd5a7

File tree

4 files changed

+326
-176
lines changed

4 files changed

+326
-176
lines changed

src/cpu/kernels/activation/generic/sve/fp16.cpp

Lines changed: 27 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2020-2024 Arm Limited.
2+
* Copyright (c) 2020-2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -32,6 +32,7 @@
3232
#include "src/core/NEON/SVEMath.h"
3333
#include "src/cpu/kernels/lut/list.h"
3434

35+
#include "fp16_impl.h"
3536
#include <arm_sve.h>
3637
#include <cmath>
3738
#include <cstddef>
@@ -52,95 +53,34 @@ void sve_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayer
5253
Iterator input(src, win_collapsed);
5354
Iterator output(dst, win_collapsed);
5455

55-
const auto const_1 = svdup_n_f16(1.f);
56-
const auto const_0 = svdup_n_f16(0.f);
57-
const auto const_6 = svdup_n_f16(6.f);
58-
const auto const_3 = svdup_n_f16(3.f);
59-
const auto const_inv_6 = svdup_n_f16(0.166666667f);
56+
dispatch_sve_fp16_activation_function(act, act_info,
57+
[&](auto activation_function)
58+
{
59+
execute_window_loop(
60+
win_collapsed,
61+
[&](const Coordinates &)
62+
{
63+
const auto input_ptr =
64+
reinterpret_cast<const float16_t *>(input.ptr());
65+
const auto output_ptr =
66+
reinterpret_cast<float16_t *>(output.ptr());
6067

61-
const auto va = svdup_n_f16(act_info.a());
62-
const auto vb = svdup_n_f16(act_info.b());
63-
execute_window_loop(
64-
win_collapsed,
65-
[&](const Coordinates &)
66-
{
67-
const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
68-
const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
68+
svfloat16_t tmp;
6969

70-
svfloat16_t tmp;
70+
int x = window_start_x;
71+
svbool_t pg = svwhilelt_b16(x, window_end_x);
72+
do
73+
{
74+
const auto vin = svld1_f16(pg, input_ptr + x);
75+
tmp = activation_function(vin, pg);
76+
svst1_f16(pg, output_ptr + x, tmp);
77+
x += svcnth();
78+
pg = svwhilelt_b16(x, window_end_x);
7179

72-
int x = window_start_x;
73-
svbool_t pg = svwhilelt_b16(x, window_end_x);
74-
do
75-
{
76-
const auto vin = svld1_f16(pg, input_ptr + x);
77-
switch (act)
78-
{
79-
case ActivationLayerInfo::ActivationFunction::ABS:
80-
tmp = svabs_f16_z(pg, vin);
81-
break;
82-
case ActivationLayerInfo::ActivationFunction::LINEAR:
83-
tmp = svmla_f16_z(pg, vb, va, vin);
84-
break;
85-
case ActivationLayerInfo::ActivationFunction::LOGISTIC:
86-
tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin))));
87-
break;
88-
case ActivationLayerInfo::ActivationFunction::RELU:
89-
tmp = svmax_f16_z(pg, const_0, vin);
90-
break;
91-
case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
92-
tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin));
93-
break;
94-
case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
95-
tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin));
96-
break;
97-
case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
98-
tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va),
99-
svmax_f16_z(pg, vin, const_0));
100-
break;
101-
case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
102-
tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin)));
103-
break;
104-
case ActivationLayerInfo::ActivationFunction::ELU:
105-
tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin,
106-
svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1)));
107-
break;
108-
case ActivationLayerInfo::ActivationFunction::SQRT:
109-
tmp = svsqrt_f16_z(pg, vin);
110-
break;
111-
case ActivationLayerInfo::ActivationFunction::SQUARE:
112-
tmp = svmul_f16_z(pg, vin, vin);
113-
break;
114-
case ActivationLayerInfo::ActivationFunction::TANH:
115-
tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin)));
116-
break;
117-
case ActivationLayerInfo::ActivationFunction::IDENTITY:
118-
tmp = vin;
119-
break;
120-
case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
121-
tmp = svmul_f16_z(
122-
pg, vin,
123-
svmul_f16_z(
124-
pg, const_inv_6,
125-
svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3)))));
126-
break;
127-
case ActivationLayerInfo::ActivationFunction::SWISH:
128-
tmp = svmul_f16_z(
129-
pg, vin,
130-
svinv_f16_z(pg, svadd_f16_z(pg, const_1,
131-
svexp_f16_z(pg, svneg_f16_z(pg, svmul_f16_z(pg, va, vin))))));
132-
break;
133-
default:
134-
ARM_COMPUTE_ERROR("Unsupported activation function");
135-
}
136-
svst1_f16(pg, output_ptr + x, tmp);
137-
138-
x += svcnth();
139-
pg = svwhilelt_b16(x, window_end_x);
140-
141-
} while (svptest_any(svptrue_b16(), pg));
142-
},
143-
input, output);
80+
} while (svptest_any(svptrue_b16(), pg));
81+
},
82+
input, output);
83+
});
14484
}
14585

14686
void sve_fp16_activation_lut(const ITensor *src,
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
/*
2+
* Copyright (c) 2020-2025 Arm Limited.
3+
*
4+
* SPDX-License-Identifier: MIT
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to
8+
* deal in the Software without restriction, including without limitation the
9+
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10+
* sell copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in all
14+
* copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22+
* SOFTWARE.
23+
*/
24+
25+
#ifndef ACL_SRC_CPU_KERNELS_ACTIVATION_GENERIC_SVE_FP16_IMPL_H
26+
#define ACL_SRC_CPU_KERNELS_ACTIVATION_GENERIC_SVE_FP16_IMPL_H
27+
28+
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
29+
30+
#include "arm_compute/function_info/ActivationLayerInfo.h"
31+
32+
#include "src/core/NEON/SVEMath.h"
33+
34+
#include <arm_sve.h>
35+
#include <cstddef>
36+
37+
namespace arm_compute
38+
{
39+
namespace cpu
40+
{
41+
42+
template <typename F>
43+
void dispatch_sve_fp16_activation_function(ActivationLayerInfo::ActivationFunction act,
44+
const ActivationLayerInfo &act_info,
45+
F &&fn)
46+
{
47+
const auto const_1 = svdup_n_f16(1.f);
48+
const auto const_0 = svdup_n_f16(0.f);
49+
const auto const_6 = svdup_n_f16(6.f);
50+
const auto const_3 = svdup_n_f16(3.f);
51+
const auto const_inv_6 = svdup_n_f16(0.166666667f);
52+
53+
const auto va = svdup_n_f16(act_info.a());
54+
const auto vb = svdup_n_f16(act_info.b());
55+
56+
switch (act)
57+
{
58+
case ActivationLayerInfo::ActivationFunction::ABS:
59+
fn([](auto vin, svbool_t pg) { return svabs_f16_z(pg, vin); });
60+
break;
61+
case ActivationLayerInfo::ActivationFunction::LINEAR:
62+
fn([&](auto vin, svbool_t pg) { return svmla_f16_z(pg, vb, va, vin); });
63+
break;
64+
case ActivationLayerInfo::ActivationFunction::LOGISTIC:
65+
fn([&](auto vin, svbool_t pg)
66+
{ return svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin)))); });
67+
break;
68+
case ActivationLayerInfo::ActivationFunction::RELU:
69+
fn([&](auto vin, svbool_t pg) { return svmax_f16_z(pg, const_0, vin); });
70+
break;
71+
case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
72+
fn([&](auto vin, svbool_t pg) { return svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin)); });
73+
break;
74+
case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
75+
fn([&](auto vin, svbool_t pg) { return svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin)); });
76+
break;
77+
case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
78+
fn(
79+
[&](auto vin, svbool_t pg) {
80+
return svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va),
81+
svmax_f16_z(pg, vin, const_0));
82+
});
83+
break;
84+
case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
85+
fn([&](auto vin, svbool_t pg) { return svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin))); });
86+
break;
87+
case ActivationLayerInfo::ActivationFunction::ELU:
88+
fn(
89+
[&](auto vin, svbool_t pg)
90+
{
91+
return svsel_f16(svcmpgt_f16(pg, vin, const_0), vin,
92+
svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1)));
93+
});
94+
break;
95+
case ActivationLayerInfo::ActivationFunction::SQRT:
96+
fn([&](auto vin, svbool_t pg) { return svsqrt_f16_z(pg, vin); });
97+
break;
98+
case ActivationLayerInfo::ActivationFunction::SQUARE:
99+
fn([&](auto vin, svbool_t pg) { return svmul_f16_z(pg, vin, vin); });
100+
break;
101+
case ActivationLayerInfo::ActivationFunction::TANH:
102+
fn([&](auto vin, svbool_t pg) { return svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin))); });
103+
break;
104+
case ActivationLayerInfo::ActivationFunction::IDENTITY:
105+
fn([&](auto vin, svbool_t) { return vin; });
106+
break;
107+
case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
108+
fn(
109+
[&](auto vin, svbool_t pg)
110+
{
111+
return svmul_f16_z(
112+
pg, vin,
113+
svmul_f16_z(pg, const_inv_6,
114+
svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3)))));
115+
});
116+
break;
117+
case ActivationLayerInfo::ActivationFunction::SWISH:
118+
fn(
119+
[&](auto vin, svbool_t pg)
120+
{
121+
return svmul_f16_z(
122+
pg, vin,
123+
svinv_f16_z(
124+
pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, svmul_f16_z(pg, va, vin))))));
125+
});
126+
break;
127+
default:
128+
ARM_COMPUTE_ERROR("Unsupported activation function");
129+
}
130+
}
131+
132+
} // namespace cpu
133+
} // namespace arm_compute
134+
#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
135+
136+
#endif // ACL_SRC_CPU_KERNELS_ACTIVATION_GENERIC_SVE_FP16_IMPL_H

0 commit comments

Comments
 (0)