Skip to content

Commit 30ba117

Browse files
mfarag13JaccovG
authored andcommitted
[prelu_opt]: Optimizing RELU/LeakyRelu
- Using mul_hi in case shift > 15. - Using unroll with factor 4. - Using Squash to one dim. - Split LeakyRelu and PRELU Code to two separate components.
1 parent af60572 commit 30ba117

14 files changed

+1047
-169
lines changed

include/mli_config.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@
7272

7373
#if defined(__clang__)
7474
#define MLI_FORCE_INLINE inline __attribute__((always_inline))
75-
#define MLI_NO_INLINE __attribute__((noinline))
75+
#define MLI_NO_INLINE __attribute__((used)) __attribute__((noinline))
7676
#define MLI_CODE_SECTION_START(x) code(x)
7777
#define MLI_CODE_SECTION_END() code()
7878

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
/*
2+
* Copyright 2021, Synopsys, Inc.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-3-Clause license found in
6+
* the LICENSE file in the root directory of this source tree.
7+
*
8+
*/
9+
10+
#ifndef _MLI_KRN_LEAKY_RELU_DSP_H_
11+
#define _MLI_KRN_LEAKY_RELU_DSP_H_
12+
13+
#include "mli_check.h"
14+
#include "mli_config.h"
15+
#include "mli_debug.h"
16+
#include "mli_helpers_api.h"
17+
#include "mli_math.h"
18+
#include "mli_prv_dsp.h"
19+
#include "mli_prv_tensor.h"
20+
#include "mli_types.h"
21+
22+
namespace mli {
23+
namespace krn {
24+
namespace dsp {
25+
26+
static MLI_FORCE_INLINE v2q15_t calc_leaky_relu(
27+
const v2q15_t input,
28+
const v2q15_t scale_v,
29+
const int shift) {
30+
31+
/* out = max(0, in) + alpha * min(0, in) */
32+
v2q15_t zero = mli_prv_init_v<int16_t, v2q15_t>(0);
33+
v2q15_t pos = mli_math_max_fx(zero, input);
34+
v2q15_t neg = mli_math_acc_cast_fx<v2q15_t, v2accum40_t>(
35+
mli_math_mul_fx<v2q15_t, v2accum40_t>(scale_v, mli_math_min_fx(zero, input)), shift);
36+
return mli_math_add_fx(pos, neg);
37+
}
38+
39+
template <typename io_T>
40+
static MLI_FORCE_INLINE void compute_leaky_relu(
41+
const MLI_PTR(io_T) vec_in,
42+
MLI_OUT_PTR(io_T) vec_out,
43+
const io_T scale,
44+
const int shift) {
45+
46+
v2q15_t input = mli_prv_load_1vec(vec_in);
47+
v2q15_t scale_v = mli_prv_init_v<io_T, v2q15_t>(scale);
48+
mli_prv_store_n_samples(vec_out, calc_leaky_relu(input, scale_v, shift));
49+
}
50+
51+
template <typename io_T>
52+
static MLI_FORCE_INLINE void compute_leaky_relu(
53+
const MLI_PTR(io_T) vec_in,
54+
MLI_OUT_PTR(io_T) vec_out,
55+
const io_T scale,
56+
const int shift,
57+
const int remaining_part) {
58+
59+
MLI_ASSERT(remaining_part == 1);
60+
v2q15_t input = mli_prv_load_1vec(vec_in);
61+
v2q15_t scale_v = mli_prv_init_v<io_T, v2q15_t>(scale);
62+
mli_prv_store_1_sample(vec_out, calc_leaky_relu(input, scale_v, shift));
63+
}
64+
65+
static MLI_FORCE_INLINE void compute_leaky_relu(
66+
const MLI_PTR(int8_t) vec_in,
67+
MLI_OUT_PTR(int8_t) vec_out,
68+
const int16_t in_zp,
69+
const s8asym_quant_params *identity_params,
70+
const s8asym_quant_params *alpha_params) {
71+
72+
mli::krn::ref::compute_leaky_relu(vec_in, vec_out, in_zp, identity_params, alpha_params);
73+
mli::krn::ref::compute_leaky_relu(vec_in + 1, vec_out + 1, in_zp, identity_params, alpha_params);
74+
}
75+
76+
static MLI_FORCE_INLINE void compute_leaky_relu(
77+
const MLI_PTR(int8_t) vec_in,
78+
MLI_OUT_PTR(int8_t) vec_out,
79+
const int16_t in_zp,
80+
const s8asym_quant_params *identity_params,
81+
const s8asym_quant_params *alpha_params,
82+
const int remaining_part) {
83+
84+
MLI_ASSERT(remaining_part == 1);
85+
mli::krn::ref::compute_leaky_relu(vec_in, vec_out, in_zp, identity_params, alpha_params);
86+
}
87+
88+
} // namespace dsp
89+
} // namespace krn
90+
} // namespace mli
91+
92+
#endif // _MLI_KRN_LEAKY_RELU_DSP_H_

0 commit comments

Comments
 (0)