Skip to content

Commit cb4274e

Browse files
authored
Merge pull request #2888 from Qiyu8/usimd-sum
Optimize the performance of sum by using universal intrinsics
2 parents bb74dd2 + 0ed1f07 commit cb4274e

File tree

1 file changed

+40
-8
lines changed

1 file changed

+40
-8
lines changed

kernel/arm/sum.c

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,23 +29,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2929
* trivial copy of asum.c with the ABS() removed *
3030
**************************************************************************************/
3131

32-
3332
#include "common.h"
33+
#include "../simd/intrin.h"
3434
#include <math.h>
3535

3636
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
3737
{
38-
BLASLONG i=0;
38+
BLASLONG i = 0;
3939
FLOAT sumf = 0.0;
40-
if (n <= 0 || inc_x <= 0) return(sumf);
41-
40+
if (n <= 0 || inc_x <= 0)
41+
return (sumf);
4242
n *= inc_x;
43-
while(i < n)
43+
if (inc_x == 1)
44+
{
45+
#if V_SIMD
46+
const int vstep = v_nlanes_f32;
47+
const int unrollx4 = n & (-vstep * 4);
48+
const int unrollx = n & -vstep;
49+
v_f32 vsum0 = v_zero_f32();
50+
v_f32 vsum1 = v_zero_f32();
51+
v_f32 vsum2 = v_zero_f32();
52+
v_f32 vsum3 = v_zero_f32();
53+
while (i < unrollx4)
54+
{
55+
vsum0 = v_add_f32(vsum0, v_loadu_f32(x));
56+
vsum1 = v_add_f32(vsum1, v_loadu_f32(x + vstep));
57+
vsum2 = v_add_f32(vsum2, v_loadu_f32(x + vstep * 2));
58+
vsum3 = v_add_f32(vsum3, v_loadu_f32(x + vstep * 3));
59+
i += vstep * 4;
60+
}
61+
vsum0 = v_add_f32(
62+
v_add_f32(vsum0, vsum1), v_add_f32(vsum2, vsum3));
63+
while (i < unrollx)
64+
{
65+
vsum0 = v_add_f32(vsum0, v_loadu_f32(x + i));
66+
i += vstep;
67+
}
68+
sumf = v_sum_f32(vsum0);
69+
#else
70+
int n1 = n & -4;
71+
for (; i < n1; i += 4)
72+
{
73+
sumf += x[i] + x[i + 1] + x[i + 2] + x[i + 3];
74+
}
75+
#endif
76+
}
77+
while (i < n)
4478
{
4579
sumf += x[i];
4680
i += inc_x;
4781
}
48-
return(sumf);
82+
return (sumf);
4983
}
50-
51-

0 commit comments

Comments
 (0)