Skip to content

Commit c4c591a

Browse files
committed
fix sum optimize issues
1 parent ff16329 commit c4c591a

File tree

1 file changed

+18
-17
lines changed

1 file changed

+18
-17
lines changed

kernel/arm/sum.c

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -42,24 +42,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
4242
n *= inc_x;
4343
if (inc_x == 1)
4444
{
45-
#if V_SIMD
45+
#if V_SIMD && (!defined(DOUBLE) || (defined(DOUBLE) && V_SIMD_F64 && V_SIMD > 128))
4646
#ifdef DOUBLE
4747
const int vstep = v_nlanes_f64;
48-
const int unrollx2 = n & (-vstep * 2);
48+
const int unrollx4 = n & (-vstep * 4);
4949
const int unrollx = n & -vstep;
5050
v_f64 vsum0 = v_zero_f64();
5151
v_f64 vsum1 = v_zero_f64();
52-
while (i < unrollx2)
52+
v_f64 vsum2 = v_zero_f64();
53+
v_f64 vsum3 = v_zero_f64();
54+
for (; i < unrollx4; i += vstep * 4)
5355
{
54-
vsum0 = v_add_f64(vsum0, v_loadu_f64(x));
55-
vsum1 = v_add_f64(vsum1, v_loadu_f64(x + vstep));
56-
i += vstep * 2;
56+
vsum0 = v_add_f64(vsum0, v_loadu_f64(x + i));
57+
vsum1 = v_add_f64(vsum1, v_loadu_f64(x + i + vstep));
58+
vsum2 = v_add_f64(vsum2, v_loadu_f64(x + i + vstep * 2));
59+
vsum3 = v_add_f64(vsum3, v_loadu_f64(x + i + vstep * 3));
5760
}
58-
vsum0 = v_add_f64(vsum0, vsum1);
59-
while (i < unrollx)
61+
vsum0 = v_add_f64(
62+
v_add_f64(vsum0, vsum1), v_add_f64(vsum2, vsum3));
63+
for (; i < unrollx; i += vstep)
6064
{
6165
vsum0 = v_add_f64(vsum0, v_loadu_f64(x + i));
62-
i += vstep;
6366
}
6467
sumf = v_sum_f64(vsum0);
6568
#else
@@ -70,20 +73,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
7073
v_f32 vsum1 = v_zero_f32();
7174
v_f32 vsum2 = v_zero_f32();
7275
v_f32 vsum3 = v_zero_f32();
73-
while (i < unrollx4)
76+
for (; i < unrollx4; i += vstep * 4)
7477
{
75-
vsum0 = v_add_f32(vsum0, v_loadu_f32(x));
76-
vsum1 = v_add_f32(vsum1, v_loadu_f32(x + vstep));
77-
vsum2 = v_add_f32(vsum2, v_loadu_f32(x + vstep * 2));
78-
vsum3 = v_add_f32(vsum3, v_loadu_f32(x + vstep * 3));
79-
i += vstep * 4;
78+
vsum0 = v_add_f32(vsum0, v_loadu_f32(x + i));
79+
vsum1 = v_add_f32(vsum1, v_loadu_f32(x + i + vstep));
80+
vsum2 = v_add_f32(vsum2, v_loadu_f32(x + i + vstep * 2));
81+
vsum3 = v_add_f32(vsum3, v_loadu_f32(x + i + vstep * 3));
8082
}
8183
vsum0 = v_add_f32(
8284
v_add_f32(vsum0, vsum1), v_add_f32(vsum2, vsum3));
83-
while (i < unrollx)
85+
for (; i < unrollx; i += vstep)
8486
{
8587
vsum0 = v_add_f32(vsum0, v_loadu_f32(x + i));
86-
i += vstep;
8788
}
8889
sumf = v_sum_f32(vsum0);
8990
#endif

0 commit comments

Comments
 (0)