Skip to content

Commit 346e30a

Browse files
author
Rajalakshmi Srinivasaraghavan
committed
POWER10: Improve axpy performance
This patch aligns the stores to 32 byte boundary for saxpy and daxpy before entering into vector pair loop. Fox caxpy, changed the store instructions to stxv to improve performance of unaligned cases.
1 parent 83de62c commit 346e30a

File tree

3 files changed

+38
-17
lines changed

3 files changed

+38
-17
lines changed

kernel/power/caxpy_microk_power10.c

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -112,10 +112,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
112112
"xvmaddasp 38, 58, 33 \n\t"
113113
"xvmaddasp 39, 59, 33 \n\t"
114114

115-
"stxvp 48, 0(%4) \n\t"
116-
"stxvp 50, 32(%4) \n\t"
117-
"stxvp 34, 64(%4) \n\t"
118-
"stxvp 38, 96(%4) \n\t"
115+
"stxv 49, 0(%4) \n\t"
116+
"stxv 48, 16(%4) \n\t"
117+
"stxv 51, 32(%4) \n\t"
118+
"stxv 50, 48(%4) \n\t"
119+
"stxv 35, 64(%4) \n\t"
120+
"stxv 34, 80(%4) \n\t"
121+
"stxv 39, 96(%4) \n\t"
122+
"stxv 38, 112(%4) \n\t"
119123

120124
"addi %4, %4, 128 \n\t"
121125
"xxperm 52, 40, %x10 \n\t" // exchange real and imag part
@@ -159,10 +163,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
159163
"xvmaddasp 38, 58, 33 \n\t"
160164
"xvmaddasp 39, 59, 33 \n\t"
161165

162-
"stxvp 48, 0(%4) \n\t"
163-
"stxvp 50, 32(%4) \n\t"
164-
"stxvp 34, 64(%4) \n\t"
165-
"stxvp 38, 96(%4) \n\t"
166+
"stxv 49, 0(%4) \n\t"
167+
"stxv 48, 16(%4) \n\t"
168+
"stxv 51, 32(%4) \n\t"
169+
"stxv 50, 48(%4) \n\t"
170+
"stxv 35, 64(%4) \n\t"
171+
"stxv 34, 80(%4) \n\t"
172+
"stxv 39, 96(%4) \n\t"
173+
"stxv 38, 112(%4) \n\t"
166174

167175
"#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n"
168176
:

kernel/power/daxpy_power10.c

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -66,12 +66,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
6666
if ( (inc_x == 1) && (inc_y == 1) )
6767
{
6868

69-
BLASLONG n1 = n & -16;
69+
if ( n >= 16 )
70+
{
71+
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
72+
for (i = 0; i < align; i++) {
73+
y[i] += da * x[i] ;
74+
}
75+
}
76+
BLASLONG n1 = (n-i) & -16;
77+
if ( n1 )
78+
daxpy_kernel_8(n1, &x[i], &y[i], da);
79+
80+
i += n1;
7081

71-
if ( n1 )
72-
daxpy_kernel_8(n1, x, y, da);
73-
74-
i = n1;
7582
while(i < n)
7683
{
7784

kernel/power/saxpy_power10.c

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,12 +64,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
6464
if ( (inc_x == 1) && (inc_y == 1) )
6565
{
6666

67-
BLASLONG n1 = n & -64;
68-
67+
if ( n >= 64 )
68+
{
69+
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
70+
for (i = 0; i < align; i++) {
71+
y[i] += da * x[i] ;
72+
}
73+
}
74+
BLASLONG n1 = (n-i) & -64;
6975
if ( n1 )
70-
saxpy_kernel_64(n1, x, y, da);
76+
saxpy_kernel_64(n1, &x[i], &y[i], da);
7177

72-
i = n1;
78+
i += n1;
7379
while(i < n)
7480
{
7581

0 commit comments

Comments
 (0)