Skip to content

Commit e2f9005

Browse files
authored
Merge pull request #2950 from RajalakshmiSR/saxpy
Optimize saxpy for POWER10
2 parents 76203e2 + c24ba8b commit e2f9005

File tree

3 files changed

+301
-1
lines changed

3 files changed

+301
-1
lines changed

kernel/power/KERNEL.POWER10

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ DASUMKERNEL = dasum.c
141141
CASUMKERNEL = casum.c
142142
ZASUMKERNEL = zasum.c
143143
#
144-
SAXPYKERNEL = saxpy.c
144+
SAXPYKERNEL = saxpy_power10.c
145145
DAXPYKERNEL = daxpy_power10.c
146146
ifneq ($(GCCVERSIONGTEQ9),1)
147147
CAXPYKERNEL = caxpy_power9.S

kernel/power/saxpy_microk_power10.c

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
/***************************************************************************
2+
Copyright (c) 2020, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#define HAVE_KERNEL_8 1
29+
30+
static void saxpy_kernel_64(long n, float *x, float *y, float alpha)
31+
{
32+
__vector float t0 = {alpha, alpha,alpha, alpha};
33+
34+
__asm__
35+
(
36+
37+
"dcbt 0, %2 \n\t"
38+
"dcbt 0, %3 \n\t"
39+
40+
"lxvp 32, 0(%2) \n\t"
41+
"lxvp 34, 32(%2) \n\t"
42+
"lxvp 40, 64(%2) \n\t"
43+
"lxvp 42, 96(%2) \n\t"
44+
"lxvp 48, 128(%2) \n\t"
45+
"lxvp 50, 160(%2) \n\t"
46+
"lxvp 52, 192(%2) \n\t"
47+
"lxvp 54, 224(%2) \n\t"
48+
49+
"lxvp 36, 0(%3) \n\t"
50+
"lxvp 38, 32(%3) \n\t"
51+
"lxvp 44, 64(%3) \n\t"
52+
"lxvp 46, 96(%3) \n\t"
53+
"lxvp 56, 128(%3) \n\t"
54+
"lxvp 58, 160(%3) \n\t"
55+
"lxvp 60, 192(%3) \n\t"
56+
"lxvp 62, 224(%3) \n\t"
57+
58+
"addi %2, %2, 256 \n\t"
59+
60+
"addic. %1, %1, -64 \n\t"
61+
"ble two%= \n\t"
62+
63+
".align 5 \n"
64+
"one%=: \n\t"
65+
66+
"xvmaddasp 36, 32, %x4 \n\t"
67+
"xvmaddasp 37, 33, %x4 \n\t"
68+
69+
"lxvp 32, 0(%2) \n\t"
70+
"stxvp 36, 0(%3) \n\t"
71+
72+
"xvmaddasp 38, 34, %x4 \n\t"
73+
"xvmaddasp 39, 35, %x4 \n\t"
74+
75+
"lxvp 34, 32(%2) \n\t"
76+
"stxvp 38, 32(%3) \n\t"
77+
78+
"lxvp 36, 256(%3) \n\t"
79+
"lxvp 38, 288(%3) \n\t"
80+
81+
"xvmaddasp 44, 40, %x4 \n\t"
82+
"xvmaddasp 45, 41, %x4 \n\t"
83+
84+
"lxvp 40, 64(%2) \n\t"
85+
"stxvp 44, 64(%3) \n\t"
86+
87+
"xvmaddasp 46, 42, %x4 \n\t"
88+
"xvmaddasp 47, 43, %x4 \n\t"
89+
90+
"lxvp 42, 96(%2) \n\t"
91+
"stxvp 46, 96(%3) \n\t"
92+
93+
"lxvp 44, 320(%3) \n\t"
94+
"lxvp 46, 352(%3) \n\t"
95+
96+
"xvmaddasp 56, 48, %x4 \n\t"
97+
"xvmaddasp 57, 49, %x4 \n\t"
98+
99+
"lxvp 48, 128(%2) \n\t"
100+
"stxvp 56, 128(%3) \n\t"
101+
102+
"xvmaddasp 58, 50, %x4 \n\t"
103+
"xvmaddasp 59, 51, %x4 \n\t"
104+
105+
"lxvp 50, 160(%2) \n\t"
106+
"stxvp 58, 160(%3) \n\t"
107+
108+
"lxvp 56, 384(%3) \n\t"
109+
"lxvp 58, 416(%3) \n\t"
110+
111+
"xvmaddasp 60, 52, %x4 \n\t"
112+
"xvmaddasp 61, 53, %x4 \n\t"
113+
114+
"lxvp 52, 192(%2) \n\t"
115+
"stxvp 60, 192(%3) \n\t"
116+
117+
"xvmaddasp 62, 54, %x4 \n\t"
118+
"xvmaddasp 63, 55, %x4 \n\t"
119+
120+
"lxvp 54, 224(%2) \n\t"
121+
"stxvp 62, 224(%3) \n\t"
122+
123+
"lxvp 60, 448(%3) \n\t"
124+
"lxvp 62, 480(%3) \n\t"
125+
126+
"addi %2, %2, 256 \n\t"
127+
"addi %3, %3, 256 \n\t"
128+
129+
"addic. %1, %1, -64 \n\t"
130+
"bgt one%= \n"
131+
132+
"two%=: \n\t"
133+
134+
"xvmaddasp 36, 32, %x4 \n\t"
135+
"xvmaddasp 37, 33, %x4 \n\t"
136+
"xvmaddasp 38, 34, %x4 \n\t"
137+
"xvmaddasp 39, 35, %x4 \n\t"
138+
139+
"xvmaddasp 44, 40, %x4 \n\t"
140+
"xvmaddasp 45, 41, %x4 \n\t"
141+
"xvmaddasp 46, 42, %x4 \n\t"
142+
"xvmaddasp 47, 43, %x4 \n\t"
143+
144+
"xvmaddasp 56, 48, %x4 \n\t"
145+
"xvmaddasp 57, 49, %x4 \n\t"
146+
"xvmaddasp 58, 50, %x4 \n\t"
147+
"xvmaddasp 59, 51, %x4 \n\t"
148+
149+
"xvmaddasp 60, 52, %x4 \n\t"
150+
"xvmaddasp 61, 53, %x4 \n\t"
151+
"xvmaddasp 62, 54, %x4 \n\t"
152+
"xvmaddasp 63, 55, %x4 \n\t"
153+
"stxvp 36, 0(%3) \n\t"
154+
"stxvp 38, 32(%3) \n\t"
155+
"stxvp 44, 64(%3) \n\t"
156+
"stxvp 46, 96(%3) \n\t"
157+
"stxvp 56, 128(%3) \n\t"
158+
"stxvp 58, 160(%3) \n\t"
159+
"stxvp 60, 192(%3) \n\t"
160+
"stxvp 62, 224(%3) \n\t"
161+
162+
"#n=%1 x=%5=%2 y=%0=%3 t0=%x4\n"
163+
:
164+
"+m" (*y),
165+
"+r" (n), // 1
166+
"+b" (x), // 2
167+
"+b" (y) // 3
168+
:
169+
"wa" (t0), // 4
170+
"m" (*x)
171+
:
172+
"cr0",
173+
"vs32","vs33","vs34","vs35","vs36","vs37", "vs38", "vs39",
174+
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
175+
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
176+
"vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
177+
);
178+
179+
}
180+
181+

kernel/power/saxpy_power10.c

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
/***************************************************************************
2+
Copyright (c) 2020, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
29+
#include "common.h"
30+
31+
#if defined(__VEC__) || defined(__ALTIVEC__)
32+
#include "saxpy_microk_power10.c"
33+
#endif
34+
35+
#ifndef HAVE_KERNEL_8
36+
static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
37+
{
38+
BLASLONG register i = 0;
39+
40+
while(i < n)
41+
{
42+
y[i] += alpha * x[i];
43+
y[i+1] += alpha * x[i+1];
44+
y[i+2] += alpha * x[i+2];
45+
y[i+3] += alpha * x[i+3];
46+
y[i+4] += alpha * x[i+4];
47+
y[i+5] += alpha * x[i+5];
48+
y[i+6] += alpha * x[i+6];
49+
y[i+7] += alpha * x[i+7];
50+
i+=8 ;
51+
52+
}
53+
54+
}
55+
#endif
56+
57+
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
58+
{
59+
BLASLONG i=0;
60+
BLASLONG ix=0,iy=0;
61+
62+
if ( n <= 0 ) return(0);
63+
64+
if ( (inc_x == 1) && (inc_y == 1) )
65+
{
66+
67+
BLASLONG n1 = n & -64;
68+
69+
if ( n1 )
70+
saxpy_kernel_64(n1, x, y, da);
71+
72+
i = n1;
73+
while(i < n)
74+
{
75+
76+
y[i] += da * x[i] ;
77+
i++ ;
78+
79+
}
80+
return(0);
81+
82+
83+
}
84+
85+
BLASLONG n1 = n & -4;
86+
87+
while(i < n1)
88+
{
89+
90+
FLOAT m1 = da * x[ix] ;
91+
FLOAT m2 = da * x[ix+inc_x] ;
92+
FLOAT m3 = da * x[ix+2*inc_x] ;
93+
FLOAT m4 = da * x[ix+3*inc_x] ;
94+
95+
y[iy] += m1 ;
96+
y[iy+inc_y] += m2 ;
97+
y[iy+2*inc_y] += m3 ;
98+
y[iy+3*inc_y] += m4 ;
99+
100+
ix += inc_x*4 ;
101+
iy += inc_y*4 ;
102+
i+=4 ;
103+
104+
}
105+
106+
while(i < n)
107+
{
108+
109+
y[iy] += da * x[ix] ;
110+
ix += inc_x ;
111+
iy += inc_y ;
112+
i++ ;
113+
114+
}
115+
return(0);
116+
117+
}
118+
119+

0 commit comments

Comments
 (0)