Skip to content

Commit 4927251

Browse files
authored
Merge pull request #2750 from RajalakshmiSR/dgemv_p10
dgemv optimization for POWER10
2 parents cb097be + f77b6a8 commit 4927251

File tree

4 files changed

+1675
-2
lines changed

4 files changed

+1675
-2
lines changed

kernel/power/KERNEL.POWER10

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,12 +187,12 @@ ZSWAPKERNEL = zswap.c
187187
#
188188

189189
SGEMVNKERNEL = sgemv_n.c
190-
DGEMVNKERNEL = dgemv_n.c
190+
DGEMVNKERNEL = dgemv_n_power10.c
191191
CGEMVNKERNEL = cgemv_n.c
192192
ZGEMVNKERNEL = zgemv_n_4.c
193193
#
194194
SGEMVTKERNEL = sgemv_t.c
195-
DGEMVTKERNEL = dgemv_t.c
195+
DGEMVTKERNEL = dgemv_t_power10.c
196196
CGEMVTKERNEL = cgemv_t.c
197197
ZGEMVTKERNEL = zgemv_t_4.c
198198

kernel/power/dgemv_n_microk_power10.c

Lines changed: 268 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
/***************************************************************************
2+
Copyright (c) 2013-2016, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
/**************************************************************************************
29+
* 2016/03/30 Werner Saar ([email protected])
30+
* BLASTEST : OK
31+
* CTEST : OK
32+
* TEST : OK
33+
* LAPACK-TEST : OK
34+
**************************************************************************************/
35+
36+
#define HAVE_KERNEL_4x4 1
37+
38+
static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha)
39+
{
40+
double *a0;
41+
double *a1;
42+
double *a2;
43+
double *a3;
44+
45+
__asm__
46+
(
47+
"lxvp 40, 0(%10) \n\t" // x0, x1
48+
XXSPLTD_S(32,%x9,0) // alpha, alpha
49+
50+
"sldi %6, %13, 3 \n\t" // lda * sizeof (double)
51+
52+
"xvmuldp 34, 41, 32 \n\t" // x0 * alpha, x1 * alpha
53+
"xvmuldp 35, 40, 32 \n\t" // x2 * alpha, x3 * alpha
54+
55+
"add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda
56+
"add %6, %6, %6 \n\t" // 2 * lda
57+
58+
XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha
59+
XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha
60+
XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha
61+
XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha
62+
63+
"add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda
64+
"add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda
65+
66+
"dcbt 0, %3 \n\t"
67+
"dcbt 0, %4 \n\t"
68+
"dcbt 0, %5 \n\t"
69+
"dcbt 0, %6 \n\t"
70+
71+
"lxvp 40, 0(%3) \n\t" // a0[0], a0[1]
72+
73+
"lxvp 42, 0(%4) \n\t" // a1[0], a1[1]
74+
75+
"lxvp 44, 0(%5) \n\t" // a2[0], a2[1]
76+
77+
"lxvp 46, 0(%6) \n\t" // a3[0], a3[1]
78+
79+
"dcbt 0, %2 \n\t"
80+
81+
"addi %3, %3, 32 \n\t"
82+
"addi %4, %4, 32 \n\t"
83+
"addi %5, %5, 32 \n\t"
84+
"addi %6, %6, 32 \n\t"
85+
86+
"addic. %1, %1, -4 \n\t"
87+
"ble two%= \n\t"
88+
89+
".align 5 \n"
90+
"one%=: \n\t"
91+
92+
"lxvp 36, 0(%2) \n\t" // y0, y1
93+
94+
"xvmaddadp 36, 40, 32 \n\t"
95+
"xvmaddadp 37, 41, 32 \n\t"
96+
97+
"lxvp 40, 0(%3) \n\t" // a0[0], a0[1]
98+
99+
"xvmaddadp 36, 42, 33 \n\t"
100+
"addi %3, %3, 32 \n\t"
101+
"xvmaddadp 37, 43, 33 \n\t"
102+
103+
"lxvp 42, 0(%4) \n\t" // a1[0], a1[1]
104+
105+
"xvmaddadp 36, 44, 34 \n\t"
106+
"addi %4, %4, 32 \n\t"
107+
"xvmaddadp 37, 45, 34 \n\t"
108+
109+
"lxvp 44, 0(%5) \n\t" // a2[0], a2[1]
110+
111+
"xvmaddadp 36, 46, 35 \n\t"
112+
"addi %5, %5, 32 \n\t"
113+
"xvmaddadp 37, 47, 35 \n\t"
114+
115+
"stxvp 36, 0(%2) \n\t" // y0, y1
116+
117+
"lxvp 46, 0(%6) \n\t" // a3[0], a3[1]
118+
119+
"addi %6, %6, 32 \n\t"
120+
"addi %2, %2, 32 \n\t"
121+
122+
"addic. %1, %1, -4 \n\t"
123+
"ble two%= \n\t"
124+
125+
126+
"lxvp 36, 0(%2) \n\t" // y0, y1
127+
128+
"xvmaddadp 36, 40, 32 \n\t"
129+
"xvmaddadp 37, 41, 32 \n\t"
130+
131+
"lxvp 40, 0(%3) \n\t" // a0[0], a0[1]
132+
133+
"xvmaddadp 36, 42, 33 \n\t"
134+
"addi %3, %3, 32 \n\t"
135+
"xvmaddadp 37, 43, 33 \n\t"
136+
137+
"lxvp 42, 0(%4) \n\t" // a1[0], a1[1]
138+
139+
"xvmaddadp 36, 44, 34 \n\t"
140+
"addi %4, %4, 32 \n\t"
141+
"xvmaddadp 37, 45, 34 \n\t"
142+
143+
"lxvp 44, 0(%5) \n\t" // a2[0], a2[1]
144+
145+
"xvmaddadp 36, 46, 35 \n\t"
146+
"addi %5, %5, 32 \n\t"
147+
"xvmaddadp 37, 47, 35 \n\t"
148+
149+
"stxvp 36, 0(%2) \n\t" // y0, y1
150+
151+
"lxvp 46, 0(%6) \n\t" // a3[0], a3[1]
152+
153+
"addi %6, %6, 32 \n\t"
154+
"addi %2, %2, 32 \n\t"
155+
156+
"addic. %1, %1, -4 \n\t"
157+
"ble two%= \n\t"
158+
159+
160+
"lxvp 36, 0(%2) \n\t" // y0, y1
161+
162+
"xvmaddadp 36, 40, 32 \n\t"
163+
"xvmaddadp 37, 41, 32 \n\t"
164+
165+
"lxvp 40, 0(%3) \n\t" // a0[0], a0[1]
166+
167+
"xvmaddadp 36, 42, 33 \n\t"
168+
"addi %3, %3, 32 \n\t"
169+
"xvmaddadp 37, 43, 33 \n\t"
170+
171+
"lxvp 42, 0(%4) \n\t" // a1[0], a1[1]
172+
173+
"xvmaddadp 36, 44, 34 \n\t"
174+
"addi %4, %4, 32 \n\t"
175+
"xvmaddadp 37, 45, 34 \n\t"
176+
177+
"lxvp 44, 0(%5) \n\t" // a2[0], a2[1]
178+
179+
"xvmaddadp 36, 46, 35 \n\t"
180+
"addi %5, %5, 32 \n\t"
181+
"xvmaddadp 37, 47, 35 \n\t"
182+
183+
"stxvp 36, 0(%2) \n\t" // y0, y1
184+
185+
"lxvp 46, 0(%6) \n\t" // a3[0], a3[1]
186+
187+
"addi %6, %6, 32 \n\t"
188+
"addi %2, %2, 32 \n\t"
189+
190+
"addic. %1, %1, -4 \n\t"
191+
"ble two%= \n\t"
192+
193+
194+
"lxvp 36, 0(%2) \n\t" // y0, y1
195+
196+
"xvmaddadp 36, 40, 32 \n\t"
197+
"xvmaddadp 37, 41, 32 \n\t"
198+
199+
"lxvp 40, 0(%3) \n\t" // a0[0], a0[1]
200+
201+
"xvmaddadp 36, 42, 33 \n\t"
202+
"addi %3, %3, 32 \n\t"
203+
"xvmaddadp 37, 43, 33 \n\t"
204+
205+
"lxvp 42, 0(%4) \n\t" // a1[0], a1[1]
206+
207+
"xvmaddadp 36, 44, 34 \n\t"
208+
"addi %4, %4, 32 \n\t"
209+
"xvmaddadp 37, 45, 34 \n\t"
210+
211+
"lxvp 44, 0(%5) \n\t" // a2[0], a2[1]
212+
213+
"xvmaddadp 36, 46, 35 \n\t"
214+
"addi %5, %5, 32 \n\t"
215+
"xvmaddadp 37, 47, 35 \n\t"
216+
217+
"stxvp 36, 0(%2) \n\t" // y0, y1
218+
219+
"lxvp 46, 0(%6) \n\t" // a3[0], a3[1]
220+
221+
"addi %6, %6, 32 \n\t"
222+
"addi %2, %2, 32 \n\t"
223+
224+
"addic. %1, %1, -4 \n\t"
225+
"bgt one%= \n"
226+
227+
"two%=: \n\t"
228+
229+
"lxvp 36, 0(%2) \n\t" // y0, y1
230+
231+
"xvmaddadp 36, 40, 32 \n\t"
232+
"xvmaddadp 37, 41, 32 \n\t"
233+
234+
"xvmaddadp 36, 42, 33 \n\t"
235+
"xvmaddadp 37, 43, 33 \n\t"
236+
237+
"xvmaddadp 36, 44, 34 \n\t"
238+
"xvmaddadp 37, 45, 34 \n\t"
239+
240+
"xvmaddadp 36, 46, 35 \n\t"
241+
"xvmaddadp 37, 47, 35 \n\t"
242+
243+
"stxvp 36, 0(%2) \n\t" // y0, y1
244+
245+
"#n=%1 ap=%8=%12 lda=%13 x=%7=%10 y=%0=%2 alpha=%9 o16=%11\n"
246+
"#a0=%3 a1=%4 a2=%5 a3=%6"
247+
:
248+
"+m" (*y),
249+
"+r" (n), // 1
250+
"+b" (y), // 2
251+
"=b" (a0), // 3
252+
"=b" (a1), // 4
253+
"=&b" (a2), // 5
254+
"=&b" (a3) // 6
255+
:
256+
"m" (*x),
257+
"m" (*ap),
258+
"d" (alpha), // 9
259+
"r" (x), // 10
260+
"b" (16), // 11
261+
"3" (ap), // 12
262+
"4" (lda) // 13
263+
:
264+
"cr0",
265+
"vs32","vs33","vs34","vs35","vs36","vs37",
266+
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
267+
);
268+
}

0 commit comments

Comments
 (0)