Skip to content

Commit 35086cb

Browse files
authored
Merge pull request #3092 from RajalakshmiSR/cscal_p10
Optimize cscal function for POWER10
2 parents 7745439 + 2056ffc commit 35086cb

File tree

2 files changed

+187
-1
lines changed

2 files changed

+187
-1
lines changed

kernel/power/cscal_microk_power10.c

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
/***************************************************************************
2+
Copyright (c) 2021, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#define HAVE_KERNEL_8 1
29+
30+
static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i)
31+
{
32+
__vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i};
33+
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
34+
__asm__
35+
(
36+
"dcbt 0, %2 \n\t"
37+
"xscvdpspn 32, %x3 \n\t"
38+
"xxspltw 32, 32, 0 \n\t"
39+
40+
"lxvp 40, 0(%2) \n\t"
41+
"lxvp 42, 32(%2) \n\t"
42+
"lxvp 44, 64(%2) \n\t"
43+
"lxvp 46, 96(%2) \n\t"
44+
45+
"addic. %1, %1, -16 \n\t"
46+
"ble two%= \n\t"
47+
48+
".align 5 \n"
49+
"one%=: \n\t"
50+
51+
"xvmulsp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
52+
"xvmulsp 49, 41, 32 \n\t"
53+
"xvmulsp 50, 42, 32 \n\t"
54+
"xvmulsp 51, 43, 32 \n\t"
55+
"xvmulsp 52, 44, 32 \n\t"
56+
"xvmulsp 53, 45, 32 \n\t"
57+
"xvmulsp 54, 46, 32 \n\t"
58+
"xvmulsp 55, 47, 32 \n\t"
59+
60+
"xxperm 34, 40, %x5 \n\t"
61+
"xxperm 35, 41, %x5 \n\t"
62+
"xxperm 36, 42, %x5 \n\t"
63+
"xxperm 37, 43, %x5 \n\t"
64+
"xxperm 38, 44, %x5 \n\t"
65+
"xxperm 39, 45, %x5 \n\t"
66+
"xxperm 56, 46, %x5 \n\t"
67+
"xxperm 57, 47, %x5 \n\t"
68+
69+
"xvmulsp 34, 34, %x4 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
70+
"xvmulsp 35, 35, %x4 \n\t"
71+
72+
"lxvp 40, 128(%2) \n\t"
73+
74+
"xvmulsp 36, 36, %x4 \n\t"
75+
"xvmulsp 37, 37, %x4 \n\t"
76+
77+
"lxvp 42, 160(%2) \n\t"
78+
79+
"xvmulsp 38, 38, %x4 \n\t"
80+
"xvmulsp 39, 39, %x4 \n\t"
81+
82+
"lxvp 44, 192(%2) \n\t"
83+
84+
"xvmulsp 56, 56, %x4 \n\t"
85+
"xvmulsp 57, 57, %x4 \n\t"
86+
87+
"lxvp 46, 224(%2) \n\t"
88+
89+
"xvaddsp 48, 48, 34 \n\t"
90+
"xvaddsp 49, 49, 35 \n\t"
91+
"xvaddsp 50, 50, 36 \n\t"
92+
"xvaddsp 51, 51, 37 \n\t"
93+
94+
"stxvp 48, 0(%2) \n\t"
95+
96+
"xvaddsp 52, 52, 38 \n\t"
97+
"xvaddsp 53, 53, 39 \n\t"
98+
99+
"stxvp 50, 32(%2) \n\t"
100+
101+
"xvaddsp 54, 54, 56 \n\t"
102+
"xvaddsp 55, 55, 57 \n\t"
103+
104+
"stxvp 52, 64(%2) \n\t"
105+
"stxvp 54, 96(%2) \n\t"
106+
107+
"addi %2, %2, 128 \n\t"
108+
109+
"addic. %1, %1, -16 \n\t"
110+
"bgt one%= \n"
111+
112+
"two%=: \n\t"
113+
114+
"xvmulsp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
115+
"xvmulsp 49, 41, 32 \n\t"
116+
"xvmulsp 50, 42, 32 \n\t"
117+
"xvmulsp 51, 43, 32 \n\t"
118+
"xvmulsp 52, 44, 32 \n\t"
119+
"xvmulsp 53, 45, 32 \n\t"
120+
"xvmulsp 54, 46, 32 \n\t"
121+
"xvmulsp 55, 47, 32 \n\t"
122+
123+
"xxperm 34, 40, %x5 \n\t"
124+
"xxperm 35, 41, %x5 \n\t"
125+
"xxperm 36, 42, %x5 \n\t"
126+
"xxperm 37, 43, %x5 \n\t"
127+
"xxperm 38, 44, %x5 \n\t"
128+
"xxperm 39, 45, %x5 \n\t"
129+
"xxperm 56, 46, %x5 \n\t"
130+
"xxperm 57, 47, %x5 \n\t"
131+
132+
133+
"xvmulsp 34, 34, %x4 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
134+
"xvmulsp 35, 35, %x4 \n\t"
135+
"xvmulsp 36, 36, %x4 \n\t"
136+
"xvmulsp 37, 37, %x4 \n\t"
137+
"xvmulsp 38, 38, %x4 \n\t"
138+
"xvmulsp 39, 39, %x4 \n\t"
139+
"xvmulsp 56, 56, %x4 \n\t"
140+
"xvmulsp 57, 57, %x4 \n\t"
141+
142+
"xvaddsp 48, 48, 34 \n\t"
143+
"xvaddsp 49, 49, 35 \n\t"
144+
"xvaddsp 50, 50, 36 \n\t"
145+
"xvaddsp 51, 51, 37 \n\t"
146+
147+
"stxvp 48, 0(%2) \n\t"
148+
149+
"xvaddsp 52, 52, 38 \n\t"
150+
"xvaddsp 53, 53, 39 \n\t"
151+
152+
"stxvp 50, 32(%2) \n\t"
153+
154+
"xvaddsp 54, 54, 56 \n\t"
155+
"xvaddsp 55, 55, 57 \n\t"
156+
157+
"stxvp 52, 64(%2) \n\t"
158+
"stxvp 54, 96(%2) \n\t"
159+
160+
"#n=%1 x=%0=%2 alpha=(%3,%4)\n"
161+
:
162+
"+m" (*x),
163+
"+r" (n), // 1
164+
"+b" (x) // 2
165+
:
166+
"f" (alpha_r), // 3
167+
"wa" (t0), // 4
168+
"wa" (mask) // 5
169+
:
170+
"cr0",
171+
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
172+
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
173+
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
174+
"vs56","vs57"
175+
);
176+
}

kernel/power/zscal.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3838

3939
#pragma GCC optimize "O1"
4040

41-
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
4241
#if defined(__VEC__) || defined(__ALTIVEC__)
42+
#if defined(POWER8) || defined(POWER9)
4343
#if defined(DOUBLE)
4444
#include "zscal_microk_power8.c"
4545
#endif
46+
#elif defined(POWER10)
47+
#if defined(DOUBLE)
48+
#include "zscal_microk_power8.c"
49+
#else
50+
#include "cscal_microk_power10.c"
51+
#endif
4652
#endif
4753
#endif
4854

@@ -145,7 +151,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
145151
{
146152

147153

154+
#if defined(DOUBLE)
148155
n1 = n & -8;
156+
#else
157+
n1 = n & -16;
158+
#endif
149159
if ( n1 > 0 )
150160
{
151161
zscal_kernel_8(n1, x, da_r, da_i);

0 commit comments

Comments
 (0)