Skip to content

Commit 3ede843

Browse files
author
Rajalakshmi Srinivasaraghavan
committed
Optimize s/dscal function for POWER10
This patch makes use of new POWER10 vector pair instructions for loads and stores.
1 parent 69a5558 commit 3ede843

File tree

4 files changed

+339
-2
lines changed

4 files changed

+339
-2
lines changed

kernel/power/dscal.c

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3535

3636
#include "common.h"
3737

38-
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
3938
#if defined(__VEC__) || defined(__ALTIVEC__)
39+
#if defined(POWER8) || defined(POWER9)
4040
#include "dscal_microk_power8.c"
41+
#elif defined(POWER10)
42+
#include "dscal_microk_power10.c"
4143
#endif
4244
#endif
4345

@@ -100,12 +102,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
100102
if ( da == 0.0 )
101103
{
102104

105+
#if defined(POWER10)
106+
if ( n >= 16 )
107+
{
108+
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
109+
for (j = 0; j < align; j++) {
110+
x[j] = 0.0;
111+
}
112+
}
113+
BLASLONG n1 = (n-j) & -16;
114+
if ( n1 > 0 )
115+
{
116+
dscal_kernel_8_zero(n1, &x[j]);
117+
j+=n1;
118+
}
119+
#else
103120
BLASLONG n1 = n & -16;
104121
if ( n1 > 0 )
105122
{
106123
dscal_kernel_8_zero(n1, x);
107124
j=n1;
108125
}
126+
#endif
109127

110128
while(j < n)
111129
{
@@ -118,12 +136,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
118136
else
119137
{
120138

139+
#if defined(POWER10)
140+
if ( n >= 16 )
141+
{
142+
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
143+
for (j = 0; j < align; j++) {
144+
x[j] = da * x[j];
145+
}
146+
}
147+
BLASLONG n1 = (n-j) & -16;
148+
if ( n1 > 0 )
149+
{
150+
dscal_kernel_8(n1, &x[j], da);
151+
j+=n1;
152+
}
153+
#else
121154
BLASLONG n1 = n & -16;
122155
if ( n1 > 0 )
123156
{
124157
dscal_kernel_8(n1, x, da);
125158
j=n1;
126159
}
160+
#endif
127161
while(j < n)
128162
{
129163

kernel/power/dscal_microk_power10.c

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
/***************************************************************************
2+
Copyright (c) 2021, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#define HAVE_KERNEL_8 1
29+
30+
static void dscal_kernel_8 (long n, double *x, double alpha)
31+
{
32+
__asm__
33+
(
34+
"dcbt 0, %2 \n\t"
35+
36+
XXSPLTD_S(48,%x3,0)
37+
38+
"lxvp 32, 0(%2) \n\t"
39+
"lxvp 34, 32(%2) \n\t"
40+
"lxvp 36, 64(%2) \n\t"
41+
"lxvp 38, 96(%2) \n\t"
42+
43+
"addic. %1, %1, -16 \n\t"
44+
"ble two%= \n\t"
45+
46+
".align 5 \n"
47+
"one%=: \n\t"
48+
49+
"xvmuldp 40, 32, 48 \n\t"
50+
"xvmuldp 41, 33, 48 \n\t"
51+
"xvmuldp 42, 34, 48 \n\t"
52+
"xvmuldp 43, 35, 48 \n\t"
53+
"lxvp 32, 128(%2) \n\t"
54+
"lxvp 34, 160(%2) \n\t"
55+
"xvmuldp 44, 36, 48 \n\t"
56+
"xvmuldp 45, 37, 48 \n\t"
57+
"xvmuldp 46, 38, 48 \n\t"
58+
"xvmuldp 47, 39, 48 \n\t"
59+
"lxvp 36, 192(%2) \n\t"
60+
"lxvp 38, 224(%2) \n\t"
61+
62+
"stxvp 40, 0(%2) \n\t"
63+
"stxvp 42, 32(%2) \n\t"
64+
"stxvp 44, 64(%2) \n\t"
65+
"stxvp 46, 96(%2) \n\t"
66+
67+
"addi %2, %2, 128 \n\t"
68+
69+
"addic. %1, %1, -16 \n\t"
70+
"bgt one%= \n"
71+
72+
"two%=: \n\t"
73+
74+
"xvmuldp 40, 32, 48 \n\t"
75+
"xvmuldp 41, 33, 48 \n\t"
76+
"xvmuldp 42, 34, 48 \n\t"
77+
"xvmuldp 43, 35, 48 \n\t"
78+
79+
"xvmuldp 44, 36, 48 \n\t"
80+
"xvmuldp 45, 37, 48 \n\t"
81+
"xvmuldp 46, 38, 48 \n\t"
82+
"xvmuldp 47, 39, 48 \n\t"
83+
84+
"stxvp 40, 0(%2) \n\t"
85+
"stxvp 42, 32(%2) \n\t"
86+
"stxvp 44, 64(%2) \n\t"
87+
"stxvp 46, 96(%2) \n\t"
88+
89+
"#n=%1 alpha=%3 x=%0=%2"
90+
:
91+
"+m" (*x),
92+
"+r" (n), // 1
93+
"+b" (x) // 2
94+
:
95+
"d" (alpha) // 3
96+
:
97+
"cr0",
98+
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
99+
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47","vs48"
100+
);
101+
}
102+
103+
104+
static void dscal_kernel_8_zero (long n, double *x)
105+
{
106+
107+
__asm__
108+
(
109+
"xxlxor 32, 32, 32 \n\t"
110+
"xxlxor 33, 33, 33 \n\t"
111+
112+
".align 5 \n"
113+
"one%=: \n\t"
114+
115+
"stxvp 32, 0(%2) \n\t"
116+
"stxvp 32, 32(%2) \n\t"
117+
"stxvp 32, 64(%2) \n\t"
118+
"stxvp 32, 96(%2) \n\t"
119+
120+
"addi %2, %2, 128 \n\t"
121+
122+
"addic. %1, %1, -16 \n\t"
123+
"bgt one%= \n"
124+
125+
"#n=%1 x=%0=%2 "
126+
:
127+
"=m" (*x),
128+
"+r" (n), // 1
129+
"+b" (x) // 2
130+
:
131+
:
132+
"cr0","vs32","vs33"
133+
);
134+
}

kernel/power/sscal.c

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3535

3636
#include "common.h"
3737

38-
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
3938
#if defined(__VEC__) || defined(__ALTIVEC__)
39+
#if defined(POWER8) || defined(POWER9)
4040
#include "sscal_microk_power8.c"
41+
#elif defined(POWER10)
42+
#include "sscal_microk_power10.c"
4143
#endif
4244
#endif
4345

@@ -102,12 +104,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
102104
if ( da == 0.0 )
103105
{
104106

107+
#if defined(POWER10)
108+
if ( n >= 32 )
109+
{
110+
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
111+
for (j = 0; j < align; j++) {
112+
x[j] = 0.0;
113+
}
114+
}
115+
BLASLONG n1 = (n-j) & -32;
116+
if ( n1 > 0 )
117+
{
118+
sscal_kernel_16_zero(n1, &x[j]);
119+
j+=n1;
120+
}
121+
#else
105122
BLASLONG n1 = n & -32;
106123
if ( n1 > 0 )
107124
{
108125
sscal_kernel_16_zero(n1, x);
109126
j=n1;
110127
}
128+
#endif
111129

112130
while(j < n)
113131
{
@@ -120,12 +138,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
120138
else
121139
{
122140

141+
#if defined(POWER10)
142+
if ( n >= 32 )
143+
{
144+
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
145+
for (j = 0; j < align; j++) {
146+
x[j] = da * x[j];
147+
}
148+
}
149+
BLASLONG n1 = (n-j) & -32;
150+
if ( n1 > 0 )
151+
{
152+
sscal_kernel_16(n1, &x[j], da);
153+
j+=n1;
154+
}
155+
#else
123156
BLASLONG n1 = n & -32;
124157
if ( n1 > 0 )
125158
{
126159
sscal_kernel_16(n1, x, da);
127160
j=n1;
128161
}
162+
#endif
129163
while(j < n)
130164
{
131165

0 commit comments

Comments
 (0)