Skip to content

Commit 46c9357

Browse files
authored
Merge pull request #1288 from quickwritereader/develop
Optimized standard Blas Level-1,2 (excluding nrm2 functions) for z13 (double precision). Issue 884
2 parents 1c3e2d3 + 1cfdb22 commit 46c9357

26 files changed

+7230
-26
lines changed

CONTRIBUTORS.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,5 +166,5 @@ In chronological order:
166166
* [2017-01-01] dgemm and dtrmm kernels for IBM z13
167167
* [2017-02-26] ztrmm kernel for IBM z13
168168
* [2017-03-13] strmm and ctrmm kernel for IBM z13
169-
169+
* [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13
170170

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ Please read GotoBLAS_01Readme.txt
107107
- **ARM Cortex-A57**: Experimental
108108

109109
#### IBM zEnterprise System:
110-
- **Z13**: Optimized Level-3 BLAS
110+
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
111111

112112

113113
### Support OS:

interface/axpy.c

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,12 @@
4040
#include "common.h"
4141
#ifdef FUNCTION_PROFILE
4242
#include "functable.h"
43+
#endif
44+
#if defined(Z13)
45+
#define MULTI_THREAD_MINIMAL 200000
46+
#else
47+
#define MULTI_THREAD_MINIMAL 10000
4348
#endif
44-
4549
#ifndef CBLAS
4650

4751
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
@@ -88,7 +92,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
8892

8993
//Temporarily work-around the low performance issue with small imput size &
9094
//multithreads.
91-
if (n <= 10000)
95+
if (n <= MULTI_THREAD_MINIMAL)
9296
nthreads = 1;
9397

9498
if (nthreads == 1) {

kernel/zarch/KERNEL.Z13

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,14 @@ SMINKERNEL = ../arm/min.c
1515
DMINKERNEL = ../arm/min.c
1616

1717
ISAMAXKERNEL = ../arm/iamax.c
18-
IDAMAXKERNEL = ../arm/iamax.c
18+
IDAMAXKERNEL = idamax.c
1919
ICAMAXKERNEL = ../arm/izamax.c
20-
IZAMAXKERNEL = ../arm/izamax.c
20+
IZAMAXKERNEL = izamax.c
2121

2222
ISAMINKERNEL = ../arm/iamin.c
23-
IDAMINKERNEL = ../arm/iamin.c
23+
IDAMINKERNEL = idamin.c
2424
ICAMINKERNEL = ../arm/izamin.c
25-
IZAMINKERNEL = ../arm/izamin.c
25+
IZAMINKERNEL = izamin.c
2626

2727
ISMAXKERNEL = ../arm/imax.c
2828
IDMAXKERNEL = ../arm/imax.c
@@ -31,54 +31,54 @@ ISMINKERNEL = ../arm/imin.c
3131
IDMINKERNEL = ../arm/imin.c
3232

3333
SASUMKERNEL = ../arm/asum.c
34-
DASUMKERNEL = ../arm/asum.c
34+
DASUMKERNEL = dasum.c
3535
CASUMKERNEL = ../arm/zasum.c
36-
ZASUMKERNEL = ../arm/zasum.c
36+
ZASUMKERNEL = zasum.c
3737

3838
SAXPYKERNEL = ../arm/axpy.c
39-
DAXPYKERNEL = ../arm/axpy.c
39+
DAXPYKERNEL = daxpy.c
4040
CAXPYKERNEL = ../arm/zaxpy.c
41-
ZAXPYKERNEL = ../arm/zaxpy.c
41+
ZAXPYKERNEL = zaxpy.c
4242

4343
SCOPYKERNEL = ../arm/copy.c
44-
DCOPYKERNEL = ../arm/copy.c
44+
DCOPYKERNEL = dcopy.c
4545
CCOPYKERNEL = ../arm/zcopy.c
46-
ZCOPYKERNEL = ../arm/zcopy.c
46+
ZCOPYKERNEL = zcopy.c
4747

4848
SDOTKERNEL = ../arm/dot.c
49-
DDOTKERNEL = ../arm/dot.c
49+
DDOTKERNEL = ddot.c
5050
CDOTKERNEL = ../arm/zdot.c
51-
ZDOTKERNEL = ../arm/zdot.c
51+
ZDOTKERNEL = zdot.c
5252

5353
SNRM2KERNEL = ../arm/nrm2.c
5454
DNRM2KERNEL = ../arm/nrm2.c
5555
CNRM2KERNEL = ../arm/znrm2.c
5656
ZNRM2KERNEL = ../arm/znrm2.c
5757

5858
SROTKERNEL = ../arm/rot.c
59-
DROTKERNEL = ../arm/rot.c
59+
DROTKERNEL = drot.c
6060
CROTKERNEL = ../arm/zrot.c
61-
ZROTKERNEL = ../arm/zrot.c
61+
ZROTKERNEL = zrot.c
6262

6363
SSCALKERNEL = ../arm/scal.c
64-
DSCALKERNEL = ../arm/scal.c
64+
DSCALKERNEL = dscal.c
6565
CSCALKERNEL = ../arm/zscal.c
66-
ZSCALKERNEL = ../arm/zscal.c
66+
ZSCALKERNEL = zscal.c
6767

6868
SSWAPKERNEL = ../arm/swap.c
69-
DSWAPKERNEL = ../arm/swap.c
69+
DSWAPKERNEL = dswap.c
7070
CSWAPKERNEL = ../arm/zswap.c
71-
ZSWAPKERNEL = ../arm/zswap.c
71+
ZSWAPKERNEL = zswap.c
7272

7373
SGEMVNKERNEL = ../arm/gemv_n.c
74-
DGEMVNKERNEL = ../arm/gemv_n.c
74+
DGEMVNKERNEL = dgemv_n_4.c
7575
CGEMVNKERNEL = ../arm/zgemv_n.c
76-
ZGEMVNKERNEL = ../arm/zgemv_n.c
76+
ZGEMVNKERNEL = zgemv_n_4.c
7777

7878
SGEMVTKERNEL = ../arm/gemv_t.c
79-
DGEMVTKERNEL = ../arm/gemv_t.c
79+
DGEMVTKERNEL = dgemv_t_4.c
8080
CGEMVTKERNEL = ../arm/zgemv_t.c
81-
ZGEMVTKERNEL = ../arm/zgemv_t.c
81+
ZGEMVTKERNEL = zgemv_t_4.c
8282

8383
STRMMKERNEL = strmm8x4V.S
8484
DTRMMKERNEL = trmm8x4V.S

kernel/zarch/dasum.c

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
/***************************************************************************
2+
Copyright (c) 2013-2017, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
29+
#include "common.h"
30+
#include <math.h>
31+
32+
#if defined(DOUBLE)
33+
#define ABS fabs
34+
#else
35+
#define ABS fabsf
36+
#endif
37+
38+
39+
static FLOAT __attribute__ ((noinline)) dasum_kernel_32(BLASLONG n, FLOAT *x) {
40+
41+
__asm__ (
42+
"pfd 1, 0(%1) \n\t"
43+
"sllg %%r0,%0,3 \n\t"
44+
"agr %%r0,%1 \n\t"
45+
"vzero %%v0 \n\t"
46+
"vzero %%v1 \n\t"
47+
"vzero %%v2 \n\t"
48+
"vzero %%v3 \n\t"
49+
".align 16 \n\t"
50+
"1: \n\t"
51+
"pfd 1, 256(%1 ) \n\t"
52+
"vlm %%v24,%%v31, 0(%1 ) \n\t"
53+
54+
"vflpdb %%v24, %%v24 \n\t"
55+
"vflpdb %%v25, %%v25 \n\t"
56+
"vflpdb %%v26, %%v26 \n\t"
57+
"vflpdb %%v27, %%v27 \n\t"
58+
"vflpdb %%v28, %%v28 \n\t"
59+
"vflpdb %%v29, %%v29 \n\t"
60+
"vflpdb %%v30, %%v30 \n\t"
61+
"vflpdb %%v31, %%v31 \n\t"
62+
63+
"vfadb %%v0,%%v0,%%v24 \n\t"
64+
"vfadb %%v1,%%v1,%%v25 \n\t"
65+
"vfadb %%v2,%%v2,%%v26 \n\t"
66+
"vfadb %%v3,%%v3,%%v27 \n\t"
67+
"vfadb %%v0,%%v0,%%v28 \n\t"
68+
"vfadb %%v1,%%v1,%%v29 \n\t"
69+
"vfadb %%v2,%%v2,%%v30 \n\t"
70+
"vfadb %%v3,%%v3,%%v31 \n\t"
71+
72+
"vlm %%v24,%%v31, 128(%1) \n\t"
73+
74+
"vflpdb %%v24, %%v24 \n\t"
75+
"vflpdb %%v25, %%v25 \n\t"
76+
"vflpdb %%v26, %%v26 \n\t"
77+
"vflpdb %%v27, %%v27 \n\t"
78+
"vflpdb %%v28, %%v28 \n\t"
79+
"vflpdb %%v29, %%v29 \n\t"
80+
"vflpdb %%v30, %%v30 \n\t"
81+
"vflpdb %%v31, %%v31 \n\t"
82+
"la %1,256(%1) \n\t"
83+
"vfadb %%v0,%%v0,%%v24 \n\t"
84+
"vfadb %%v1,%%v1,%%v25 \n\t"
85+
"vfadb %%v2,%%v2,%%v26 \n\t"
86+
"vfadb %%v3,%%v3,%%v27 \n\t"
87+
"vfadb %%v0,%%v0,%%v28 \n\t"
88+
"vfadb %%v1,%%v1,%%v29 \n\t"
89+
"vfadb %%v2,%%v2,%%v30 \n\t"
90+
"vfadb %%v3,%%v3,%%v31 \n\t"
91+
92+
"clgrjl %1,%%r0,1b \n\t"
93+
"vfadb %%v24,%%v0,%%v1 \n\t"
94+
"vfadb %%v25,%%v2,%%v3 \n\t"
95+
"vfadb %%v0,%%v25,%%v24 \n\t"
96+
"vrepg %%v1,%%v0,1 \n\t"
97+
"adbr %%f0,%%f1 \n\t"
98+
:
99+
: "r"(n), "a"(x)
100+
: "cc", "memory","r0","f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31"
101+
);
102+
103+
}
104+
105+
106+
107+
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
108+
BLASLONG i = 0;
109+
BLASLONG j = 0;
110+
FLOAT sumf = 0.0;
111+
BLASLONG n1;
112+
113+
if (n <= 0 || inc_x <= 0) return sumf;
114+
115+
if (inc_x == 1) {
116+
117+
n1 = n & -32;
118+
119+
if (n1 > 0) {
120+
121+
sumf = dasum_kernel_32(n1, x);
122+
i = n1;
123+
}
124+
125+
while (i < n) {
126+
sumf += ABS(x[i]);
127+
i++;
128+
}
129+
130+
} else {
131+
BLASLONG n1 = n & -4;
132+
register FLOAT sum1, sum2;
133+
sum1 = 0.0;
134+
sum2 = 0.0;
135+
while (j < n1) {
136+
137+
sum1 += ABS(x[i]);
138+
sum2 += ABS(x[i + inc_x]);
139+
sum1 += ABS(x[i + 2 * inc_x]);
140+
sum2 += ABS(x[i + 3 * inc_x]);
141+
142+
i += inc_x * 4;
143+
j += 4;
144+
145+
}
146+
sumf = sum1 + sum2;
147+
while (j < n) {
148+
149+
sumf += ABS(x[i]);
150+
i += inc_x;
151+
j++;
152+
}
153+
154+
155+
}
156+
return sumf;
157+
}
158+
159+

0 commit comments

Comments
 (0)