Skip to content

Commit 64629cb

Browse files
authored
Merge pull request #91 from xianyi/develop
rebase
2 parents caf7a12 + 0d98ce2 commit 64629cb

File tree

14 files changed

+764
-27
lines changed

14 files changed

+764
-27
lines changed

README.md

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,10 @@ Building OpenBLAS requires the following to be installed:
4646

4747
Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically.
4848
To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`.
49-
The full target list is in the file `TargetList.txt`.
49+
The full target list is in the file `TargetList.txt`. For building with `cmake`, the
50+
usual conventions apply, i.e. create a build directory either underneath the toplevel
51+
OpenBLAS source directory or separate from it, and invoke `cmake` there with the path
52+
to the source tree and any build options you plan to set.
5053

5154
### Cross compile
5255

@@ -152,13 +155,17 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
152155
- **Falkor**: same as A57 (different cpu specifications)
153156
- **ThunderX**: Optimized some Level-1 functions
154157
- **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2
158+
- **ThunderX3T110**
155159
- **TSV110**: Optimized some Level-3 helper functions
156160
- **EMAG 8180**: preliminary support based on A57
161+
- **Neoverse N1**: (AWS Graviton2) preliminary support
162+
- **Apple Vortex**: preliminary support based on ARMV8
157163

158164
#### PPC/PPC64
159165

160166
- **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1`
161167
- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only.
168+
- **POWER10**:
162169

163170
#### IBM zEnterprise System
164171

@@ -226,7 +233,8 @@ We provide the following functions to control the number of threads at runtime:
226233
void goto_set_num_threads(int num_threads);
227234
void openblas_set_num_threads(int num_threads);
228235
```
229-
236+
Note that these are only used once at library initialization, and are not available for
237+
fine-tuning thread numbers in individual BLAS calls.
230238
If you compile this library with `USE_OPENMP=1`, you should use the above functions too.
231239
232240
## Reporting bugs

getarch.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -492,7 +492,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
492492
"-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \
493493
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \
494494
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \
495-
"-DHAVE_AVX -DHAVE_FMA4"
495+
"-DHAVE_AVX"
496496
#define LIBNAME "bulldozer"
497497
#define CORENAME "BULLDOZER"
498498
#endif
@@ -508,7 +508,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
508508
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
509509
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
510510
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
511-
"-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3"
511+
"-DHAVE_AVX -DHAVE_FMA3"
512512
#define LIBNAME "piledriver"
513513
#define CORENAME "PILEDRIVER"
514514
#endif
@@ -524,7 +524,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
524524
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
525525
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
526526
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
527-
"-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3"
527+
"-DHAVE_AVX -DHAVE_FMA3"
528528
#define LIBNAME "steamroller"
529529
#define CORENAME "STEAMROLLER"
530530
#endif
@@ -540,7 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
540540
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
541541
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
542542
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
543-
"-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3"
543+
"-DHAVE_AVX -DHAVE_FMA3"
544544
#define LIBNAME "excavator"
545545
#define CORENAME "EXCAVATOR"
546546
#endif

kernel/power/KERNEL.POWER10

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,9 +151,9 @@ endif
151151
ZAXPYKERNEL = zaxpy_power10.c
152152
#
153153
SCOPYKERNEL = scopy.c
154-
DCOPYKERNEL = dcopy.c
154+
DCOPYKERNEL = dcopy_power10.c
155155
CCOPYKERNEL = ccopy.c
156-
ZCOPYKERNEL = zcopy.c
156+
ZCOPYKERNEL = zcopy_power10.c
157157
#
158158
SDOTKERNEL = sdot.c
159159
DDOTKERNEL = ddot.c

kernel/power/dcopy_microk_power10.c

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
/***************************************************************************
2+
Copyright (c) 2020, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#define HAVE_KERNEL_64 1
29+
30+
static void dcopy_kernel_64 (long n, double *x, double *y)
31+
{
32+
__asm__
33+
(
34+
"lxvp 32, 0(%2) \n\t"
35+
"lxvp 34, 32(%2) \n\t"
36+
"lxvp 36, 64(%2) \n\t"
37+
"lxvp 38, 96(%2) \n\t"
38+
"lxvp 40, 128(%2) \n\t"
39+
"lxvp 42, 160(%2) \n\t"
40+
"lxvp 44, 192(%2) \n\t"
41+
"lxvp 46, 224(%2) \n\t"
42+
43+
"lxvp 48, 256(%2) \n\t"
44+
"lxvp 50, 288(%2) \n\t"
45+
"lxvp 52, 320(%2) \n\t"
46+
"lxvp 54, 352(%2) \n\t"
47+
"lxvp 56, 384(%2) \n\t"
48+
"lxvp 58, 416(%2) \n\t"
49+
"lxvp 60, 448(%2) \n\t"
50+
"lxvp 62, 480(%2) \n\t"
51+
"addi %2, %2, 512 \n\t"
52+
53+
"addic. %1, %1, -64 \n\t"
54+
"ble two%= \n\t"
55+
56+
".align 5 \n"
57+
"one%=: \n\t"
58+
59+
"stxvp 32, 0(%3) \n\t"
60+
"lxvp 32, 0(%2) \n\t"
61+
"stxvp 34, 32(%3) \n\t"
62+
"lxvp 34, 32(%2) \n\t"
63+
"stxvp 36, 64(%3) \n\t"
64+
"lxvp 36, 64(%2) \n\t"
65+
"stxvp 38, 96(%3) \n\t"
66+
"lxvp 38, 96(%2) \n\t"
67+
68+
"stxvp 40, 128(%3) \n\t"
69+
"lxvp 40, 128(%2) \n\t"
70+
"stxvp 42, 160(%3) \n\t"
71+
"lxvp 42, 160(%2) \n\t"
72+
"stxvp 44, 192(%3) \n\t"
73+
"lxvp 44, 192(%2) \n\t"
74+
"stxvp 46, 224(%3) \n\t"
75+
"lxvp 46, 224(%2) \n\t"
76+
77+
"stxvp 48, 256(%3) \n\t"
78+
"lxvp 48, 256(%2) \n\t"
79+
"stxvp 50, 288(%3) \n\t"
80+
"lxvp 50, 288(%2) \n\t"
81+
"stxvp 52, 320(%3) \n\t"
82+
"lxvp 52, 320(%2) \n\t"
83+
"stxvp 54, 352(%3) \n\t"
84+
"lxvp 54, 352(%2) \n\t"
85+
"stxvp 56, 384(%3) \n\t"
86+
"lxvp 56, 384(%2) \n\t"
87+
"stxvp 58, 416(%3) \n\t"
88+
"lxvp 58, 416(%2) \n\t"
89+
"stxvp 60, 448(%3) \n\t"
90+
"lxvp 60, 448(%2) \n\t"
91+
"stxvp 62, 480(%3) \n\t"
92+
"lxvp 62, 480(%2) \n\t"
93+
94+
"addi %3, %3, 512 \n\t"
95+
"addi %2, %2, 512 \n\t"
96+
97+
"addic. %1, %1, -64 \n\t"
98+
"bgt one%= \n"
99+
100+
"two%=: \n\t"
101+
102+
"stxvp 32, 0(%3) \n\t"
103+
"stxvp 34, 32(%3) \n\t"
104+
"stxvp 36, 64(%3) \n\t"
105+
"stxvp 38, 96(%3) \n\t"
106+
"stxvp 40, 128(%3) \n\t"
107+
"stxvp 42, 160(%3) \n\t"
108+
"stxvp 44, 192(%3) \n\t"
109+
"stxvp 46, 224(%3) \n\t"
110+
"stxvp 48, 256(%3) \n\t"
111+
"stxvp 50, 288(%3) \n\t"
112+
"stxvp 52, 320(%3) \n\t"
113+
"stxvp 54, 352(%3) \n\t"
114+
"stxvp 56, 384(%3) \n\t"
115+
"stxvp 58, 416(%3) \n\t"
116+
"stxvp 60, 448(%3) \n\t"
117+
"stxvp 62, 480(%3) \n\t"
118+
119+
"#n=%1 x=%4=%2 y=%0=%3"
120+
:
121+
"=m" (*y),
122+
"+r" (n), // 1
123+
"+b" (x), // 2
124+
"+b" (y) // 3
125+
:
126+
"m" (*x)
127+
:
128+
"cr0",
129+
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
130+
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
131+
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
132+
"vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
133+
);
134+
}

kernel/power/dcopy_power10.c

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
/***************************************************************************
2+
Copyright (c) 2020, The OpenBLAS Project
3+
All rights reserved.
4+
Redistribution and use in source and binary forms, with or without
5+
modification, are permitted provided that the following conditions are
6+
met:
7+
1. Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
2. Redistributions in binary form must reproduce the above copyright
10+
notice, this list of conditions and the following disclaimer in
11+
the documentation and/or other materials provided with the
12+
distribution.
13+
3. Neither the name of the OpenBLAS project nor the names of
14+
its contributors may be used to endorse or promote products
15+
derived from this software without specific prior written permission.
16+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25+
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
*****************************************************************************/
27+
28+
#include "common.h"
29+
30+
#if defined(__VEC__) || defined(__ALTIVEC__)
31+
#include "dcopy_microk_power10.c"
32+
#endif
33+
34+
#ifndef HAVE_KERNEL_64
35+
36+
static void dcopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y)
37+
{
38+
39+
BLASLONG i=0;
40+
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
41+
FLOAT *x1=x;
42+
FLOAT *y1=y;
43+
44+
while ( i<n )
45+
{
46+
47+
f0 = x1[0];
48+
f1 = x1[1];
49+
f2 = x1[2];
50+
f3 = x1[3];
51+
f4 = x1[4];
52+
f5 = x1[5];
53+
f6 = x1[6];
54+
f7 = x1[7];
55+
56+
y1[0] = f0;
57+
y1[1] = f1;
58+
y1[2] = f2;
59+
y1[3] = f3;
60+
y1[4] = f4;
61+
y1[5] = f5;
62+
y1[6] = f6;
63+
y1[7] = f7;
64+
65+
x1 += 8;
66+
y1 += 8;
67+
68+
i+=8;
69+
}
70+
return;
71+
72+
}
73+
74+
75+
#endif
76+
77+
78+
79+
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
80+
{
81+
BLASLONG i=0;
82+
BLASLONG ix=0,iy=0;
83+
84+
if ( n <= 0 ) return(0);
85+
86+
if ( (inc_x == 1) && (inc_y == 1 ))
87+
{
88+
89+
BLASLONG n1 = n & -64;
90+
if ( n1 > 0 )
91+
{
92+
dcopy_kernel_64(n1, x, y);
93+
i=n1;
94+
}
95+
96+
while(i < n)
97+
{
98+
y[i] = x[i] ;
99+
i++ ;
100+
101+
}
102+
103+
104+
}
105+
else
106+
{
107+
108+
while(i < n)
109+
{
110+
y[iy] = x[ix] ;
111+
ix += inc_x ;
112+
iy += inc_y ;
113+
i++ ;
114+
115+
}
116+
117+
}
118+
return(0);
119+
120+
121+
}
122+
123+

0 commit comments

Comments
 (0)