Skip to content

Commit 61fae59

Browse files
authored
Merge pull request #88 from xianyi/develop
rebase
2 parents 8d12027 + 33d22f9 commit 61fae59

File tree

14 files changed

+226
-203
lines changed

14 files changed

+226
-203
lines changed

.travis.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,17 @@ matrix:
204204
env:
205205
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8"
206206

207+
- <<: *test-macos
208+
osx_image: xcode12
209+
before_script:
210+
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
211+
- brew update
212+
- brew install gcc@10 # for gfortran
213+
script:
214+
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
215+
env:
216+
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
217+
207218
- <<: *test-macos
208219
osx_image: xcode10.0
209220
env:

Makefile.zarch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,5 @@ endif
1212
# Enable floating-point expression contraction for clang, since it is the
1313
# default for gcc
1414
ifeq ($(C_COMPILER), CLANG)
15-
CCOMMON_OPT += -ffp-contract=fast
15+
CCOMMON_OPT += -ffp-contract=on
1616
endif

kernel/zarch/cscal.c

Lines changed: 30 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -25,67 +25,35 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
2525
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2626
*****************************************************************************/
2727

28+
/*
29+
* Avoid contraction of floating point operations, specifically fused
30+
* multiply-add, because they can cause unexpected results in complex
31+
* multiplication.
32+
*/
33+
#if defined(__GNUC__) && !defined(__clang__)
34+
#pragma GCC optimize ("fp-contract=off")
35+
#endif
36+
37+
#if defined(__clang__)
38+
#pragma clang fp contract(off)
39+
#endif
40+
2841
#include "common.h"
42+
#include "vector-common.h"
2943

30-
static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) {
31-
__asm__("vlrepf %%v0,0(%[alpha])\n\t"
32-
"vlef %%v1,4(%[alpha]),0\n\t"
33-
"vlef %%v1,4(%[alpha]),2\n\t"
34-
"vflcsb %%v1,%%v1\n\t"
35-
"vlef %%v1,4(%[alpha]),1\n\t"
36-
"vlef %%v1,4(%[alpha]),3\n\t"
37-
"srlg %[n],%[n],4\n\t"
38-
"xgr %%r1,%%r1\n\t"
39-
"0:\n\t"
40-
"pfd 2, 1024(%%r1,%[x])\n\t"
41-
"vl %%v16,0(%%r1,%[x])\n\t"
42-
"vl %%v17,16(%%r1,%[x])\n\t"
43-
"vl %%v18,32(%%r1,%[x])\n\t"
44-
"vl %%v19,48(%%r1,%[x])\n\t"
45-
"vl %%v20,64(%%r1,%[x])\n\t"
46-
"vl %%v21,80(%%r1,%[x])\n\t"
47-
"vl %%v22,96(%%r1,%[x])\n\t"
48-
"vl %%v23,112(%%r1,%[x])\n\t"
49-
"verllg %%v24,%%v16,32\n\t"
50-
"verllg %%v25,%%v17,32\n\t"
51-
"verllg %%v26,%%v18,32\n\t"
52-
"verllg %%v27,%%v19,32\n\t"
53-
"verllg %%v28,%%v20,32\n\t"
54-
"verllg %%v29,%%v21,32\n\t"
55-
"verllg %%v30,%%v22,32\n\t"
56-
"verllg %%v31,%%v23,32\n\t"
57-
"vfmsb %%v16,%%v16,%%v0\n\t"
58-
"vfmsb %%v17,%%v17,%%v0\n\t"
59-
"vfmsb %%v18,%%v18,%%v0\n\t"
60-
"vfmsb %%v19,%%v19,%%v0\n\t"
61-
"vfmsb %%v20,%%v20,%%v0\n\t"
62-
"vfmsb %%v21,%%v21,%%v0\n\t"
63-
"vfmsb %%v22,%%v22,%%v0\n\t"
64-
"vfmsb %%v23,%%v23,%%v0\n\t"
65-
"vfmasb %%v16,%%v24,%%v1,%%v16\n\t"
66-
"vfmasb %%v17,%%v25,%%v1,%%v17\n\t"
67-
"vfmasb %%v18,%%v26,%%v1,%%v18\n\t"
68-
"vfmasb %%v19,%%v27,%%v1,%%v19\n\t"
69-
"vfmasb %%v20,%%v28,%%v1,%%v20\n\t"
70-
"vfmasb %%v21,%%v29,%%v1,%%v21\n\t"
71-
"vfmasb %%v22,%%v30,%%v1,%%v22\n\t"
72-
"vfmasb %%v23,%%v31,%%v1,%%v23\n\t"
73-
"vst %%v16,0(%%r1,%[x])\n\t"
74-
"vst %%v17,16(%%r1,%[x])\n\t"
75-
"vst %%v18,32(%%r1,%[x])\n\t"
76-
"vst %%v19,48(%%r1,%[x])\n\t"
77-
"vst %%v20,64(%%r1,%[x])\n\t"
78-
"vst %%v21,80(%%r1,%[x])\n\t"
79-
"vst %%v22,96(%%r1,%[x])\n\t"
80-
"vst %%v23,112(%%r1,%[x])\n\t"
81-
"agfi %%r1,128\n\t"
82-
"brctg %[n],0b"
83-
: "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n)
84-
: [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),
85-
[alpha] "a"(alpha)
86-
: "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21",
87-
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30",
88-
"v31");
44+
static void cscal_kernel_16(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x) {
45+
vector_float da_r_vec = vec_splats(da_r);
46+
vector_float da_i_vec = { -da_i, da_i, -da_i, da_i };
47+
48+
vector_float *x_vec_ptr = (vector_float *)x;
49+
50+
#pragma GCC unroll 16
51+
for (size_t i = 0; i < n/2; i++) {
52+
vector_float x_vec = vec_load_hinted(x + i * VLEN_FLOATS);
53+
vector_float x_swapped = {x_vec[1], x_vec[0], x_vec[3], x_vec[2]};
54+
55+
x_vec_ptr[i] = x_vec * da_r_vec + x_swapped * da_i_vec;
56+
}
8957
}
9058

9159
static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) {
@@ -199,14 +167,12 @@ static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) {
199167
: "cc", "r1", "v0");
200168
}
201169

202-
static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x,
170+
static void cscal_kernel_inc_8(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x,
203171
BLASLONG inc_x) {
204172
BLASLONG i;
205173
BLASLONG inc_x2 = 2 * inc_x;
206174
BLASLONG inc_x3 = inc_x2 + inc_x;
207175
FLOAT t0, t1, t2, t3;
208-
FLOAT da_r = alpha[0];
209-
FLOAT da_i = alpha[1];
210176

211177
for (i = 0; i < n; i += 4) {
212178
t0 = da_r * x[0] - da_i * x[1];
@@ -324,9 +290,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
324290

325291
BLASLONG n1 = n & -8;
326292
if (n1 > 0) {
327-
alpha[0] = da_r;
328-
alpha[1] = da_i;
329-
cscal_kernel_inc_8(n1, alpha, x, inc_x);
293+
cscal_kernel_inc_8(n1, da_r, da_i, x, inc_x);
330294
j = n1;
331295
i = n1 * inc_x;
332296
}
@@ -362,7 +326,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
362326
else if (da_i == 0)
363327
cscal_kernel_16_zero_i(n1, alpha, x);
364328
else
365-
cscal_kernel_16(n1, alpha, x);
329+
cscal_kernel_16(n1, da_r, da_i, x);
366330

367331
i = n1 << 1;
368332
j = n1;

kernel/zarch/gemm_vec.c

Lines changed: 2 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,13 @@
3030
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3131
*/
3232
#include "common.h"
33-
#include <vecintrin.h>
33+
#include "vector-common.h"
3434

3535
#include <stdbool.h>
3636
#include <stdio.h>
3737
#include <stdlib.h>
3838

39+
3940
#ifdef COMPLEX
4041
#error "Handling for complex numbers is not supported in this kernel"
4142
#endif
@@ -153,37 +154,6 @@ static const bool backwards = false;
153154
* 3, May 2008.
154155
*/
155156

156-
#define VLEN_BYTES 16
157-
#define VLEN_FLOATS (VLEN_BYTES / sizeof(FLOAT))
158-
159-
typedef FLOAT vector_float __attribute__ ((vector_size (16)));
160-
161-
/**
162-
* Load a vector into register, and hint on 8-byte alignment to improve
163-
* performance. gcc-9 and newer will create these hints by itself. For older
164-
* compiler versions, use inline assembly to explicitly express the hint.
165-
* Provide explicit hex encoding to cater for binutils versions that do not know
166-
* about vector-load with alignment hints yet.
167-
*
168-
* Note that, for block sizes where we apply vectorization, vectors in A will
169-
* always be 8-byte aligned.
170-
*/
171-
static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
172-
vector_float const *restrict addr = (vector_float const *restrict)a;
173-
vector_float y;
174-
175-
#if __GNUC__ < 9 && !defined(__clang__)
176-
// hex-encode vl %[out],%[addr],3
177-
asm(".insn vrx,0xe70000003006,%[out],%[addr],3"
178-
: [ out ] "=v"(y)
179-
: [ addr ] "R"(*addr));
180-
#else
181-
y = *addr;
182-
#endif
183-
184-
return y;
185-
}
186-
187157
/**
188158
* Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics.
189159
*

kernel/zarch/vector-common.h

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/*
2+
* Copyright (c) IBM Corporation 2020.
3+
* All rights reserved.
4+
*
5+
* Redistribution and use in source and binary forms, with or without
6+
* modification, are permitted provided that the following conditions are
7+
* met:
8+
*
9+
* 1. Redistributions of source code must retain the above copyright
10+
* notice, this list of conditions and the following disclaimer.
11+
*
12+
* 2. Redistributions in binary form must reproduce the above copyright
13+
* notice, this list of conditions and the following disclaimer in
14+
* the documentation and/or other materials provided with the
15+
* distribution.
16+
* 3. Neither the name of the OpenBLAS project nor the names of
17+
* its contributors may be used to endorse or promote products
18+
* derived from this software without specific prior written
19+
* permission.
20+
*
21+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24+
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26+
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28+
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29+
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
30+
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31+
*/
32+
33+
#include <vecintrin.h>
34+
35+
#define VLEN_BYTES 16
36+
#define VLEN_FLOATS (VLEN_BYTES / sizeof(FLOAT))
37+
38+
typedef FLOAT vector_float __attribute__ ((vector_size (VLEN_BYTES)));
39+
40+
/**
41+
* Load a vector into register, and hint on 8-byte alignment to improve
42+
* performance. gcc-9 and newer will create these hints by itself. For older
43+
* compiler versions, use inline assembly to explicitly express the hint.
44+
* Provide explicit hex encoding to cater for binutils versions that do not know
45+
* about vector-load with alignment hints yet.
46+
*
47+
* Note that, for block sizes where we apply vectorization, vectors in A will
48+
* always be 8-byte aligned.
49+
*/
50+
static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
51+
vector_float const *restrict addr = (vector_float const *restrict)a;
52+
vector_float y;
53+
54+
#if __GNUC__ < 9 && !defined(__clang__)
55+
// hex-encode vl %[out],%[addr],3
56+
asm(".insn vrx,0xe70000003006,%[out],%[addr],3"
57+
: [ out ] "=v"(y)
58+
: [ addr ] "R"(*addr));
59+
#else
60+
y = *addr;
61+
#endif
62+
63+
return y;
64+
}

0 commit comments

Comments
 (0)