Skip to content

Commit fbda20c

Browse files
authored
Merge pull request #94 from xianyi/develop
rebase
2 parents cccd143 + e1b7123 commit fbda20c

File tree

15 files changed

+217
-46
lines changed

15 files changed

+217
-46
lines changed

Makefile.rule

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,22 @@ COMMON_PROF = -pg
279279

280280
# If you want to enable the experimental BFLOAT16 support
281281
# BUILD_HALF = 1
282-
#
282+
283+
284+
# Set the thread number threshold beyond which the job array for the threaded level3 BLAS
285+
# will be allocated on the heap rather than the stack. (This array alone requires
286+
# NUM_THREADS*NUM_THREADS*128 bytes of memory so should not pose a problem at low cpu
287+
# counts, but obviously it is not the only item that ends up on the stack.
288+
# The default value of 32 ensures that the overall requirement is compatible
289+
# with the default 1MB stacksize imposed by having the Java VM loaded without use
290+
# of its -Xss parameter.
291+
# The value of 160 formerly used from about version 0.2.7 until 0.3.10 is easily compatible
292+
# with the common Linux stacksize of 8MB but will cause crashes with unwary use of the java
293+
# VM e.g. in Octave or with the java-based libhdfs in numpy or scipy code
294+
# BLAS3_MEM_ALLOC_THRESHOLD = 160
295+
296+
297+
283298
# the below is not yet configurable, use cmake if you need to build only select types
284299
BUILD_SINGLE = 1
285300
BUILD_DOUBLE = 1

Makefile.x86_64

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@ endif
88
endif
99
endif
1010

11+
ifdef HAVE_SSE3
12+
CCOMMON_OPT += -msse3
13+
FCOMMON_OPT += -msse3
14+
endif
15+
1116
ifeq ($(CORE), SKYLAKEX)
1217
ifndef DYNAMIC_ARCH
1318
ifndef NO_AVX512

cmake/system.cmake

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,9 @@ if (DEFINED TARGET)
7070
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
7171
endif()
7272
endif()
73+
if (DEFINED HAVE_SSE3)
74+
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
75+
endif()
7376
endif()
7477

7578
if (DEFINED TARGET)
@@ -323,7 +326,13 @@ else ()
323326
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=2048")
324327
endif ()
325328
endif ()
326-
329+
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
330+
if (DEFINED BLAS3_MEM_ALLOC_THRESHOLD)
331+
if (NOT ${BLAS3_MEM_ALLOC_THRESHOLD} EQUAL 32)
332+
set(CCOMMON_OPT "${CCOMMON_OPT} -DBLAS3_MEM_ALLOC_THRESHOLD=${BLAS3_MEM_ALLOC_THRESHOLD}")
333+
endif()
334+
endif()
335+
endif()
327336
if (DEFINED LIBNAMESUFFIX)
328337
set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}")
329338
else ()

common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
402402
#endif
403403

404404
#ifndef BLAS3_MEM_ALLOC_THRESHOLD
405-
#define BLAS3_MEM_ALLOC_THRESHOLD 160
405+
#define BLAS3_MEM_ALLOC_THRESHOLD 32
406406
#endif
407407

408408
#ifdef QUAD_PRECISION

kernel/Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@ endif
55
TOPDIR = ..
66
include $(TOPDIR)/Makefile.system
77

8+
ifdef HAVE_SSE3
9+
CFLAGS += -msse3
10+
endif
811

912
ifeq ($(C_COMPILER), GCC)
1013
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)

kernel/arm64/KERNEL.ARMV8

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ CNRM2KERNEL = znrm2.S
9797
ZNRM2KERNEL = znrm2.S
9898

9999
DDOTKERNEL = dot.S
100-
SDOTKERNEL = dot.S
100+
SDOTKERNEL = ../generic/dot.c
101101
CDOTKERNEL = zdot.S
102102
ZDOTKERNEL = zdot.S
103103
DSDOTKERNEL = dot.S

kernel/arm64/KERNEL.CORTEXA53

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ CNRM2KERNEL = znrm2.S
9797
ZNRM2KERNEL = znrm2.S
9898

9999
DDOTKERNEL = dot.S
100-
SDOTKERNEL = dot.S
100+
SDOTKERNEL = ../generic/dot.c
101101
CDOTKERNEL = zdot.S
102102
ZDOTKERNEL = zdot.S
103103
DSDOTKERNEL = dot.S

kernel/arm64/KERNEL.CORTEXA57

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ DCOPYKERNEL = copy.S
7070
CCOPYKERNEL = copy.S
7171
ZCOPYKERNEL = copy.S
7272

73-
SDOTKERNEL = dot.S
73+
SDOTKERNEL = ../generic/dot.c
7474
DDOTKERNEL = dot.S
7575
CDOTKERNEL = zdot.S
7676
ZDOTKERNEL = zdot.S

kernel/generic/dot.c

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2727

2828

2929
#include "common.h"
30-
30+
#include "../simd/intrin.h"
3131
#if defined(DSDOT)
3232
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
3333
#else
@@ -47,27 +47,59 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
4747

4848
if ( (inc_x == 1) && (inc_y == 1) )
4949
{
50-
51-
int n1 = n & -4;
52-
53-
while(i < n1)
50+
int n1 = n & -4;
51+
#if V_SIMD && !defined(DSDOT)
52+
const int vstep = v_nlanes_f32;
53+
const int unrollx4 = n & (-vstep * 4);
54+
const int unrollx = n & -vstep;
55+
v_f32 vsum0 = v_zero_f32();
56+
v_f32 vsum1 = v_zero_f32();
57+
v_f32 vsum2 = v_zero_f32();
58+
v_f32 vsum3 = v_zero_f32();
59+
while(i < unrollx4)
60+
{
61+
vsum0 = v_muladd_f32(
62+
v_loadu_f32(x + i), v_loadu_f32(y + i), vsum0
63+
);
64+
vsum1 = v_muladd_f32(
65+
v_loadu_f32(x + i + vstep), v_loadu_f32(y + i + vstep), vsum1
66+
);
67+
vsum2 = v_muladd_f32(
68+
v_loadu_f32(x + i + vstep*2), v_loadu_f32(y + i + vstep*2), vsum2
69+
);
70+
vsum3 = v_muladd_f32(
71+
v_loadu_f32(x + i + vstep*3), v_loadu_f32(y + i + vstep*3), vsum3
72+
);
73+
i += vstep*4;
74+
}
75+
vsum0 = v_add_f32(
76+
v_add_f32(vsum0, vsum1), v_add_f32(vsum2 , vsum3)
77+
);
78+
while(i < unrollx)
79+
{
80+
vsum0 = v_muladd_f32(
81+
v_loadu_f32(x + i), v_loadu_f32(y + i), vsum0
82+
);
83+
i += vstep;
84+
}
85+
dot = v_sum_f32(vsum0);
86+
#elif defined(DSDOT)
87+
for (; i < n1; i += 4)
5488
{
55-
56-
#if defined(DSDOT)
5789
dot += (double) y[i] * (double) x[i]
5890
+ (double) y[i+1] * (double) x[i+1]
5991
+ (double) y[i+2] * (double) x[i+2]
6092
+ (double) y[i+3] * (double) x[i+3] ;
93+
}
6194
#else
95+
for (; i < n1; i += 4)
96+
{
6297
dot += y[i] * x[i]
6398
+ y[i+1] * x[i+1]
6499
+ y[i+2] * x[i+2]
65100
+ y[i+3] * x[i+3] ;
66-
#endif
67-
i+=4 ;
68-
69101
}
70-
102+
#endif
71103
while(i < n)
72104
{
73105

kernel/simd/intrin.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,11 @@ extern "C" {
5151
#include <immintrin.h>
5252
#endif
5353

54+
/** NEON **/
55+
#ifdef HAVE_NEON
56+
#include <arm_neon.h>
57+
#endif
58+
5459
// distribute
5560
#if defined(HAVE_AVX512VL) || defined(HAVE_AVX512BF16)
5661
#include "intrin_avx512.h"
@@ -60,6 +65,10 @@ extern "C" {
6065
#include "intrin_sse.h"
6166
#endif
6267

68+
#ifdef HAVE_NEON
69+
#include "intrin_neon.h"
70+
#endif
71+
6372
#ifndef V_SIMD
6473
#define V_SIMD 0
6574
#define V_SIMD_F64 0

0 commit comments

Comments
 (0)