Skip to content

Commit 913cc9a

Browse files
committed
Merge branch 'develop' into risc-v
2 parents d7ba767 + ff16329 commit 913cc9a

File tree

162 files changed

+8794
-4744
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

162 files changed

+8794
-4744
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
66
project(OpenBLAS C ASM)
77
set(OpenBLAS_MAJOR_VERSION 0)
88
set(OpenBLAS_MINOR_VERSION 3)
9-
set(OpenBLAS_PATCH_VERSION 10.dev)
9+
set(OpenBLAS_PATCH_VERSION 12.dev)
1010
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
1111

1212
# Adhere to GNU filesystem layout conventions

Changelog.txt

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,102 @@
11
OpenBLAS ChangeLog
2+
====================================================================
3+
Version 0.3.12
4+
24-Oct-2020
5+
6+
common:
7+
* Fixed missing BLAS/LAPACK functions (inadvertently dropped during
8+
the build system restructuring)
9+
* Fixed argument conversion macro in LAPACKE_zgesvdq (LAPACK #458)
10+
11+
POWER:
12+
* Added optimized SCOPY/CCOPY kernels for POWER10
13+
* Increased and unified the default size of the GEMM BUFFER
14+
* Fixed building for POWER10 in DYNAMIC_ARCH mode
15+
* POWER10 compatibility test now checks binutils version as well
16+
* Cleaned up compiler warnings
17+
18+
x86_64:
19+
* corrected compiler version checks for AVX2 compatibility
20+
* added compiler option -mavx2 for building with flang
21+
* fixed direct SGEMM pathway for small matrix sizes (broken by
22+
the code refactoring in 0.3.11)
23+
* fixed unhandled partial register clobbers in several kernels
24+
for AXPY,DOT,GEMV_N and GEMV_T flagged by gcc10 tree-vectorizer
25+
26+
ARMV8:
27+
* improved Apple Vortex support to include cross-compiling
28+
29+
====================================================================
30+
Version 0.3.11
31+
17-Oct-2020
32+
33+
common:
34+
* API change:
35+
the newly added BFLOAT16 functions were renamed to use the
36+
letter "B" instead of "H" to avoid potential confusion with
37+
the IEEE "half precision float" type, i.e. the 0.3.10
38+
SHGEMM is now SBGEMM and the corresponding build option
39+
was changed from "BUILD_HALF" to "BUILD_BFLOAT16".
40+
* Reduced the default BLAS3_MEM_ALLOC_THRESHOLD (used as an upper
41+
limit for placing temporary arrays on the stack) to be compatible
42+
with a stack size of 1mb (as imposed by the JAVA runtime library)
43+
* Added mixed-precision dot function SBDOT and utility functions
44+
shstobf16, shdtobf16, sbf16tos and dbf16tod to convert between
45+
single or double precision float arrays and bfloat16 arrays
46+
* Fixed prototypes of LAPACK_?ggsvp and LAPACK_?ggsvd functions
47+
in lapack.h
48+
* Fixed underflow and rounding errors in LAPACK SLANV2 and DLANV2
49+
(causing miscalculations in e.g. SHSEQR/DHSEQR, LAPACK issue #263)
50+
* Fixed workspace calculation in LAPACK ?GELQ (LAPACK issue #415)
51+
* Fixed several bugs in the LAPACK testsuite
52+
* Improved performance of TRMM and TRSM for certain problem sizes
53+
* Fixed infinite recursions and workspace miscalculations in ReLAPACK
54+
* CMAKE builds no longer require pkg-config for creating the .pc file
55+
* Makefile builds no longer misread NO_CBLAS=0 or NO_LAPACK=0 as
56+
enabling these options
57+
* Fixed detection of gfortran when invoked through an mpi wrapper
58+
* Improve thread reinitialization performance with OpenMP after a fork
59+
* Added support for building only the subset of the library required
60+
for a particular precision by specifying BUILD_SINGLE, BUILD_DOUBLE
61+
* Optional function name prefixes and suffixes are now correctly
62+
reflected in the generated cblas.h
63+
* Added CMAKE build support for the LAPACK and multithreading tests
64+
65+
POWER:
66+
* Added optimized support for POWER10
67+
* Added support for compiling for POWER8 in 32bit mode
68+
* Added support for compilation with LLVM/clang
69+
* Added support for compilation with NVIDIA/PGI compilers
70+
* Fixed building on big-endian POWER8
71+
* Fixed miscompilation of ZDOTC by gcc10
72+
* Fixed alignment errors in the POWER8 SAXPY kernel
73+
* Improved CPU detection on AIX
74+
* Supported building with older compilers on POWER9
75+
76+
x86_64:
77+
* Added support for Intel Cooperlake
78+
* Added autodetection of AMD Renoir/Matisse/Zen3 cpus
79+
* Added autodetection of Intel Comet Lake cpus
80+
* Reimplemented ?sum, ?dot and daxpy using universal intrinsics
81+
* Reset the fpu state before using the fpu on Windows as a workaround
82+
for a problem introduced in Windows 10 build 19041 (a.k.a. SDK 2004)
83+
* Fixed potentially undefined behaviour in the dot and gemv_t kernels
84+
* Fixed a potential segmentation fault in DYNAMIC_ARCH builds
85+
* Fixed building for ZEN with PGI/NVIDIA and AMD AOCC compilers
86+
87+
ARMV7:
88+
* Fixed cpu detection on BSD-like systems
89+
90+
ARMV8:
91+
* Added preliminary support for Apple Vortex cpus
92+
* Added support for the Cavium ThunderX3T110 cpu
93+
* Fixed cpu detection on BSD-like systems
94+
* Fixed compilation in -std=C18 mode
95+
96+
IBM Z:
97+
* Added support for compiling with the clang compiler
98+
* Improved GEMM performance on Z14
99+
2100
====================================================================
3101
Version 0.3.10
4102
14-Jun-2020

Makefile.arm

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,8 @@ ifeq ($(CORE), ARMV6)
1212
CCOMMON_OPT += -mfpu=vfp
1313
FCOMMON_OPT += -mfpu=vfp
1414
endif
15+
16+
ifdef HAVE_NEON
17+
CCOMMON_OPT += -mfpu=neon
18+
FCOMMON_OPT += -mfpu=neon
19+
endif

Makefile.power

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ USE_OPENMP = 1
1010
endif
1111

1212
ifeq ($(CORE), POWER10)
13-
COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
13+
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
1414
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
1515
endif
1616

Makefile.rule

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#
44

55
# This library's version
6-
VERSION = 0.3.10.dev
6+
VERSION = 0.3.12.dev
77

88
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
99
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@@ -295,10 +295,13 @@ COMMON_PROF = -pg
295295

296296

297297

298-
# the below is not yet configurable, use cmake if you need to build only select types
299-
BUILD_SINGLE = 1
300-
BUILD_DOUBLE = 1
301-
BUILD_COMPLEX = 1
302-
BUILD_COMPLEX16 = 1
298+
# By default the library contains BLAS functions (and LAPACK if selected) for all input types.
299+
# To build a smaller library supporting e.g. only single precision real (SGEMM etc.) or only
300+
# the functions for complex numbers, uncomment the desired type(s) below
301+
# BUILD_SINGLE = 1
302+
# BUILD_DOUBLE = 1
303+
# BUILD_COMPLEX = 1
304+
# BUILD_COMPLEX16 = 1
305+
#
303306
# End of user configuration
304307
#

Makefile.system

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
INCLUDED = 1
77

88
ifndef TOPDIR
9-
TOPDIR = .
9+
TOPDIR = .
1010
endif
1111

1212
# If ARCH is not set, we use the host system's architecture for getarch compile options.
@@ -93,6 +93,12 @@ endif
9393
ifdef TARGET
9494
GETARCH_FLAGS := -DFORCE_$(TARGET)
9595
GETARCH_FLAGS += -DUSER_TARGET
96+
ifeq ($(TARGET), GENERIC)
97+
ifeq ($(DYNAMIC_ARCH), 1)
98+
override NO_EXPRECISION=1
99+
export NO_EXPRECiSION
100+
endif
101+
endif
96102
endif
97103

98104
# Force fallbacks for 32bit
@@ -246,6 +252,22 @@ DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)"
246252
ifndef TARGET_CORE
247253
include $(TOPDIR)/Makefile.conf
248254
else
255+
HAVE_NEON=
256+
HAVE_VFP=
257+
HAVE_VFPV3=
258+
HAVE_VFPV4=
259+
HAVE_MMX=
260+
HAVE_SSE=
261+
HAVE_SSE2=
262+
HAVE_SSE3=
263+
HAVE_SSSE3=
264+
HAVE_SSE4_1=
265+
HAVE_SSE4_2=
266+
HAVE_SSE4A=
267+
HAVE_SSE5=
268+
HAVE_AVX=
269+
HAVE_AVX2=
270+
HAVE_FMA3=
249271
include $(TOPDIR)/Makefile_kernel.conf
250272
endif
251273

@@ -319,6 +341,7 @@ ifeq ($(GCCVERSIONGTEQ7),1)
319341
else
320342
GCCDUMPVERSION_PARAM := -dumpversion
321343
endif
344+
GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1)
322345
GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2)
323346
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
324347
endif
@@ -641,18 +664,22 @@ DYNAMIC_CORE += POWER8
641664
ifneq ($(C_COMPILER), GCC)
642665
DYNAMIC_CORE += POWER9
643666
DYNAMIC_CORE += POWER10
667+
CCOMMON_OPT += -DHAVE_P10_SUPPORT
644668
endif
645669
ifeq ($(C_COMPILER), GCC)
646670
ifeq ($(GCCVERSIONGT5), 1)
647671
DYNAMIC_CORE += POWER9
648672
else
649673
$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
650674
endif
651-
ifeq ($(GCCVERSIONGTEQ11), 1)
675+
LDVERSIONGTEQ35 := $(shell expr `ld --version | head -1 | cut -f2 -d "." | cut -f1 -d "-"` >= 35)
676+
ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11)
652677
DYNAMIC_CORE += POWER10
678+
CCOMMON_OPT += -DHAVE_P10_SUPPORT
653679
else ifeq ($(GCCVERSIONGTEQ10), 1)
654-
ifeq ($(GCCMINORVERSIONGTEQ2), 1)
680+
ifeq ($(GCCMINORVERSIONGTEQ2)$(LDVERSIONGTEQ35), 11)
655681
DYNAMIC_CORE += POWER10
682+
CCOMMON_OPT += -DHAVE_P10_SUPPORT
656683
endif
657684
else
658685
$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)
@@ -853,7 +880,7 @@ CCOMMON_OPT += -DF_INTERFACE_FLANG
853880
FCOMMON_OPT += -Mrecursive -Kieee
854881
ifeq ($(OSNAME), Linux)
855882
ifeq ($(ARCH), x86_64)
856-
FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`)
883+
FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`)
857884
ifeq ($(FLANG_VENDOR),AOCC)
858885
FCOMMON_OPT += -fno-unroll-loops
859886
endif
@@ -1515,6 +1542,8 @@ export HAVE_SSE4_2
15151542
export HAVE_SSE4A
15161543
export HAVE_SSE5
15171544
export HAVE_AVX
1545+
export HAVE_AVX2
1546+
export HAVE_FMA3
15181547
export HAVE_VFP
15191548
export HAVE_VFPV3
15201549
export HAVE_VFPV4

Makefile.x86_64

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@ endif
99
endif
1010

1111
ifdef HAVE_SSE3
12-
ifndef DYNAMIC_ARCH
1312
CCOMMON_OPT += -msse3
1413
FCOMMON_OPT += -msse3
14+
endif
1515
ifdef HAVE_SSSE3
1616
CCOMMON_OPT += -mssse3
1717
FCOMMON_OPT += -mssse3
@@ -20,7 +20,17 @@ ifdef HAVE_SSE4_1
2020
CCOMMON_OPT += -msse4.1
2121
FCOMMON_OPT += -msse4.1
2222
endif
23+
ifdef HAVE_AVX
24+
CCOMMON_OPT += -mavx
25+
FCOMMON_OPT += -mavx
26+
endif
27+
ifdef HAVE_AVX2
28+
CCOMMON_OPT += -mavx2
29+
FCOMMON_OPT += -mavx2
2330
endif
31+
ifdef HAVE_FMA3
32+
CCOMMON_OPT += -mfma
33+
FCOMMON_OPT += -mfma
2434
endif
2535

2636
ifeq ($(CORE), SKYLAKEX)
@@ -47,8 +57,6 @@ ifndef DYNAMIC_ARCH
4757
ifndef NO_AVX512
4858
ifeq ($(C_COMPILER), GCC)
4959
# cooperlake support was added in 10.1
50-
GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
51-
GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 1)
5260
ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
5361
CCOMMON_OPT += -march=cooperlake
5462
FCOMMON_OPT += -march=cooperlake
@@ -68,24 +76,31 @@ endif
6876
endif
6977
endif
7078

71-
ifeq ($(CORE), $(filter $(CORE), HASWELL ZEN SKYLAKEX COOPERLAKE))
72-
ifndef DYNAMIC_ARCH
79+
ifdef HAVE_AVX2
7380
ifndef NO_AVX2
7481
ifeq ($(C_COMPILER), GCC)
7582
# AVX2 support was added in 4.7.0
76-
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
77-
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
78-
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
83+
GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
84+
ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
85+
CCOMMON_OPT += -mavx2
86+
endif
87+
else
88+
ifeq ($(C_COMPILER), CLANG)
7989
CCOMMON_OPT += -mavx2
8090
endif
8191
endif
8292
ifeq ($(F_COMPILER), GFORTRAN)
8393
# AVX2 support was added in 4.7.0
8494
GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4)
95+
GCCVERSIONGTEQ5 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 5)
8596
GCCMINORVERSIONGTEQ7 := $(shell expr `$(FC) -dumpversion | cut -f2 -d.` \>= 7)
86-
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
97+
GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
98+
ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
8799
FCOMMON_OPT += -mavx2
88100
endif
101+
else
102+
ifeq ($(F_COMPILER), FLANG)
103+
FCOMMON_OPT += -mavx2
89104
endif
90105
endif
91106
endif

0 commit comments

Comments
 (0)