Skip to content

Commit 6e3a05f

Browse files
authored
Merge pull request #2943 from xianyi/develop
Merge from develop for 0.3.12 release
2 parents 51c2261 + 89db735 commit 6e3a05f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+540
-269
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
66
project(OpenBLAS C ASM)
77
set(OpenBLAS_MAJOR_VERSION 0)
88
set(OpenBLAS_MINOR_VERSION 3)
9-
set(OpenBLAS_PATCH_VERSION 11)
9+
set(OpenBLAS_PATCH_VERSION 12)
1010
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
1111

1212
# Adhere to GNU filesystem layout conventions

Changelog.txt

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,36 @@
11
OpenBLAS ChangeLog
2+
====================================================================
3+
Version 0.3.12
4+
24-Oct-2020
5+
6+
common:
7+
* Fixed missibg LAPACK functions (inadvertently dropped during
8+
the build system restructuring)
9+
* Fixed argument conversion macro in LAPACKE_zgesvdq (LAPACK #458)
10+
11+
POWER:
12+
* Added optimized SCOPY/CCOPY kernels for POWER10
13+
* Increased and unified the default size of the GEMM BUFFER
14+
* Fixed building for POWER1ß in DYNAMIC_ARCH mode
15+
* POWER10 compatibility test now checks binutils version as well
16+
* Cleaned up compiler warnings
17+
18+
x86_64:
19+
* corrected compiler version checks for AVX2 compatibility
20+
* added compiler option -mavx2 for building with flang
21+
* fixed direct SGEMM pathway for small matrix sizes (broken by
22+
the code refactoring in 0.3.11)
23+
* fixed unhandled partial register clobbers in several kernels
24+
for AXPY,DOT,GEMV_N and GEMV_T flagged by gcc10 tree-vectorizer
25+
26+
ARMV8:
27+
* improved Apple Vortex support to include cross-compiling
28+
229
====================================================================
330
Version 0.3.11
431
17-Oct-2020
532

6-
common:
33+
common:
734
* API change:
835
the newly added BFLOAT16 functions were renamed to use the
936
letter "B" instead of "H" to avoid potential confusion with
@@ -28,7 +55,7 @@ Version 0.3.11
2855
* Makefile builds no longer misread NO_CBLAS=0 or NO_LAPACK=0 as
2956
enabling these options
3057
* Fixed detection of gfortran when invoked through an mpi wrapper
31-
* Improve thread reinitialization performance with OpenMP xafter a fork
58+
* Improve thread reinitialization performance with OpenMP after a fork
3259
* Added support for building only the subset of the library required
3360
for a particular precision by specifying BUILD_SINGLE, BUILD_DOUBLE
3461
* Optional function name prefixes and suffixes are now correctly
@@ -66,7 +93,6 @@ ARMV8:
6693
* Fixed cpu detection on BSD-like systems
6794
* Fixed compilation in -std=C18 mode
6895

69-
7096
IBM Z:
7197
* Added support for compiling with the clang compiler
7298
* Improved GEMM performance on Z14

Makefile.power

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ USE_OPENMP = 1
1010
endif
1111

1212
ifeq ($(CORE), POWER10)
13-
COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
13+
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
1414
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
1515
endif
1616

Makefile.rule

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#
44

55
# This library's version
6-
VERSION = 0.3.11
6+
VERSION = 0.3.12
77

88
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
99
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@@ -295,10 +295,13 @@ COMMON_PROF = -pg
295295

296296

297297

298-
# the below is not yet configurable, use cmake if you need to build only select types
299-
BUILD_SINGLE = 1
300-
BUILD_DOUBLE = 1
301-
BUILD_COMPLEX = 1
302-
BUILD_COMPLEX16 = 1
298+
# By default the library contains BLAS functions (and LAPACK if selected) for all input types.
299+
# To build a smaller library supporting e.g. only single precision real (SGEMM etc.) or only
300+
# the functions for complex numbers, uncomment the desired type(s) below
301+
# BUILD_SINGLE = 1
302+
# BUILD_DOUBLE = 1
303+
# BUILD_COMPLEX = 1
304+
# BUILD_COMPLEX16 = 1
305+
#
303306
# End of user configuration
304307
#

Makefile.system

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -641,18 +641,22 @@ DYNAMIC_CORE += POWER8
641641
ifneq ($(C_COMPILER), GCC)
642642
DYNAMIC_CORE += POWER9
643643
DYNAMIC_CORE += POWER10
644+
CCOMMON_OPT += -DHAVE_P10_SUPPORT
644645
endif
645646
ifeq ($(C_COMPILER), GCC)
646647
ifeq ($(GCCVERSIONGT5), 1)
647648
DYNAMIC_CORE += POWER9
648649
else
649650
$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
650651
endif
651-
ifeq ($(GCCVERSIONGTEQ11), 1)
652+
LDVERSIONGTEQ35 := $(shell expr `ld --version | head -1 | cut -f2 -d "." | cut -f1 -d "-"` >= 35)
653+
ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11)
652654
DYNAMIC_CORE += POWER10
655+
CCOMMON_OPT += -DHAVE_P10_SUPPORT
653656
else ifeq ($(GCCVERSIONGTEQ10), 1)
654-
ifeq ($(GCCMINORVERSIONGTEQ2), 1)
657+
ifeq ($(GCCMINORVERSIONGTEQ2)$(LDVERSIONGTEQ35), 11)
655658
DYNAMIC_CORE += POWER10
659+
CCOMMON_OPT += -DHAVE_P10_SUPPORT
656660
endif
657661
else
658662
$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)

Makefile.x86_64

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,10 @@ ifndef NO_AVX2
7474
ifeq ($(C_COMPILER), GCC)
7575
# AVX2 support was added in 4.7.0
7676
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
77+
GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5)
7778
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
78-
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
79+
GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
80+
ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
7981
CCOMMON_OPT += -mavx2
8082
endif
8183
else
@@ -86,8 +88,14 @@ endif
8688
ifeq ($(F_COMPILER), GFORTRAN)
8789
# AVX2 support was added in 4.7.0
8890
GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4)
91+
GCCVERSIONGTEQ5 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 5)
8992
GCCMINORVERSIONGTEQ7 := $(shell expr `$(FC) -dumpversion | cut -f2 -d.` \>= 7)
90-
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
93+
GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
94+
ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
95+
FCOMMON_OPT += -mavx2
96+
endif
97+
else
98+
ifeq ($(F_COMPILER), FLANG)
9199
FCOMMON_OPT += -mavx2
92100
endif
93101
endif

cmake/arch.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ if (DYNAMIC_ARCH)
4949

5050
if (POWER)
5151
set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10)
52+
set(CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_P10_SUPPORT")
5253
endif ()
5354

5455
if (X86)

cmake/prebuild.cmake

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,29 @@ endif ()
416416
set(ZGEMM_UNROLL_M 4)
417417
set(ZGEMM_UNROLL_N 4)
418418
set(SYMV_P 16)
419+
elseif ("${TCORE}" STREQUAL "VORTEX")
420+
file(APPEND ${TARGET_CONF_TEMP}
421+
"#define ARMV8\n"
422+
"#define L1_CODE_SIZE\t32768\n"
423+
"#define L1_CODE_LINESIZE\t64\n"
424+
"#define L1_CODE_ASSOCIATIVE\t4\n"
425+
"#define L1_DATA_SIZE\t32768\n"
426+
"#define L1_DATA_LINESIZE\t64\n"
427+
"#define L1_DATA_ASSOCIATIVE\t4\n"
428+
"#define L2_SIZE\t5262144\n"
429+
"#define L2_LINESIZE\t64\n"
430+
"#define L2_ASSOCIATIVE\t8\n"
431+
"#define DTB_DEFAULT_ENTRIES\t64\n"
432+
"#define DTB_SIZE\t4096\n")
433+
set(SGEMM_UNROLL_M 16)
434+
set(SGEMM_UNROLL_N 4)
435+
set(DGEMM_UNROLL_M 8)
436+
set(DGEMM_UNROLL_N 4)
437+
set(CGEMM_UNROLL_M 8)
438+
set(CGEMM_UNROLL_N 4)
439+
set(ZGEMM_UNROLL_M 4)
440+
set(ZGEMM_UNROLL_N 4)
441+
set(SYMV_P 16)
419442
elseif ("${TCORE}" STREQUAL "POWER6")
420443
file(APPEND ${TARGET_CONF_TEMP}
421444
"#define L1_DATA_SIZE 32768\n"

common_power.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -844,8 +844,8 @@ Lmcount$lazy_ptr:
844844
#define BUFFER_SIZE ( 2 << 20)
845845
#elif defined(PPC440FP2)
846846
#define BUFFER_SIZE ( 16 << 20)
847-
#elif defined(POWER8) || defined(POWER9) || defined(POWER10)
848-
#define BUFFER_SIZE ( 64 << 20)
847+
#elif defined(POWER6) || defined(POWER8) || defined(POWER9) || defined(POWER10)
848+
#define BUFFER_SIZE ( 64 << 22)
849849
#else
850850
#define BUFFER_SIZE ( 16 << 20)
851851
#endif

cpuid_arm64.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,7 @@ void get_cpuconfig(void)
424424
sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0);
425425
printf("#define L1_DATA_SIZE %d \n",value);
426426
sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0);
427-
printf("#define L2_DATA_SIZE %d \n",value);
427+
printf("#define L2_SIZE %d \n",value);
428428
break;
429429
#endif
430430
}

0 commit comments

Comments
 (0)