Skip to content

Commit b1b743f

Browse files
authored
Merge branch 'develop' into interim033
2 parents 2caa221 + 52d3f7a commit b1b743f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+1130
-57
lines changed

CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
66
project(OpenBLAS C ASM)
77
set(OpenBLAS_MAJOR_VERSION 0)
88
set(OpenBLAS_MINOR_VERSION 3)
9-
set(OpenBLAS_PATCH_VERSION 1.dev)
9+
set(OpenBLAS_PATCH_VERSION 3.dev)
1010
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
1111

1212
# Adhere to GNU filesystem layout conventions
@@ -150,6 +150,7 @@ endif()
150150

151151
# add objects to the openblas lib
152152
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
153+
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>)
153154

154155
# Android needs to explicitly link against libm
155156
if(ANDROID)
@@ -169,6 +170,7 @@ endif()
169170
# Set output for libopenblas
170171
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
171172
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
173+
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS")
172174

173175
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
174176
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )

Changelog.txt

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,115 @@
11
OpenBLAS ChangeLog
2+
====================================================================
3+
Version 0.3.2
4+
30-Jul-2018
5+
6+
common:
7+
* fixes for regressions caused by the rewrite of the thread
8+
initialization code in 0.3.1
9+
10+
POWER:
11+
* fixed cpu autodetection for the BSDs
12+
13+
MIPS64:
14+
* fixed utest errors in AXPY, DSDOT, ROT and SWAP
15+
16+
x86_64:
17+
* added autodetection of AMD Ryzen 2
18+
* fixed build with older versions of MSVC
19+
20+
====================================================================
21+
Version 0.3.1
22+
01-Jul-2018
23+
24+
common:
25+
* rewritten thread initialization code with significantly reduced overhead
26+
* added CBLAS interfaces to the IxAMIN BLAS extension functions
27+
* fixed the lapack-test target
28+
* CMAKE builds now create an OpenBLASConfig.cmake file
29+
* ZAXPY now uses a single thread for small input sizes
30+
* the LAPACK code was updated from Reference-LAPACK/lapack#253
31+
(fixing LAPACKE interfaces to Aasen's functions)
32+
33+
POWER:
34+
* corrected CROT and ZROT behaviour with zero INC_X
35+
36+
ARMV7:
37+
* corrected xDOT behaviour with zero INC_X or INC_Y
38+
39+
x86_64:
40+
* retired some older targets of DYNAMIC_ARCH builds to a new option DYNAMIC_OLDER,
41+
this affects PENRYN,DUNNINGTON,OPTERON,OPTERON_SSE3,BOBCAT,ATOM and NANO
42+
(which will still be supported via the slower PRESCOTT kernels when this option is not set)
43+
* added an option DYNAMIC_LIST that (used in conjunction with DYNAMIC_ARCH) allows to
44+
specify the list of x86_64 targets to include. Any target not on the list will be supported
45+
by the Sandybridge or Nehalem kernels if available, or by Prescott.
46+
* improved SWITCH_RATIO on Haswell for increased GEMM throughput
47+
* added initial support for Intel Skylake X, including an AVX512 SGEMM kernel
48+
* added autodetection of Intel Cannon Lake series as Skylake X
49+
* added a default L2 cache size for hypervisors that return zero here (Chromebook)
50+
* fixed a name clash with recent Windows10 headers that broke the build with (at least)
51+
recent mingw from MSYS2
52+
* fixed a link error in mixed clang/gfortran builds with OpenMP
53+
* updated the OSX deployment target to 10.8
54+
* switched on parallel make for builds on MS Windows by default
55+
56+
x86:
57+
* fixed SSWAP and DSWAP behaviour with zero INC_X and INC_Y
58+
59+
====================================================================
60+
Version 0.3.0
61+
23-May-2108
62+
63+
common:
64+
* fixed some more thread race and locking bugs
65+
* added preliminary support for calling an OpenMP build of the library from multiple threads
66+
* removed performance impact of thread locks added in 0.2.20 on OpenMP code
67+
* general code cleanup
68+
* optimized DSDOT implementation
69+
* improved thread distribution for GEMM
70+
* corrected IMATCOPY/OMATCOPY implementation
71+
* fixed out-of-bounds accesses in the multithreaded xBMV/xPMV and SYMV implementations
72+
* cmake build improvements
73+
* pkgconfig file now contains build options
74+
* openblas_get_config() now reports USE_OPENMP and NUM_THREADS settings used for the build
75+
* corrections and improvements for systems with more than 64 cpus
76+
* LAPACK code updated to 3.8.0 including later fixes
77+
* added ReLAPACK, a recursive implementation of several LAPACK functions
78+
* Rewrote ROTMG to handle cases that the netlib code failed to address
79+
* Disabled (broken) multithreading code for xTRMV
80+
* corrected prototypes of complex CBLAS functions to make our cblas.h match the generally accepted standard
81+
* shared memory access failures on startup are now handled more gracefully
82+
* restored utests from earlier releases (and made them pass on all affected systems)
83+
84+
SPARC:
85+
* several fixes for cpu autodetection
86+
87+
POWER:
88+
* corrected vector register overwriting in several Power8 kernels
89+
* optimized additional BLAS functions
90+
91+
ARM:
92+
* added support for CortexA53 and A72
93+
* added autodetection for ThunderX2T99
94+
* made most optimized kernels the default for generic ARMv8 targets
95+
96+
x86_64:
97+
* parallelized DDOT kernel for Haswell
98+
* changed alignment directives in assembly kernels to boost performance on OSX
99+
* fixed register handling in the GEMV microkernels (bug exposed by gcc7)
100+
* added support for building on OpenBSD and Dragonfly
101+
* updated compiler options to work with Intel release 2018
102+
* support fully optimized build with clang/flang on Microsoft Windows
103+
* fixed building on AIX
104+
105+
IBM Z:
106+
* added optimized BLAS 1/2 functions
107+
108+
MIPS:
109+
* fixed cpu autodetection helper code
110+
* added mips32 1004K cpu (Mediatek MT7621 and similar SoC)
111+
* added mips64 I6500 cpu
112+
2113
====================================================================
3114
Version 0.2.20
4115
24-Jul-2017

Makefile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ endif
9797

9898
shared :
9999
ifndef NO_SHARED
100-
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
100+
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
101101
@$(MAKE) -C exports so
102102
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
103103
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
@@ -267,6 +267,8 @@ ifeq ($(F_COMPILER), GFORTRAN)
267267
ifdef SMP
268268
ifeq ($(OSNAME), WINNT)
269269
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
270+
else ifeq ($(OSNAME), Haiku)
271+
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
270272
else
271273
-@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
272274
endif

Makefile.install

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ endif
6666
#for install shared library
6767
ifndef NO_SHARED
6868
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
69-
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
69+
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
7070
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
7171
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
7272
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \

Makefile.x86_64

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ ifeq ($(CORE), SKYLAKEX)
1212
ifndef NO_AVX512
1313
CCOMMON_OPT += -march=skylake-avx512
1414
FCOMMON_OPT += -march=skylake-avx512
15+
ifeq ($(OSNAME), CYGWIN_NT)
16+
CCOMMON_OPT += -fno-asynchronous-unwind-tables
17+
endif
1518
endif
1619
endif
1720

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ Please read `GotoBLAS_01Readme.txt`.
110110
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
111111
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
112112
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
113+
- **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
113114
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
114115
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
115116
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
@@ -200,6 +201,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2
200201
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
201202
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
202203
Clang 3.0 will generate the wrong AVX binary code.
204+
* Please use GCC version 6 or LLVM version 6 and above to compile Skyalke AVX512 kernels.
203205
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
204206
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
205207
the library with `BIGNUMA=1`.

benchmark/gemv.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ int main(int argc, char *argv[]){
122122

123123
FLOAT *a, *x, *y;
124124
FLOAT alpha[] = {1.0, 1.0};
125-
FLOAT beta [] = {1.0, 1.0};
125+
FLOAT beta [] = {1.0, 0.0};
126126
char trans='N';
127127
blasint m, i, j;
128128
blasint inc_x=1,inc_y=1;

c_check

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ $os = WINNT if ($data =~ /OS_WINNT/);
6464
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
6565
$os = Interix if ($data =~ /OS_INTERIX/);
6666
$os = Android if ($data =~ /OS_ANDROID/);
67+
$os = Haiku if ($data =~ /OS_HAIKU/);
6768

6869
$architecture = x86 if ($data =~ /ARCH_X86/);
6970
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);

cmake/prebuild.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ if (NOT NOFORTRAN)
8585
endif ()
8686

8787
# Cannot run getarch on target if we are cross-compiling
88-
if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
88+
if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE"))
8989
# Write to config as getarch would
9090

9191
# TODO: Set up defines that getarch sets up based on every other target

cmake/system_check.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ endif()
6868

6969
if (X86_64 OR X86)
7070
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
71-
execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512)
71+
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
7272
if (NO_AVX512 EQUAL 1)
7373
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
7474
endif()

0 commit comments

Comments
 (0)