OpenMathLib
diff --git a/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎Changelog.txt
Lines changed: 98 additions & 0 deletions b/‎Changelog.txt
Lines changed: 98 additions & 0 deletions
diff --git a/‎Makefile.arm
Lines changed: 5 additions & 0 deletions b/‎Makefile.arm
Lines changed: 5 additions & 0 deletions
diff --git a/‎Makefile.power
Lines changed: 1 addition & 1 deletion b/‎Makefile.power
Lines changed: 1 addition & 1 deletion
diff --git a/‎Makefile.rule
Lines changed: 9 additions & 6 deletions b/‎Makefile.rule
Lines changed: 9 additions & 6 deletions
diff --git a/‎Makefile.system
Lines changed: 33 additions & 4 deletions b/‎Makefile.system
Lines changed: 33 additions & 4 deletions
diff --git a/‎Makefile.x86_64
Lines changed: 24 additions & 9 deletions b/‎Makefile.x86_64
Lines changed: 24 additions & 9 deletions
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 10.dev)
+set(OpenBLAS_PATCH_VERSION 12.dev)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
 # Adhere to GNU filesystem layout conventions
 
@@ -1,4 +1,102 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.12
+ 24-Oct-2020
+
+common:
+	* Fixed missing BLAS/LAPACK functions (inadvertently dropped during
+	  the build system restructuring)
+	* Fixed argument conversion macro in LAPACKE_zgesvdq (LAPACK #458)
+
+POWER:
+	* Added optimized SCOPY/CCOPY kernels for POWER10
+	* Increased and unified the default size of the GEMM BUFFER
+	* Fixed building for POWER10 in DYNAMIC_ARCH mode
+	* POWER10 compatibility test now checks binutils version as well
+	* Cleaned up compiler warnings
+
+x86_64:
+	* corrected compiler version checks for AVX2 compatibility
+	* added compiler option -mavx2 for building with flang
+	* fixed direct SGEMM pathway for small matrix sizes (broken by
+	  the code refactoring in 0.3.11)
+	* fixed unhandled partial register clobbers in several kernels
+	  for AXPY,DOT,GEMV_N and GEMV_T flagged by gcc10 tree-vectorizer
+
+ARMV8:
+	* improved Apple Vortex support to include cross-compiling
+
+====================================================================
+Version 0.3.11
+ 17-Oct-2020
+
+common:
+ 	* API change:
+	  the newly added BFLOAT16 functions were renamed to use the
+	  letter "B" instead of "H" to avoid potential confusion with
+	  the IEEE "half precision float" type, i.e. the 0.3.10
+	  SHGEMM is now SBGEMM and the corresponding build option
+	  was changed from "BUILD_HALF" to "BUILD_BFLOAT16".
+	* Reduced the default BLAS3_MEM_ALLOC_THRESHOLD (used as an upper
+	  limit for placing temporary arrays on the stack) to be compatible
+	  with a stack size of 1mb (as imposed by the JAVA runtime library) 
+	* Added mixed-precision dot function SBDOT and utility functions
+	  shstobf16, shdtobf16, sbf16tos and dbf16tod to convert between
+	  single or double precision float arrays and bfloat16 arrays
+	* Fixed prototypes of LAPACK_?ggsvp and LAPACK_?ggsvd functions
+	  in lapack.h
+	* Fixed underflow and rounding errors in LAPACK SLANV2 and DLANV2
+	  (causing miscalculations in e.g. SHSEQR/DHSEQR, LAPACK issue #263)
+	* Fixed workspace calculation in LAPACK ?GELQ (LAPACK issue #415)
+	* Fixed several bugs in the LAPACK testsuite
+	* Improved performance of TRMM and TRSM for certain problem sizes
+	* Fixed infinite recursions and workspace miscalculations in ReLAPACK
+	* CMAKE builds no longer require pkg-config for creating the .pc file
+	* Makefile builds no longer misread NO_CBLAS=0 or NO_LAPACK=0 as 
+	  enabling these options
+	* Fixed detection of gfortran when invoked through an mpi wrapper
+	* Improve thread reinitialization performance with OpenMP after a fork 
+	* Added support for building only the subset of the library required
+	  for a particular precision by specifying BUILD_SINGLE, BUILD_DOUBLE
+	* Optional function name prefixes and suffixes are now correctly
+	  reflected in the generated cblas.h
+	* Added CMAKE build support for the LAPACK and multithreading tests
+
+POWER:
+	* Added optimized support for POWER10
+	* Added support for compiling for POWER8 in 32bit mode
+	* Added support for compilation with LLVM/clang
+	* Added support for compilation with NVIDIA/PGI compilers
+	* Fixed building on big-endian POWER8
+	* Fixed miscompilation of ZDOTC by gcc10
+	* Fixed alignment errors in the POWER8 SAXPY kernel
+	* Improved CPU detection on AIX
+	* Supported building with older compilers on POWER9
+
+x86_64:
+	* Added support for Intel Cooperlake
+	* Added autodetection of AMD Renoir/Matisse/Zen3 cpus
+	* Added autodetection of Intel Comet Lake cpus
+	* Reimplemented ?sum, ?dot and daxpy using universal intrinsics
+	* Reset the fpu state before using the fpu on Windows as a workaround
+	  for a problem introduced in Windows 10 build 19041 (a.k.a. SDK 2004)
+	* Fixed potentially undefined behaviour in the dot and gemv_t kernels
+	* Fixed a potential segmentation fault in DYNAMIC_ARCH builds
+	* Fixed building for ZEN with PGI/NVIDIA and AMD AOCC compilers
+	
+ARMV7:
+	* Fixed cpu detection on BSD-like systems
+
+ARMV8:
+	* Added preliminary support for Apple Vortex cpus
+	* Added support for the Cavium ThunderX3T110 cpu
+	* Fixed cpu detection on BSD-like systems
+	* Fixed compilation in -std=C18 mode
+
+IBM Z:
+	* Added support for compiling with the clang compiler
+	* Improved GEMM performance on Z14
+
 ====================================================================
 Version 0.3.10
  14-Jun-2020
 
@@ -12,3 +12,8 @@ ifeq ($(CORE), ARMV6)
 CCOMMON_OPT += -mfpu=vfp
 FCOMMON_OPT += -mfpu=vfp
 endif
+
+ifdef HAVE_NEON
+CCOMMON_OPT += -mfpu=neon
+FCOMMON_OPT += -mfpu=neon
+endif
@@ -10,7 +10,7 @@ USE_OPENMP = 1
 endif
 
 ifeq ($(CORE), POWER10)
-COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx  -fno-fast-math
+CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
 FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10  -fno-fast-math
 endif
 
 
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.10.dev
+VERSION = 0.3.12.dev
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@@ -295,10 +295,13 @@ COMMON_PROF = -pg
 
 
 
-# the below is not yet configurable, use cmake if you need to build only select types
-BUILD_SINGLE = 1
-BUILD_DOUBLE = 1
-BUILD_COMPLEX = 1
-BUILD_COMPLEX16 = 1
+# By default the library contains BLAS functions (and LAPACK if selected) for all input types.
+# To build a smaller library supporting e.g. only single precision real (SGEMM etc.) or only
+# the functions for complex numbers, uncomment the desired type(s) below
+# BUILD_SINGLE = 1
+# BUILD_DOUBLE = 1
+# BUILD_COMPLEX = 1
+# BUILD_COMPLEX16 = 1
+#
 #  End of user configuration
 #
@@ -6,7 +6,7 @@
 INCLUDED = 1
 
 ifndef TOPDIR
-TOPDIR = .
+TOPDIR = . 
 endif
 
  # If ARCH is not set, we use the host system's architecture for getarch compile options.
@@ -93,6 +93,12 @@ endif
 ifdef TARGET
 GETARCH_FLAGS := -DFORCE_$(TARGET)
 GETARCH_FLAGS += -DUSER_TARGET
+ifeq ($(TARGET), GENERIC)
+ifeq ($(DYNAMIC_ARCH), 1)
+override NO_EXPRECISION=1
+export NO_EXPRECiSION
+endif
+endif
 endif
 
 # Force fallbacks for 32bit
@@ -246,6 +252,22 @@ DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)"
 ifndef TARGET_CORE
 include $(TOPDIR)/Makefile.conf
 else
+HAVE_NEON=
+HAVE_VFP=
+HAVE_VFPV3=
+HAVE_VFPV4=
+HAVE_MMX=
+HAVE_SSE=
+HAVE_SSE2=
+HAVE_SSE3=
+HAVE_SSSE3=
+HAVE_SSE4_1=
+HAVE_SSE4_2=
+HAVE_SSE4A=
+HAVE_SSE5=
+HAVE_AVX=
+HAVE_AVX2=
+HAVE_FMA3=
 include $(TOPDIR)/Makefile_kernel.conf
 endif
 
@@ -319,6 +341,7 @@ ifeq ($(GCCVERSIONGTEQ7),1)
 else
 	GCCDUMPVERSION_PARAM := -dumpversion
 endif
+GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1)
 GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2)
 GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
 endif
@@ -641,18 +664,22 @@ DYNAMIC_CORE += POWER8
 ifneq ($(C_COMPILER), GCC)
 DYNAMIC_CORE += POWER9
 DYNAMIC_CORE += POWER10
+CCOMMON_OPT += -DHAVE_P10_SUPPORT
 endif
 ifeq ($(C_COMPILER), GCC)
 ifeq ($(GCCVERSIONGT5), 1)
 DYNAMIC_CORE += POWER9
 else
 $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
 endif
-ifeq ($(GCCVERSIONGTEQ11), 1)
+LDVERSIONGTEQ35 := $(shell expr `ld --version | head -1 | cut -f2 -d "." | cut -f1 -d "-"` >= 35)
+ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11)
 DYNAMIC_CORE += POWER10
+CCOMMON_OPT += -DHAVE_P10_SUPPORT
 else ifeq ($(GCCVERSIONGTEQ10), 1)
-ifeq ($(GCCMINORVERSIONGTEQ2), 1)
+ifeq ($(GCCMINORVERSIONGTEQ2)$(LDVERSIONGTEQ35), 11)
 DYNAMIC_CORE += POWER10
+CCOMMON_OPT += -DHAVE_P10_SUPPORT
 endif
 else
 $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)
@@ -853,7 +880,7 @@ CCOMMON_OPT += -DF_INTERFACE_FLANG
 FCOMMON_OPT += -Mrecursive -Kieee
 ifeq ($(OSNAME), Linux)
 ifeq ($(ARCH), x86_64)
-FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`)
+FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`)
 ifeq ($(FLANG_VENDOR),AOCC)
 FCOMMON_OPT += -fno-unroll-loops
 endif
@@ -1515,6 +1542,8 @@ export HAVE_SSE4_2
 export HAVE_SSE4A
 export HAVE_SSE5
 export HAVE_AVX
+export HAVE_AVX2
+export HAVE_FMA3
 export HAVE_VFP
 export HAVE_VFPV3
 export HAVE_VFPV4
 
@@ -9,9 +9,9 @@ endif
 endif
 
 ifdef HAVE_SSE3
-ifndef DYNAMIC_ARCH
 CCOMMON_OPT += -msse3
 FCOMMON_OPT += -msse3
+endif
 ifdef HAVE_SSSE3
 CCOMMON_OPT += -mssse3
 FCOMMON_OPT += -mssse3
@@ -20,7 +20,17 @@ ifdef HAVE_SSE4_1
 CCOMMON_OPT += -msse4.1
 FCOMMON_OPT += -msse4.1
 endif
+ifdef HAVE_AVX
+CCOMMON_OPT += -mavx
+FCOMMON_OPT += -mavx
+endif
+ifdef HAVE_AVX2
+CCOMMON_OPT += -mavx2
+FCOMMON_OPT += -mavx2
 endif
+ifdef HAVE_FMA3
+CCOMMON_OPT += -mfma
+FCOMMON_OPT += -mfma
 endif
 
 ifeq ($(CORE), SKYLAKEX)
@@ -47,8 +57,6 @@ ifndef DYNAMIC_ARCH
 ifndef NO_AVX512
 ifeq ($(C_COMPILER), GCC)
 # cooperlake support was added in 10.1
-GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
-GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 1)
 ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
 CCOMMON_OPT += -march=cooperlake
 FCOMMON_OPT += -march=cooperlake
@@ -68,24 +76,31 @@ endif
 endif
 endif
 
-ifeq ($(CORE), $(filter $(CORE), HASWELL ZEN SKYLAKEX COOPERLAKE))
-ifndef DYNAMIC_ARCH
+ifdef HAVE_AVX2
 ifndef NO_AVX2
 ifeq ($(C_COMPILER), GCC)
 # AVX2 support was added in 4.7.0
-GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
-GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
-ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
+GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
+ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
+CCOMMON_OPT += -mavx2
+endif
+else 
+ifeq ($(C_COMPILER), CLANG)
 CCOMMON_OPT += -mavx2
 endif
 endif
 ifeq ($(F_COMPILER), GFORTRAN)
 # AVX2 support was added in 4.7.0
 GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4)
+GCCVERSIONGTEQ5 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 5)
 GCCMINORVERSIONGTEQ7 := $(shell expr `$(FC) -dumpversion | cut -f2 -d.` \>= 7)
-ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
+GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
+ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
 FCOMMON_OPT += -mavx2
 endif
+else
+ifeq ($(F_COMPILER), FLANG)
+FCOMMON_OPT += -mavx2
 endif
 endif
 endif