OpenMathLib
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Changelog.txt‎
Lines changed: 29 additions & 3 deletions b/‎Changelog.txt‎
Lines changed: 29 additions & 3 deletions
diff --git a/‎Makefile.power‎
Lines changed: 1 addition & 1 deletion b/‎Makefile.power‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Makefile.rule‎
Lines changed: 9 additions & 6 deletions b/‎Makefile.rule‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎Makefile.system‎
Lines changed: 6 additions & 2 deletions b/‎Makefile.system‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎Makefile.x86_64‎
Lines changed: 10 additions & 2 deletions b/‎Makefile.x86_64‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎cmake/arch.cmake‎
Lines changed: 1 addition & 0 deletions b/‎cmake/arch.cmake‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cmake/prebuild.cmake‎
Lines changed: 23 additions & 0 deletions b/‎cmake/prebuild.cmake‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎common_power.h‎
Lines changed: 2 additions & 2 deletions b/‎common_power.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpuid_arm64.c‎
Lines changed: 1 addition & 1 deletion b/‎cpuid_arm64.c‎
Lines changed: 1 addition & 1 deletion
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 11)
+set(OpenBLAS_PATCH_VERSION 12)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
 # Adhere to GNU filesystem layout conventions
 
@@ -1,9 +1,36 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.12
+ 24-Oct-2020
+
+common:
+	* Fixed missibg LAPACK functions (inadvertently dropped during
+	  the build system restructuring)
+	* Fixed argument conversion macro in LAPACKE_zgesvdq (LAPACK #458)
+
+POWER:
+	* Added optimized SCOPY/CCOPY kernels for POWER10
+	* Increased and unified the default size of the GEMM BUFFER
+	* Fixed building for POWER1ß in DYNAMIC_ARCH mode
+	* POWER10 compatibility test now checks binutils version as well
+	* Cleaned up compiler warnings
+
+x86_64:
+	* corrected compiler version checks for AVX2 compatibility
+	* added compiler option -mavx2 for building with flang
+	* fixed direct SGEMM pathway for small matrix sizes (broken by
+	  the code refactoring in 0.3.11)
+	* fixed unhandled partial register clobbers in several kernels
+	  for AXPY,DOT,GEMV_N and GEMV_T flagged by gcc10 tree-vectorizer
+
+ARMV8:
+	* improved Apple Vortex support to include cross-compiling
+
 ====================================================================
 Version 0.3.11
  17-Oct-2020
 
- common:
+common:
  	* API change:
 	  the newly added BFLOAT16 functions were renamed to use the
 	  letter "B" instead of "H" to avoid potential confusion with
@@ -28,7 +55,7 @@ Version 0.3.11
 	* Makefile builds no longer misread NO_CBLAS=0 or NO_LAPACK=0 as 
 	  enabling these options
 	* Fixed detection of gfortran when invoked through an mpi wrapper
-	* Improve thread reinitialization performance with OpenMP xafter a fork 
+	* Improve thread reinitialization performance with OpenMP after a fork 
 	* Added support for building only the subset of the library required
 	  for a particular precision by specifying BUILD_SINGLE, BUILD_DOUBLE
 	* Optional function name prefixes and suffixes are now correctly
@@ -66,7 +93,6 @@ ARMV8:
 	* Fixed cpu detection on BSD-like systems
 	* Fixed compilation in -std=C18 mode
 
-
 IBM Z:
 	* Added support for compiling with the clang compiler
 	* Improved GEMM performance on Z14
 
@@ -10,7 +10,7 @@ USE_OPENMP = 1
 endif
 
 ifeq ($(CORE), POWER10)
-COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx  -fno-fast-math
+CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
 FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10  -fno-fast-math
 endif
 
 
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.11
+VERSION = 0.3.12
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@@ -295,10 +295,13 @@ COMMON_PROF = -pg
 
 
 
-# the below is not yet configurable, use cmake if you need to build only select types
-BUILD_SINGLE = 1
-BUILD_DOUBLE = 1
-BUILD_COMPLEX = 1
-BUILD_COMPLEX16 = 1
+# By default the library contains BLAS functions (and LAPACK if selected) for all input types.
+# To build a smaller library supporting e.g. only single precision real (SGEMM etc.) or only
+# the functions for complex numbers, uncomment the desired type(s) below
+# BUILD_SINGLE = 1
+# BUILD_DOUBLE = 1
+# BUILD_COMPLEX = 1
+# BUILD_COMPLEX16 = 1
+#
 #  End of user configuration
 #
@@ -641,18 +641,22 @@ DYNAMIC_CORE += POWER8
 ifneq ($(C_COMPILER), GCC)
 DYNAMIC_CORE += POWER9
 DYNAMIC_CORE += POWER10
+CCOMMON_OPT += -DHAVE_P10_SUPPORT
 endif
 ifeq ($(C_COMPILER), GCC)
 ifeq ($(GCCVERSIONGT5), 1)
 DYNAMIC_CORE += POWER9
 else
 $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
 endif
-ifeq ($(GCCVERSIONGTEQ11), 1)
+LDVERSIONGTEQ35 := $(shell expr `ld --version | head -1 | cut -f2 -d "." | cut -f1 -d "-"` >= 35)
+ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11)
 DYNAMIC_CORE += POWER10
+CCOMMON_OPT += -DHAVE_P10_SUPPORT
 else ifeq ($(GCCVERSIONGTEQ10), 1)
-ifeq ($(GCCMINORVERSIONGTEQ2), 1)
+ifeq ($(GCCMINORVERSIONGTEQ2)$(LDVERSIONGTEQ35), 11)
 DYNAMIC_CORE += POWER10
+CCOMMON_OPT += -DHAVE_P10_SUPPORT
 endif
 else
 $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)
 
@@ -74,8 +74,10 @@ ifndef NO_AVX2
 ifeq ($(C_COMPILER), GCC)
 # AVX2 support was added in 4.7.0
 GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
+GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5)
 GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
-ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
+GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
+ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
 CCOMMON_OPT += -mavx2
 endif
 else 
@@ -86,8 +88,14 @@ endif
 ifeq ($(F_COMPILER), GFORTRAN)
 # AVX2 support was added in 4.7.0
 GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4)
+GCCVERSIONGTEQ5 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 5)
 GCCMINORVERSIONGTEQ7 := $(shell expr `$(FC) -dumpversion | cut -f2 -d.` \>= 7)
-ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
+GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
+ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
+FCOMMON_OPT += -mavx2
+endif
+else
+ifeq ($(F_COMPILER), FLANG)
 FCOMMON_OPT += -mavx2
 endif
 endif
 
@@ -49,6 +49,7 @@ if (DYNAMIC_ARCH)
 
   if (POWER)
 	  set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10)
+	  set(CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_P10_SUPPORT")
   endif ()
 
   if (X86)
 
@@ -416,6 +416,29 @@ endif ()
     set(ZGEMM_UNROLL_M 4)
     set(ZGEMM_UNROLL_N 4)
     set(SYMV_P 16)
+elseif ("${TCORE}" STREQUAL "VORTEX")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define ARMV8\n"
+      "#define L1_CODE_SIZE\t32768\n"
+      "#define L1_CODE_LINESIZE\t64\n"
+      "#define L1_CODE_ASSOCIATIVE\t4\n"
+      "#define L1_DATA_SIZE\t32768\n"
+      "#define L1_DATA_LINESIZE\t64\n"
+      "#define L1_DATA_ASSOCIATIVE\t4\n"
+      "#define L2_SIZE\t5262144\n"
+      "#define L2_LINESIZE\t64\n"
+      "#define L2_ASSOCIATIVE\t8\n"
+      "#define DTB_DEFAULT_ENTRIES\t64\n"
+      "#define DTB_SIZE\t4096\n")
+    set(SGEMM_UNROLL_M 16)
+    set(SGEMM_UNROLL_N 4)
+    set(DGEMM_UNROLL_M 8)
+    set(DGEMM_UNROLL_N 4)
+    set(CGEMM_UNROLL_M 8)
+    set(CGEMM_UNROLL_N 4)
+    set(ZGEMM_UNROLL_M 4)
+    set(ZGEMM_UNROLL_N 4)
+    set(SYMV_P 16)
   elseif ("${TCORE}" STREQUAL "POWER6")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_DATA_SIZE 32768\n"
 
@@ -844,8 +844,8 @@ Lmcount$lazy_ptr:
 #define BUFFER_SIZE     (  2 << 20)
 #elif defined(PPC440FP2)
 #define BUFFER_SIZE     ( 16 << 20)
-#elif defined(POWER8) || defined(POWER9) || defined(POWER10)
-#define BUFFER_SIZE     ( 64 << 20)
+#elif defined(POWER6) || defined(POWER8) || defined(POWER9) || defined(POWER10)
+#define BUFFER_SIZE     ( 64 << 22)
 #else
 #define BUFFER_SIZE     ( 16 << 20)
 #endif
 
@@ -424,7 +424,7 @@ void get_cpuconfig(void)
 			sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0);
 			printf("#define L1_DATA_SIZE	     %d       \n",value);
 			sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0);
-			printf("#define L2_DATA_SIZE	     %d       \n",value);
+			printf("#define L2_SIZE	     %d       \n",value);
 			break;
 #endif			
 	}
Original file line number	Diff line number	Diff line change
`@@ -424,7 +424,7 @@ void get_cpuconfig(void)`
`424`	`424`	`sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0);`
`425`	`425`	`printf("#define L1_DATA_SIZE %d \n",value);`
`426`	`426`	`sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0);`
`427`		`- printf("#define L2_DATA_SIZE %d \n",value);`
	`427`	`+ printf("#define L2_SIZE %d \n",value);`
`428`	`428`	`break;`
`429`	`429`	`#endif`
`430`	`430`	`}`