OpenMathLib
diff --git a/‎.github/workflows/arm64_graviton.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/arm64_graviton.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 1 deletion b/‎.gitignore‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 8 additions & 5 deletions b/‎CMakeLists.txt‎
Lines changed: 8 additions & 5 deletions
diff --git a/‎CONTRIBUTORS.md‎
Lines changed: 9 additions & 1 deletion b/‎CONTRIBUTORS.md‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎Changelog.txt‎
Lines changed: 134 additions & 0 deletions b/‎Changelog.txt‎
Lines changed: 134 additions & 0 deletions
diff --git a/‎Makefile.arm64‎
Lines changed: 10 additions & 0 deletions b/‎Makefile.arm64‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎Makefile.power‎
Lines changed: 6 additions & 6 deletions b/‎Makefile.power‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎Makefile.prebuild‎
Lines changed: 2 additions & 2 deletions b/‎Makefile.prebuild‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎Makefile.riscv64‎
Lines changed: 4 additions & 4 deletions b/‎Makefile.riscv64‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎Makefile.rule‎
Lines changed: 3 additions & 1 deletion b/‎Makefile.rule‎
Lines changed: 3 additions & 1 deletion
@@ -88,13 +88,14 @@ jobs:
         run: |
           case "${{ matrix.build }}" in
             "make")
-              make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}"
+              make -j$(nproc) DYNAMIC_ARCH=1 BUILD_BFLOAT16=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}"
               ;;
             "cmake")
               mkdir build && cd build
               cmake -DDYNAMIC_ARCH=1 \
                     -DNOFORTRAN=0 \
                     -DBUILD_WITHOUT_LAPACK=0 \
+                    -DBUILD_BFLOAT16=1 \
                     -DCMAKE_VERBOSE_MAKEFILE=ON \
                     -DCMAKE_BUILD_TYPE=Release \
                     -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \
 
@@ -13,8 +13,8 @@ lapack-3.4.1.tgz
 lapack-3.4.2
 lapack-3.4.2.tgz
 lapack-netlib/make.inc
-lapack-netlib/lapacke/include/lapacke_mangling.h
 lapack-netlib/SRC/la_constants.mod
+lapack-netlib/SRC/la_xisnan.mod
 lapack-netlib/TESTING/testing_results.txt
 lapack-netlib/INSTALL/test*
 lapack-netlib/TESTING/xeigtstc
 
@@ -9,7 +9,7 @@ project(OpenBLAS C ASM)
 
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 29.dev)
+set(OpenBLAS_PATCH_VERSION 30.dev)
 
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
@@ -152,6 +152,9 @@ endif ()
 if (NOT DEFINED BUILD_BFLOAT16)
  set (BUILD_BFLOAT16 false)
 endif ()
+if (NOT DEFINED BUILD_HFLOAT16)
+ set (BUILD_HFLOAT16 false)
+endif ()
 # set which float types we want to build for
 if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16)
   # if none are defined, build for all
@@ -305,8 +308,8 @@ if (USE_OPENMP)
   endif()
 endif()
 
-# Fix "Argument list too long" for macOS with Intel CPUs and DYNAMIC_ARCH turned on
-if(APPLE AND DYNAMIC_ARCH AND (NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64"))
+# Fix "Argument list too long" for macOS with POWERPC or Intel CPUs 
+if(APPLE AND (NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64"))
   # Use response files
   set(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
   # Always build static library first
@@ -541,13 +544,13 @@ message(STATUS "adding postbuild instruction to rename syms")
   if (NOT USE_PERL)
   add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
 	  COMMAND sh ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
-    COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def  ${PROJECT_BINARY_DIR}/lib/${OpenBLAS_LIBNAME}.so
+    COMMAND objcopy --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def  ${PROJECT_BINARY_DIR}/lib/${OpenBLAS_LIBNAME}.so
     COMMENT "renaming symbols"
     )
   else()
   add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
     COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
-    COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def  ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
+    COMMAND objcopy --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def  ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
     COMMENT "renaming symbols"
     )
   endif()
 
@@ -256,4 +256,12 @@ In chronological order:
   * [2025-04-22] Optimise dot kernel for NEOVERSE V1
 
 * Sharif Inamdar <[email protected]>
-  * [2025-06-05] Optimize gemv_n_sve_v1x3 kernel
+  * [2025-06-05] Optimize gemv_n_sve_v1x3 kernel
+
+* Guoyuan Li <https://github.com/guoyuanplct>
+  * [2025-04-11] Optimise gemv kernel for RISCV64_ZVL256B
+  * [2025-05-01] Optimise zgemv kernel for RISCV64_ZVL256B
+  * [2025-05-17] Optimise omatcopy/zomatcopy kernel for RISCV64_ZVL256B
+  * [2025-05-29] Optimise axpby kernel for RISCV64_ZVL256B
+  * [2025-06-05] Optimise hbmv kernel for RISCV64_ZVL256B
+
@@ -1,4 +1,138 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.30
+19-Jun-2025
+
+general:
+ - fixed an installation problem with the thread safety test in gmake builds
+ - fixed spurious overwriting of an input array in complex GEMMT/GEMMTR
+ - fixed naming of GEMMTR in error messages from XERBLA
+ - fixed compilation of SBGEMMT/SBGEMMTR in CMake builds
+ - fixed the implementation of ?NRM2 to handle INCX=0 correctly
+ - removed tests for CSROT and ZDROT that relied on unspecified behavior
+ - fixed a performance regression in multithreaded GEMM that was particularly
+   serious on POWER targets
+ - fixed linking issues when using LLVM's flang-new with gmake
+ - fixed a potential thread safety problem with C11 atomic operations
+ - further improved the workload partitioning in parallel GEMM
+ - fixed omission of LAPACKE interfaces for CGESVDQ,CTRSYL3 and ?GEQPF in 
+   CMake builds
+ - fixed mishandling of setting NO_LAPACK to FALSE, and incorrect dependencies
+   for LAPACK function SPMV in CMake builds
+ - added explicit CMake options for building LAPACKE and shared libraries
+ - simplified and improved handling of OpenMP options in CMake builds
+ - reworked Windows DLL generation in CMake builds to ensure correct symbol
+   renaming (pre/postfixing) and optional generation of PDB files for debugging
+ - updated the Perl script version of the gensymbol utility for use with 
+   Windows-on-Arm
+ - Fixed building with (Mingw) gmake on Windows to ensure completeness of the
+   LAPACK included in the static library (potential race condition due to the
+   Windows version of the "ln" utility creating snapshot copies rather than links)
+ - fixed unwanted deletion of the lapacke_mangling.h file by "make clean"
+ - fixed potential duplication of a _64 suffix on library names in CMake builds
+ - fixed compilation of the C fallback copies of the LAPACK code with GCC 15
+ - included fixed from the Reference-LAPACK project:
+   - fixed a truncated error message in the EIG part of the testsuite
+     (Reference-LAPACK PR 1119)
+   - fixed too strict check in LAPACKE_?gesdd_work (PR #1126)
+   - fixed memory corruption when calling ?GEEV with non-finite data (PR #1128)
+   - fixed missing initialization of a variable in C/GEQP3RK (PR #1131)
+   - fixed 2nd dimension chosen in C/ZUNMLQ transposition operation (PR #1135)
+
+x86_64:
+ - fixed an error in the SBGEMV kernel for Cooper Lake/Sapphire Rapids
+ - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
+ - improved the compiler identification code for flang-new
+ - fixed a potential build issue in the ZSUM kernel
+ - fixed "argument list too long" errors when building on MacOS
+ - added cpu autodetection support for several new Arrow Lake models
+ - fixed conditional inclusion of the fast path SGEMM kernel in DYNAMIC_ARCH
+ - fixed compilation with the MinGW build of GCC 15
+
+arm64:
+ - fixed cpu type detection of A64FX and some ThunderX models (broken in 0.3.29)
+ - added support for the AmpereOne/1A cpus in DYNAMIC_ ARCH builds
+ - added an optimized SBGEMM kernel for NEOVERSEV1
+ - improved 1xN SBGEMM performance by forwarding to SBGEMV 
+ - introduced a stepwise increase of the thread count used for
+   SGEMM and SGEMV on NEOVERSEV1/V2 in relation to problem size
+ - introduced a stepwise increase of the thread count used for
+   DGEMV on NEOVERSEV1 in relation to problem size
+ - introduced a stepwise increase of the thread count used for
+   SDOT and DDOT on NEOVERSEV1 in relation to problem size
+ - worked around assembler limitations in LLVM for Windows-on-Arm
+ - enabled cpu type autodetection from the registry on Windows-on-Arm
+ - improved multithreading threshold for GEMV and GESV on Windows-on-Arm
+ - fixed overoptimization issues with LLVM's flang in Windows-on-Arm
+ - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
+ - added a fast path SGEMM kernel for small workloads on SME capable targets
+ - improved performance of SGEMM and DGEMM kernels for small workloads
+ - improved performance of SGEMV and DGEMV on SVE-capable targets
+ - improved performance of SGEMV on NEOVERSEN1 and Apple M
+ - added optimized SSYMV and DSYMV kernels for NEOVERSEN1, Apple M and all
+   SVE capable targets
+ - added optimized SBGEMV kernels for NEOVERSEV1/V2/N2
+ - improved performance of SGEMM through faster NCOPY kernels
+ - added compiler options for the NVIDIA HPC Compiler Suite
+ - fixed compilation on OSX with XCode 16.3 and later
+ - fixed cpu core type and cache size detection on Apple M4
+ - updated GEMM parameter settings for Neoverse cpus in cross-builds with CMake
+ - fixed default compiler options for NEOVERSEN1 and CORTEXX2 in CMake builds
+ - fixed conditional inclusion of the fast path SGEMM kernel in DYNAMIC_ARCH
+ - fixed potential miscompilation of the non-SVE SDOT kernel
+
+riscv64:
+ - added optimized SROTM and DROTM kernels for x280
+ - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
+ - improved performance of GEMM_TCOPY on RVV1.0 targets with 
+   VLEN of 128 or 256
+ - improved performance of OMATCOPY on targets with VLEN 256
+ - greatly improved performance of SGEMV/DGEMV
+ - improved performance of CGEMV and ZGEMV on C910V and all RVV targets 
+   with VLEN 256
+ - improved performance of SAXPBY and DAXPBY on C910V and all RVV targets 
+   with VLEN 256
+ - improved performance of AXPY and DOT on C910V and ZVL256B targets by
+   falling back to non-vectorized code for very small N. (Thereby fixing
+   poor performance of CHBMV/ZHBMV for very small K)
+ - fixed CMake build failures of the TRMM kernels 
+
+loongarch64:
+ - improved performance of the LSX versions of SSYMV/DSYMV
+ - made the LASX versions of the DSYMV and SSYMV kernels 
+   compatible with hardware changes in LA664 and future targets
+ - fixed inaccuracies in several LASX kernels
+ - improved compatibility of LSX kernels with LA264 targets
+ - fixed handling of deprecated target names in CMake builds
+ - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
+
+power:
+ - fixed building for PPCG4 with CMake
+ - fixed SSCAL/DSCAL on PPC970 running FreeBSD
+ - fixed a potential alignment issue in the POWER8 SGEMV kernel
+ - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
+
+zarch:
+ - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
+ - fixed unwanted generation of object files with a writable stack
+
+x86:
+ - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
+ - worked around potential miscompilation of CDOT with very old binutils
+
+arm:
+ - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
+ - fixed unwanted generation of object files with a writable stack
+
+sparc:
+ - fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
+
+alpha:
+ - fixed build failure caused by spurious Windows-only typecasts
+
+cell:
+ - fixed probable build issue caused by spurious Windows-only typecasts
+ 
 ====================================================================
 Version 0.3.29
 12-Jan-2025
 
@@ -191,6 +191,16 @@ endif
 endif
 endif
 
+# Detect Ampere AmpereOne(ampere1,ampere1a) processors.
+ifeq ($(CORE), AMPERE1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ12) $(ISCLANG)))
+CCOMMON_OPT += -march=armv8.6-a+crypto+crc+fp16+sha3+rng
+ifneq ($(F_COMPILER), NAG)
+FCOMMON_OPT += -march=armv8.6-a+crypto+crc+fp16+sha3+rng
+endif
+endif
+endif
+
 # Use a53 tunings because a55 is only available in GCC>=8.1
 ifeq ($(CORE), CORTEXA55)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
 
@@ -13,16 +13,16 @@ ifeq ($(CORE), POWER10)
 ifneq ($(C_COMPILER), PGI)
 ifeq ($(C_COMPILER), GCC)
 ifeq ($(GCCVERSIONGTEQ10), 1)
-CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
+CCOMMON_OPT += -O3 -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
 else ifneq ($(GCCVERSIONGT4), 1)
 $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
-CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
+CCOMMON_OPT += -O3 -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
 else
 $(warning your compiler is too old to fully support POWER10, getting a newer version of gcc is recommended)
-CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
+CCOMMON_OPT += -O3 -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
 endif
 else
-CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
+CCOMMON_OPT += -O3 -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
 endif
 ifeq ($(F_COMPILER), IBM)
 FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr10 -qtune=pwr10 -qfloat=nomaf -qzerosize
@@ -34,7 +34,7 @@ endif
 
 ifeq ($(CORE), POWER9)
 ifneq ($(C_COMPILER), PGI)
-CCOMMON_OPT += -Ofast -mvsx -fno-fast-math
+CCOMMON_OPT += -O3 -mvsx -fno-fast-math
 ifeq ($(C_COMPILER), GCC)
 ifneq ($(GCCVERSIONGT4), 1)
 $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
@@ -70,7 +70,7 @@ endif
 
 ifeq ($(CORE), POWER8)
 ifneq ($(C_COMPILER), PGI)
-CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx  -fno-fast-math
+CCOMMON_OPT += -O3 -mcpu=power8 -mtune=power8 -mvsx  -fno-fast-math
 else
 CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
 endif
 
@@ -64,11 +64,11 @@ TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d
 endif
 
 ifeq ($(TARGET), RISCV64_ZVL256B)
-TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
+TARGET_FLAGS = -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
 endif
 
 ifeq ($(TARGET), RISCV64_ZVL128B)
-TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
+TARGET_FLAGS = -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
 endif
 
 ifeq ($(TARGET), RISCV64_GENERIC)
 
@@ -7,12 +7,12 @@ CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d
 FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
 endif
 ifeq ($(CORE), RISCV64_ZVL256B)
-CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d
-FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
+CCOMMON_OPT += -march=rv64imafdcv_zvl256b_zvfh_zfh -mabi=lp64d
+FCOMMON_OPT += -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
 endif
 ifeq ($(CORE), RISCV64_ZVL128B)
-CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d 
-FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
+CCOMMON_OPT += -march=rv64imafdcv_zvfh_zfh -mabi=lp64d 
+FCOMMON_OPT += -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
 endif
 ifeq ($(CORE), RISCV64_GENERIC)
 CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d
 
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.29.dev
+VERSION = 0.3.30.dev
 
 # If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a
 # and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library
@@ -308,6 +308,8 @@ COMMON_PROF = -pg
 # If you want to enable the experimental BFLOAT16 support
 # BUILD_BFLOAT16 = 1
 
+# If you want to enable the experimental HFLOAT16 support
+# BUILD_HFLOAT16 = 1
 
 # Set the thread number threshold beyond which the job array for the threaded level3 BLAS
 # will be allocated on the heap rather than the stack. (This array alone requires