Skip to content

Commit fd37406

Browse files
authored
Merge branch 'develop' into optimized_gemv_n_1x3
2 parents 8279e68 + 15d6e58 commit fd37406

File tree

126 files changed

+4878
-598
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

126 files changed

+4878
-598
lines changed

.github/workflows/arm64_graviton.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,13 +88,14 @@ jobs:
8888
run: |
8989
case "${{ matrix.build }}" in
9090
"make")
91-
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}"
91+
make -j$(nproc) DYNAMIC_ARCH=1 BUILD_BFLOAT16=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}"
9292
;;
9393
"cmake")
9494
mkdir build && cd build
9595
cmake -DDYNAMIC_ARCH=1 \
9696
-DNOFORTRAN=0 \
9797
-DBUILD_WITHOUT_LAPACK=0 \
98+
-DBUILD_BFLOAT16=1 \
9899
-DCMAKE_VERBOSE_MAKEFILE=ON \
99100
-DCMAKE_BUILD_TYPE=Release \
100101
-DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ lapack-3.4.1.tgz
1313
lapack-3.4.2
1414
lapack-3.4.2.tgz
1515
lapack-netlib/make.inc
16-
lapack-netlib/lapacke/include/lapacke_mangling.h
1716
lapack-netlib/SRC/la_constants.mod
17+
lapack-netlib/SRC/la_xisnan.mod
1818
lapack-netlib/TESTING/testing_results.txt
1919
lapack-netlib/INSTALL/test*
2020
lapack-netlib/TESTING/xeigtstc

CMakeLists.txt

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ project(OpenBLAS C ASM)
99

1010
set(OpenBLAS_MAJOR_VERSION 0)
1111
set(OpenBLAS_MINOR_VERSION 3)
12-
set(OpenBLAS_PATCH_VERSION 29.dev)
12+
set(OpenBLAS_PATCH_VERSION 30.dev)
1313

1414
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
1515

@@ -152,6 +152,9 @@ endif ()
152152
if (NOT DEFINED BUILD_BFLOAT16)
153153
set (BUILD_BFLOAT16 false)
154154
endif ()
155+
if (NOT DEFINED BUILD_HFLOAT16)
156+
set (BUILD_HFLOAT16 false)
157+
endif ()
155158
# set which float types we want to build for
156159
if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16)
157160
# if none are defined, build for all
@@ -305,8 +308,8 @@ if (USE_OPENMP)
305308
endif()
306309
endif()
307310

308-
# Fix "Argument list too long" for macOS with Intel CPUs and DYNAMIC_ARCH turned on
309-
if(APPLE AND DYNAMIC_ARCH AND (NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64"))
311+
# Fix "Argument list too long" for macOS with POWERPC or Intel CPUs
312+
if(APPLE AND (NOT CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "arm64"))
310313
# Use response files
311314
set(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
312315
# Always build static library first
@@ -541,13 +544,13 @@ message(STATUS "adding postbuild instruction to rename syms")
541544
if (NOT USE_PERL)
542545
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
543546
COMMAND sh ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
544-
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/${OpenBLAS_LIBNAME}.so
547+
COMMAND objcopy --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/${OpenBLAS_LIBNAME}.so
545548
COMMENT "renaming symbols"
546549
)
547550
else()
548551
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
549552
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
550-
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
553+
COMMAND objcopy --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
551554
COMMENT "renaming symbols"
552555
)
553556
endif()

CONTRIBUTORS.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -256,4 +256,12 @@ In chronological order:
256256
* [2025-04-22] Optimise dot kernel for NEOVERSE V1
257257

258258
* Sharif Inamdar <[email protected]>
259-
* [2025-06-05] Optimize gemv_n_sve_v1x3 kernel
259+
* [2025-06-05] Optimize gemv_n_sve_v1x3 kernel
260+
261+
* Guoyuan Li <https://github.com/guoyuanplct>
262+
* [2025-04-11] Optimise gemv kernel for RISCV64_ZVL256B
263+
* [2025-05-01] Optimise zgemv kernel for RISCV64_ZVL256B
264+
* [2025-05-17] Optimise omatcopy/zomatcopy kernel for RISCV64_ZVL256B
265+
* [2025-05-29] Optimise axpby kernel for RISCV64_ZVL256B
266+
* [2025-06-05] Optimise hbmv kernel for RISCV64_ZVL256B
267+

Changelog.txt

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,138 @@
11
OpenBLAS ChangeLog
2+
====================================================================
3+
Version 0.3.30
4+
19-Jun-2025
5+
6+
general:
7+
- fixed an installation problem with the thread safety test in gmake builds
8+
- fixed spurious overwriting of an input array in complex GEMMT/GEMMTR
9+
- fixed naming of GEMMTR in error messages from XERBLA
10+
- fixed compilation of SBGEMMT/SBGEMMTR in CMake builds
11+
- fixed the implementation of ?NRM2 to handle INCX=0 correctly
12+
- removed tests for CSROT and ZDROT that relied on unspecified behavior
13+
- fixed a performance regression in multithreaded GEMM that was particularly
14+
serious on POWER targets
15+
- fixed linking issues when using LLVM's flang-new with gmake
16+
- fixed a potential thread safety problem with C11 atomic operations
17+
- further improved the workload partitioning in parallel GEMM
18+
- fixed omission of LAPACKE interfaces for CGESVDQ,CTRSYL3 and ?GEQPF in
19+
CMake builds
20+
- fixed mishandling of setting NO_LAPACK to FALSE, and incorrect dependencies
21+
for LAPACK function SPMV in CMake builds
22+
- added explicit CMake options for building LAPACKE and shared libraries
23+
- simplified and improved handling of OpenMP options in CMake builds
24+
- reworked Windows DLL generation in CMake builds to ensure correct symbol
25+
renaming (pre/postfixing) and optional generation of PDB files for debugging
26+
- updated the Perl script version of the gensymbol utility for use with
27+
Windows-on-Arm
28+
- Fixed building with (Mingw) gmake on Windows to ensure completeness of the
29+
LAPACK included in the static library (potential race condition due to the
30+
Windows version of the "ln" utility creating snapshot copies rather than links)
31+
- fixed unwanted deletion of the lapacke_mangling.h file by "make clean"
32+
- fixed potential duplication of a _64 suffix on library names in CMake builds
33+
- fixed compilation of the C fallback copies of the LAPACK code with GCC 15
34+
- included fixed from the Reference-LAPACK project:
35+
- fixed a truncated error message in the EIG part of the testsuite
36+
(Reference-LAPACK PR 1119)
37+
- fixed too strict check in LAPACKE_?gesdd_work (PR #1126)
38+
- fixed memory corruption when calling ?GEEV with non-finite data (PR #1128)
39+
- fixed missing initialization of a variable in C/GEQP3RK (PR #1131)
40+
- fixed 2nd dimension chosen in C/ZUNMLQ transposition operation (PR #1135)
41+
42+
x86_64:
43+
- fixed an error in the SBGEMV kernel for Cooper Lake/Sapphire Rapids
44+
- fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
45+
- improved the compiler identification code for flang-new
46+
- fixed a potential build issue in the ZSUM kernel
47+
- fixed "argument list too long" errors when building on MacOS
48+
- added cpu autodetection support for several new Arrow Lake models
49+
- fixed conditional inclusion of the fast path SGEMM kernel in DYNAMIC_ARCH
50+
- fixed compilation with the MinGW build of GCC 15
51+
52+
arm64:
53+
- fixed cpu type detection of A64FX and some ThunderX models (broken in 0.3.29)
54+
- added support for the AmpereOne/1A cpus in DYNAMIC_ ARCH builds
55+
- added an optimized SBGEMM kernel for NEOVERSEV1
56+
- improved 1xN SBGEMM performance by forwarding to SBGEMV
57+
- introduced a stepwise increase of the thread count used for
58+
SGEMM and SGEMV on NEOVERSEV1/V2 in relation to problem size
59+
- introduced a stepwise increase of the thread count used for
60+
DGEMV on NEOVERSEV1 in relation to problem size
61+
- introduced a stepwise increase of the thread count used for
62+
SDOT and DDOT on NEOVERSEV1 in relation to problem size
63+
- worked around assembler limitations in LLVM for Windows-on-Arm
64+
- enabled cpu type autodetection from the registry on Windows-on-Arm
65+
- improved multithreading threshold for GEMV and GESV on Windows-on-Arm
66+
- fixed overoptimization issues with LLVM's flang in Windows-on-Arm
67+
- fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
68+
- added a fast path SGEMM kernel for small workloads on SME capable targets
69+
- improved performance of SGEMM and DGEMM kernels for small workloads
70+
- improved performance of SGEMV and DGEMV on SVE-capable targets
71+
- improved performance of SGEMV on NEOVERSEN1 and Apple M
72+
- added optimized SSYMV and DSYMV kernels for NEOVERSEN1, Apple M and all
73+
SVE capable targets
74+
- added optimized SBGEMV kernels for NEOVERSEV1/V2/N2
75+
- improved performance of SGEMM through faster NCOPY kernels
76+
- added compiler options for the NVIDIA HPC Compiler Suite
77+
- fixed compilation on OSX with XCode 16.3 and later
78+
- fixed cpu core type and cache size detection on Apple M4
79+
- updated GEMM parameter settings for Neoverse cpus in cross-builds with CMake
80+
- fixed default compiler options for NEOVERSEN1 and CORTEXX2 in CMake builds
81+
- fixed conditional inclusion of the fast path SGEMM kernel in DYNAMIC_ARCH
82+
- fixed potential miscompilation of the non-SVE SDOT kernel
83+
84+
riscv64:
85+
- added optimized SROTM and DROTM kernels for x280
86+
- fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
87+
- improved performance of GEMM_TCOPY on RVV1.0 targets with
88+
VLEN of 128 or 256
89+
- improved performance of OMATCOPY on targets with VLEN 256
90+
- greatly improved performance of SGEMV/DGEMV
91+
- improved performance of CGEMV and ZGEMV on C910V and all RVV targets
92+
with VLEN 256
93+
- improved performance of SAXPBY and DAXPBY on C910V and all RVV targets
94+
with VLEN 256
95+
- improved performance of AXPY and DOT on C910V and ZVL256B targets by
96+
falling back to non-vectorized code for very small N. (Thereby fixing
97+
poor performance of CHBMV/ZHBMV for very small K)
98+
- fixed CMake build failures of the TRMM kernels
99+
100+
loongarch64:
101+
- improved performance of the LSX versions of SSYMV/DSYMV
102+
- made the LASX versions of the DSYMV and SSYMV kernels
103+
compatible with hardware changes in LA664 and future targets
104+
- fixed inaccuracies in several LASX kernels
105+
- improved compatibility of LSX kernels with LA264 targets
106+
- fixed handling of deprecated target names in CMake builds
107+
- fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
108+
109+
power:
110+
- fixed building for PPCG4 with CMake
111+
- fixed SSCAL/DSCAL on PPC970 running FreeBSD
112+
- fixed a potential alignment issue in the POWER8 SGEMV kernel
113+
- fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
114+
115+
zarch:
116+
- fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
117+
- fixed unwanted generation of object files with a writable stack
118+
119+
x86:
120+
- fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
121+
- worked around potential miscompilation of CDOT with very old binutils
122+
123+
arm:
124+
- fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
125+
- fixed unwanted generation of object files with a writable stack
126+
127+
sparc:
128+
- fixed corner cases of NAN and INF input handling in CSCAL and ZSCAL
129+
130+
alpha:
131+
- fixed build failure caused by spurious Windows-only typecasts
132+
133+
cell:
134+
- fixed probable build issue caused by spurious Windows-only typecasts
135+
2136
====================================================================
3137
Version 0.3.29
4138
12-Jan-2025

Makefile.arm64

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,16 @@ endif
191191
endif
192192
endif
193193

194+
# Detect Ampere AmpereOne(ampere1,ampere1a) processors.
195+
ifeq ($(CORE), AMPERE1)
196+
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ12) $(ISCLANG)))
197+
CCOMMON_OPT += -march=armv8.6-a+crypto+crc+fp16+sha3+rng
198+
ifneq ($(F_COMPILER), NAG)
199+
FCOMMON_OPT += -march=armv8.6-a+crypto+crc+fp16+sha3+rng
200+
endif
201+
endif
202+
endif
203+
194204
# Use a53 tunings because a55 is only available in GCC>=8.1
195205
ifeq ($(CORE), CORTEXA55)
196206
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))

Makefile.power

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,16 @@ ifeq ($(CORE), POWER10)
1313
ifneq ($(C_COMPILER), PGI)
1414
ifeq ($(C_COMPILER), GCC)
1515
ifeq ($(GCCVERSIONGTEQ10), 1)
16-
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
16+
CCOMMON_OPT += -O3 -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
1717
else ifneq ($(GCCVERSIONGT4), 1)
1818
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
19-
CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
19+
CCOMMON_OPT += -O3 -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
2020
else
2121
$(warning your compiler is too old to fully support POWER10, getting a newer version of gcc is recommended)
22-
CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
22+
CCOMMON_OPT += -O3 -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
2323
endif
2424
else
25-
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
25+
CCOMMON_OPT += -O3 -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
2626
endif
2727
ifeq ($(F_COMPILER), IBM)
2828
FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr10 -qtune=pwr10 -qfloat=nomaf -qzerosize
@@ -34,7 +34,7 @@ endif
3434

3535
ifeq ($(CORE), POWER9)
3636
ifneq ($(C_COMPILER), PGI)
37-
CCOMMON_OPT += -Ofast -mvsx -fno-fast-math
37+
CCOMMON_OPT += -O3 -mvsx -fno-fast-math
3838
ifeq ($(C_COMPILER), GCC)
3939
ifneq ($(GCCVERSIONGT4), 1)
4040
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
@@ -70,7 +70,7 @@ endif
7070

7171
ifeq ($(CORE), POWER8)
7272
ifneq ($(C_COMPILER), PGI)
73-
CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
73+
CCOMMON_OPT += -O3 -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
7474
else
7575
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
7676
endif

Makefile.prebuild

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,11 @@ TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d
6464
endif
6565

6666
ifeq ($(TARGET), RISCV64_ZVL256B)
67-
TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
67+
TARGET_FLAGS = -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
6868
endif
6969

7070
ifeq ($(TARGET), RISCV64_ZVL128B)
71-
TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
71+
TARGET_FLAGS = -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
7272
endif
7373

7474
ifeq ($(TARGET), RISCV64_GENERIC)

Makefile.riscv64

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@ CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d
77
FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
88
endif
99
ifeq ($(CORE), RISCV64_ZVL256B)
10-
CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d
11-
FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
10+
CCOMMON_OPT += -march=rv64imafdcv_zvl256b_zvfh_zfh -mabi=lp64d
11+
FCOMMON_OPT += -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
1212
endif
1313
ifeq ($(CORE), RISCV64_ZVL128B)
14-
CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
15-
FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
14+
CCOMMON_OPT += -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
15+
FCOMMON_OPT += -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
1616
endif
1717
ifeq ($(CORE), RISCV64_GENERIC)
1818
CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d

Makefile.rule

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#
44

55
# This library's version
6-
VERSION = 0.3.29.dev
6+
VERSION = 0.3.30.dev
77

88
# If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a
99
# and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library
@@ -308,6 +308,8 @@ COMMON_PROF = -pg
308308
# If you want to enable the experimental BFLOAT16 support
309309
# BUILD_BFLOAT16 = 1
310310

311+
# If you want to enable the experimental HFLOAT16 support
312+
# BUILD_HFLOAT16 = 1
311313

312314
# Set the thread number threshold beyond which the job array for the threaded level3 BLAS
313315
# will be allocated on the heap rather than the stack. (This array alone requires

0 commit comments

Comments
 (0)