Skip to content

Commit 75472b8

Browse files
author
Chip Kerchner
committed
Merge branch 'develop' into betterPowerGEMVTail
2 parents 1a7b8c6 + cd3945b commit 75472b8

28 files changed

+1845
-1745
lines changed

CMakeLists.txt

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ project(OpenBLAS C ASM)
88

99
set(OpenBLAS_MAJOR_VERSION 0)
1010
set(OpenBLAS_MINOR_VERSION 3)
11-
set(OpenBLAS_PATCH_VERSION 27.dev)
11+
set(OpenBLAS_PATCH_VERSION 28.dev)
1212

1313
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
1414

@@ -22,6 +22,8 @@ option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS
2222

2323
option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON)
2424

25+
set(LAPACK_STRLEN "" CACHE STRING "When building LAPACK, use this type (e.g. \"int\") for character lengths (defaults to size_t)")
26+
2527
option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
2628

2729
option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF)
@@ -30,7 +32,7 @@ option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OF
3032

3133
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
3234

33-
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
35+
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64, ppc or RISCV64-RVV1.0 only)" OFF)
3436

3537
option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
3638

@@ -256,6 +258,10 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|Drago
256258
endif()
257259
endif()
258260

261+
if (APPLE AND BUILD_SHARED_LIBS)
262+
set(CMAKE_MACOSX_RPATH ON)
263+
endif()
264+
259265
# Seems that this hack doesn't required since macOS 11 Big Sur
260266
if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
261267
set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)

Changelog.txt

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,127 @@
11
OpenBLAS ChangeLog
2+
====================================================================
3+
Version 0.3.28
4+
8-Aug-2024
5+
6+
general:
7+
- Reworked the unfinished implementation of HUGETLB from GotoBLAS
8+
for allocating huge memory pages as buffers on suitable systems
9+
- Changed the unfinished implementation of GEMM3M for the generic
10+
target on all architectures to at least forward to regular GEMM
11+
- Improved multithreaded GEMM performance for large non-skinny matrices
12+
- Improved BLAS3 performance on larger multicore systems through improved
13+
parallelism
14+
- Improved performance of the initial memory allocation by reducing
15+
locking overhead
16+
- Improved performance of GBMV at small problem sizes by introducing
17+
a size barrier for the switch to multithreading
18+
- Added an implementation of the CBLAS_GEMM_BATCH extension
19+
- Fixed miscompilation of CAXPYC and ZAXPYC on all architectures in
20+
CMAKE builds (error introduced in 0.3.27)
21+
- Fixed corner cases involving the handling of NAN and INFINITY
22+
arguments in ?SCAL on all architectures
23+
- Added support for cross-compiling to WEBM with CMAKE (in addition
24+
to the already present makefile support)
25+
- Fixed NAN handling and potential accuracy issues in compilations with
26+
Intel ICX by supplying a suitable fp-model option by default
27+
- The contents of the github project wiki have been converted into
28+
a new set of documentation included with the source code.
29+
- It is now possible to register a callback function that replaces
30+
the built-in support for multithreading with an external backend
31+
like TBB (openblas_set_threads_callback_function)
32+
- Fixed potential duplication of suffixes in shared library naming
33+
- Improved C compiler detection by the build system to tolerate more
34+
naming variants for gcc builds
35+
- Fixed an unnecessary dependency of the utest on CBLAS
36+
- Fixed spurious error reports from the BLAS extensions utest
37+
- Fixed unwanted invocation of the GEMM3M tests in cross-compilation
38+
- Fixed a flaw in the makefile build that could lead to the pkgconfig
39+
file containing an entry of UNKNOWN for the target cpu after installing
40+
- Integrated fixes from the Reference-LAPACK project:
41+
- Fixed uninitialized variables in the LAPACK tests for ?QP3RK (PR 961)
42+
- Fixed potential bounds error in ?UNHR_COL/?ORHR_COL (PR 1018)
43+
- Fixed potential infinite loop in the LAPACK testsuite (PR 1024)
44+
- Make the variable type used for hidden length arguments configurable (PR 1025)
45+
- Fixed SYTRD workspace computation and various typos (PR 1030)
46+
- Prevent compiler use of FMA that could increase numerical error in ?GEEVX (PR 1033)
47+
48+
x86-64:
49+
- reverted thread management under Windows to its state before 0.3.26
50+
due to signs of race conditions in some circumstances now under study
51+
- fixed accidental selection of the unoptimized generic SBGEMM kernel
52+
in CMAKE builds for CooperLake and SapphireRapids targets
53+
- fixed a potential thread buffer overrun in SBSTOBF16 on small systems
54+
- fixed an accuracy issue in ZSCAL introduced in 0.3.26
55+
- fixed compilation with CMAKE and recent releases of LLVM
56+
- added support for Intel Emerald Rapids and Meteor Lake cpus
57+
- added autodetection support for the Zhaoxin KX-7000 cpu
58+
- fixed autodetection of Intel Prescott (probably broken since 0.3.19)
59+
- fixed compilation for older targets with the Yocto SDK
60+
- fixed compilation of the converter-generated C versions
61+
of the LAPACK sources with gcc-14
62+
- improved compiler options when building with CMAKE and LLVM for
63+
AVX512-capable targets
64+
- added support for supplying the L2 cache size via an environment
65+
variable (OPENBLAS_L2_SIZE) in case it is not correctly reported
66+
(as in some VM configurations)
67+
- improved the error message shown when thread creation fails on startup
68+
- fixed setting the rpath entry of the dylib in CMAKE builds on MacOS
69+
70+
arm:
71+
- fixed building for baremetal targets with make
72+
73+
arm64:
74+
- Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1
75+
matrix to the corresponding GEMV kernel
76+
- added optimized SGEMV and DGEMV kernels for A64FX
77+
- added optimized SVE kernels for small-matrix GEMM
78+
- added A64FX to the cpu list for DYNAMIC_ARCH
79+
- fixed building with support for cpu affinity
80+
- worked around accuracy problems with C/ZNRM2 on NeoverseN1 and
81+
Apple M targets
82+
- improved GEMM performance on Neoverse V1
83+
- fixed compilation for NEOVERSEN2 with older compilers
84+
- fixed potential miscompilation of the SVE SDOT and DDOT kernels
85+
- fixed potential miscompilation of the non-SVE CDOT and ZDOT kernels
86+
- fixed a potential overflow when using very large user-defined BUFFERSIZE
87+
- fixed setting the rpath entry of the dylib in CMAKE builds on MacOS
88+
89+
power:
90+
- Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1
91+
matrix to the corresponding GEMV kernel
92+
- significantly improved performance of SBGEMM on POWER10
93+
- fixed compilation with OpenMP and the XLF compiler
94+
- fixed building of the BLAS extension utests under AIX
95+
- fixed building of parts of the LAPACK testsuite with XLF
96+
- fixed CSWAP/ZSWAP on big-endian POWER10 targets
97+
- fixed a performance regression in SAXPY on POWER10 with OpenXL
98+
- fixed accuracy issues in CSCAL/ZSCAL when compiled with LLVM
99+
- fixed building for POWER9 under FreeBSD
100+
- fixed a potential overflow when using very large user-defined BUFFERSIZE
101+
- fixed an accuracy issue in the POWER6 kernels for GEMM and GEMV
102+
103+
riscv64:
104+
- Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1
105+
matrix to the corresponding GEMV kernel
106+
- fixed building for RISCV64_GENERIC with OpenMP enabled
107+
- added DYNAMIC_ARCH support (comprising GENERIC_RISCV64 and the two
108+
RVV 1.0 targets with vector length of 128 and 256)
109+
- worked around the ZVL128B kernels for AXPBY mishandling the special
110+
case of zero Y increment
111+
112+
loongarch64:
113+
- improved GEMM performance on servers of the 3C5000 generation
114+
- improved performance and stability of DGEMM
115+
- improved GEMV and TRSM kernels for LSX and LASX vector ABIs
116+
- fixed CMAKE compilation with the INTERFACE64 option set
117+
- fixed compilation with CMAKE
118+
- worked around spurious errors flagged by the BLAS3 tests
119+
- worked around a miscompilation of the POTRS utest by gcc 14.1
120+
121+
mips64:
122+
- fixed ASUM and SUM kernels to accept negative step sizes in X
123+
- fixed complex GEMV kernels for MSA
124+
2125
====================================================================
3126
Version 0.3.27
4127
4-Apr-2024

Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ else
4545
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
4646
endif
4747

48+
ifdef LAPACK_STRLEN
49+
LAPACK_FFLAGS += -DLAPACK_STRLEN=$(LAPACK_STRLEN)
50+
endif
51+
4852
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
4953

5054
.PHONY : all libs netlib $(RELA) test ctest shared install

Makefile.install

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ endif
178178
@echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)"
179179
@echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)"
180180
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)"
181-
@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
181+
@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(TARGET) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
182182
@echo 'version='$(VERSION) >> "$(PKGFILE)"
183183
@echo 'extralib='$(PKG_EXTRALIB) >> "$(PKGFILE)"
184184
@cat openblas.pc.in >> "$(PKGFILE)"

Makefile.rule

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#
44

55
# This library's version
6-
VERSION = 0.3.27.dev
6+
VERSION = 0.3.28.dev
77

88
# If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a
99
# and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library
@@ -134,6 +134,12 @@ VERSION = 0.3.27.dev
134134
# Build LAPACK Deprecated functions since LAPACK 3.6.0
135135
BUILD_LAPACK_DEPRECATED = 1
136136

137+
# The variable type assumed for the length of character arguments when passing
138+
# data between Fortran LAPACK and C BLAS (defaults to "size_t", but older GCC
139+
# versions used "int"). Mismatches will not cause runtime failures but may result
140+
# in build warnings or errors when building with link-time optimization (LTO)
141+
# LAPACK_STRLEN=int
142+
137143
# Build RecursiveLAPACK on top of LAPACK
138144
# BUILD_RELAPACK = 1
139145
# Have RecursiveLAPACK actually replace standard LAPACK routines instead of

Makefile.system

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,12 @@ endif
277277
ifeq ($(ARCH), arm64)
278278
GEMM_GEMV_FORWARD = 1
279279
endif
280+
ifeq ($(ARCH), riscv)
281+
GEMM_GEMV_FORWARD = 1
282+
endif
283+
ifeq ($(ARCH), power)
284+
GEMM_GEMV_FORWARD = 1
285+
endif
280286

281287
ifeq ($(SMALL_MATRIX_OPT), 1)
282288
CCOMMON_OPT += -DSMALL_MATRIX_OPT

cmake/arch.cmake

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,11 @@ if (DYNAMIC_ARCH)
5757
set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10)
5858
set(CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_P10_SUPPORT")
5959
endif ()
60-
60+
61+
if (RISCV64)
62+
set(DYNAMIC_CORE RISCV64_GENERIC RISCV64_ZVL128B RISCV64_ZVL256B)
63+
endif ()
64+
6165
if (X86)
6266
set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO)
6367
endif ()

cmake/system.cmake

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,7 @@ if (SMALL_MATRIX_OPT)
403403
endif ()
404404

405405
if (DYNAMIC_ARCH)
406-
if (X86 OR X86_64 OR ARM64 OR POWER)
406+
if (X86 OR X86_64 OR ARM64 OR POWER OR RISCV64)
407407
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
408408
if (DYNAMIC_OLDER)
409409
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
@@ -621,7 +621,10 @@ set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} ${FCOMMON_OPT}")
621621
set(FPFLAGS "${FPFLAGS} ${FCOMMON_OPT} ${COMMON_PROF}")
622622

623623
#For LAPACK Fortran codes.
624-
set(LAPACK_FFLAGS "${LAPACK_FFLAGS} ${CMAKE_Fortran_FLAGS}")
624+
set(LAPACK_FFLAGS "${LAPACK_FFLAGS} ${CMAKE_Fortran_FLAGS}" )
625+
if (LAPACK_STRLEN)
626+
set (LAPACK_FFLAGS "${LAPACK_FFLAGS} -DLAPACK_STRLEN=${LAPACK_STRLEN}")
627+
endif()
625628
set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}")
626629

627630
#Disable -fopenmp for LAPACK Fortran codes on Windows.

common_thread.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,8 @@ typedef struct blas_queue {
111111
struct blas_queue *next;
112112

113113
#if defined( __WIN32__) || defined(__CYGWIN32__) || defined(_WIN32) || defined(__CYGWIN__)
114-
// CRITICAL_SECTION lock;
115-
// HANDLE finish;
114+
CRITICAL_SECTION lock;
115+
HANDLE finish;
116116
volatile int finished;
117117
#else
118118
pthread_mutex_t lock;

driver/others/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ if (DYNAMIC_ARCH)
5252
list(APPEND COMMON_SOURCES dynamic_arm64.c)
5353
elseif (POWER)
5454
list(APPEND COMMON_SOURCES dynamic_power.c)
55+
elseif (RISCV64)
56+
list(APPEND COMMON_SOURCES dynamic_riscv64.c detect_riscv64.c)
5557
else ()
5658
list(APPEND COMMON_SOURCES dynamic.c)
5759
endif ()

0 commit comments

Comments
 (0)