Skip to content

Commit ed47326

Browse files
authored
Merge pull request #1 from xianyi/develop
update
2 parents 756802d + 0b8a436 commit ed47326

File tree

402 files changed

+41599
-5765
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

402 files changed

+41599
-5765
lines changed

.drone.yml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,3 +190,27 @@ steps:
190190
- make -C ctest $COMMON_FLAGS
191191
- make -C utest $COMMON_FLAGS
192192
- make -C cpp_thread_test dgemm_tester
193+
---
194+
kind: pipeline
195+
name: arm64_gcc10
196+
197+
platform:
198+
os: linux
199+
arch: arm64
200+
201+
steps:
202+
- name: Build and Test
203+
image: ubuntu:20.04
204+
environment:
205+
CC: gcc-10
206+
FC: gfortran-10
207+
COMMON_FLAGS: 'TARGET=ARMV8 DYNAMIC_ARCH=1'
208+
commands:
209+
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
210+
- apt-get update -y
211+
- apt-get install -y make $CC gfortran-10 perl python g++
212+
- $CC --version
213+
- make QUIET_MAKE=1 $COMMON_FLAGS
214+
- make -C utest $COMMON_FLAGS
215+
- make -C test $COMMON_FLAGS
216+

.github/workflows/nightly-Homebrew-build.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,11 @@ jobs:
4444
if: github.event_name != 'pull_request'
4545
run: brew update || true
4646

47+
- name: unlink installed gcc to allow updating
48+
run: |
49+
brew unlink gcc@8
50+
brew unlink gcc@9
51+
4752
- name: Install prerequisites
4853
run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas
4954

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,5 +89,7 @@ build.*
8989
*.swp
9090
benchmark/*.goto
9191
benchmark/smallscaling
92+
.vscode
9293
CMakeCache.txt
9394
CMakeFiles/*
95+
.vscode

.travis.yml

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -211,44 +211,57 @@ matrix:
211211

212212
- &test-macos
213213
os: osx
214-
osx_image: xcode10.1
214+
osx_image: xcode11.5
215215
before_script:
216216
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
217-
- brew update
218-
- brew install gcc@8 # for gfortran
219217
script:
220218
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
221219
env:
222-
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8"
220+
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9"
223221

224222
- <<: *test-macos
225223
osx_image: xcode12
226224
before_script:
227225
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
228226
- brew update
229-
- brew install gcc@10 # for gfortran
230227
script:
231228
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
232229
env:
233-
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
234-
230+
- BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 CC=gcc-10 FC=gfortran-10"
231+
235232
- <<: *test-macos
236-
osx_image: xcode10.0
233+
osx_image: xcode12
234+
before_script:
235+
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
236+
- brew update
237+
script:
238+
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
237239
env:
238-
- BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
240+
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
241+
242+
# - <<: *test-macos
243+
# osx_image: xcode10
244+
# env:
245+
# - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
239246

240247
- <<: *test-macos
241-
osx_image: xcode10.1
248+
osx_image: xcode11.5
249+
before_script:
250+
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
251+
- brew update
242252
env:
243-
- CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
244-
- CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
253+
# - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
254+
# - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
255+
- CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
256+
- CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0"
245257
- BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
246-
247258
- <<: *test-macos
248-
osx_image: xcode10.1
259+
osx_image: xcode11.5
249260
env:
250-
- CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
251-
- CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1"
261+
# - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
262+
# - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1"
263+
- CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
264+
- CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1"
252265
- BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1"
253266

254267
- &test-graviton2

CMakeLists.txt

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,17 @@ cmake_minimum_required(VERSION 2.8.5)
66
project(OpenBLAS C ASM)
77
set(OpenBLAS_MAJOR_VERSION 0)
88
set(OpenBLAS_MINOR_VERSION 3)
9-
set(OpenBLAS_PATCH_VERSION 10.dev)
9+
set(OpenBLAS_PATCH_VERSION 14.dev)
1010
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
1111

1212
# Adhere to GNU filesystem layout conventions
1313
include(GNUInstallDirs)
1414

1515
include(CMakePackageConfigHelpers)
1616

17+
if(MSVC AND NOT DEFINED NOFORTRAN)
18+
set(NOFORTRAN ON)
19+
endif()
1720

1821
#######
1922
if(MSVC)
@@ -229,7 +232,7 @@ if (NOT NO_CBLAS)
229232
add_subdirectory(utest)
230233
endif()
231234

232-
if (NOT MSVC AND NOT NOFORTRAN)
235+
if (NOT NOFORTRAN)
233236
# Build test and ctest
234237
add_subdirectory(test)
235238
if(NOT NO_CBLAS)

CONTRIBUTORS.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,4 +190,7 @@ In chronological order:
190190
* [2020-09-07] Fix builds with clang on IBM z, including dynamic architecture support
191191

192192
* Danfeng Zhang <https://github.com/craft-zhang>
193-
* [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53
193+
* [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53
194+
195+
* PingTouGe Semiconductor Co., Ltd.
196+
* [2020-10] Add RISC-V Vector (0.7.1) support. Optimize BLAS kernels for Xuantie C910

Changelog.txt

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,200 @@
11
OpenBLAS ChangeLog
2+
====================================================================
3+
Version 0.3.14
4+
17-Mar-2021
5+
6+
common:
7+
* Fixed a race condition on thread shutdown in non-OpenMP builds
8+
* Fixed custom BUFFERSIZE option getting ignored in gmake builds
9+
* Fixed CMAKE compilation of the TRMM kernels for GENERIC platforms
10+
* Added CBLAS interfaces for CROTG, ZROTG, CSROT and ZDROT
11+
* Improved performance of OMATCOPY_RT across all platforms
12+
* Changed perl scripts to use env instead of a hardcoded /usr/bin/perl
13+
* Fixed potential misreading of the GCC compiler version in the build scripts
14+
* Fixed convergence problems in LAPACK complex GGEV/GGES (Reference-LAPACK #477)
15+
* Reduced the stacksize requirements for running the LAPACK testsuite (Reference-LAPACK #335)
16+
17+
RISCV:
18+
* Fixed compilation on RISCV (missing entry in getarch)
19+
20+
POWER:
21+
* Fixed compilation for DYNAMIC_ARCH with clang and with old gcc versions
22+
* Added support for compilation on FreeBSD/ppc64le
23+
* Added optimized POWER10 kernels for SSCAL, DSCAL, CSCAL, ZSCAL
24+
* Added optimized POWER10 kernels for SROT, DROT, CDOT, SASUM, DASUM
25+
* Improved SSWAP, DSWAP, CSWAP, ZSWAP performance on POWER10
26+
* Improved SCOPY and CCOPY performance on POWER10
27+
* Improved SGEMM and DGEMM performance on POWER10
28+
* Added support for compilation with the NVIDIA HPC compiler
29+
30+
x86_64:
31+
* Added an optimized bfloat16 GEMM kernel for Cooperlake
32+
* Added CPUID autodetection for Intel Rocket Lake and Tiger Lake cpus
33+
* Improved the performance of SASUM,DASUM,SROT,DROT on AMD Ryzen cpus
34+
* Added support for compilation with the NAG Fortran compiler
35+
* Fixed recognition of the AMD AOCC compiler
36+
* Fixed compilation for DYNAMIC_ARCH with clang on Windows
37+
* Added support for running the BLAS/CBLAS tests on Windows
38+
* Fixed signatures of the tls callback functions for Windows x64
39+
* Fixed various issues with fma intrinsics support handling
40+
41+
ARM:
42+
* Added support for embedded Cortex M targets via a new option EMBEDDED
43+
44+
ARMV8:
45+
* Fixed the THUNDERX2T99 and NEOVERSEN1 DNRM2/ZNRM2 kernels for inputs with Inf
46+
* Added support for the DYNAMIC_LIST option
47+
* Added support for compilation with the NVIDIA HPC compiler
48+
* Added support for compiling with the NAG Fortran compiler
49+
50+
====================================================================
51+
Version 0.3.13
52+
12-Dec-2020
53+
54+
common:
55+
* Added a generic bfloat16 SBGEMV kernel
56+
* Fixed a potentially severe memory leak after fork in OpenMP builds
57+
that was introduced in 0.3.12
58+
* Added detection of the Fujitsu Fortran compiler
59+
* Added detection of the (e)gfortran compiler on OpenBSD
60+
* Added support for overriding the default name of the library independently
61+
from symbol suffixing in the gmake builds (already supported in cmake)
62+
63+
RISCV:
64+
* Added a RISC V port optimized for C910V
65+
66+
POWER:
67+
* Added optimized POWER10 kernels for SAXPY, CAXPY, SDOT, DDOT and DGEMV_N
68+
* Improved DGEMM performance on POWER10
69+
* Improved STRSM and DTRSM performance on POWER9 and POWER10
70+
* Fixed segmemtation faults in DYNAMIC_ARCH builds
71+
* Fixed compilation with the PGI compiler
72+
73+
x86:
74+
* Fixed compilation of kernels that require SSE2 intrinsics since 0.3.12
75+
76+
x86_64:
77+
* Added an optimized bfloat16 SBGEMV kernel for SkylakeX and Cooperlake
78+
* Improved the performance of SASUM and DASUM kernels through parallelization
79+
* Improved the performance of SROT and DROT kernels
80+
* Improved the performance of multithreaded xSYRK
81+
* Fixed OpenMP builds that use the LLVM Clang compiler together with GNU gfortran
82+
(where linking of both the LLVM libomp and GNU libgomp could lead to lockups or
83+
wrong results)
84+
* Fixed miscompilations by old gcc 4.6
85+
* Fixed misdetection of AVX2 capability in some Sandybridge cpus
86+
* Fixed lockups in builds combining DYNAMIC_ARCH with TARGET=GENERIC on OpenBSD
87+
88+
ARM64:
89+
* Fixed segmemtation faults in DYNAMIC_ARCH builds
90+
91+
MIPS:
92+
* Improved kernels for Loongson 3R3 ("3A") and 3R4 ("3B") models, including MSA
93+
* Fixed bugs in the MSA kernels for CGEMM, CTRMM, CGEMV and ZGEMV
94+
* Added handling of zero increments in the MSA kernels for SSWAP and DSWAP
95+
* Added DYNAMIC_ARCH support for MIPS64 (currently Loongson3R3/3R4 only)
96+
97+
SPARC:
98+
* Fixed building 32 and 64 bit SPARC kernels with the SolarisStudio compilers
99+
100+
====================================================================
101+
Version 0.3.12
102+
24-Oct-2020
103+
104+
common:
105+
* Fixed missing BLAS/LAPACK functions (inadvertently dropped during
106+
the build system restructuring)
107+
* Fixed argument conversion macro in LAPACKE_zgesvdq (LAPACK #458)
108+
109+
POWER:
110+
* Added optimized SCOPY/CCOPY kernels for POWER10
111+
* Increased and unified the default size of the GEMM BUFFER
112+
* Fixed building for POWER10 in DYNAMIC_ARCH mode
113+
* POWER10 compatibility test now checks binutils version as well
114+
* Cleaned up compiler warnings
115+
116+
x86_64:
117+
* corrected compiler version checks for AVX2 compatibility
118+
* added compiler option -mavx2 for building with flang
119+
* fixed direct SGEMM pathway for small matrix sizes (broken by
120+
the code refactoring in 0.3.11)
121+
* fixed unhandled partial register clobbers in several kernels
122+
for AXPY,DOT,GEMV_N and GEMV_T flagged by gcc10 tree-vectorizer
123+
124+
ARMV8:
125+
* improved Apple Vortex support to include cross-compiling
126+
127+
====================================================================
128+
Version 0.3.11
129+
17-Oct-2020
130+
131+
common:
132+
* API change:
133+
the newly added BFLOAT16 functions were renamed to use the
134+
letter "B" instead of "H" to avoid potential confusion with
135+
the IEEE "half precision float" type, i.e. the 0.3.10
136+
SHGEMM is now SBGEMM and the corresponding build option
137+
was changed from "BUILD_HALF" to "BUILD_BFLOAT16".
138+
* Reduced the default BLAS3_MEM_ALLOC_THRESHOLD (used as an upper
139+
limit for placing temporary arrays on the stack) to be compatible
140+
with a stack size of 1mb (as imposed by the JAVA runtime library)
141+
* Added mixed-precision dot function SBDOT and utility functions
142+
shstobf16, shdtobf16, sbf16tos and dbf16tod to convert between
143+
single or double precision float arrays and bfloat16 arrays
144+
* Fixed prototypes of LAPACK_?ggsvp and LAPACK_?ggsvd functions
145+
in lapack.h
146+
* Fixed underflow and rounding errors in LAPACK SLANV2 and DLANV2
147+
(causing miscalculations in e.g. SHSEQR/DHSEQR, LAPACK issue #263)
148+
* Fixed workspace calculation in LAPACK ?GELQ (LAPACK issue #415)
149+
* Fixed several bugs in the LAPACK testsuite
150+
* Improved performance of TRMM and TRSM for certain problem sizes
151+
* Fixed infinite recursions and workspace miscalculations in ReLAPACK
152+
* CMAKE builds no longer require pkg-config for creating the .pc file
153+
* Makefile builds no longer misread NO_CBLAS=0 or NO_LAPACK=0 as
154+
enabling these options
155+
* Fixed detection of gfortran when invoked through an mpi wrapper
156+
* Improve thread reinitialization performance with OpenMP after a fork
157+
* Added support for building only the subset of the library required
158+
for a particular precision by specifying BUILD_SINGLE, BUILD_DOUBLE
159+
* Optional function name prefixes and suffixes are now correctly
160+
reflected in the generated cblas.h
161+
* Added CMAKE build support for the LAPACK and multithreading tests
162+
163+
POWER:
164+
* Added optimized support for POWER10
165+
* Added support for compiling for POWER8 in 32bit mode
166+
* Added support for compilation with LLVM/clang
167+
* Added support for compilation with NVIDIA/PGI compilers
168+
* Fixed building on big-endian POWER8
169+
* Fixed miscompilation of ZDOTC by gcc10
170+
* Fixed alignment errors in the POWER8 SAXPY kernel
171+
* Improved CPU detection on AIX
172+
* Supported building with older compilers on POWER9
173+
174+
x86_64:
175+
* Added support for Intel Cooperlake
176+
* Added autodetection of AMD Renoir/Matisse/Zen3 cpus
177+
* Added autodetection of Intel Comet Lake cpus
178+
* Reimplemented ?sum, ?dot and daxpy using universal intrinsics
179+
* Reset the fpu state before using the fpu on Windows as a workaround
180+
for a problem introduced in Windows 10 build 19041 (a.k.a. SDK 2004)
181+
* Fixed potentially undefined behaviour in the dot and gemv_t kernels
182+
* Fixed a potential segmentation fault in DYNAMIC_ARCH builds
183+
* Fixed building for ZEN with PGI/NVIDIA and AMD AOCC compilers
184+
185+
ARMV7:
186+
* Fixed cpu detection on BSD-like systems
187+
188+
ARMV8:
189+
* Added preliminary support for Apple Vortex cpus
190+
* Added support for the Cavium ThunderX3T110 cpu
191+
* Fixed cpu detection on BSD-like systems
192+
* Fixed compilation in -std=C18 mode
193+
194+
IBM Z:
195+
* Added support for compiling with the clang compiler
196+
* Improved GEMM performance on Z14
197+
2198
====================================================================
3199
Version 0.3.10
4200
14-Jun-2020

Makefile

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,9 @@ endif
5959
@$(CC) --version > /dev/null 2>&1;\
6060
if [ $$? -eq 0 ]; then \
6161
cverinfo=`$(CC) --version | sed -n '1p'`; \
62+
if [ -z "$${cverinfo}" ]; then \
63+
cverinfo=`$(CC) --version | sed -n '2p'`; \
64+
fi; \
6265
echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\
6366
else \
6467
echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\
@@ -67,6 +70,9 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
6770
@$(FC) --version > /dev/null 2>&1;\
6871
if [ $$? -eq 0 ]; then \
6972
fverinfo=`$(FC) --version | sed -n '1p'`; \
73+
if [ -z "$${fverinfo}" ]; then \
74+
fverinfo=`$(FC) --version | sed -n '2p'`; \
75+
fi; \
7076
echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\
7177
else \
7278
echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\
@@ -268,7 +274,11 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
268274
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
269275
-@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
270276
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
277+
ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGGFORTRAN1)
278+
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc
279+
else
271280
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
281+
endif
272282
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
273283
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
274284
-@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc

0 commit comments

Comments
 (0)