Skip to content

Commit 0892943

Browse files
authored
Merge pull request #2886 from martin-frbg/issue_2767
Rename "HALF" precision functions (sh prefix) to "BFLOAT16" with "sb" prefix
2 parents 0c84ffe + cb83957 commit 0892943

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+665
-695
lines changed

CMakeLists.txt

Lines changed: 16 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,8 @@ option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding proc
2929
else()
3030
set(NO_AFFINITY 1)
3131
endif()
32-
option(BUILD_SINGLE "Single precision" OFF)
33-
option(BUILD_DOUBLE "Double precision" OFF)
34-
option(BUILD_COMPLEX "Single precision" OFF)
35-
option(BUILD_COMPLEX16 "Single precision" OFF)
32+
option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF)
33+
option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF)
3634

3735
# Add a prefix or suffix to all exported symbol names in the shared library.
3836
# Avoids conflicts with other BLAS libraries, especially when using
@@ -91,13 +89,13 @@ if (NOT NO_LAPACK)
9189
list(APPEND SUBDIRS lapack)
9290
endif ()
9391

94-
if (NOT DEFINED BUILD_HALF)
95-
set (BUILD_HALF false)
92+
if (NOT DEFINED BUILD_BFLOAT16)
93+
set (BUILD_BFLOAT16 false)
9694
endif ()
9795
# set which float types we want to build for
9896
if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16)
9997
# if none are defined, build for all
100-
# set(BUILD_HALF true)
98+
# set(BUILD_BFLOAT16 true)
10199
set(BUILD_SINGLE true)
102100
set(BUILD_DOUBLE true)
103101
set(BUILD_COMPLEX true)
@@ -110,33 +108,28 @@ endif()
110108

111109
set(FLOAT_TYPES "")
112110
if (BUILD_SINGLE)
113-
message(STATUS "Building Songle Precision")
114-
list(APPEND FLOAT_TYPES "SINGLE")
115-
# set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1")
111+
message(STATUS "Building Single Precision")
112+
list(APPEND FLOAT_TYPES "SINGLE") # defines nothing
116113
endif ()
117114

118115
if (BUILD_DOUBLE)
119116
message(STATUS "Building Double Precision")
120-
list(APPEND FLOAT_TYPES "DOUBLE")
121-
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE=1")
117+
list(APPEND FLOAT_TYPES "DOUBLE") # defines DOUBLE
122118
endif ()
123119

124120
if (BUILD_COMPLEX)
125121
message(STATUS "Building Complex Precision")
126-
list(APPEND FLOAT_TYPES "COMPLEX")
127-
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX=1")
128-
endif ()
122+
list(APPEND FLOAT_TYPES "COMPLEX") # defines COMPLEX
123+
endif ()
129124

130125
if (BUILD_COMPLEX16)
131126
message(STATUS "Building Double Complex Precision")
132-
list(APPEND FLOAT_TYPES "ZCOMPLEX")
133-
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16=1")
127+
list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE
134128
endif ()
135129

136-
if (BUILD_HALF)
130+
if (BUILD_BFLOAT16)
137131
message(STATUS "Building Half Precision")
138-
list(APPEND FLOAT_TYPES "HALF")
139-
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_HALF")
132+
list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing
140133
endif ()
141134

142135
if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN")
@@ -243,6 +236,9 @@ if (NOT MSVC AND NOT NOFORTRAN)
243236
add_subdirectory(ctest)
244237
endif()
245238
add_subdirectory(lapack-netlib/TESTING)
239+
if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
240+
add_subdirectory(cpp_thread_test)
241+
endif()
246242
endif()
247243

248244
set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES

Makefile.rule

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -272,17 +272,33 @@ COMMON_PROF = -pg
272272
# work at all.
273273
#
274274
# CPP_THREAD_SAFETY_TEST = 1
275+
#
276+
# use this to run only the less memory-hungry GEMV test
277+
# CPP_THREAD_SAFETY_GEMV = 1
275278

276279

277280
# If you want to enable the experimental BFLOAT16 support
278-
# BUILD_HALF = 1
279-
#
280-
# Select if you need to build only select types
281-
# BUILD_SINGLE = 1
282-
# BUILD_DOUBLE = 1
283-
# BUILD_COMPLEX = 1
284-
# BUILD_COMPLEX16 = 1
285-
#
286-
#
281+
# BUILD_BFLOAT16 = 1
282+
283+
284+
# Set the thread number threshold beyond which the job array for the threaded level3 BLAS
285+
# will be allocated on the heap rather than the stack. (This array alone requires
286+
# NUM_THREADS*NUM_THREADS*128 bytes of memory so should not pose a problem at low cpu
287+
# counts, but obviously it is not the only item that ends up on the stack.
288+
# The default value of 32 ensures that the overall requirement is compatible
289+
# with the default 1MB stacksize imposed by having the Java VM loaded without use
290+
# of its -Xss parameter.
291+
# The value of 160 formerly used from about version 0.2.7 until 0.3.10 is easily compatible
292+
# with the common Linux stacksize of 8MB but will cause crashes with unwary use of the java
293+
# VM e.g. in Octave or with the java-based libhdfs in numpy or scipy code
294+
# BLAS3_MEM_ALLOC_THRESHOLD = 160
295+
296+
297+
298+
# the below is not yet configurable, use cmake if you need to build only select types
299+
BUILD_SINGLE = 1
300+
BUILD_DOUBLE = 1
301+
BUILD_COMPLEX = 1
302+
BUILD_COMPLEX16 = 1
287303
# End of user configuration
288304
#

Makefile.system

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1232,8 +1232,8 @@ ifeq ($(USE_TLS), 1)
12321232
CCOMMON_OPT += -DUSE_TLS
12331233
endif
12341234

1235-
ifeq ($(BUILD_HALF), 1)
1236-
CCOMMON_OPT += -DBUILD_HALF
1235+
ifeq ($(BUILD_BFLOAT16), 1)
1236+
CCOMMON_OPT += -DBUILD_BFLOAT16
12371237
endif
12381238
ifeq ($(BUILD_SINGLE), 1)
12391239
CCOMMON_OPT += -DBUILD_SINGLE=1
@@ -1521,10 +1521,10 @@ export KERNELDIR
15211521
export FUNCTION_PROFILE
15221522
export TARGET_CORE
15231523
export NO_AVX512
1524-
export BUILD_HALF
1524+
export BUILD_BFLOAT16
15251525

1526-
export SHGEMM_UNROLL_M
1527-
export SHGEMM_UNROLL_N
1526+
export SBGEMM_UNROLL_M
1527+
export SBGEMM_UNROLL_N
15281528
export SGEMM_UNROLL_M
15291529
export SGEMM_UNROLL_N
15301530
export DGEMM_UNROLL_M

Makefile.tail

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,14 @@ BLASOBJS += $(QBLASOBJS) $(XBLASOBJS)
2424
BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P)
2525
endif
2626

27-
$(SHBLASOBJS) $(SHBLASOBJS_P) : override CFLAGS += -DHALF -UDOUBLE -UCOMPLEX
27+
$(SHBLASOBJS) $(SHBLASOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX
2828
$(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX
2929
$(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX
3030
$(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX
3131
$(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX
3232
$(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX
3333
$(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX
34-
$(SHEXTOBJS) $(SHEXTOBJS_P) : override CFLAGS += -DHALF -UDOUBLE -UCOMPLEX
34+
$(SHEXTOBJS) $(SHEXTOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX
3535

3636
$(SHBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
3737
$(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)

benchmark/Makefile

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ else
4949
GOTO_LAPACK_TARGETS=
5050
endif
5151

52-
ifeq ($(BUILD_HALF),1)
53-
GOTO_HALF_TARGETS=shgemm.goto
52+
ifeq ($(BUILD_BFLOAT16),1)
53+
GOTO_HALF_TARGETS=sbgemm.goto
5454
else
5555
GOTO_HALF_TARGETS=
5656
endif
@@ -620,8 +620,8 @@ zcholesky.essl : zcholesky.$(SUFFIX)
620620
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
621621

622622
##################################### Sgemm ####################################################
623-
ifeq ($(BUILD_HALF),1)
624-
shgemm.goto : shgemm.$(SUFFIX) ../$(LIBNAME)
623+
ifeq ($(BUILD_BFLOAT16),1)
624+
sbgemm.goto : sbgemm.$(SUFFIX) ../$(LIBNAME)
625625
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
626626
endif
627627

@@ -2927,8 +2927,8 @@ ccholesky.$(SUFFIX) : cholesky.c
29272927
zcholesky.$(SUFFIX) : cholesky.c
29282928
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
29292929

2930-
ifeq ($(BUILD_HALF),1)
2931-
shgemm.$(SUFFIX) : gemm.c
2930+
ifeq ($(BUILD_BFLOAT16),1)
2931+
sbgemm.$(SUFFIX) : gemm.c
29322932
$(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^
29332933
endif
29342934

benchmark/gemm.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4040
#ifdef DOUBLE
4141
#define GEMM BLASFUNC(dgemm)
4242
#elif defined(HALF)
43-
#define GEMM BLASFUNC(shgemm)
43+
#define GEMM BLASFUNC(sbgemm)
4444
#else
4545
#define GEMM BLASFUNC(sgemm)
4646
#endif

cblas.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -392,7 +392,7 @@ void cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE
392392
/* convert BFLOAT16 array to double array */
393393
void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout);
394394
/* dot production of BFLOAT16 input arrays, and output as float */
395-
float cblas_shdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy);
395+
float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy);
396396

397397
#ifdef __cplusplus
398398
}

cmake/kernel.cmake

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ macro(SetDefaultL1)
113113
set(ZSUMKERNEL zsum.S)
114114
set(QSUMKERNEL sum.S)
115115
set(XSUMKERNEL zsum.S)
116-
if (BUILD_HALF)
116+
if (BUILD_BFLOAT16)
117117
set(SHAMINKERNEL ../arm/amin.c)
118118
set(SHAMAXKERNEL ../arm/amax.c)
119119
set(SHMAXKERNEL ../arm/max.c)
@@ -126,7 +126,7 @@ if (BUILD_HALF)
126126
set(SHAXPYKERNEL ../arm/axpy.c)
127127
set(SHAXPBYKERNEL ../arm/axpby.c)
128128
set(SHCOPYKERNEL ../arm/copy.c)
129-
set(SHDOTKERNEL ../x86_64/shdot.c)
129+
set(SBDOTKERNEL ../x86_64/sbdot.c)
130130
set(SHROTKERNEL ../arm/rot.c)
131131
set(SHSCALKERNEL ../arm/scal.c)
132132
set(SHNRM2KERNEL ../arm/nrm2.c)
@@ -183,9 +183,9 @@ macro(SetDefaultL2)
183183
set(XHEMV_L_KERNEL ../generic/zhemv_k.c)
184184
set(XHEMV_V_KERNEL ../generic/zhemv_k.c)
185185
set(XHEMV_M_KERNEL ../generic/zhemv_k.c)
186-
if (BUILD_HALF)
187-
set(SHGEMVNKERNEL ../arm/gemv_n.c)
188-
set(SHGEMVTKERNEL ../arm/gemv_t.c)
186+
if (BUILD_BFLOAT16)
187+
set(SBGEMVNKERNEL ../arm/gemv_n.c)
188+
set(SBGEMVTKERNEL ../arm/gemv_t.c)
189189
set(SHGERKERNEL ../generic/ger.c)
190190
endif ()
191191
endmacro ()
@@ -195,18 +195,18 @@ macro(SetDefaultL3)
195195
set(DGEADD_KERNEL ../generic/geadd.c)
196196
set(CGEADD_KERNEL ../generic/zgeadd.c)
197197
set(ZGEADD_KERNEL ../generic/zgeadd.c)
198-
if (BUILD_HALF)
198+
if (BUILD_BFLOAT16)
199199
set(SHGEADD_KERNEL ../generic/geadd.c)
200-
set(SHGEMMKERNEL ../generic/gemmkernel_2x2.c)
201-
set(SHGEMM_BETA ../generic/gemm_beta.c)
202-
set(SHGEMMINCOPY ../generic/gemm_ncopy_2.c)
203-
set(SHGEMMITCOPY ../generic/gemm_tcopy_2.c)
204-
set(SHGEMMONCOPY ../generic/gemm_ncopy_2.c)
205-
set(SHGEMMOTCOPY ../generic/gemm_tcopy_2.c)
206-
set(SHGEMMINCOPYOBJ shgemm_incopy.o)
207-
set(SHGEMMITCOPYOBJ shgemm_itcopy.o)
208-
set(SHGEMMONCOPYOBJ shgemm_oncopy.o)
209-
set(SHGEMMOTCOPYOBJ shgemm_otcopy.o)
200+
set(SBGEMMKERNEL ../generic/gemmkernel_2x2.c)
201+
set(SBGEMM_BETA ../generic/gemm_beta.c)
202+
set(SBGEMMINCOPY ../generic/gemm_ncopy_2.c)
203+
set(SBGEMMITCOPY ../generic/gemm_tcopy_2.c)
204+
set(SBGEMMONCOPY ../generic/gemm_ncopy_2.c)
205+
set(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c)
206+
set(SBGEMMINCOPYOBJ sbgemm_incopy.o)
207+
set(SBGEMMITCOPYOBJ sbgemm_itcopy.o)
208+
set(SBGEMMONCOPYOBJ sbgemm_oncopy.o)
209+
set(SBGEMMOTCOPYOBJ sbgemm_otcopy.o)
210210
endif ()
211211

212212
endmacro ()

cmake/prebuild.cmake

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616
# HAVE_SSE2
1717
# HAVE_SSE3
1818
# MAKE
19-
# SHGEMM_UNROLL_M
20-
# SHGEMM_UNROLL_N
19+
# SBGEMM_UNROLL_M
20+
# SBGEMM_UNROLL_N
2121
# SGEMM_UNROLL_M
2222
# SGEMM_UNROLL_N
2323
# DGEMM_UNROLL_M
@@ -471,8 +471,8 @@ endif ()
471471
set(ZGEMM_UNROLL_N 2)
472472
set(SYMV_P 8)
473473
endif()
474-
set(SHGEMM_UNROLL_M 8)
475-
set(SHGEMM_UNROLL_N 4)
474+
set(SBGEMM_UNROLL_M 8)
475+
set(SBGEMM_UNROLL_N 4)
476476

477477
# Or should this actually be NUM_CORES?
478478
if (${NUM_THREADS} GREATER 0)

cmake/system.cmake

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,13 @@ else ()
326326
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=2048")
327327
endif ()
328328
endif ()
329-
329+
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
330+
if (DEFINED BLAS3_MEM_ALLOC_THRESHOLD)
331+
if (NOT ${BLAS3_MEM_ALLOC_THRESHOLD} EQUAL 32)
332+
set(CCOMMON_OPT "${CCOMMON_OPT} -DBLAS3_MEM_ALLOC_THRESHOLD=${BLAS3_MEM_ALLOC_THRESHOLD}")
333+
endif()
334+
endif()
335+
endif()
330336
if (DEFINED LIBNAMESUFFIX)
331337
set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}")
332338
else ()
@@ -404,20 +410,16 @@ if (NOT BUILD_SINGLE AND NOT BUILD_DOUBLE AND NOT BUILD_COMPLEX AND NOT BUILD_CO
404410
set (BUILD_COMPLEX16 ON)
405411
endif()
406412
if (BUILD_SINGLE)
407-
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_SINGLE=1")
408-
set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1")
413+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_SINGLE")
409414
endif()
410415
if (BUILD_DOUBLE)
411-
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE=1")
412-
set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1")
416+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE")
413417
endif()
414418
if (BUILD_COMPLEX)
415-
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX=1")
416-
set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_COMPLEX=1")
419+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX")
417420
endif()
418421
if (BUILD_COMPLEX16)
419-
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16=1")
420-
set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_COMPLEX16=1")
422+
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16")
421423
endif()
422424
if(NOT MSVC)
423425
set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}")
@@ -591,8 +593,8 @@ endif ()
591593
#export FUNCTION_PROFILE
592594
#export TARGET_CORE
593595
#
594-
#export SHGEMM_UNROLL_M
595-
#export SHGEMM_UNROLL_N
596+
#export SBGEMM_UNROLL_M
597+
#export SBGEMM_UNROLL_N
596598
#export SGEMM_UNROLL_M
597599
#export SGEMM_UNROLL_N
598600
#export DGEMM_UNROLL_M

0 commit comments

Comments
 (0)