Skip to content

Commit 5f8f058

Browse files
authored
Merge branch 'develop' into fc-1847
2 parents 40cce0e + 974a6a3 commit 5f8f058

28 files changed

+1046
-715
lines changed

Makefile.install

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ ifndef NO_CBLAS
4848
@sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
4949
endif
5050

51+
ifneq ($(OSNAME), AIX)
5152
ifndef NO_LAPACKE
5253
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
5354
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
@@ -72,6 +73,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
7273
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
7374
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
7475
endif
76+
7577
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
7678
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
7779
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
@@ -93,6 +95,33 @@ ifeq ($(OSNAME), CYGWIN_NT)
9395
endif
9496
endif
9597

98+
else
99+
#install on AIX has different options syntax
100+
ifndef NO_LAPACKE
101+
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
102+
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
103+
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
104+
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
105+
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
106+
endif
107+
108+
#for install static library
109+
ifndef NO_STATIC
110+
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
111+
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
112+
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
113+
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
114+
endif
115+
#for install shared library
116+
ifndef NO_SHARED
117+
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
118+
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
119+
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
120+
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
121+
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
122+
endif
123+
124+
endif
96125

97126
#Generating openblas.pc
98127
@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"

Makefile.system

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,13 @@ CCOMMON_OPT += $(XCCOMMON_OPT)
510510
#CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)'
511511
endif
512512

513+
ifeq ($(ARCH), arm64)
514+
DYNAMIC_CORE = ARMV8
515+
DYNAMIC_CORE += CORTEXA57
516+
DYNAMIC_CORE += THUNDERX
517+
DYNAMIC_CORE += THUNDERX2T99
518+
endif
519+
513520
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
514521
ifndef DYNAMIC_CORE
515522
override DYNAMIC_ARCH=

common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ extern "C" {
183183

184184
#define ALLOCA_ALIGN 63UL
185185

186-
#define NUM_BUFFERS MAX(64,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER))
186+
#define NUM_BUFFERS MAX(50,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER))
187187

188188
#ifdef NEEDBUNDERSCORE
189189
#define BLASFUNC(FUNC) FUNC##_

cpuid_arm64.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,6 @@ void get_cpuconfig(void)
237237
break;
238238

239239
case CPU_THUNDERX:
240-
printf("#define ARMV8\n");
241240
printf("#define THUNDERX\n");
242241
printf("#define L1_DATA_SIZE 32768\n");
243242
printf("#define L1_DATA_LINESIZE 128\n");

cpuid_x86.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2009,6 +2009,8 @@ int get_coretype(void){
20092009
switch (model) {
20102010
case 1:
20112011
// AMD Ryzen
2012+
case 8:
2013+
// Ryzen 2
20122014
if(support_avx())
20132015
#ifndef NO_AVX2
20142016
return CORE_ZEN;

driver/level3/level3_thread.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,10 @@
4848
#define SWITCH_RATIO 2
4949
#endif
5050

51+
#ifndef GEMM_PREFERED_SIZE
52+
#define GEMM_PREFERED_SIZE 1
53+
#endif
54+
5155
//The array of job_t may overflow the stack.
5256
//Instead, use malloc to alloc job_t.
5357
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@@ -510,6 +514,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
510514
return 0;
511515
}
512516

517+
static int round_up(int remainder, int width, int multiple)
518+
{
519+
if (multiple > remainder || width <= multiple)
520+
return width;
521+
width = (width + multiple - 1) / multiple;
522+
width = width * multiple;
523+
return width;
524+
}
525+
526+
513527
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
514528
*range_n, FLOAT *sa, FLOAT *sb,
515529
BLASLONG nthreads_m, BLASLONG nthreads_n) {
@@ -601,9 +615,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
601615
num_parts = 0;
602616
while (m > 0){
603617
width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts);
618+
619+
width = round_up(m, width, GEMM_PREFERED_SIZE);
620+
604621
m -= width;
622+
605623
if (m < 0) width = width + m;
606624
range_M[num_parts + 1] = range_M[num_parts] + width;
625+
607626
num_parts ++;
608627
}
609628
for (i = num_parts; i < MAX_CPU_NUMBER; i++) {
@@ -645,9 +664,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
645664
if (width < SWITCH_RATIO) {
646665
width = SWITCH_RATIO;
647666
}
667+
width = round_up(n, width, GEMM_PREFERED_SIZE);
668+
648669
n -= width;
649670
if (n < 0) width = width + n;
650671
range_N[num_parts + 1] = range_N[num_parts] + width;
672+
651673
num_parts ++;
652674
}
653675
for (j = num_parts; j < MAX_CPU_NUMBER; j++) {

driver/others/Makefile

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,11 @@ endif
1515
# COMMONOBJS += info.$(SUFFIX)
1616

1717
ifeq ($(DYNAMIC_ARCH), 1)
18+
ifeq ($(ARCH),arm64)
19+
COMMONOBJS += dynamic_arm64.$(SUFFIX)
20+
else
1821
COMMONOBJS += dynamic.$(SUFFIX)
22+
endif
1923
else
2024
COMMONOBJS += parameter.$(SUFFIX)
2125
endif
@@ -71,7 +75,11 @@ BLAS_SERVER = blas_server.c
7175
endif
7276

7377
ifeq ($(DYNAMIC_ARCH), 1)
78+
ifeq ($(ARCH),arm64)
79+
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX)
80+
else
7481
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
82+
endif
7583
else
7684
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
7785
endif

driver/others/blas_server.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -850,6 +850,11 @@ void goto_set_num_threads(int num_threads) {
850850

851851
long i;
852852

853+
#ifdef SMP_SERVER
854+
// Handle lazy re-init of the thread-pool after a POSIX fork
855+
if (unlikely(blas_server_avail == 0)) blas_thread_init();
856+
#endif
857+
853858
if (num_threads < 1) num_threads = blas_num_threads;
854859

855860
#ifndef NO_AFFINITY

driver/others/blas_server_win32.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -478,7 +478,12 @@ int BLASFUNC(blas_thread_shutdown)(void){
478478

479479
void goto_set_num_threads(int num_threads)
480480
{
481-
long i;
481+
long i;
482+
483+
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
484+
// Handle lazy re-init of the thread-pool after a POSIX fork
485+
if (unlikely(blas_server_avail == 0)) blas_thread_init();
486+
#endif
482487

483488
if (num_threads < 1) num_threads = blas_cpu_number;
484489

driver/others/dynamic_arm64.c

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
/*********************************************************************/
2+
/* Copyright 2009, 2010 The University of Texas at Austin. */
3+
/* All rights reserved. */
4+
/* */
5+
/* Redistribution and use in source and binary forms, with or */
6+
/* without modification, are permitted provided that the following */
7+
/* conditions are met: */
8+
/* */
9+
/* 1. Redistributions of source code must retain the above */
10+
/* copyright notice, this list of conditions and the following */
11+
/* disclaimer. */
12+
/* */
13+
/* 2. Redistributions in binary form must reproduce the above */
14+
/* copyright notice, this list of conditions and the following */
15+
/* disclaimer in the documentation and/or other materials */
16+
/* provided with the distribution. */
17+
/* */
18+
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19+
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20+
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21+
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22+
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23+
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24+
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25+
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26+
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27+
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28+
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29+
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30+
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31+
/* POSSIBILITY OF SUCH DAMAGE. */
32+
/* */
33+
/* The views and conclusions contained in the software and */
34+
/* documentation are those of the authors and should not be */
35+
/* interpreted as representing official policies, either expressed */
36+
/* or implied, of The University of Texas at Austin. */
37+
/*********************************************************************/
38+
39+
#include "common.h"
40+
#include <asm/hwcap.h>
41+
#include <sys/auxv.h>
42+
43+
extern gotoblas_t gotoblas_ARMV8;
44+
extern gotoblas_t gotoblas_CORTEXA57;
45+
extern gotoblas_t gotoblas_THUNDERX;
46+
extern gotoblas_t gotoblas_THUNDERX2T99;
47+
48+
extern void openblas_warning(int verbose, const char * msg);
49+
50+
#define NUM_CORETYPES 4
51+
52+
/*
53+
* In case asm/hwcap.h is outdated on the build system, make sure
54+
* that HWCAP_CPUID is defined
55+
*/
56+
#ifndef HWCAP_CPUID
57+
#define HWCAP_CPUID (1 << 11)
58+
#endif
59+
60+
#define get_cpu_ftr(id, var) ({ \
61+
asm("mrs %0, "#id : "=r" (var)); \
62+
})
63+
64+
static char *corename[] = {
65+
"armv8",
66+
"cortexa57",
67+
"thunderx",
68+
"thunderx2t99",
69+
"unknown"
70+
};
71+
72+
char *gotoblas_corename(void) {
73+
if (gotoblas == &gotoblas_ARMV8) return corename[ 0];
74+
if (gotoblas == &gotoblas_CORTEXA57) return corename[ 1];
75+
if (gotoblas == &gotoblas_THUNDERX) return corename[ 2];
76+
if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 3];
77+
return corename[NUM_CORETYPES];
78+
}
79+
80+
static gotoblas_t *force_coretype(char *coretype) {
81+
int i ;
82+
int found = -1;
83+
char message[128];
84+
85+
for ( i=0 ; i < NUM_CORETYPES; i++)
86+
{
87+
if (!strncasecmp(coretype, corename[i], 20))
88+
{
89+
found = i;
90+
break;
91+
}
92+
}
93+
94+
switch (found)
95+
{
96+
case 0: return (&gotoblas_ARMV8);
97+
case 1: return (&gotoblas_CORTEXA57);
98+
case 2: return (&gotoblas_THUNDERX);
99+
case 3: return (&gotoblas_THUNDERX2T99);
100+
}
101+
snprintf(message, 128, "Core not found: %s\n", coretype);
102+
openblas_warning(1, message);
103+
return NULL;
104+
}
105+
106+
static gotoblas_t *get_coretype(void) {
107+
int implementer, variant, part, arch, revision, midr_el1;
108+
109+
if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
110+
char coremsg[128];
111+
snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n");
112+
openblas_warning(1, coremsg);
113+
return NULL;
114+
}
115+
116+
get_cpu_ftr(MIDR_EL1, midr_el1);
117+
/*
118+
* MIDR_EL1
119+
*
120+
* 31 24 23 20 19 16 15 4 3 0
121+
* -----------------------------------------------------------------
122+
* | Implementer | Variant | Architecture | Part Number | Revision |
123+
* -----------------------------------------------------------------
124+
*/
125+
implementer = (midr_el1 >> 24) & 0xFF;
126+
part = (midr_el1 >> 4) & 0xFFF;
127+
128+
switch(implementer)
129+
{
130+
case 0x41: // ARM
131+
switch (part)
132+
{
133+
case 0xd07: // Cortex A57
134+
case 0xd08: // Cortex A72
135+
case 0xd03: // Cortex A53
136+
return &gotoblas_CORTEXA57;
137+
}
138+
break;
139+
case 0x42: // Broadcom
140+
switch (part)
141+
{
142+
case 0x516: // Vulcan
143+
return &gotoblas_THUNDERX2T99;
144+
}
145+
break;
146+
case 0x43: // Cavium
147+
switch (part)
148+
{
149+
case 0x0a1: // ThunderX
150+
return &gotoblas_THUNDERX;
151+
case 0x0af: // ThunderX2
152+
return &gotoblas_THUNDERX2T99;
153+
}
154+
break;
155+
}
156+
return NULL;
157+
}
158+
159+
void gotoblas_dynamic_init(void) {
160+
161+
char coremsg[128];
162+
char coren[22];
163+
char *p;
164+
165+
if (gotoblas) return;
166+
167+
p = getenv("OPENBLAS_CORETYPE");
168+
if ( p )
169+
{
170+
gotoblas = force_coretype(p);
171+
}
172+
else
173+
{
174+
gotoblas = get_coretype();
175+
}
176+
177+
if (gotoblas == NULL)
178+
{
179+
snprintf(coremsg, 128, "Falling back to generic ARMV8 core\n");
180+
openblas_warning(1, coremsg);
181+
gotoblas = &gotoblas_ARMV8;
182+
}
183+
184+
if (gotoblas && gotoblas->init) {
185+
strncpy(coren, gotoblas_corename(), 20);
186+
sprintf(coremsg, "Core: %s\n", coren);
187+
openblas_warning(2, coremsg);
188+
gotoblas -> init();
189+
} else {
190+
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
191+
exit(1);
192+
}
193+
194+
}
195+
196+
void gotoblas_dynamic_quit(void) {
197+
gotoblas = NULL;
198+
}

0 commit comments

Comments
 (0)