Skip to content

Commit f971ef5

Browse files
committed
Add ARMV8SVE to AArch64 Dynamic Dispatch
In order to enable support for future cores which have similar tunings (in this case I'm doing this for the Arm(R) Neoverse(TM) V2 core), this generically detects SVE support and enables it. This should better manage the size and complexity of dynamic dispatch rather than just copy pasting the same parameters. To make `ARMV8SVE` more representive of the common 128-bit SVE case, I've split it and similar parameters from A64FX which has the wider 512-bit SVE.
1 parent 7976def commit f971ef5

File tree

4 files changed

+59
-4
lines changed

4 files changed

+59
-4
lines changed

Makefile.system

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -668,6 +668,7 @@ DYNAMIC_CORE += NEOVERSEN1
668668
ifneq ($(NO_SVE), 1)
669669
DYNAMIC_CORE += NEOVERSEV1
670670
DYNAMIC_CORE += NEOVERSEN2
671+
DYNAMIC_CORE += ARMV8SVE
671672
endif
672673
DYNAMIC_CORE += CORTEXA55
673674
DYNAMIC_CORE += FALKOR

cmake/arch.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ if (DYNAMIC_ARCH)
4646
if (ARM64)
4747
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
4848
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
49-
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2)
49+
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE)
5050
endif ()
5151
if (DYNAMIC_LIST)
5252
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})

driver/others/dynamic_arm64.c

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*********************************************************************/
22
/* Copyright 2009, 2010 The University of Texas at Austin. */
3+
/* Copyright 2023 The OpenBLAS Project */
34
/* All rights reserved. */
45
/* */
56
/* Redistribution and use in source and binary forms, with or */
@@ -109,6 +110,11 @@ extern gotoblas_t gotoblas_NEOVERSEN2;
109110
#else
110111
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
111112
#endif
113+
#ifdef DYN_ARMV8SVE
114+
extern gotoblas_t gotoblas_ARMV8SVE;
115+
#else
116+
#define gotoblas_ARMV8SVE gotoblas_ARMV8
117+
#endif
112118
#ifdef DYN_CORTEX_A55
113119
extern gotoblas_t gotoblas_CORTEXA55;
114120
#else
@@ -128,9 +134,11 @@ extern gotoblas_t gotoblas_NEOVERSEN1;
128134
#ifndef NO_SVE
129135
extern gotoblas_t gotoblas_NEOVERSEV1;
130136
extern gotoblas_t gotoblas_NEOVERSEN2;
137+
extern gotoblas_t gotoblas_ARMV8SVE;
131138
#else
132139
#define gotoblas_NEOVERSEV1 gotoblas_ARMV8
133140
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
141+
#define gotoblas_ARMV8SVE gotoblas_ARMV8
134142
#endif
135143
extern gotoblas_t gotoblas_THUNDERX3T110;
136144
extern gotoblas_t gotoblas_CORTEXA55;
@@ -140,7 +148,7 @@ extern void openblas_warning(int verbose, const char * msg);
140148
#define FALLBACK_VERBOSE 1
141149
#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"
142150

143-
#define NUM_CORETYPES 13
151+
#define NUM_CORETYPES 16
144152

145153
/*
146154
* In case asm/hwcap.h is outdated on the build system, make sure
@@ -173,6 +181,7 @@ static char *corename[] = {
173181
"neoversen2",
174182
"thunderx3t110",
175183
"cortexa55",
184+
"armv8sve",
176185
"unknown"
177186
};
178187

@@ -192,6 +201,7 @@ char *gotoblas_corename(void) {
192201
if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12];
193202
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13];
194203
if (gotoblas == &gotoblas_CORTEXA55) return corename[14];
204+
if (gotoblas == &gotoblas_ARMV8SVE) return corename[15];
195205
return corename[NUM_CORETYPES];
196206
}
197207

@@ -226,6 +236,7 @@ static gotoblas_t *force_coretype(char *coretype) {
226236
case 12: return (&gotoblas_NEOVERSEN2);
227237
case 13: return (&gotoblas_THUNDERX3T110);
228238
case 14: return (&gotoblas_CORTEXA55);
239+
case 15: return (&gotoblas_ARMV8SVE);
229240
}
230241
snprintf(message, 128, "Core not found: %s\n", coretype);
231242
openblas_warning(1, message);
@@ -345,6 +356,12 @@ static gotoblas_t *get_coretype(void) {
345356
snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
346357
openblas_warning(1, coremsg);
347358
}
359+
#ifndef NO_SVE
360+
if ((getauxval(AT_HWCAP) & HWCAP_SVE)) {
361+
return &gotoblas_ARMV8SVE;
362+
}
363+
#endif
364+
348365
return NULL;
349366
#endif
350367
}

param.h

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3371,7 +3371,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
33713371
#define CGEMM_DEFAULT_R 4096
33723372
#define ZGEMM_DEFAULT_R 4096
33733373

3374-
#elif defined(NEOVERSEV1)
3374+
#elif defined(NEOVERSEV1) // 256-bit SVE
33753375

33763376
#if defined(XDOUBLE) || defined(DOUBLE)
33773377
#define SWITCH_RATIO 8
@@ -3449,7 +3449,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
34493449
#define CGEMM_DEFAULT_R 4096
34503450
#define ZGEMM_DEFAULT_R 4096
34513451

3452-
#elif defined(ARMV8SVE) || defined(A64FX) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2)
3452+
#elif defined(A64FX) // 512-bit SVE
34533453

34543454
/* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl".
34553455
Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */
@@ -3490,6 +3490,43 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout
34903490
#define CGEMM_DEFAULT_R 4096
34913491
#define ZGEMM_DEFAULT_R 4096
34923492

3493+
#elif defined(ARMV8SVE) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE
3494+
3495+
#if defined(XDOUBLE) || defined(DOUBLE)
3496+
#define SWITCH_RATIO 8
3497+
#else
3498+
#define SWITCH_RATIO 16
3499+
#endif
3500+
3501+
#define SGEMM_DEFAULT_UNROLL_M 4 // Actually 1VL (8) but kept seperate to keep copies seperate
3502+
#define SGEMM_DEFAULT_UNROLL_N 8
3503+
3504+
#define DGEMM_DEFAULT_UNROLL_M 4
3505+
#define DGEMM_DEFAULT_UNROLL_N 8
3506+
3507+
#define CGEMM_DEFAULT_UNROLL_M 2
3508+
#define CGEMM_DEFAULT_UNROLL_N 4
3509+
#define CGEMM_DEFAULT_UNROLL_MN 16
3510+
3511+
#define ZGEMM_DEFAULT_UNROLL_M 2
3512+
#define ZGEMM_DEFAULT_UNROLL_N 4
3513+
#define ZGEMM_DEFAULT_UNROLL_MN 16
3514+
3515+
#define SGEMM_DEFAULT_P 128
3516+
#define DGEMM_DEFAULT_P 160
3517+
#define CGEMM_DEFAULT_P 128
3518+
#define ZGEMM_DEFAULT_P 128
3519+
3520+
#define SGEMM_DEFAULT_Q 352
3521+
#define DGEMM_DEFAULT_Q 128
3522+
#define CGEMM_DEFAULT_Q 224
3523+
#define ZGEMM_DEFAULT_Q 112
3524+
3525+
#define SGEMM_DEFAULT_R 4096
3526+
#define DGEMM_DEFAULT_R 4096
3527+
#define CGEMM_DEFAULT_R 4096
3528+
#define ZGEMM_DEFAULT_R 4096
3529+
34933530
#else /* Other/undetected ARMv8 cores */
34943531

34953532
#define SGEMM_DEFAULT_UNROLL_M 16

0 commit comments

Comments
 (0)