Skip to content

Commit 1a7b8c6

Browse files
author
Chip Kerchner
committed
Merge branch 'develop' into betterPowerGEMVTail
2 parents e2334d0 + 9afd0c8 commit 1a7b8c6

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

75 files changed

+7490
-1340
lines changed

.github/workflows/riscv64_vector.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ jobs:
2828
- target: RISCV64_ZVL256B
2929
opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64
3030
qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64
31+
- target: DYNAMIC_ARCH=1
32+
opts: TARGET=RISCV64_GENERIC BINARY=64 ARCH=riscv64 DYNAMIC_ARCH=1
33+
qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64
3134

3235
steps:
3336
- name: Checkout repository

Jenkinsfile.pwr

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
pipeline {
22
agent {
33
docker {
4-
image 'osuosl/ubuntu-ppc64le'
4+
image 'osuosl/ubuntu-ppc64le:18.04'
55
}
66
}
77
stages {

Makefile.arm64

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,12 +276,19 @@ endif
276276
endif
277277
endif
278278

279-
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
280279
ifeq ($(CORE), A64FX)
280+
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
281+
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ3) $(GCCVERSIONGTEQ11) $(ISCLANG)))
281282
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx
282283
ifneq ($(F_COMPILER), NAG)
283284
FCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx
284285
endif
286+
else
287+
CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-n1
288+
ifneq ($(F_COMPILER), NAG)
289+
FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-n1
290+
endif
291+
endif
285292
endif
286293
endif
287294

Makefile.system

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,13 +268,24 @@ SMALL_MATRIX_OPT = 1
268268
else ifeq ($(ARCH), power)
269269
SMALL_MATRIX_OPT = 1
270270
BUILD_BFLOAT16 = 1
271+
else ifeq ($(ARCH), arm64)
272+
SMALL_MATRIX_OPT = 1
271273
endif
272274
ifeq ($(ARCH), loongarch64)
273275
SMALL_MATRIX_OPT = 1
274276
endif
277+
ifeq ($(ARCH), arm64)
278+
GEMM_GEMV_FORWARD = 1
279+
endif
280+
275281
ifeq ($(SMALL_MATRIX_OPT), 1)
276282
CCOMMON_OPT += -DSMALL_MATRIX_OPT
277283
endif
284+
ifeq ($(GEMM_GEMV_FORWARD), 1)
285+
ifneq ($(ONLY_CBLAS), 1)
286+
CCOMMON_OPT += -DGEMM_GEMV_FORWARD
287+
endif
288+
endif
278289

279290
# This operation is expensive, so execution should be once.
280291
ifndef GOTOBLAS_MAKEFILE
@@ -689,6 +700,7 @@ ifneq ($(NO_SVE), 1)
689700
DYNAMIC_CORE += NEOVERSEV1
690701
DYNAMIC_CORE += NEOVERSEN2
691702
DYNAMIC_CORE += ARMV8SVE
703+
DYNAMIC_CORE += A64FX
692704
endif
693705
DYNAMIC_CORE += THUNDERX
694706
DYNAMIC_CORE += THUNDERX2T99
@@ -715,6 +727,17 @@ ifeq ($(ARCH), loongarch64)
715727
DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC
716728
endif
717729

730+
ifeq ($(ARCH), riscv64)
731+
DYNAMIC_CORE = RISCV64_GENERIC
732+
DYNAMIC_CORE += RISCV64_ZVL128B
733+
DYNAMIC_CORE += RISCV64_ZVL256B
734+
ifdef DYNAMIC_LIST
735+
override DYNAMIC_CORE = RISCV64_GENERIC $(DYNAMIC_LIST)
736+
XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_RISCV64_GENERIC
737+
XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
738+
endif
739+
endif
740+
718741
ifeq ($(ARCH), zarch)
719742
DYNAMIC_CORE = ZARCH_GENERIC
720743

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,8 @@ For **POWER**, the list encompasses POWER6, POWER8 and POWER9. POWER10 is additi
234234

235235
on **ZARCH** it comprises Z13 and Z14 as well as generic zarch support.
236236

237+
On **riscv64**, DYNAMIC_ARCH enables support for riscv64_zvl128b and riscv64_zvl256b in addition to generic riscv64 support. A compiler that supports RVV 1.0 is required to build OpenBLAS for riscv64 when DYNAMIC_ARCH is enabled.
238+
237239
The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the
238240
common code in the library, usually you will want to set this to the oldest model you expect to encounter.
239241
Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library.

benchmark/pybench/benchmarks/bench_blas.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -234,14 +234,10 @@ def test_gesdd(benchmark, mn, variant):
234234
gesdd = ow.get_func('gesdd', variant)
235235
u, s, vt, info = benchmark(run_gesdd, a, lwork, gesdd)
236236

237-
if variant != 's':
238-
# On entry to SLASCL parameter number 4 had an illegal value
239-
# under codspeed (cannot repro locally or on CI w/o codspeed)
240-
# https://github.com/OpenMathLib/OpenBLAS/issues/4776
241-
assert info == 0
242-
243-
atol = {'s': 1e-5, 'd': 1e-13}
244-
np.testing.assert_allclose(u @ np.diag(s) @ vt, a, atol=atol[variant])
237+
assert info == 0
238+
239+
atol = {'s': 1e-5, 'd': 1e-13}
240+
np.testing.assert_allclose(u @ np.diag(s) @ vt, a, atol=atol[variant])
245241

246242

247243
# linalg.eigh

c_check

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,9 @@ if [ "$compiler" = "GCC" ]; then
356356
no_avx2=0
357357
oldgcc=0
358358
data=`$compiler_name -dumpversion`
359+
case "$data" in *-*)
360+
data="${data%-*}"
361+
esac
359362
case "$data" in *.*.*)
360363
data="${data%.*}"
361364
esac

cmake/arch.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ if (DYNAMIC_ARCH)
4646
if (ARM64)
4747
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
4848
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
49-
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE)
49+
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
5050
endif ()
5151
if (DYNAMIC_LIST)
5252
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})

cmake/prebuild.cmake

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1218,6 +1218,37 @@ endif ()
12181218
set(ZGEMM_UNROLL_M 4)
12191219
set(ZGEMM_UNROLL_N 4)
12201220
set(SYMV_P 16)
1221+
elseif ("${TCORE}" STREQUAL "A64FX")
1222+
file(APPEND ${TARGET_CONF_TEMP}
1223+
"#define L1_CODE_SIZE\t65536\n"
1224+
"#define L1_CODE_LINESIZE\t256\n"
1225+
"#define L1_CODE_ASSOCIATIVE\t8\n"
1226+
"#define L1_DATA_SIZE\t32768\n"
1227+
"#define L1_DATA_LINESIZE\t256\n"
1228+
"#define L1_DATA_ASSOCIATIVE\t8\n"
1229+
"#define L2_SIZE\t8388608\n\n"
1230+
"#define L2_LINESIZE\t256\n"
1231+
"#define L2_ASSOCIATIVE\t8\n"
1232+
"#define L3_SIZE\t0\n\n"
1233+
"#define L3_LINESIZE\t0\n\n"
1234+
"#define L3_ASSOCIATIVE\t0\n\n"
1235+
"#define DTB_DEFAULT_ENTRIES\t64\n"
1236+
"#define DTB_SIZE\t4096\n"
1237+
"#define HAVE_VFPV4\n"
1238+
"#define HAVE_VFPV3\n"
1239+
"#define HAVE_VFP\n"
1240+
"#define HAVE_NEON\n"
1241+
"#define HAVE_SVE\n"
1242+
"#define ARMV8\n")
1243+
set(SGEMM_UNROLL_M 4)
1244+
set(SGEMM_UNROLL_N 8)
1245+
set(DGEMM_UNROLL_M 2)
1246+
set(DGEMM_UNROLL_N 8)
1247+
set(CGEMM_UNROLL_M 2)
1248+
set(CGEMM_UNROLL_N 4)
1249+
set(ZGEMM_UNROLL_M 2)
1250+
set(ZGEMM_UNROLL_N 4)
1251+
set(SYMV_P 16)
12211252
elseif ("${TCORE}" STREQUAL "P5600")
12221253
file(APPEND ${TARGET_CONF_TEMP}
12231254
"#define L2_SIZE 1048576\n"

cmake/system.cmake

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,18 @@ if (${TARGET} STREQUAL NEOVERSEV1)
310310
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve")
311311
endif()
312312
endif()
313+
if (${TARGET} STREQUAL A64FX)
314+
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
315+
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx")
316+
else ()
317+
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
318+
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
319+
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve -mtune=a64fx")
320+
else ()
321+
message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support A64FX.")
322+
endif()
323+
endif()
324+
endif()
313325

314326
endif()
315327

@@ -379,6 +391,13 @@ endif ()
379391
if (X86_64 OR ${CORE} STREQUAL POWER10)
380392
set(SMALL_MATRIX_OPT TRUE)
381393
endif ()
394+
if (ARM64)
395+
set(GEMM_GEMV_FORWARD TRUE)
396+
endif ()
397+
398+
if (GEMM_GEMV_FORWARD AND NOT ONLY_CBLAS)
399+
set(CCOMMON_OPT "${CCOMMON_OPT} -DGEMM_GEMV_FORWARD")
400+
endif ()
382401
if (SMALL_MATRIX_OPT)
383402
set(CCOMMON_OPT "${CCOMMON_OPT} -DSMALL_MATRIX_OPT")
384403
endif ()

0 commit comments

Comments
 (0)