Skip to content

Commit 114316f

Browse files
committed
Optimize SBGEMM / BGEMM for NEOVERSEV1 further
This changes the kernels to pack full SVE vectors and reduces the overall complexity of the inner GEMM loop.
1 parent 75c6ab4 commit 114316f

8 files changed

+1064
-14
lines changed

kernel/arm64/KERNEL.NEOVERSEV1

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -34,31 +34,31 @@ SGEMVTKERNEL = gemv_t_sve_v1x3.c
3434
DGEMVTKERNEL = gemv_t_sve_v1x3.c
3535
ifeq ($(BUILD_BFLOAT16), 1)
3636
BGEMM_BETA = bgemm_beta_neon.c
37-
BGEMMKERNEL = bgemm_kernel_$(BGEMM_UNROLL_M)x$(BGEMM_UNROLL_N)_neoversev1.c
37+
BGEMMKERNEL = bgemm_kernel_2vlx4_neoversev1.c
3838
ifneq ($(BGEMM_UNROLL_M), $(BGEMM_UNROLL_N))
39-
BGEMMINCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_M)_neoversev1.c
40-
BGEMMITCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversev1.c
39+
BGEMMINCOPY = bgemm_ncopy_2vl_neoversev1.c
40+
BGEMMITCOPY = bgemm_tcopy_2vl_neoversev1.c
4141
BGEMMINCOPYOBJ = bgemm_incopy$(TSUFFIX).$(SUFFIX)
4242
BGEMMITCOPYOBJ = bgemm_itcopy$(TSUFFIX).$(SUFFIX)
4343
endif
44-
BGEMMONCOPY = sbgemm_ncopy_$(BGEMM_UNROLL_N)_neoversev1.c
45-
BGEMMOTCOPY = sbgemm_tcopy_$(BGEMM_UNROLL_N)_neoversev1.c
44+
BGEMMONCOPY = bgemm_ncopy_4_neoversev1.c
45+
BGEMMOTCOPY = bgemm_tcopy_4_neoversev1.c
4646
BGEMMONCOPYOBJ = bgemm_oncopy$(TSUFFIX).$(SUFFIX)
4747
BGEMMOTCOPYOBJ = bgemm_otcopy$(TSUFFIX).$(SUFFIX)
4848

4949
BGEMVTKERNEL = sbgemv_t_bfdot.c
5050
BGEMVNKERNEL = bgemv_n_sve_v3x4.c
5151

5252
SBGEMM_BETA = sbgemm_beta_neoversev1.c
53-
SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversev1.c
53+
SBGEMMKERNEL = bgemm_kernel_2vlx4_neoversev1.c
5454
ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N))
55-
SBGEMMINCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_M)_neoversev1.c
56-
SBGEMMITCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversev1.c
55+
SBGEMMINCOPY = bgemm_ncopy_2vl_neoversev1.c
56+
SBGEMMITCOPY = bgemm_tcopy_2vl_neoversev1.c
5757
SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX)
5858
SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
5959
endif
60-
SBGEMMONCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversev1.c
61-
SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_N)_neoversev1.c
60+
SBGEMMONCOPY = bgemm_ncopy_4_neoversev1.c
61+
SBGEMMOTCOPY = bgemm_tcopy_4_neoversev1.c
6262
SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
6363
SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
6464

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
/***************************************************************************
2+
* Copyright (c) 2025, The OpenBLAS Project
3+
* All rights reserved.
4+
* Redistribution and use in source and binary forms, with or without
5+
* modification, are permitted provided that the following conditions are
6+
* met:
7+
* 1. Redistributions of source code must retain the above copyright
8+
* notice, this list of conditions and the following disclaimer.
9+
* 2. Redistributions in binary form must reproduce the above copyright
10+
* notice, this list of conditions and the following disclaimer in
11+
* the documentation and/or other materials provided with the
12+
* distribution.
13+
* 3. Neither the name of the OpenBLAS project nor the names of
14+
* its contributors may be used to endorse or promote products
15+
* derived from this software without specific prior written permission.
16+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19+
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20+
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21+
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22+
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23+
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24+
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26+
* POSSIBILITY OF SUCH DAMAGE.
27+
* *****************************************************************************/
28+
29+
#include <arm_sve.h>
30+
#include <arm_neon.h>
31+
32+
#include "common.h"
33+
34+
#define ALPHA_ONE
35+
#include "bgemm_kernel_2vlx4_neoversev1_impl.c"
36+
#undef ALPHA_ONE
37+
#undef UPDATE_C
38+
#undef UPDATE_C2
39+
#undef UPDATE_C1
40+
#include "bgemm_kernel_2vlx4_neoversev1_impl.c"
41+
42+
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT *A, IFLOAT *B,
43+
FLOAT *C, BLASLONG ldc) {
44+
#ifdef BGEMM
45+
bfloat16_t alpha_bf16;
46+
memcpy(&alpha_bf16, &alpha, sizeof(bfloat16_t));
47+
float alpha_f32 = vcvtah_f32_bf16(alpha_bf16);
48+
#else
49+
float alpha_f32 = alpha;
50+
#endif
51+
52+
if (alpha_f32 == 1.0f)
53+
return bgemm_kernel_neoversev1_alpha_one(m, n, k, alpha_f32, A, B, C, ldc);
54+
else
55+
return bgemm_kernel_neoversev1_alpha(m, n, k, alpha_f32, A, B, C, ldc);
56+
return 0;
57+
}

0 commit comments

Comments
 (0)