Skip to content

Commit 05211ac

Browse files
DavidMansellmorgolock
authored andcommitted
feat: gemm: Add SME1 FP32 kernels.
Change-Id: I6ed4d04e0b3e83de85a6e1718098e06de6cb8a64 Signed-off-by: David Mansell <[email protected]>
1 parent 49751f9 commit 05211ac

File tree

11 files changed

+2432
-1
lines changed

11 files changed

+2432
-1
lines changed

Android.bp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1342,6 +1342,9 @@ cc_library_static {
13421342
"src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp",
13431343
"src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp",
13441344
"src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp",
1345+
"src/core/NEON/kernels/arm_gemm/kernels/sme_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp",
1346+
"src/core/NEON/kernels/arm_gemm/kernels/sme_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp",
1347+
"src/core/NEON/kernels/arm_gemm/kernels/sme_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp",
13451348
"src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp",
13461349
"src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp",
13471350
"src/core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp",

filelist.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1807,6 +1807,9 @@
18071807
"src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp",
18081808
"src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp",
18091809
"src/core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp",
1810+
"src/core/NEON/kernels/arm_gemm/kernels/sme_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp",
1811+
"src/core/NEON/kernels/arm_gemm/kernels/sme_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp",
1812+
"src/core/NEON/kernels/arm_gemm/kernels/sme_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp",
18101813
"src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp",
18111814
"src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6x4VL/generic.cpp",
18121815
"src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/a64fx.cpp",

src/BUILD.bazel

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,9 @@ filegroup(
282282
"core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp",
283283
"core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp",
284284
"core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp",
285+
"core/NEON/kernels/arm_gemm/kernels/sme_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp",
286+
"core/NEON/kernels/arm_gemm/kernels/sme_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp",
287+
"core/NEON/kernels/arm_gemm/kernels/sme_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp",
285288
"core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp",
286289
"core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp",
287290
"core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp",

src/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,9 @@ target_sources(
251251
core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_1VLx4VL/generic.cpp
252252
core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_2VLx2VL/generic.cpp
253253
core/NEON/kernels/arm_gemm/kernels/sme2_interleaved_nomerge_u8q_mopa_4VLx1VL/generic.cpp
254+
core/NEON/kernels/arm_gemm/kernels/sme_interleaved_nomerge_fp32_mopa_1VLx4VL/generic.cpp
255+
core/NEON/kernels/arm_gemm/kernels/sme_interleaved_nomerge_fp32_mopa_2VLx2VL/generic.cpp
256+
core/NEON/kernels/arm_gemm/kernels/sme_interleaved_nomerge_fp32_mopa_4VLx1VL/generic.cpp
254257
core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_bf16fp32_mmla_6x4VL/generic.cpp
255258
core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/a64fx.cpp
256259
core/NEON/kernels/arm_gemm/kernels/sve_ffhybrid_fp16_mla_6x4VL/generic.cpp

src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2017-2024 Arm Limited.
2+
* Copyright (c) 2017-2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -66,6 +66,11 @@
6666
#include "kernels/sme2_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp"
6767
#include "kernels/sme2_interleaved_nomerge_bf16fp32_mopa_4VLx1VL.hpp"
6868
#endif // ARM_COMPUTE_ENABLE_SME2
69+
#ifdef ARM_COMPUTE_ENABLE_SME
70+
#include "kernels/sme_interleaved_nomerge_fp32_mopa_1VLx4VL.hpp"
71+
#include "kernels/sme_interleaved_nomerge_fp32_mopa_2VLx2VL.hpp"
72+
#include "kernels/sme_interleaved_nomerge_fp32_mopa_4VLx1VL.hpp"
73+
#endif // ARM_COMPUTE_ENABLE_SME
6974

7075
#include "kernels/sve_ffhybrid_fp32_mla_6x4VL.hpp"
7176
#include "kernels/sve_ffhybrid_fp32bf16fp32_mmla_4x6VL.hpp"
@@ -188,6 +193,31 @@ GemmImplementation<float, float, float>::with_estimate(
188193
[](const GemmArgs &args) { return new GemmInterleavedNoMerge<cls_sme2_interleaved_nomerge_fp32_mopa_2VLx2VL, float, float>(args); }
189194
},
190195
#endif // ARM_COMPUTE_ENABLE_SME2
196+
#ifdef ARM_COMPUTE_ENABLE_SME
197+
{
198+
GemmMethod::GEMM_INTERLEAVED,
199+
"sme_interleaved_nomerge_fp32_mopa_1VLx4VL",
200+
[](const GemmArgs &args) { return args._ci->has_sme(); },
201+
[](const GemmArgs &args) { const auto VL = sme::get_vector_length<float>();
202+
return args._Msize <= VL || (2*VL < args._Msize && args._Msize <= 3*VL); },
203+
[](const GemmArgs &args) { return new GemmInterleavedNoMerge<cls_sme_interleaved_nomerge_fp32_mopa_1VLx4VL, float, float>(args); }
204+
},
205+
{
206+
GemmMethod::GEMM_INTERLEAVED,
207+
"sme_interleaved_nomerge_fp32_mopa_4VLx1VL",
208+
[](const GemmArgs &args) { return args._ci->has_sme(); },
209+
[](const GemmArgs &args) { const auto VL = sme::get_vector_length<float>();
210+
return args._Nsize <= VL || (2*VL < args._Nsize && args._Nsize <= 3*VL); },
211+
[](const GemmArgs &args) { return new GemmInterleavedNoMerge<cls_sme_interleaved_nomerge_fp32_mopa_4VLx1VL, float, float>(args); }
212+
},
213+
{
214+
GemmMethod::GEMM_INTERLEAVED,
215+
"sme_interleaved_nomerge_fp32_mopa_2VLx2VL",
216+
[](const GemmArgs &args) { return args._ci->has_sme(); },
217+
nullptr,
218+
[](const GemmArgs &args) { return new GemmInterleavedNoMerge<cls_sme_interleaved_nomerge_fp32_mopa_2VLx2VL, float, float>(args); }
219+
},
220+
#endif // ARM_COMPUTE_ENABLE_SME
191221
#ifdef ARM_COMPUTE_ENABLE_BF16
192222
GemmImplementation<float, float, float>::with_estimate(
193223
GemmMethod::GEMM_INTERLEAVED,
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
/*
2+
* Copyright (c) 2025 Arm Limited.
3+
*
4+
* SPDX-License-Identifier: MIT
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to
8+
* deal in the Software without restriction, including without limitation the
9+
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10+
* sell copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in all
14+
* copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22+
* SOFTWARE.
23+
*/
24+
#pragma once
25+
26+
#ifdef ARM_COMPUTE_ENABLE_SME
27+
28+
29+
#include "../std_transforms_sme.hpp"
30+
31+
namespace arm_gemm
32+
{
33+
34+
// Implementations
35+
void sme_interleaved_nomerge_fp32_mopa_1VLx4VL(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
36+
37+
class cls_sme_interleaved_nomerge_fp32_mopa_1VLx4VL
38+
{
39+
public:
40+
typedef float lhs_operand_type;
41+
typedef float rhs_operand_type;
42+
typedef float result_type;
43+
44+
typedef void (*kern_type)(const float *const A, const float *const B, float *const C, int ldc, const int M, const int N, const int K, const float *const bias, const Activation act, bool accumulate, float *const accumulator_buffer);
45+
46+
/* Kernel blocking parameters */
47+
static unsigned int out_height()
48+
{
49+
return sme::get_vector_length<float>() * 1;
50+
}
51+
52+
static unsigned int out_width()
53+
{
54+
return sme::get_vector_length<float>() * 4;
55+
}
56+
57+
static constexpr unsigned int k_unroll()
58+
{
59+
return 1;
60+
}
61+
62+
static constexpr bool supports_bias()
63+
{
64+
return true;
65+
}
66+
67+
static constexpr bool is_sme()
68+
{
69+
return true;
70+
}
71+
72+
// Default to the generic kernel
73+
kern_type kernel = sme_interleaved_nomerge_fp32_mopa_1VLx4VL;
74+
75+
StdTransformsSME<lhs_operand_type, result_type, 1, 4, 1> transforms = {};
76+
77+
cls_sme_interleaved_nomerge_fp32_mopa_1VLx4VL(const CPUInfo *)
78+
{
79+
}
80+
};
81+
82+
} // namespace arm_gemm
83+
84+
#endif // ARM_COMPUTE_ENABLE_SME2

0 commit comments

Comments
 (0)