Skip to content

Commit ff37b12

Browse files
committed
feat: neon task 5 (accu block shape)
1 parent b1d5f29 commit ff37b12

File tree

9 files changed

+635
-12
lines changed

9 files changed

+635
-12
lines changed

submissions/submission_25_05_08/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,14 +53,18 @@ FetchContent_MakeAvailable(google_benchmark)
5353
set(SRC_FILES
5454
neon_4_1.s
5555
neon_4_2.s
56+
neon_5_1.s
57+
neon_5_1-base-line.s
5658
)
5759

5860
set(TEST_FILES
5961
neon_4.test.cpp
62+
neon_5.test.cpp
6063
)
6164

6265
set(BENCH_FILES
6366
neon_4.bench.cpp
67+
neon_5.bench.cpp
6468
)
6569

6670
# add_executable(loops "${SRC_FILES}")

submissions/submission_25_05_08/neon_4.bench.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,27 +25,27 @@ class GemmMxNxKFixture : public benchmark::Fixture
2525
}
2626
};
2727

28-
BENCHMARK_TEMPLATE_DEFINE_F(GemmMxNxKFixture, BM_matmul_15_6_64, 16, 6, 64)(benchmark::State &state)
28+
BENCHMARK_TEMPLATE_DEFINE_F(GemmMxNxKFixture, BM_matmul_14_6_64, 14, 6, 64)(benchmark::State &state)
2929
{
3030
for (auto _ : state)
3131
{
3232
matmul_14_6_64(matrix_a, matrix_b, matrix_c, 16, 64, 16);
3333
}
3434

35-
flops = (4 * 6 * 4 * 2) * 64; // (4 fmla * 4 floats each * 2 instructions (add & mul) * 6 columns) * 64 K-Loop
35+
flops = (14 *6 * 64) * 2; // M * N * K * 2 instructions (add & mul)
3636
flops *= state.iterations();
3737
};
3838

39-
BENCHMARK_REGISTER_F(GemmMxNxKFixture, BM_matmul_15_6_64)->MinWarmUpTime(1.0); // WarmUp in seconds
39+
BENCHMARK_REGISTER_F(GemmMxNxKFixture, BM_matmul_14_6_64)->MinWarmUpTime(1.0); // WarmUp in seconds
4040

41-
BENCHMARK_TEMPLATE_DEFINE_F(GemmMxNxKFixture, BM_matmul_15_6_64, 16, 6, 64)(benchmark::State &state)
41+
BENCHMARK_TEMPLATE_DEFINE_F(GemmMxNxKFixture, BM_matmul_15_6_64, 15, 6, 64)(benchmark::State &state)
4242
{
4343
for (auto _ : state)
4444
{
45-
matmul_15_6_64(matrix_a, matrix_b, matrix_c, 16, 64, 16);
45+
matmul_15_6_64(matrix_a, matrix_b, matrix_c, 15, 64, 16);
4646
}
4747

48-
flops = (4 * 6 * 4 * 2) * 64; // (4 fmla * 4 floats each * 2 instructions (add & mul) * 6 columns) * 64 K-Loop
48+
flops = (15 *6 * 64) * 2; // M * N * K * 2 instructions (add & mul)
4949
flops *= state.iterations();
5050
};
5151

submissions/submission_25_05_08/neon_4_1.s

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
1-
// using the neon_2_unrolled as base kernel as it is the fast based on benchmarks
2-
31
/**
42
* @param x0 = a pointer to column-major 14x64 matrix A.
53
* @param x1 = b pointer to column-major 64x6 matrix B.
64
* @param x2 = c pointer to column-major 14x6 matrix C.
75
* @param x3 = lda leading dimension of A.
86
* @param x4 = ldb leading dimension of B.
97
* @param x5 = ldc leading dimension of C.
10-
**/
8+
**/
119
.text
1210
.type matmul_14_6_64, %function
1311
.global matmul_14_6_64

submissions/submission_25_05_08/neon_4_2.s

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
1-
// using the neon_2_unrolled as base kernel as it is the fast based on benchmarks
2-
31
/**
42
* @param x0 = a pointer to column-major 15x64 matrix A.
53
* @param x1 = b pointer to column-major 64x6 matrix B.
64
* @param x2 = c pointer to column-major 15x6 matrix C.
75
* @param x3 = lda leading dimension of A.
86
* @param x4 = ldb leading dimension of B.
97
* @param x5 = ldc leading dimension of C.
10-
**/
8+
**/
119
.text
1210
.type matmul_15_6_64, %function
1311
.global matmul_15_6_64
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#include <benchmark/benchmark.h>
2+
#include "neon_5.h"
3+
4+
template <uint32_t TMdim, uint32_t TNdim, uint32_t TKdim>
5+
class GemmMxNxKFixture : public benchmark::Fixture
6+
{
7+
public:
8+
float matrix_a[TMdim * TKdim];
9+
float matrix_b[TKdim * TNdim];
10+
float matrix_c[TMdim * TNdim];
11+
double flops;
12+
13+
void SetUp(::benchmark::State &_) override
14+
{
15+
flops = 0;
16+
17+
fill_random_matrix(matrix_a);
18+
fill_random_matrix(matrix_b);
19+
fill_random_matrix(matrix_c);
20+
}
21+
22+
void TearDown(::benchmark::State &state) override
23+
{
24+
state.counters["FLOPS"] = benchmark::Counter(flops, benchmark::Counter::kIsRate);
25+
}
26+
};
27+
28+
BENCHMARK_TEMPLATE_DEFINE_F(GemmMxNxKFixture, BM_matmul_64_64_64, 64, 64, 64)(benchmark::State &state)
29+
{
30+
for (auto _ : state)
31+
{
32+
matmul_64_64_64(matrix_a, matrix_b, matrix_c, 64, 64, 64);
33+
}
34+
35+
flops = (64 *64 * 64) * 2; // M * N * K * 2 instructions (add & mul)
36+
flops *= state.iterations();
37+
};
38+
39+
BENCHMARK_REGISTER_F(GemmMxNxKFixture, BM_matmul_64_64_64)->MinWarmUpTime(1.0); // WarmUp in seconds
40+
41+
BENCHMARK_TEMPLATE_DEFINE_F(GemmMxNxKFixture, BM_matmul_64_64_64_base_line, 64, 64, 64)(benchmark::State &state)
42+
{
43+
for (auto _ : state)
44+
{
45+
matmul_64_64_64_base_line(matrix_a, matrix_b, matrix_c);
46+
}
47+
48+
flops = (64 *64 * 64) * 2; // M * N * K * 2 instructions (add & mul)
49+
flops *= state.iterations();
50+
};
51+
52+
BENCHMARK_REGISTER_F(GemmMxNxKFixture, BM_matmul_64_64_64_base_line)->MinWarmUpTime(1.0); // WarmUp in seconds
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
#ifndef NEON_5_H
2+
#define NEON_5_H
3+
4+
#include <cstdint>
5+
#include <ctime>
6+
7+
extern "C"
8+
{
9+
/**
10+
* @brief Matmul that loops over the NMK dimension of an original matmul of (M=64, N=64, K=64) now with loop over K=64.
11+
* @param a pointer to column-major matrix A.
12+
* @param b pointer to column-major matrix B.
13+
* @param c pointer to column-major matrix C.
14+
* @param lda leading dimension of A.
15+
* @param ldb leading dimension of B.
16+
* @param ldc leading dimension of C.
17+
**/
18+
void matmul_64_64_64(float const *a, float const *b, float *c, int64_t lda, int64_t ldb, int64_t ldc);
19+
20+
/**
21+
* @brief Matmul that loops over the NMK dimension of an original matmul of (M=64, N=64, K=64) now with loop over K=64.
22+
* @param a pointer to column-major 64x64 matrix A.
23+
* @param b pointer to column-major 64x64 matrix B.
24+
* @param c pointer to column-major 64x64 matrix C.
25+
**/
26+
void matmul_64_64_64_base_line(float const *a, float const *b, float *c);
27+
}
28+
29+
/// @brief Fill the given matrix with random values.
30+
/// @tparam TSize The total size of the matrix.
31+
/// @param matrix The matrix to write to.
32+
template <uint32_t TSize>
33+
void fill_random_matrix(float (&matrix)[TSize])
34+
{
35+
std::srand(std::time(0));
36+
for (size_t i = 0; i < TSize; i++)
37+
{
38+
matrix[i] = (static_cast<float>(std::rand())) / (static_cast<float>(std::rand()));
39+
}
40+
}
41+
42+
/// @brief Fill the given matrix with counting up values, starting from 0.
43+
/// @tparam TSize The total size of the matrix.
44+
/// @param matrix The matrix to write to.
45+
template <uint32_t TSize>
46+
void fill_counting_matrix(float (&matrix)[TSize])
47+
{
48+
for (size_t i = 0; i < TSize; i++)
49+
{
50+
matrix[i] = i;
51+
}
52+
}
53+
54+
/// @brief Copy the values of matrix to another matrix.
55+
/// @tparam TSize The equal size of the matrices.
56+
/// @param input The matrix to copy from.
57+
/// @param output The matrix to copy to.
58+
template <uint32_t TSize>
59+
void copy_matrix(float (&input)[TSize], float (&output)[TSize])
60+
{
61+
std::copy(std::begin(input), std::end(input), std::begin(output));
62+
}
63+
64+
/// @brief Naive matmul of column-major C [MxN] = A [MxK] mul B [KxN].
65+
/// @tparam TMDim The size of the M dimension.
66+
/// @tparam TNDim The size of the N dimension.
67+
/// @tparam TKDim The size of the K dimension.
68+
/// @param a The pointer of matrix A.
69+
/// @param b The pointer of matrix B.
70+
/// @param c The pointer of matrix C.
71+
/// @param lda The leading dimension of A.
72+
/// @param ldb The leading dimension of B.
73+
/// @param ldc The leading dimension of C.
74+
template <uint32_t TMDim, uint32_t TNDim, uint32_t TKDim>
75+
void naive_matmul_M_N_K(const float *__restrict__ a, const float *__restrict__ b, float *__restrict__ c,
76+
int64_t lda, int64_t ldb, int64_t ldc)
77+
{
78+
for (size_t iM = 0; iM < TMDim; iM++)
79+
{
80+
for (size_t iN = 0; iN < TNDim; iN++)
81+
{
82+
for (size_t iK = 0; iK < TKDim; ++iK)
83+
{
84+
c[iM + iN * ldc] += a[iM + iK * lda] * b[iK + iN * ldb];
85+
}
86+
}
87+
}
88+
}
89+
#endif // NEON_5_H
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#include <catch2/catch_test_macros.hpp>
2+
#include <catch2/matchers/catch_matchers_floating_point.hpp>
3+
#include <cstdint>
4+
#include "neon_5.h"
5+
6+
template <uint TSize>
7+
void verify_matmul(const float (&expected)[TSize], const float (&result)[TSize])
8+
{
9+
for (size_t i = 0; i < TSize; i++)
10+
{
11+
CAPTURE(i, result[i], expected[i]);
12+
REQUIRE_THAT(result[i], Catch::Matchers::WithinRel(expected[i]));
13+
}
14+
}
15+
16+
TEST_CASE("Test 64x64x64 gemm correctness random data", "[neon_5][correctness][gemm]")
17+
{
18+
float matrix_a[64 * 64];
19+
float matrix_b[64 * 64];
20+
float matrix_c[64 * 64];
21+
float matrix_c_verify[64 * 64];
22+
23+
fill_random_matrix(matrix_a);
24+
fill_random_matrix(matrix_b);
25+
fill_random_matrix(matrix_c);
26+
copy_matrix(matrix_c, matrix_c_verify);
27+
28+
// Run matmuls
29+
matmul_64_64_64(matrix_a, matrix_b, matrix_c, 64, 64, 64);
30+
naive_matmul_M_N_K<64, 64, 64>(matrix_a, matrix_b, matrix_c_verify, 64, 64, 64);
31+
32+
verify_matmul(matrix_c_verify, matrix_c);
33+
}
34+
35+
TEST_CASE("Test 64x64x64 gemm correctness counting data", "[neon_5][correctness][gemm]")
36+
{
37+
float matrix_a[64 * 64];
38+
float matrix_b[64 * 64];
39+
float matrix_c[64 * 64];
40+
float matrix_c_verify[64 * 64];
41+
42+
fill_counting_matrix(matrix_a);
43+
fill_counting_matrix(matrix_b);
44+
fill_counting_matrix(matrix_c);
45+
copy_matrix(matrix_c, matrix_c_verify);
46+
47+
// Run matmuls
48+
matmul_64_64_64(matrix_a, matrix_b, matrix_c, 64, 64, 64);
49+
naive_matmul_M_N_K<64, 64, 64>(matrix_a, matrix_b, matrix_c_verify, 64, 64, 64);
50+
51+
verify_matmul(matrix_c_verify, matrix_c);
52+
}
53+
54+
55+
TEST_CASE("Test matmul_64_64_64_base_line gemm correctness random data", "[neon_5][correctness][gemm]")
56+
{
57+
float matrix_a[64 * 64];
58+
float matrix_b[64 * 64];
59+
float matrix_c[64 * 64];
60+
float matrix_c_verify[64 * 64];
61+
62+
fill_random_matrix(matrix_a);
63+
fill_random_matrix(matrix_b);
64+
fill_random_matrix(matrix_c);
65+
copy_matrix(matrix_c, matrix_c_verify);
66+
67+
// Run matmuls
68+
matmul_64_64_64_base_line(matrix_a, matrix_b, matrix_c);
69+
naive_matmul_M_N_K<64, 64, 64>(matrix_a, matrix_b, matrix_c_verify, 64, 64, 64);
70+
71+
verify_matmul(matrix_c_verify, matrix_c);
72+
}
73+
74+
TEST_CASE("Test matmul_64_64_64_base_line gemm correctness counting data", "[neon_5][correctness][gemm]")
75+
{
76+
float matrix_a[64 * 64];
77+
float matrix_b[64 * 64];
78+
float matrix_c[64 * 64];
79+
float matrix_c_verify[64 * 64];
80+
81+
fill_counting_matrix(matrix_a);
82+
fill_counting_matrix(matrix_b);
83+
fill_counting_matrix(matrix_c);
84+
copy_matrix(matrix_c, matrix_c_verify);
85+
86+
// Run matmuls
87+
matmul_64_64_64_base_line(matrix_a, matrix_b, matrix_c);
88+
naive_matmul_M_N_K<64, 64, 64>(matrix_a, matrix_b, matrix_c_verify, 64, 64, 64);
89+
90+
verify_matmul(matrix_c_verify, matrix_c);
91+
}

0 commit comments

Comments
 (0)