feat: neon task 5 (accu block shape)

Integer-Ctrl · Integer-Ctrl · commit ff37b12d16ae · 2025-05-07T13:05:29.000Z
diff --git a/submissions/submission_25_05_08/CMakeLists.txt b/submissions/submission_25_05_08/CMakeLists.txt
@@ -53,14 +53,18 @@ FetchContent_MakeAvailable(google_benchmark)
 set(SRC_FILES
     neon_4_1.s
     neon_4_2.s
+    neon_5_1.s
+    neon_5_1-base-line.s
 )
 
 set(TEST_FILES
     neon_4.test.cpp
+    neon_5.test.cpp
 )
 
 set(BENCH_FILES
     neon_4.bench.cpp
+    neon_5.bench.cpp
 )
 
 # add_executable(loops "${SRC_FILES}")
diff --git a/submissions/submission_25_05_08/neon_4.bench.cpp b/submissions/submission_25_05_08/neon_4.bench.cpp
@@ -25,27 +25,27 @@ class GemmMxNxKFixture : public benchmark::Fixture
     }
 };
 
-BENCHMARK_TEMPLATE_DEFINE_F(GemmMxNxKFixture, BM_matmul_15_6_64, 16, 6, 64)(benchmark::State &state)
+BENCHMARK_TEMPLATE_DEFINE_F(GemmMxNxKFixture, BM_matmul_14_6_64, 14, 6, 64)(benchmark::State &state)
 {
     for (auto _ : state)
     {
         matmul_14_6_64(matrix_a, matrix_b, matrix_c, 16, 64, 16);
     }
 
-    flops = (4 * 6 * 4 * 2) * 64; // (4 fmla * 4 floats each * 2 instructions (add & mul) * 6 columns) * 64 K-Loop
+    flops = (14 *6 * 64) * 2; // M * N * K * 2 instructions (add & mul)
     flops *= state.iterations();
 };
 
-BENCHMARK_REGISTER_F(GemmMxNxKFixture, BM_matmul_15_6_64)->MinWarmUpTime(1.0); // WarmUp in seconds
+BENCHMARK_REGISTER_F(GemmMxNxKFixture, BM_matmul_14_6_64)->MinWarmUpTime(1.0); // WarmUp in seconds
 
-BENCHMARK_TEMPLATE_DEFINE_F(GemmMxNxKFixture, BM_matmul_15_6_64, 16, 6, 64)(benchmark::State &state)
+BENCHMARK_TEMPLATE_DEFINE_F(GemmMxNxKFixture, BM_matmul_15_6_64, 15, 6, 64)(benchmark::State &state)
 {
     for (auto _ : state)
     {
-        matmul_15_6_64(matrix_a, matrix_b, matrix_c, 16, 64, 16);
+        matmul_15_6_64(matrix_a, matrix_b, matrix_c, 15, 64, 16);
     }
 
-    flops = (4 * 6 * 4 * 2) * 64; // (4 fmla * 4 floats each * 2 instructions (add & mul) * 6 columns) * 64 K-Loop
+    flops = (15 *6 * 64) * 2; // M * N * K * 2 instructions (add & mul)
     flops *= state.iterations();
 };
 
diff --git a/submissions/submission_25_05_08/neon_4_1.s b/submissions/submission_25_05_08/neon_4_1.s
@@ -1,13 +1,11 @@
-// using the neon_2_unrolled as base kernel as it is the fast based on benchmarks
-
 /**
     * @param x0 = a pointer to column-major 14x64 matrix A.
     * @param x1 = b pointer to column-major 64x6 matrix B.
     * @param x2 = c pointer to column-major 14x6 matrix C.
     * @param x3 = lda leading dimension of A.
     * @param x4 = ldb leading dimension of B.
     * @param x5 = ldc leading dimension of C.
-    **/
+**/
 .text
 .type matmul_14_6_64, %function
 .global matmul_14_6_64
diff --git a/submissions/submission_25_05_08/neon_4_2.s b/submissions/submission_25_05_08/neon_4_2.s
@@ -1,13 +1,11 @@
-// using the neon_2_unrolled as base kernel as it is the fast based on benchmarks
-
 /**
     * @param x0 = a pointer to column-major 15x64 matrix A.
     * @param x1 = b pointer to column-major 64x6 matrix B.
     * @param x2 = c pointer to column-major 15x6 matrix C.
     * @param x3 = lda leading dimension of A.
     * @param x4 = ldb leading dimension of B.
     * @param x5 = ldc leading dimension of C.
-    **/
+**/
 .text
 .type matmul_15_6_64, %function
 .global matmul_15_6_64
diff --git a/submissions/submission_25_05_08/neon_5.bench.cpp b/submissions/submission_25_05_08/neon_5.bench.cpp
@@ -0,0 +1,52 @@
+#include <benchmark/benchmark.h>
+#include "neon_5.h"
+
+template <uint32_t TMdim, uint32_t TNdim, uint32_t TKdim>
+class GemmMxNxKFixture : public benchmark::Fixture
+{
+public:
+    float matrix_a[TMdim * TKdim];
+    float matrix_b[TKdim * TNdim];
+    float matrix_c[TMdim * TNdim];
+    double flops;
+
+    void SetUp(::benchmark::State &_) override
+    {
+        flops = 0;
+
+        fill_random_matrix(matrix_a);
+        fill_random_matrix(matrix_b);
+        fill_random_matrix(matrix_c);
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        state.counters["FLOPS"] = benchmark::Counter(flops, benchmark::Counter::kIsRate);
+    }
+};
+
+BENCHMARK_TEMPLATE_DEFINE_F(GemmMxNxKFixture, BM_matmul_64_64_64, 64, 64, 64)(benchmark::State &state)
+{
+    for (auto _ : state)
+    {
+        matmul_64_64_64(matrix_a, matrix_b, matrix_c, 64, 64, 64);
+    }
+
+    flops = (64 *64 * 64) * 2; // M * N * K * 2 instructions (add & mul)
+    flops *= state.iterations();
+};
+
+BENCHMARK_REGISTER_F(GemmMxNxKFixture, BM_matmul_64_64_64)->MinWarmUpTime(1.0); // WarmUp in seconds
+
+BENCHMARK_TEMPLATE_DEFINE_F(GemmMxNxKFixture, BM_matmul_64_64_64_base_line, 64, 64, 64)(benchmark::State &state)
+{
+    for (auto _ : state)
+    {
+        matmul_64_64_64_base_line(matrix_a, matrix_b, matrix_c);
+    }
+
+    flops = (64 *64 * 64) * 2; // M * N * K * 2 instructions (add & mul)
+    flops *= state.iterations();
+};
+
+BENCHMARK_REGISTER_F(GemmMxNxKFixture, BM_matmul_64_64_64_base_line)->MinWarmUpTime(1.0); // WarmUp in seconds
diff --git a/submissions/submission_25_05_08/neon_5.h b/submissions/submission_25_05_08/neon_5.h
@@ -0,0 +1,89 @@
+#ifndef NEON_5_H
+#define NEON_5_H
+
+#include <cstdint>
+#include <ctime>
+
+extern "C"
+{
+    /**
+     * @brief Matmul that loops over the NMK dimension of an original matmul of (M=64, N=64, K=64) now with loop over K=64.
+     * @param a pointer to column-major matrix A.
+     * @param b pointer to column-major matrix B.
+     * @param c pointer to column-major matrix C.
+     * @param lda leading dimension of A.
+     * @param ldb leading dimension of B.
+     * @param ldc leading dimension of C.
+     **/
+    void matmul_64_64_64(float const *a, float const *b, float *c, int64_t lda, int64_t ldb, int64_t ldc);
+
+    /**
+     * @brief Matmul that loops over the NMK dimension of an original matmul of (M=64, N=64, K=64) now with loop over K=64.
+     * @param a pointer to column-major 64x64 matrix A.
+     * @param b pointer to column-major 64x64 matrix B.
+     * @param c pointer to column-major 64x64 matrix C.
+     **/
+    void matmul_64_64_64_base_line(float const *a, float const *b, float *c);
+}
+
+/// @brief Fill the given matrix with random values.
+/// @tparam TSize The total size of the matrix.
+/// @param matrix The matrix to write to.
+template <uint32_t TSize>
+void fill_random_matrix(float (&matrix)[TSize])
+{
+    std::srand(std::time(0));
+    for (size_t i = 0; i < TSize; i++)
+    {
+        matrix[i] = (static_cast<float>(std::rand())) / (static_cast<float>(std::rand()));
+    }
+}
+
+/// @brief Fill the given matrix with counting up values, starting from 0.
+/// @tparam TSize The total size of the matrix.
+/// @param matrix The matrix to write to.
+template <uint32_t TSize>
+void fill_counting_matrix(float (&matrix)[TSize])
+{
+    for (size_t i = 0; i < TSize; i++)
+    {
+        matrix[i] = i;
+    }
+}
+
+/// @brief Copy the values of matrix to another matrix.
+/// @tparam TSize The equal size of the matrices.
+/// @param input The matrix to copy from.
+/// @param output The matrix to copy to.
+template <uint32_t TSize>
+void copy_matrix(float (&input)[TSize], float (&output)[TSize])
+{
+    std::copy(std::begin(input), std::end(input), std::begin(output));
+}
+
+/// @brief Naive matmul of column-major C [MxN] = A [MxK] mul B [KxN].
+/// @tparam TMDim The size of the M dimension.
+/// @tparam TNDim The size of the N dimension.
+/// @tparam TKDim The size of the K dimension.
+/// @param a The pointer of matrix A.
+/// @param b The pointer of matrix B.
+/// @param c The pointer of matrix C.
+/// @param lda The leading dimension of A.
+/// @param ldb The leading dimension of B.
+/// @param ldc The leading dimension of C.
+template <uint32_t TMDim, uint32_t TNDim, uint32_t TKDim>
+void naive_matmul_M_N_K(const float *__restrict__ a, const float *__restrict__ b, float *__restrict__ c,
+                        int64_t lda, int64_t ldb, int64_t ldc)
+{
+    for (size_t iM = 0; iM < TMDim; iM++)
+    {
+        for (size_t iN = 0; iN < TNDim; iN++)
+        {
+            for (size_t iK = 0; iK < TKDim; ++iK)
+            {
+                c[iM + iN * ldc] += a[iM + iK * lda] * b[iK + iN * ldb];
+            }
+        }
+    }
+}
+#endif // NEON_5_H
diff --git a/submissions/submission_25_05_08/neon_5.test.cpp b/submissions/submission_25_05_08/neon_5.test.cpp
@@ -0,0 +1,91 @@
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/matchers/catch_matchers_floating_point.hpp>
+#include <cstdint>
+#include "neon_5.h"
+
+template <uint TSize>
+void verify_matmul(const float (&expected)[TSize], const float (&result)[TSize])
+{
+    for (size_t i = 0; i < TSize; i++)
+    {
+        CAPTURE(i, result[i], expected[i]);
+        REQUIRE_THAT(result[i], Catch::Matchers::WithinRel(expected[i]));
+    }
+}
+
+TEST_CASE("Test 64x64x64 gemm correctness random data", "[neon_5][correctness][gemm]")
+{
+    float matrix_a[64 * 64];
+    float matrix_b[64 * 64];
+    float matrix_c[64 * 64];
+    float matrix_c_verify[64 * 64];
+
+    fill_random_matrix(matrix_a);
+    fill_random_matrix(matrix_b);
+    fill_random_matrix(matrix_c);
+    copy_matrix(matrix_c, matrix_c_verify);
+
+    // Run matmuls
+    matmul_64_64_64(matrix_a, matrix_b, matrix_c, 64, 64, 64);
+    naive_matmul_M_N_K<64, 64, 64>(matrix_a, matrix_b, matrix_c_verify, 64, 64, 64);
+
+    verify_matmul(matrix_c_verify, matrix_c);
+}
+
+TEST_CASE("Test 64x64x64 gemm correctness counting data", "[neon_5][correctness][gemm]")
+{
+    float matrix_a[64 * 64];
+    float matrix_b[64 * 64];
+    float matrix_c[64 * 64];
+    float matrix_c_verify[64 * 64];
+
+    fill_counting_matrix(matrix_a);
+    fill_counting_matrix(matrix_b);
+    fill_counting_matrix(matrix_c);
+    copy_matrix(matrix_c, matrix_c_verify);
+
+    // Run matmuls
+    matmul_64_64_64(matrix_a, matrix_b, matrix_c, 64, 64, 64);
+    naive_matmul_M_N_K<64, 64, 64>(matrix_a, matrix_b, matrix_c_verify, 64, 64, 64);
+
+    verify_matmul(matrix_c_verify, matrix_c);
+}
+
+
+TEST_CASE("Test matmul_64_64_64_base_line gemm correctness random data", "[neon_5][correctness][gemm]")
+{
+    float matrix_a[64 * 64];
+    float matrix_b[64 * 64];
+    float matrix_c[64 * 64];
+    float matrix_c_verify[64 * 64];
+
+    fill_random_matrix(matrix_a);
+    fill_random_matrix(matrix_b);
+    fill_random_matrix(matrix_c);
+    copy_matrix(matrix_c, matrix_c_verify);
+
+    // Run matmuls
+    matmul_64_64_64_base_line(matrix_a, matrix_b, matrix_c);
+    naive_matmul_M_N_K<64, 64, 64>(matrix_a, matrix_b, matrix_c_verify, 64, 64, 64);
+
+    verify_matmul(matrix_c_verify, matrix_c);
+}
+
+TEST_CASE("Test matmul_64_64_64_base_line gemm correctness counting data", "[neon_5][correctness][gemm]")
+{
+    float matrix_a[64 * 64];
+    float matrix_b[64 * 64];
+    float matrix_c[64 * 64];
+    float matrix_c_verify[64 * 64];
+
+    fill_counting_matrix(matrix_a);
+    fill_counting_matrix(matrix_b);
+    fill_counting_matrix(matrix_c);
+    copy_matrix(matrix_c, matrix_c_verify);
+
+    // Run matmuls
+    matmul_64_64_64_base_line(matrix_a, matrix_b, matrix_c);
+    naive_matmul_M_N_K<64, 64, 64>(matrix_a, matrix_b, matrix_c_verify, 64, 64, 64);
+
+    verify_matmul(matrix_c_verify, matrix_c);
+}
diff --git a/submissions/submission_25_05_08/neon_5_1-base-line.s b/submissions/submission_25_05_08/neon_5_1-base-line.s
diff --git a/submissions/submission_25_05_08/neon_5_1.s b/submissions/submission_25_05_08/neon_5_1.s

Original file line number	Diff line number	Diff line change
`@@ -53,14 +53,18 @@ FetchContent_MakeAvailable(google_benchmark)`
`53`	`53`	`set(SRC_FILES`
`54`	`54`	`neon_4_1.s`
`55`	`55`	`neon_4_2.s`
	`56`	`+ neon_5_1.s`
	`57`	`+ neon_5_1-base-line.s`
`56`	`58`	`)`
`57`	`59`
`58`	`60`	`set(TEST_FILES`
`59`	`61`	`neon_4.test.cpp`
	`62`	`+ neon_5.test.cpp`
`60`	`63`	`)`
`61`	`64`
`62`	`65`	`set(BENCH_FILES`
`63`	`66`	`neon_4.bench.cpp`
	`67`	`+ neon_5.bench.cpp`
`64`	`68`	`)`
`65`	`69`
`66`	`70`	`# add_executable(loops "${SRC_FILES}")`
Original file line number	Diff line number	Diff line change
`@@ -25,27 +25,27 @@ class GemmMxNxKFixture : public benchmark::Fixture`
`25`	`25`	`}`
`26`	`26`	`};`
`27`	`27`
`28`		`-BENCHMARK_TEMPLATE_DEFINE_F(GemmMxNxKFixture, BM_matmul_15_6_64, 16, 6, 64)(benchmark::State &state)`
	`28`	`+BENCHMARK_TEMPLATE_DEFINE_F(GemmMxNxKFixture, BM_matmul_14_6_64, 14, 6, 64)(benchmark::State &state)`
`29`	`29`	`{`
`30`	`30`	`for (auto _ : state)`
`31`	`31`	`{`
`32`	`32`	`matmul_14_6_64(matrix_a, matrix_b, matrix_c, 16, 64, 16);`
`33`	`33`	`}`
`34`	`34`
`35`		`- flops = (4 * 6 * 4 * 2) * 64; // (4 fmla * 4 floats each * 2 instructions (add & mul) * 6 columns) * 64 K-Loop`
	`35`	`+ flops = (14 6 64) * 2; // M * N * K * 2 instructions (add & mul)`
`36`	`36`	`flops *= state.iterations();`
`37`	`37`	`};`
`38`	`38`
`39`		`-BENCHMARK_REGISTER_F(GemmMxNxKFixture, BM_matmul_15_6_64)->MinWarmUpTime(1.0); // WarmUp in seconds`
	`39`	`+BENCHMARK_REGISTER_F(GemmMxNxKFixture, BM_matmul_14_6_64)->MinWarmUpTime(1.0); // WarmUp in seconds`
`40`	`40`
`41`		`-BENCHMARK_TEMPLATE_DEFINE_F(GemmMxNxKFixture, BM_matmul_15_6_64, 16, 6, 64)(benchmark::State &state)`
	`41`	`+BENCHMARK_TEMPLATE_DEFINE_F(GemmMxNxKFixture, BM_matmul_15_6_64, 15, 6, 64)(benchmark::State &state)`
`42`	`42`	`{`
`43`	`43`	`for (auto _ : state)`
`44`	`44`	`{`
`45`		`- matmul_15_6_64(matrix_a, matrix_b, matrix_c, 16, 64, 16);`
	`45`	`+ matmul_15_6_64(matrix_a, matrix_b, matrix_c, 15, 64, 16);`
`46`	`46`	`}`
`47`	`47`
`48`		`- flops = (4 * 6 * 4 * 2) * 64; // (4 fmla * 4 floats each * 2 instructions (add & mul) * 6 columns) * 64 K-Loop`
	`48`	`+ flops = (15 6 64) * 2; // M * N * K * 2 instructions (add & mul)`
`49`	`49`	`flops *= state.iterations();`
`50`	`50`	`};`
`51`	`51`