feat: neon task 4 (simd)

Integer-Ctrl · Integer-Ctrl · commit b1d5f29110c1 · 2025-05-05T19:00:39.000Z
diff --git a/docs_sphinx/getting_started/building_project.rst b/docs_sphinx/getting_started/building_project.rst
@@ -97,9 +97,7 @@ Building
 6. Now we can build the project. The most desired command might be
 
     .. code-block:: bash
-
-        cmake --build . --target simulation
-
+        
     Options for ``--target`` are **benchmark** and **tests**
 
 
diff --git a/submissions/submission_25_05_08/CMakeLists.txt b/submissions/submission_25_05_08/CMakeLists.txt
@@ -0,0 +1,78 @@
+cmake_minimum_required(VERSION 3.10.0)
+project(MachineLearningCompiler VERSION 0.1.0 LANGUAGES C CXX ASM)
+
+get_property(IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
+if(IS_MULTI_CONFIG)
+    message(NOTICE "Using multi-config generator. Compile with: cmake --build . --config [Debug|Release] --target <target>")
+else()
+    message(NOTICE "Using single-config generator. Generate with: cmake .. -DCMAKE_BUILD_TYPE=[Debug|Release]")
+    if(NOT CMAKE_BUILD_TYPE)
+        set(CMAKE_BUILD_TYPE "Release")
+        message(WARNING "No Build type is set. Using Release!")
+    endif()
+endif()
+
+message(STATUS "Build Type: ${CMAKE_BUILD_TYPE}")
+
+# Setup compile Flags
+if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    add_compile_options(-Wall -Wextra -Wpedantic)
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -Og")
+elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    add_compile_options(-Wall -Wextra -Wpedantic)
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -Og")
+endif()
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED on)
+set(CMAKE_VERBOSE_MAKEFILE off)
+
+# Fetch Catch2
+Include(FetchContent)
+
+FetchContent_Declare(
+  Catch2
+  GIT_REPOSITORY https://github.com/catchorg/Catch2.git
+  GIT_TAG        v3.8.1
+)
+FetchContent_MakeAvailable(Catch2)
+
+# set(BENCHMARK_DOWNLOAD_DEPENDENCIES on)
+set(BENCHMARK_ENABLE_GTEST_TESTS off)
+set(BENCHMARK_ENABLE_TESTING off)
+
+FetchContent_Declare(
+    google_benchmark
+    GIT_REPOSITORY https://github.com/google/benchmark
+    GIT_TAG        v1.9.2
+)
+FetchContent_MakeAvailable(google_benchmark)
+
+# Add source files
+set(SRC_FILES
+    neon_4_1.s
+    neon_4_2.s
+)
+
+set(TEST_FILES
+    neon_4.test.cpp
+)
+
+set(BENCH_FILES
+    neon_4.bench.cpp
+)
+
+# add_executable(loops "${SRC_FILES}")
+# target_sources(loops loops_main.cpp)
+
+add_executable(tests "${SRC_FILES}" "${TEST_FILES}")
+target_link_libraries(tests PRIVATE Catch2::Catch2WithMain)
+
+add_executable(benchmarks "${SRC_FILES}" "${BENCH_FILES}")
+target_link_libraries(benchmarks benchmark::benchmark_main)
+
+list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
+include(CTest)
+include(Catch)
+catch_discover_tests(tests)
diff --git a/submissions/submission_25_05_08/neon_4.bench.cpp b/submissions/submission_25_05_08/neon_4.bench.cpp
@@ -0,0 +1,52 @@
+#include <benchmark/benchmark.h>
+#include "neon_4.h"
+
+template <uint32_t TMdim, uint32_t TNdim, uint32_t TKdim>
+class GemmMxNxKFixture : public benchmark::Fixture
+{
+public:
+    float matrix_a[TMdim * TKdim];
+    float matrix_b[TKdim * TNdim];
+    float matrix_c[TMdim * TNdim];
+    double flops;
+
+    void SetUp(::benchmark::State &_) override
+    {
+        flops = 0;
+
+        fill_random_matrix(matrix_a);
+        fill_random_matrix(matrix_b);
+        fill_random_matrix(matrix_c);
+    }
+
+    void TearDown(::benchmark::State &state) override
+    {
+        state.counters["FLOPS"] = benchmark::Counter(flops, benchmark::Counter::kIsRate);
+    }
+};
+
+BENCHMARK_TEMPLATE_DEFINE_F(GemmMxNxKFixture, BM_matmul_15_6_64, 16, 6, 64)(benchmark::State &state)
+{
+    for (auto _ : state)
+    {
+        matmul_14_6_64(matrix_a, matrix_b, matrix_c, 16, 64, 16);
+    }
+
+    flops = (4 * 6 * 4 * 2) * 64; // (4 fmla * 4 floats each * 2 instructions (add & mul) * 6 columns) * 64 K-Loop
+    flops *= state.iterations();
+};
+
+BENCHMARK_REGISTER_F(GemmMxNxKFixture, BM_matmul_15_6_64)->MinWarmUpTime(1.0); // WarmUp in seconds
+
+BENCHMARK_TEMPLATE_DEFINE_F(GemmMxNxKFixture, BM_matmul_15_6_64, 16, 6, 64)(benchmark::State &state)
+{
+    for (auto _ : state)
+    {
+        matmul_15_6_64(matrix_a, matrix_b, matrix_c, 16, 64, 16);
+    }
+
+    flops = (4 * 6 * 4 * 2) * 64; // (4 fmla * 4 floats each * 2 instructions (add & mul) * 6 columns) * 64 K-Loop
+    flops *= state.iterations();
+};
+
+BENCHMARK_REGISTER_F(GemmMxNxKFixture, BM_matmul_15_6_64)->MinWarmUpTime(1.0); // WarmUp in seconds
diff --git a/submissions/submission_25_05_08/neon_4.h b/submissions/submission_25_05_08/neon_4.h
@@ -0,0 +1,92 @@
+#ifndef NEON_4_H
+#define NEON_4_H
+
+#include <cstdint>
+#include <ctime>
+
+extern "C"
+{
+    /**
+     * @brief Matmul that loops over the K dimension of an original matmul of (M=14, N=6, K=1) now with loop over K=64.
+     * @param a pointer to column-major matrix A.
+     * @param b pointer to column-major matrix B.
+     * @param c pointer to column-major matrix C.
+     * @param lda leading dimension of A.
+     * @param ldb leading dimension of B.
+     * @param ldc leading dimension of C.
+     **/
+    void matmul_14_6_64(float const *a, float const *b, float *c, int64_t lda, int64_t ldb, int64_t ldc);
+
+    /**
+     * @brief Matmul that loops over the K dimension of an original matmul of (M=15, N=6, K=1) now with loop over K=64.
+     * @param a pointer to column-major matrix A.
+     * @param b pointer to column-major matrix B.
+     * @param c pointer to column-major matrix C.
+     * @param lda leading dimension of A.
+     * @param ldb leading dimension of B.
+     * @param ldc leading dimension of C.
+     **/
+    void matmul_15_6_64(float const *a, float const *b, float *c, int64_t lda, int64_t ldb, int64_t ldc);
+}
+
+/// @brief Fill the given matrix with random values.
+/// @tparam TSize The total size of the matrix.
+/// @param matrix The matrix to write to.
+template <uint32_t TSize>
+void fill_random_matrix(float (&matrix)[TSize])
+{
+    std::srand(std::time(0));
+    for (size_t i = 0; i < TSize; i++)
+    {
+        matrix[i] = (static_cast<float>(std::rand())) / (static_cast<float>(std::rand()));
+    }
+}
+
+/// @brief Fill the given matrix with counting up values, starting from 0.
+/// @tparam TSize The total size of the matrix.
+/// @param matrix The matrix to write to.
+template <uint32_t TSize>
+void fill_counting_matrix(float (&matrix)[TSize])
+{
+    for (size_t i = 0; i < TSize; i++)
+    {
+        matrix[i] = i;
+    }
+}
+
+/// @brief Copy the values of matrix to another matrix.
+/// @tparam TSize The equal size of the matrices.
+/// @param input The matrix to copy from.
+/// @param output The matrix to copy to.
+template <uint32_t TSize>
+void copy_matrix(float (&input)[TSize], float (&output)[TSize])
+{
+    std::copy(std::begin(input), std::end(input), std::begin(output));
+}
+
+/// @brief Naive matmul of column-major C [MxN] = A [MxK] mul B [KxN].
+/// @tparam TMDim The size of the M dimension.
+/// @tparam TNDim The size of the N dimension.
+/// @tparam TKDim The size of the K dimension.
+/// @param a The pointer of matrix A.
+/// @param b The pointer of matrix B.
+/// @param c The pointer of matrix C.
+/// @param lda The leading dimension of A.
+/// @param ldb The leading dimension of B.
+/// @param ldc The leading dimension of C.
+template <uint32_t TMDim, uint32_t TNDim, uint32_t TKDim>
+void naive_matmul_M_N_K(const float *__restrict__ a, const float *__restrict__ b, float *__restrict__ c,
+                        int64_t lda, int64_t ldb, int64_t ldc)
+{
+    for (size_t iM = 0; iM < TMDim; iM++)
+    {
+        for (size_t iN = 0; iN < TNDim; iN++)
+        {
+            for (size_t iK = 0; iK < TKDim; ++iK)
+            {
+                c[iM + iN * ldc] += a[iM + iK * lda] * b[iK + iN * ldb];
+            }
+        }
+    }
+}
+#endif // NEON_4_H
diff --git a/submissions/submission_25_05_08/neon_4.test.cpp b/submissions/submission_25_05_08/neon_4.test.cpp
@@ -0,0 +1,90 @@
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/matchers/catch_matchers_floating_point.hpp>
+#include <cstdint>
+#include "neon_4.h"
+
+template <uint TSize>
+void verify_matmul(const float (&expected)[TSize], const float (&result)[TSize])
+{
+    for (size_t i = 0; i < TSize; i++)
+    {
+        CAPTURE(i, result[i], expected[i]);
+        REQUIRE_THAT(result[i], Catch::Matchers::WithinRel(expected[i]));
+    }
+}
+
+TEST_CASE("Test 14x6x64 gemm correctness random data", "[neon_4][correctness][gemm]")
+{
+    float matrix_a[14 * 64];
+    float matrix_b[64 * 6];
+    float matrix_c[14 * 6];
+    float matrix_c_verify[14 * 6];
+
+    fill_random_matrix(matrix_a);
+    fill_random_matrix(matrix_b);
+    fill_random_matrix(matrix_c);
+    copy_matrix(matrix_c, matrix_c_verify);
+
+    // Run matmuls
+    matmul_14_6_64(matrix_a, matrix_b, matrix_c, 14, 64, 14);
+    naive_matmul_M_N_K<14, 6, 64>(matrix_a, matrix_b, matrix_c_verify, 14, 64, 14);
+
+    verify_matmul(matrix_c_verify, matrix_c);
+}
+
+TEST_CASE("Test 14x6x64 gemm correctness counting data", "[neon_4][correctness][gemm]")
+{
+    float matrix_a[14 * 64];
+    float matrix_b[64 * 6];
+    float matrix_c[14 * 6];
+    float matrix_c_verify[14 * 6];
+
+    fill_counting_matrix(matrix_a);
+    fill_counting_matrix(matrix_b);
+    fill_counting_matrix(matrix_c);
+    copy_matrix(matrix_c, matrix_c_verify);
+
+    // Run matmuls
+    matmul_14_6_64(matrix_a, matrix_b, matrix_c, 14, 64, 14);
+    naive_matmul_M_N_K<14, 6, 64>(matrix_a, matrix_b, matrix_c_verify, 14, 64, 14);
+
+    verify_matmul(matrix_c_verify, matrix_c);
+}
+
+TEST_CASE("Test 15x6x64 gemm correctness random data", "[neon_4][correctness][gemm]")
+{
+    float matrix_a[15 * 64];
+    float matrix_b[64 * 6];
+    float matrix_c[15 * 6];
+    float matrix_c_verify[15 * 6];
+
+    fill_random_matrix(matrix_a);
+    fill_random_matrix(matrix_b);
+    fill_random_matrix(matrix_c);
+    copy_matrix(matrix_c, matrix_c_verify);
+
+    // Run matmuls
+    matmul_15_6_64(matrix_a, matrix_b, matrix_c, 15, 64, 15);
+    naive_matmul_M_N_K<15, 6, 64>(matrix_a, matrix_b, matrix_c_verify, 15, 64, 15);
+
+    verify_matmul(matrix_c_verify, matrix_c);
+}
+
+TEST_CASE("Test 15x6x64 gemm correctness counting data", "[neon_4][correctness][gemm]")
+{
+    float matrix_a[15 * 64];
+    float matrix_b[64 * 6];
+    float matrix_c[15 * 6];
+    float matrix_c_verify[15 * 6];
+
+    fill_counting_matrix(matrix_a);
+    fill_counting_matrix(matrix_b);
+    fill_counting_matrix(matrix_c);
+    copy_matrix(matrix_c, matrix_c_verify);
+
+    // Run matmuls
+    matmul_15_6_64(matrix_a, matrix_b, matrix_c, 15, 64, 15);
+    naive_matmul_M_N_K<15, 6, 64>(matrix_a, matrix_b, matrix_c_verify, 15, 64, 15);
+
+    verify_matmul(matrix_c_verify, matrix_c);
+}
diff --git a/submissions/submission_25_05_08/neon_4_1.s b/submissions/submission_25_05_08/neon_4_1.s
diff --git a/submissions/submission_25_05_08/neon_4_2.s b/submissions/submission_25_05_08/neon_4_2.s