Skip to content

Commit b1d5f29

Browse files
committed
feat: neon task 4 (simd)
1 parent 153a047 commit b1d5f29

File tree

7 files changed

+667
-3
lines changed

7 files changed

+667
-3
lines changed

docs_sphinx/getting_started/building_project.rst

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,7 @@ Building
9797
6. Now we can build the project. The most desired command might be
9898

9999
.. code-block:: bash
100-
101-
cmake --build . --target simulation
102-
100+
103101
Options for ``--target`` are **benchmark** and **tests**
104102

105103

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
cmake_minimum_required(VERSION 3.10.0)
2+
project(MachineLearningCompiler VERSION 0.1.0 LANGUAGES C CXX ASM)
3+
4+
get_property(IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
5+
if(IS_MULTI_CONFIG)
6+
message(NOTICE "Using multi-config generator. Compile with: cmake --build . --config [Debug|Release] --target <target>")
7+
else()
8+
message(NOTICE "Using single-config generator. Generate with: cmake .. -DCMAKE_BUILD_TYPE=[Debug|Release]")
9+
if(NOT CMAKE_BUILD_TYPE)
10+
set(CMAKE_BUILD_TYPE "Release")
11+
message(WARNING "No Build type is set. Using Release!")
12+
endif()
13+
endif()
14+
15+
message(STATUS "Build Type: ${CMAKE_BUILD_TYPE}")
16+
17+
# Setup compile Flags
18+
if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
19+
add_compile_options(-Wall -Wextra -Wpedantic)
20+
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2")
21+
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -Og")
22+
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
23+
add_compile_options(-Wall -Wextra -Wpedantic)
24+
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2")
25+
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g -Og")
26+
endif()
27+
set(CMAKE_CXX_STANDARD 17)
28+
set(CMAKE_CXX_STANDARD_REQUIRED on)
29+
set(CMAKE_VERBOSE_MAKEFILE off)
30+
31+
# Fetch Catch2
32+
Include(FetchContent)
33+
34+
FetchContent_Declare(
35+
Catch2
36+
GIT_REPOSITORY https://github.com/catchorg/Catch2.git
37+
GIT_TAG v3.8.1
38+
)
39+
FetchContent_MakeAvailable(Catch2)
40+
41+
# set(BENCHMARK_DOWNLOAD_DEPENDENCIES on)
42+
set(BENCHMARK_ENABLE_GTEST_TESTS off)
43+
set(BENCHMARK_ENABLE_TESTING off)
44+
45+
FetchContent_Declare(
46+
google_benchmark
47+
GIT_REPOSITORY https://github.com/google/benchmark
48+
GIT_TAG v1.9.2
49+
)
50+
FetchContent_MakeAvailable(google_benchmark)
51+
52+
# Add source files
53+
set(SRC_FILES
54+
neon_4_1.s
55+
neon_4_2.s
56+
)
57+
58+
set(TEST_FILES
59+
neon_4.test.cpp
60+
)
61+
62+
set(BENCH_FILES
63+
neon_4.bench.cpp
64+
)
65+
66+
# add_executable(loops "${SRC_FILES}")
67+
# target_sources(loops loops_main.cpp)
68+
69+
add_executable(tests "${SRC_FILES}" "${TEST_FILES}")
70+
target_link_libraries(tests PRIVATE Catch2::Catch2WithMain)
71+
72+
add_executable(benchmarks "${SRC_FILES}" "${BENCH_FILES}")
73+
target_link_libraries(benchmarks benchmark::benchmark_main)
74+
75+
list(APPEND CMAKE_MODULE_PATH ${catch2_SOURCE_DIR}/extras)
76+
include(CTest)
77+
include(Catch)
78+
catch_discover_tests(tests)
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#include <benchmark/benchmark.h>
2+
#include "neon_4.h"
3+
4+
template <uint32_t TMdim, uint32_t TNdim, uint32_t TKdim>
5+
class GemmMxNxKFixture : public benchmark::Fixture
6+
{
7+
public:
8+
float matrix_a[TMdim * TKdim];
9+
float matrix_b[TKdim * TNdim];
10+
float matrix_c[TMdim * TNdim];
11+
double flops;
12+
13+
void SetUp(::benchmark::State &_) override
14+
{
15+
flops = 0;
16+
17+
fill_random_matrix(matrix_a);
18+
fill_random_matrix(matrix_b);
19+
fill_random_matrix(matrix_c);
20+
}
21+
22+
void TearDown(::benchmark::State &state) override
23+
{
24+
state.counters["FLOPS"] = benchmark::Counter(flops, benchmark::Counter::kIsRate);
25+
}
26+
};
27+
28+
BENCHMARK_TEMPLATE_DEFINE_F(GemmMxNxKFixture, BM_matmul_15_6_64, 16, 6, 64)(benchmark::State &state)
29+
{
30+
for (auto _ : state)
31+
{
32+
matmul_14_6_64(matrix_a, matrix_b, matrix_c, 16, 64, 16);
33+
}
34+
35+
flops = (4 * 6 * 4 * 2) * 64; // (4 fmla * 4 floats each * 2 instructions (add & mul) * 6 columns) * 64 K-Loop
36+
flops *= state.iterations();
37+
};
38+
39+
BENCHMARK_REGISTER_F(GemmMxNxKFixture, BM_matmul_15_6_64)->MinWarmUpTime(1.0); // WarmUp in seconds
40+
41+
BENCHMARK_TEMPLATE_DEFINE_F(GemmMxNxKFixture, BM_matmul_15_6_64, 16, 6, 64)(benchmark::State &state)
42+
{
43+
for (auto _ : state)
44+
{
45+
matmul_15_6_64(matrix_a, matrix_b, matrix_c, 16, 64, 16);
46+
}
47+
48+
flops = (4 * 6 * 4 * 2) * 64; // (4 fmla * 4 floats each * 2 instructions (add & mul) * 6 columns) * 64 K-Loop
49+
flops *= state.iterations();
50+
};
51+
52+
BENCHMARK_REGISTER_F(GemmMxNxKFixture, BM_matmul_15_6_64)->MinWarmUpTime(1.0); // WarmUp in seconds
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
#ifndef NEON_4_H
2+
#define NEON_4_H
3+
4+
#include <cstdint>
5+
#include <ctime>
6+
7+
extern "C"
8+
{
9+
/**
10+
* @brief Matmul that loops over the K dimension of an original matmul of (M=14, N=6, K=1) now with loop over K=64.
11+
* @param a pointer to column-major matrix A.
12+
* @param b pointer to column-major matrix B.
13+
* @param c pointer to column-major matrix C.
14+
* @param lda leading dimension of A.
15+
* @param ldb leading dimension of B.
16+
* @param ldc leading dimension of C.
17+
**/
18+
void matmul_14_6_64(float const *a, float const *b, float *c, int64_t lda, int64_t ldb, int64_t ldc);
19+
20+
/**
21+
* @brief Matmul that loops over the K dimension of an original matmul of (M=15, N=6, K=1) now with loop over K=64.
22+
* @param a pointer to column-major matrix A.
23+
* @param b pointer to column-major matrix B.
24+
* @param c pointer to column-major matrix C.
25+
* @param lda leading dimension of A.
26+
* @param ldb leading dimension of B.
27+
* @param ldc leading dimension of C.
28+
**/
29+
void matmul_15_6_64(float const *a, float const *b, float *c, int64_t lda, int64_t ldb, int64_t ldc);
30+
}
31+
32+
/// @brief Fill the given matrix with random values.
33+
/// @tparam TSize The total size of the matrix.
34+
/// @param matrix The matrix to write to.
35+
template <uint32_t TSize>
36+
void fill_random_matrix(float (&matrix)[TSize])
37+
{
38+
std::srand(std::time(0));
39+
for (size_t i = 0; i < TSize; i++)
40+
{
41+
matrix[i] = (static_cast<float>(std::rand())) / (static_cast<float>(std::rand()));
42+
}
43+
}
44+
45+
/// @brief Fill the given matrix with counting up values, starting from 0.
46+
/// @tparam TSize The total size of the matrix.
47+
/// @param matrix The matrix to write to.
48+
template <uint32_t TSize>
49+
void fill_counting_matrix(float (&matrix)[TSize])
50+
{
51+
for (size_t i = 0; i < TSize; i++)
52+
{
53+
matrix[i] = i;
54+
}
55+
}
56+
57+
/// @brief Copy the values of matrix to another matrix.
58+
/// @tparam TSize The equal size of the matrices.
59+
/// @param input The matrix to copy from.
60+
/// @param output The matrix to copy to.
61+
template <uint32_t TSize>
62+
void copy_matrix(float (&input)[TSize], float (&output)[TSize])
63+
{
64+
std::copy(std::begin(input), std::end(input), std::begin(output));
65+
}
66+
67+
/// @brief Naive matmul of column-major C [MxN] = A [MxK] mul B [KxN].
68+
/// @tparam TMDim The size of the M dimension.
69+
/// @tparam TNDim The size of the N dimension.
70+
/// @tparam TKDim The size of the K dimension.
71+
/// @param a The pointer of matrix A.
72+
/// @param b The pointer of matrix B.
73+
/// @param c The pointer of matrix C.
74+
/// @param lda The leading dimension of A.
75+
/// @param ldb The leading dimension of B.
76+
/// @param ldc The leading dimension of C.
77+
template <uint32_t TMDim, uint32_t TNDim, uint32_t TKDim>
78+
void naive_matmul_M_N_K(const float *__restrict__ a, const float *__restrict__ b, float *__restrict__ c,
79+
int64_t lda, int64_t ldb, int64_t ldc)
80+
{
81+
for (size_t iM = 0; iM < TMDim; iM++)
82+
{
83+
for (size_t iN = 0; iN < TNDim; iN++)
84+
{
85+
for (size_t iK = 0; iK < TKDim; ++iK)
86+
{
87+
c[iM + iN * ldc] += a[iM + iK * lda] * b[iK + iN * ldb];
88+
}
89+
}
90+
}
91+
}
92+
#endif // NEON_4_H
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#include <catch2/catch_test_macros.hpp>
2+
#include <catch2/matchers/catch_matchers_floating_point.hpp>
3+
#include <cstdint>
4+
#include "neon_4.h"
5+
6+
template <uint TSize>
7+
void verify_matmul(const float (&expected)[TSize], const float (&result)[TSize])
8+
{
9+
for (size_t i = 0; i < TSize; i++)
10+
{
11+
CAPTURE(i, result[i], expected[i]);
12+
REQUIRE_THAT(result[i], Catch::Matchers::WithinRel(expected[i]));
13+
}
14+
}
15+
16+
TEST_CASE("Test 14x6x64 gemm correctness random data", "[neon_4][correctness][gemm]")
17+
{
18+
float matrix_a[14 * 64];
19+
float matrix_b[64 * 6];
20+
float matrix_c[14 * 6];
21+
float matrix_c_verify[14 * 6];
22+
23+
fill_random_matrix(matrix_a);
24+
fill_random_matrix(matrix_b);
25+
fill_random_matrix(matrix_c);
26+
copy_matrix(matrix_c, matrix_c_verify);
27+
28+
// Run matmuls
29+
matmul_14_6_64(matrix_a, matrix_b, matrix_c, 14, 64, 14);
30+
naive_matmul_M_N_K<14, 6, 64>(matrix_a, matrix_b, matrix_c_verify, 14, 64, 14);
31+
32+
verify_matmul(matrix_c_verify, matrix_c);
33+
}
34+
35+
TEST_CASE("Test 14x6x64 gemm correctness counting data", "[neon_4][correctness][gemm]")
36+
{
37+
float matrix_a[14 * 64];
38+
float matrix_b[64 * 6];
39+
float matrix_c[14 * 6];
40+
float matrix_c_verify[14 * 6];
41+
42+
fill_counting_matrix(matrix_a);
43+
fill_counting_matrix(matrix_b);
44+
fill_counting_matrix(matrix_c);
45+
copy_matrix(matrix_c, matrix_c_verify);
46+
47+
// Run matmuls
48+
matmul_14_6_64(matrix_a, matrix_b, matrix_c, 14, 64, 14);
49+
naive_matmul_M_N_K<14, 6, 64>(matrix_a, matrix_b, matrix_c_verify, 14, 64, 14);
50+
51+
verify_matmul(matrix_c_verify, matrix_c);
52+
}
53+
54+
TEST_CASE("Test 15x6x64 gemm correctness random data", "[neon_4][correctness][gemm]")
55+
{
56+
float matrix_a[15 * 64];
57+
float matrix_b[64 * 6];
58+
float matrix_c[15 * 6];
59+
float matrix_c_verify[15 * 6];
60+
61+
fill_random_matrix(matrix_a);
62+
fill_random_matrix(matrix_b);
63+
fill_random_matrix(matrix_c);
64+
copy_matrix(matrix_c, matrix_c_verify);
65+
66+
// Run matmuls
67+
matmul_15_6_64(matrix_a, matrix_b, matrix_c, 15, 64, 15);
68+
naive_matmul_M_N_K<15, 6, 64>(matrix_a, matrix_b, matrix_c_verify, 15, 64, 15);
69+
70+
verify_matmul(matrix_c_verify, matrix_c);
71+
}
72+
73+
TEST_CASE("Test 15x6x64 gemm correctness counting data", "[neon_4][correctness][gemm]")
74+
{
75+
float matrix_a[15 * 64];
76+
float matrix_b[64 * 6];
77+
float matrix_c[15 * 6];
78+
float matrix_c_verify[15 * 6];
79+
80+
fill_counting_matrix(matrix_a);
81+
fill_counting_matrix(matrix_b);
82+
fill_counting_matrix(matrix_c);
83+
copy_matrix(matrix_c, matrix_c_verify);
84+
85+
// Run matmuls
86+
matmul_15_6_64(matrix_a, matrix_b, matrix_c, 15, 64, 15);
87+
naive_matmul_M_N_K<15, 6, 64>(matrix_a, matrix_b, matrix_c_verify, 15, 64, 15);
88+
89+
verify_matmul(matrix_c_verify, matrix_c);
90+
}

0 commit comments

Comments
 (0)