Skip to content

Commit 68ace7d

Browse files
committed
feat: finished task neon 3
1 parent a7c0500 commit 68ace7d

15 files changed

+908
-94
lines changed

submissions/submission_25_05_01/CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,16 +53,24 @@ FetchContent_MakeAvailable(google_benchmark)
5353
set(SRC_FILES
5454
neon_1_1.s
5555
neon_1_2.s
56+
5657
neon_2_1_simple.s
5758
neon_2_1_unrolled.s
59+
neon_2_1_optimized.s
60+
61+
neon_3_1.s
62+
neon_3_2.s
63+
neon_3_3.s
5864
)
5965

6066
set(TEST_FILES
6167
neon_2_1.test.cpp
68+
neon_3.test.cpp
6269
)
6370

6471
set(BENCH_FILES
6572
neon_2_1.bench.cpp
73+
neon_3.bench.cpp
6674
)
6775

6876
add_executable(neon_1_1 "${SRC_FILES}"

submissions/submission_25_05_01/neon_1_1_driver.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,19 +41,19 @@ int main()
4141
const auto end_throughput_fmla_2s = std::chrono::high_resolution_clock::now();
4242
const std::chrono::duration<double, std::milli> diff_throughput_fmla_2s = end_throughput_fmla_2s - start_throughput_fmla_2s;
4343
std::cout << "Executed " << run_instructions_throughput_fmla_2s << " \"FMLA 2s\" Instructions in " << diff_throughput_fmla_2s.count() << " milliseconds." << std::endl
44-
<< "Resulting in a Throughput of " << run_instructions_throughput_fmla_2s / diff_throughput_fmla_2s.count() * 1000 << " Instructions per Second!"
45-
<< std::endl
46-
<< std::endl;
44+
<< "Resulting in a Throughput of " << run_instructions_throughput_fmla_2s / diff_throughput_fmla_2s.count() * 1000 << " Instructions per Second!"
45+
<< std::endl
46+
<< std::endl;
4747

4848
std::cout << "Running the Throughput \"FMADD\" Benchmark:" << std::endl;
4949
const auto start_throughput_fmadd = std::chrono::high_resolution_clock::now();
5050
const uint64_t run_instructions_throughput_fmadd = throughput_fmadd(repetitions) * repetitions;
5151
const auto end_throughput_fmadd = std::chrono::high_resolution_clock::now();
5252
const std::chrono::duration<double, std::milli> diff_throughput_fmadd = end_throughput_fmadd - start_throughput_fmadd;
5353
std::cout << "Executed " << run_instructions_throughput_fmadd << " \"FMADD\" Instructions in " << diff_throughput_fmadd.count() << " milliseconds." << std::endl
54-
<< "Resulting in a Throughput of " << run_instructions_throughput_fmadd / diff_throughput_fmadd.count() * 1000 << " Instructions per Second!"
55-
<< std::endl
56-
<< std::endl;
54+
<< "Resulting in a Throughput of " << run_instructions_throughput_fmadd / diff_throughput_fmadd.count() * 1000 << " Instructions per Second!"
55+
<< std::endl
56+
<< std::endl;
5757

5858
return 0;
5959
}
Lines changed: 35 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,44 @@
11
#include <cstdint>
2-
#include <iomanip>
3-
#include <iostream>
4-
#include <chrono>
5-
6-
extern "C"
7-
{
8-
/// @brief Execute the fmla 4s instruction for latency benchmarks with dependency on one of the source registers
9-
/// @param iterations The number of iterations the instructions are run.
10-
/// @return The number of processed instructions in a single loop.
11-
uint64_t latency_fmla_4s_source(uint64_t iterations);
2+
#include <iomanip>
3+
#include <iostream>
4+
#include <chrono>
5+
6+
extern "C"
7+
{
8+
/// @brief Execute the fmla 4s instruction for latency benchmarks with dependency on one of the source registers
9+
/// @param iterations The number of iterations the instructions are run.
10+
/// @return The number of processed instructions in a single loop.
11+
uint64_t latency_fmla_4s_source(uint64_t iterations);
12+
13+
/// @brief Execute the fmla 4s instruction for latency benchmarks with dependency on the destination register
14+
/// @param iterations The number of iterations the instructions are run.
15+
/// @return The number of processed instructions in a single loop.
16+
uint64_t latency_fmla_4s_destination(uint64_t iterations);
17+
}
18+
19+
int main()
20+
{
21+
const uint64_t repetitions = 1'000'000;
22+
23+
std::cout << "Running the Latency \"FMLA 4s\" Benchmark with dependency on one of the source registers:" << std::endl;
24+
const auto start_latency_fmla_4s_source = std::chrono::high_resolution_clock::now();
25+
const uint64_t run_instructions_latency_fmla_4s_source = latency_fmla_4s_source(repetitions) * repetitions;
26+
const auto end_latency_fmla_4s_source = std::chrono::high_resolution_clock::now();
27+
const std::chrono::duration<double, std::milli> diff_latency_fmla_4s_source = end_latency_fmla_4s_source - start_latency_fmla_4s_source;
28+
std::cout << "Executed " << run_instructions_latency_fmla_4s_source << " \"FMLA 4s\" Instructions in " << diff_latency_fmla_4s_source.count() << " milliseconds on a single Unit." << std::endl
29+
<< "Resulting in a Throughput of " << run_instructions_latency_fmla_4s_source / diff_latency_fmla_4s_source.count() * 1000 << " Instructions per Second!"
30+
<< std::endl
31+
<< std::endl;
1232

13-
/// @brief Execute the fmla 4s instruction for latency benchmarks with dependency on the destination register
14-
/// @param iterations The number of iterations the instructions are run.
15-
/// @return The number of processed instructions in a single loop.
16-
uint64_t latency_fmla_4s_destination(uint64_t iterations);
17-
}
18-
19-
int main()
20-
{
21-
const uint64_t repetitions = 1'000'000;
22-
23-
std::cout << "Running the Latency \"FMLA 4s\" Benchmark with dependency on one of the source registers:" << std::endl;
24-
const auto start_latency_fmla_4s_source = std::chrono::high_resolution_clock::now();
25-
const uint64_t run_instructions_latency_fmla_4s_source = latency_fmla_4s_source(repetitions) * repetitions;
26-
const auto end_latency_fmla_4s_source = std::chrono::high_resolution_clock::now();
27-
const std::chrono::duration<double, std::milli> diff_latency_fmla_4s_source = end_latency_fmla_4s_source - start_latency_fmla_4s_source;
28-
std::cout << "Executed " << run_instructions_latency_fmla_4s_source << " \"FMLA 4s\" Instructions in " << diff_latency_fmla_4s_source.count() << " milliseconds on a single Unit." << std::endl
29-
<< "Resulting in a Throughput of " << run_instructions_latency_fmla_4s_source / diff_latency_fmla_4s_source.count() * 1000 << " Instructions per Second!"
30-
<< std::endl
31-
<< std::endl;
32-
3333
std::cout << "Running the Latency \"FMLA 4s\" Benchmark with dependency on one of the destination registers:" << std::endl;
3434
const auto start_latency_fmla_4s_destination = std::chrono::high_resolution_clock::now();
3535
const uint64_t run_instructions_latency_fmla_4s_destination = latency_fmla_4s_destination(repetitions) * repetitions;
3636
const auto end_latency_fmla_4s_destination = std::chrono::high_resolution_clock::now();
3737
const std::chrono::duration<double, std::milli> diff_latency_fmla_4s_destination = end_latency_fmla_4s_destination - start_latency_fmla_4s_destination;
3838
std::cout << "Executed " << run_instructions_latency_fmla_4s_destination << " \"FMLA 4s\" Instructions in " << diff_latency_fmla_4s_destination.count() << " milliseconds on a single Unit." << std::endl
39-
<< "Resulting in a Throughput of " << run_instructions_latency_fmla_4s_destination / diff_latency_fmla_4s_destination.count() * 1000 << " Instructions per Second!"
40-
<< std::endl
41-
<< std::endl;
39+
<< "Resulting in a Throughput of " << run_instructions_latency_fmla_4s_destination / diff_latency_fmla_4s_destination.count() * 1000 << " Instructions per Second!"
40+
<< std::endl
41+
<< std::endl;
4242

43-
return 0;
44-
}
43+
return 0;
44+
}

submissions/submission_25_05_01/neon_2_1.bench.cpp

Lines changed: 32 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,54 +4,66 @@
44
class Gemm16x6x1Fixture : public benchmark::Fixture
55
{
66
public:
7-
8-
float matrix_a[16*1];
9-
float matrix_b[1*6];
10-
float matrix_c[16*6];
7+
float matrix_a[16 * 1];
8+
float matrix_b[1 * 6];
9+
float matrix_c[16 * 6];
1110
double flops;
1211

13-
void SetUp(::benchmark::State& _)
12+
void SetUp(::benchmark::State &_) override
1413
{
1514
flops = 0;
1615

1716
// Fill with random values
1817
std::srand(std::time(0));
19-
for (size_t i = 0; i < 16*1; i++)
18+
for (size_t i = 0; i < 16 * 1; i++)
2019
{
21-
matrix_c[i] = (static_cast<float>(std::rand()))/(static_cast<float>(std::rand()));
20+
matrix_c[i] = (static_cast<float>(std::rand())) / (static_cast<float>(std::rand()));
2221
}
23-
for (size_t i = 0; i < 1*6; i++)
22+
for (size_t i = 0; i < 1 * 6; i++)
2423
{
25-
matrix_b[i] = (static_cast<float>(std::rand()))/(static_cast<float>(std::rand()));
24+
matrix_b[i] = (static_cast<float>(std::rand())) / (static_cast<float>(std::rand()));
2625
}
27-
for (size_t i = 0; i < 16*6; i++)
26+
for (size_t i = 0; i < 16 * 6; i++)
2827
{
29-
matrix_c[i] = (static_cast<float>(std::rand()))/(static_cast<float>(std::rand()));
28+
matrix_c[i] = (static_cast<float>(std::rand())) / (static_cast<float>(std::rand()));
3029
}
31-
3230
}
3331

34-
void TearDown(::benchmark::State& state)
32+
void TearDown(::benchmark::State &state) override
3533
{
3634
state.counters["FLOPS"] = benchmark::Counter(flops, benchmark::Counter::kIsRate);
3735
}
3836
};
3937

40-
BENCHMARK_F(Gemm16x6x1Fixture, BM_matmul_16_6_1_simple)(benchmark::State& state)
38+
BENCHMARK_DEFINE_F(Gemm16x6x1Fixture, BM_matmul_16_6_1_simple)(benchmark::State &state)
4139
{
42-
ReportAggregatesOnly(true);
4340
for (auto _ : state)
4441
{
4542
matmul_16_6_1_simple(matrix_a, matrix_b, matrix_c, 16, 1, 16);
46-
flops += 4*6*4*2; // 4 fmla * 4 floats each * 2 instructions (add & mul) * 6 columns
43+
flops += 4 * 6 * 4 * 2; // 4 fmla * 4 floats each * 2 instructions (add & mul) * 6 columns
4744
}
48-
}
45+
}
46+
47+
BENCHMARK_REGISTER_F(Gemm16x6x1Fixture, BM_matmul_16_6_1_simple)->MinWarmUpTime(1.0); // WarmUp in seconds
4948

50-
BENCHMARK_F(Gemm16x6x1Fixture, BM_matmul_16_6_1_unrolled)(benchmark::State& state)
49+
BENCHMARK_DEFINE_F(Gemm16x6x1Fixture, BM_matmul_16_6_1_unrolled)(benchmark::State &state)
5150
{
5251
for (auto _ : state)
5352
{
5453
matmul_16_6_1_unrolled(matrix_a, matrix_b, matrix_c, 16, 1, 16);
55-
flops += 4*6*4*2; // 4 fmla * 4 floats each * 2 instructions (add & mul) * 6 columns
54+
flops += 4 * 6 * 4 * 2; // 4 fmla * 4 floats each * 2 instructions (add & mul) * 6 columns
55+
}
56+
}
57+
58+
BENCHMARK_REGISTER_F(Gemm16x6x1Fixture, BM_matmul_16_6_1_unrolled)->MinWarmUpTime(1.0); // WarmUp in seconds
59+
60+
BENCHMARK_DEFINE_F(Gemm16x6x1Fixture, BM_matmul_16_6_1_optimized)(benchmark::State &state)
61+
{
62+
for (auto _ : state)
63+
{
64+
matmul_16_6_1_optimized(matrix_a, matrix_b, matrix_c, 16, 1, 16);
65+
flops += 4 * 6 * 4 * 2; // 4 fmla * 4 floats each * 2 instructions (add & mul) * 6 columns
5666
}
57-
}
67+
}
68+
69+
BENCHMARK_REGISTER_F(Gemm16x6x1Fixture, BM_matmul_16_6_1_optimized)->MinWarmUpTime(1.0); // WarmUp in seconds

submissions/submission_25_05_01/neon_2_1.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,17 @@ extern "C"
2828
**/
2929
void matmul_16_6_1_unrolled(const float * __restrict__ a, const float * __restrict__ b, float * __restrict__ c,
3030
int64_t lda, int64_t ldb, int64_t ldc);
31+
32+
/**
33+
* @param a pointer to column-major matrix A.
34+
* @param b pointer to column-major matrix B.
35+
* @param c pointer to column-major matrix C.
36+
* @param lda leading dimension of A.
37+
* @param ldb leading dimension of B.
38+
* @param ldc leading dimension of C.
39+
**/
40+
void matmul_16_6_1_optimized(const float * __restrict__ a, const float * __restrict__ b, float * __restrict__ c,
41+
int64_t lda, int64_t ldb, int64_t ldc);
3142
}
3243

3344
void naive_matmul_16_6_1(const float * __restrict__ a, const float * __restrict__ b, float * __restrict__ c,

submissions/submission_25_05_01/neon_2_1.test.cpp

Lines changed: 32 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,24 +3,24 @@
33
#include <cstdint>
44
#include "neon_2_1.h"
55

6-
7-
void verify_matmul_16_6_1(const float * __restrict__ expected, const float * __restrict__ result)
6+
void verify_matmul_16_6_1(const float *__restrict__ expected, const float *__restrict__ result)
87
{
98
for (size_t i = 0; i < 6; i++)
109
{
1110
for (size_t j = 0; j < 16; j++)
1211
{
1312
REQUIRE_THAT(result[j + i * 16], Catch::Matchers::WithinRel(expected[j + i * 16]));
14-
}
13+
}
1514
}
1615
}
1716

18-
TEST_CASE("Test 16x6x1 simple gemm correctness random data", "[neon_2_1][correctness][gemm]") {
19-
float matrix_a[16*1];
20-
float matrix_b[1*6];
21-
float matrix_c[16*6];
22-
float matrix_c_verify[16*6];
23-
17+
TEST_CASE("Test 16x6x1 simple gemm correctness random data", "[neon_2_1][correctness][gemm]")
18+
{
19+
float matrix_a[16 * 1];
20+
float matrix_b[1 * 6];
21+
float matrix_c[16 * 6];
22+
float matrix_c_verify[16 * 6];
23+
2424
fill_matmul_16_6_1(matrix_a, matrix_b, matrix_c, matrix_c_verify);
2525

2626
// Run matmuls
@@ -30,12 +30,13 @@ TEST_CASE("Test 16x6x1 simple gemm correctness random data", "[neon_2_1][correct
3030
verify_matmul_16_6_1(matrix_c_verify, matrix_c);
3131
}
3232

33-
TEST_CASE("Test 16x6x1 unrolled gemm correctness random data", "[neon_2_1][correctness][gemm]") {
34-
float matrix_a[16*1];
35-
float matrix_b[1*6];
36-
float matrix_c[16*6];
37-
float matrix_c_verify[16*6];
38-
33+
TEST_CASE("Test 16x6x1 unrolled gemm correctness random data", "[neon_2_1][correctness][gemm]")
34+
{
35+
float matrix_a[16 * 1];
36+
float matrix_b[1 * 6];
37+
float matrix_c[16 * 6];
38+
float matrix_c_verify[16 * 6];
39+
3940
fill_matmul_16_6_1(matrix_a, matrix_b, matrix_c, matrix_c_verify);
4041

4142
// Run matmuls
@@ -44,3 +45,19 @@ TEST_CASE("Test 16x6x1 unrolled gemm correctness random data", "[neon_2_1][corre
4445

4546
verify_matmul_16_6_1(matrix_c_verify, matrix_c);
4647
}
48+
49+
TEST_CASE("Test 16x6x1 optimized gemm correctness random data", "[neon_2_1][correctness][gemm]")
50+
{
51+
float matrix_a[16 * 1];
52+
float matrix_b[1 * 6];
53+
float matrix_c[16 * 6];
54+
float matrix_c_verify[16 * 6];
55+
56+
fill_matmul_16_6_1(matrix_a, matrix_b, matrix_c, matrix_c_verify);
57+
58+
// Run matmuls
59+
matmul_16_6_1_optimized(matrix_a, matrix_b, matrix_c, 16, 1, 16);
60+
naive_matmul_16_6_1(matrix_a, matrix_b, matrix_c_verify, 16, 1, 16);
61+
62+
verify_matmul_16_6_1(matrix_c_verify, matrix_c);
63+
}

submissions/submission_25_05_01/neon_2_1_driver.cpp

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,23 +2,23 @@
22
#include <cstdint>
33
#include "neon_2_1.h"
44

5-
void matmul_16_6_1(const float * __restrict__ a, const float * __restrict__ b, float * __restrict__ c,
6-
int64_t lda, int64_t ldb, int64_t ldc){
7-
matmul_16_6_1_simple(a, b, c, lda, ldb, ldc);
8-
// matmul_16_6_1_unrolled(a, b, c, lda, ldb, ldc);
9-
}
10-
5+
void matmul_16_6_1(const float *__restrict__ a, const float *__restrict__ b, float *__restrict__ c,
6+
int64_t lda, int64_t ldb, int64_t ldc)
7+
{
8+
matmul_16_6_1_simple(a, b, c, lda, ldb, ldc);
9+
// matmul_16_6_1_unrolled(a, b, c, lda, ldb, ldc);
10+
}
1111

12-
int main()
12+
int main()
1313
{
14-
float matrix_a[16*1];
15-
float matrix_b[1*6];
16-
float matrix_c[16*6];
17-
float matrix_c_verify[16*6];
14+
float matrix_a[16 * 1];
15+
float matrix_b[1 * 6];
16+
float matrix_c[16 * 6];
17+
float matrix_c_verify[16 * 6];
1818

1919
// Fill with random values
2020
fill_matmul_16_6_1(matrix_a, matrix_b, matrix_c, matrix_c_verify);
21-
21+
2222
// Run matmuls
2323
matmul_16_6_1(matrix_a, matrix_b, matrix_c, 16, 1, 16);
2424
naive_matmul_16_6_1(matrix_a, matrix_b, matrix_c_verify, 16, 1, 16);
@@ -30,10 +30,10 @@ int main()
3030
for (size_t j = 0; j < 16; j++)
3131
{
3232
success_count += (std::abs(matrix_c[j + i * 16] - matrix_c_verify[j + i * 16]) < 0.01f);
33-
}
33+
}
3434
}
35-
36-
std::cout << success_count/static_cast<float>(16*6) * 100 << "% Successful" << std::endl;
35+
36+
std::cout << success_count / static_cast<float>(16 * 6) * 100 << "% Successful" << std::endl;
3737

3838
return 0;
3939
}

0 commit comments

Comments
 (0)