Integer-Ctrl
diff --git a/‎submissions/submission_25_05_01/CMakeLists.txt‎
Lines changed: 8 additions & 0 deletions b/‎submissions/submission_25_05_01/CMakeLists.txt‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎submissions/submission_25_05_01/neon_1_1_driver.cpp‎
Lines changed: 6 additions & 6 deletions b/‎submissions/submission_25_05_01/neon_1_1_driver.cpp‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎submissions/submission_25_05_01/neon_1_2_driver.cpp‎
Lines changed: 35 additions & 35 deletions b/‎submissions/submission_25_05_01/neon_1_2_driver.cpp‎
Lines changed: 35 additions & 35 deletions
diff --git a/‎submissions/submission_25_05_01/neon_2_1.bench.cpp‎
Lines changed: 32 additions & 20 deletions b/‎submissions/submission_25_05_01/neon_2_1.bench.cpp‎
Lines changed: 32 additions & 20 deletions
diff --git a/‎submissions/submission_25_05_01/neon_2_1.h‎
Lines changed: 11 additions & 0 deletions b/‎submissions/submission_25_05_01/neon_2_1.h‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎submissions/submission_25_05_01/neon_2_1.test.cpp‎
Lines changed: 32 additions & 15 deletions b/‎submissions/submission_25_05_01/neon_2_1.test.cpp‎
Lines changed: 32 additions & 15 deletions
diff --git a/‎submissions/submission_25_05_01/neon_2_1_driver.cpp‎
Lines changed: 15 additions & 15 deletions b/‎submissions/submission_25_05_01/neon_2_1_driver.cpp‎
Lines changed: 15 additions & 15 deletions
@@ -53,16 +53,24 @@ FetchContent_MakeAvailable(google_benchmark)
 set(SRC_FILES
     neon_1_1.s
     neon_1_2.s
+
     neon_2_1_simple.s
     neon_2_1_unrolled.s
+    neon_2_1_optimized.s
+
+    neon_3_1.s
+    neon_3_2.s
+    neon_3_3.s
 )
 
 set(TEST_FILES
     neon_2_1.test.cpp
+    neon_3.test.cpp
 )
 
 set(BENCH_FILES
     neon_2_1.bench.cpp
+    neon_3.bench.cpp
 )
 
 add_executable(neon_1_1 "${SRC_FILES}"
 
@@ -41,19 +41,19 @@ int main()
     const auto end_throughput_fmla_2s = std::chrono::high_resolution_clock::now();
     const std::chrono::duration<double, std::milli> diff_throughput_fmla_2s = end_throughput_fmla_2s - start_throughput_fmla_2s;
     std::cout << "Executed " << run_instructions_throughput_fmla_2s << " \"FMLA 2s\" Instructions in " << diff_throughput_fmla_2s.count() << " milliseconds." << std::endl
-            << "Resulting in a Throughput of " << run_instructions_throughput_fmla_2s / diff_throughput_fmla_2s.count() * 1000 << " Instructions per Second!"
-            << std::endl
-            << std::endl;
+              << "Resulting in a Throughput of " << run_instructions_throughput_fmla_2s / diff_throughput_fmla_2s.count() * 1000 << " Instructions per Second!"
+              << std::endl
+              << std::endl;
 
     std::cout << "Running the Throughput \"FMADD\" Benchmark:" << std::endl;
     const auto start_throughput_fmadd = std::chrono::high_resolution_clock::now();
     const uint64_t run_instructions_throughput_fmadd = throughput_fmadd(repetitions) * repetitions;
     const auto end_throughput_fmadd = std::chrono::high_resolution_clock::now();
     const std::chrono::duration<double, std::milli> diff_throughput_fmadd = end_throughput_fmadd - start_throughput_fmadd;
     std::cout << "Executed " << run_instructions_throughput_fmadd << " \"FMADD\" Instructions in " << diff_throughput_fmadd.count() << " milliseconds." << std::endl
-                << "Resulting in a Throughput of " << run_instructions_throughput_fmadd / diff_throughput_fmadd.count() * 1000 << " Instructions per Second!"
-                << std::endl
-                << std::endl;
+              << "Resulting in a Throughput of " << run_instructions_throughput_fmadd / diff_throughput_fmadd.count() * 1000 << " Instructions per Second!"
+              << std::endl
+              << std::endl;
 
     return 0;
 }
@@ -1,44 +1,44 @@
 #include <cstdint>
- #include <iomanip>
- #include <iostream>
- #include <chrono>
- 
- extern "C"
- {
-     /// @brief Execute the fmla 4s instruction for latency benchmarks with dependency on one of the source registers
-     /// @param iterations The number of iterations the instructions are run.
-     /// @return The number of processed instructions in a single loop.
-     uint64_t latency_fmla_4s_source(uint64_t iterations);
+#include <iomanip>
+#include <iostream>
+#include <chrono>
+
+extern "C"
+{
+    /// @brief Execute the fmla 4s instruction for latency benchmarks with dependency on one of the source registers
+    /// @param iterations The number of iterations the instructions are run.
+    /// @return The number of processed instructions in a single loop.
+    uint64_t latency_fmla_4s_source(uint64_t iterations);
+
+    /// @brief Execute the fmla 4s instruction for latency benchmarks with dependency on the destination register
+    /// @param iterations The number of iterations the instructions are run.
+    /// @return The number of processed instructions in a single loop.
+    uint64_t latency_fmla_4s_destination(uint64_t iterations);
+}
+
+int main()
+{
+    const uint64_t repetitions = 1'000'000;
+
+    std::cout << "Running the Latency \"FMLA 4s\" Benchmark with dependency on one of the source registers:" << std::endl;
+    const auto start_latency_fmla_4s_source = std::chrono::high_resolution_clock::now();
+    const uint64_t run_instructions_latency_fmla_4s_source = latency_fmla_4s_source(repetitions) * repetitions;
+    const auto end_latency_fmla_4s_source = std::chrono::high_resolution_clock::now();
+    const std::chrono::duration<double, std::milli> diff_latency_fmla_4s_source = end_latency_fmla_4s_source - start_latency_fmla_4s_source;
+    std::cout << "Executed " << run_instructions_latency_fmla_4s_source << " \"FMLA 4s\" Instructions in " << diff_latency_fmla_4s_source.count() << " milliseconds on a single Unit." << std::endl
+              << "Resulting in a Throughput of " << run_instructions_latency_fmla_4s_source / diff_latency_fmla_4s_source.count() * 1000 << " Instructions per Second!"
+              << std::endl
+              << std::endl;
 
-     /// @brief Execute the fmla 4s instruction for latency benchmarks with dependency on the destination register
-     /// @param iterations The number of iterations the instructions are run.
-     /// @return The number of processed instructions in a single loop.
-     uint64_t latency_fmla_4s_destination(uint64_t iterations);
- }
- 
- int main()
- {
-     const uint64_t repetitions = 1'000'000;
- 
-     std::cout << "Running the Latency \"FMLA 4s\" Benchmark with dependency on one of the source registers:" << std::endl;
-     const auto start_latency_fmla_4s_source = std::chrono::high_resolution_clock::now();
-     const uint64_t run_instructions_latency_fmla_4s_source = latency_fmla_4s_source(repetitions) * repetitions;
-     const auto end_latency_fmla_4s_source = std::chrono::high_resolution_clock::now();
-     const std::chrono::duration<double, std::milli> diff_latency_fmla_4s_source = end_latency_fmla_4s_source - start_latency_fmla_4s_source;
-     std::cout << "Executed " << run_instructions_latency_fmla_4s_source << " \"FMLA 4s\" Instructions in " << diff_latency_fmla_4s_source.count() << " milliseconds on a single Unit." << std::endl
-                 << "Resulting in a Throughput of " << run_instructions_latency_fmla_4s_source / diff_latency_fmla_4s_source.count() * 1000 << " Instructions per Second!"
-                 << std::endl
-                 << std::endl;
-    
     std::cout << "Running the Latency \"FMLA 4s\" Benchmark with dependency on one of the destination registers:" << std::endl;
     const auto start_latency_fmla_4s_destination = std::chrono::high_resolution_clock::now();
     const uint64_t run_instructions_latency_fmla_4s_destination = latency_fmla_4s_destination(repetitions) * repetitions;
     const auto end_latency_fmla_4s_destination = std::chrono::high_resolution_clock::now();
     const std::chrono::duration<double, std::milli> diff_latency_fmla_4s_destination = end_latency_fmla_4s_destination - start_latency_fmla_4s_destination;
     std::cout << "Executed " << run_instructions_latency_fmla_4s_destination << " \"FMLA 4s\" Instructions in " << diff_latency_fmla_4s_destination.count() << " milliseconds on a single Unit." << std::endl
-                << "Resulting in a Throughput of " << run_instructions_latency_fmla_4s_destination / diff_latency_fmla_4s_destination.count() * 1000 << " Instructions per Second!"
-                << std::endl
-                << std::endl;
+              << "Resulting in a Throughput of " << run_instructions_latency_fmla_4s_destination / diff_latency_fmla_4s_destination.count() * 1000 << " Instructions per Second!"
+              << std::endl
+              << std::endl;
 
-     return 0;
- }
+    return 0;
+}
@@ -4,54 +4,66 @@
 class Gemm16x6x1Fixture : public benchmark::Fixture
 {
 public:
-
-    float matrix_a[16*1];
-    float matrix_b[1*6];
-    float matrix_c[16*6];
+    float matrix_a[16 * 1];
+    float matrix_b[1 * 6];
+    float matrix_c[16 * 6];
     double flops;
 
-    void SetUp(::benchmark::State& _)
+    void SetUp(::benchmark::State &_) override
     {
         flops = 0;
 
         // Fill with random values
         std::srand(std::time(0));
-        for (size_t i = 0; i < 16*1; i++)
+        for (size_t i = 0; i < 16 * 1; i++)
         {
-            matrix_c[i] = (static_cast<float>(std::rand()))/(static_cast<float>(std::rand()));
+            matrix_c[i] = (static_cast<float>(std::rand())) / (static_cast<float>(std::rand()));
         }
-        for (size_t i = 0; i < 1*6; i++)
+        for (size_t i = 0; i < 1 * 6; i++)
         {
-            matrix_b[i] = (static_cast<float>(std::rand()))/(static_cast<float>(std::rand()));
+            matrix_b[i] = (static_cast<float>(std::rand())) / (static_cast<float>(std::rand()));
         }
-        for (size_t i = 0; i < 16*6; i++)
+        for (size_t i = 0; i < 16 * 6; i++)
         {
-            matrix_c[i] = (static_cast<float>(std::rand()))/(static_cast<float>(std::rand()));
+            matrix_c[i] = (static_cast<float>(std::rand())) / (static_cast<float>(std::rand()));
         }
-
     }
 
-    void TearDown(::benchmark::State& state)
+    void TearDown(::benchmark::State &state) override
     {
         state.counters["FLOPS"] = benchmark::Counter(flops, benchmark::Counter::kIsRate);
     }
 };
 
-BENCHMARK_F(Gemm16x6x1Fixture, BM_matmul_16_6_1_simple)(benchmark::State& state)
+BENCHMARK_DEFINE_F(Gemm16x6x1Fixture, BM_matmul_16_6_1_simple)(benchmark::State &state)
 {
-    ReportAggregatesOnly(true);
     for (auto _ : state)
     {
         matmul_16_6_1_simple(matrix_a, matrix_b, matrix_c, 16, 1, 16);
-        flops += 4*6*4*2; // 4 fmla * 4 floats each * 2 instructions (add & mul) * 6 columns 
+        flops += 4 * 6 * 4 * 2; // 4 fmla * 4 floats each * 2 instructions (add & mul) * 6 columns
     }
-} 
+}
+
+BENCHMARK_REGISTER_F(Gemm16x6x1Fixture, BM_matmul_16_6_1_simple)->MinWarmUpTime(1.0); // WarmUp in seconds
 
-BENCHMARK_F(Gemm16x6x1Fixture, BM_matmul_16_6_1_unrolled)(benchmark::State& state)
+BENCHMARK_DEFINE_F(Gemm16x6x1Fixture, BM_matmul_16_6_1_unrolled)(benchmark::State &state)
 {
     for (auto _ : state)
     {
         matmul_16_6_1_unrolled(matrix_a, matrix_b, matrix_c, 16, 1, 16);
-        flops += 4*6*4*2; // 4 fmla * 4 floats each * 2 instructions (add & mul) * 6 columns 
+        flops += 4 * 6 * 4 * 2; // 4 fmla * 4 floats each * 2 instructions (add & mul) * 6 columns
+    }
+}
+
+BENCHMARK_REGISTER_F(Gemm16x6x1Fixture, BM_matmul_16_6_1_unrolled)->MinWarmUpTime(1.0); // WarmUp in seconds
+
+BENCHMARK_DEFINE_F(Gemm16x6x1Fixture, BM_matmul_16_6_1_optimized)(benchmark::State &state)
+{
+    for (auto _ : state)
+    {
+        matmul_16_6_1_optimized(matrix_a, matrix_b, matrix_c, 16, 1, 16);
+        flops += 4 * 6 * 4 * 2; // 4 fmla * 4 floats each * 2 instructions (add & mul) * 6 columns
     }
-}
+}
+
+BENCHMARK_REGISTER_F(Gemm16x6x1Fixture, BM_matmul_16_6_1_optimized)->MinWarmUpTime(1.0); // WarmUp in seconds
@@ -28,6 +28,17 @@ extern "C"
      **/
     void matmul_16_6_1_unrolled(const float * __restrict__ a, const float * __restrict__ b, float * __restrict__ c, 
         int64_t lda, int64_t ldb, int64_t ldc);
+
+    /**
+     * @param a pointer to column-major matrix A.
+     * @param b pointer to column-major matrix B.
+     * @param c pointer to column-major matrix C.
+     * @param lda leading dimension of A.
+     * @param ldb leading dimension of B.
+     * @param ldc leading dimension of C.
+     **/
+    void matmul_16_6_1_optimized(const float * __restrict__ a, const float * __restrict__ b, float * __restrict__ c, 
+        int64_t lda, int64_t ldb, int64_t ldc);
 }
 
 void naive_matmul_16_6_1(const float * __restrict__ a, const float * __restrict__ b, float * __restrict__ c, 
 
@@ -3,24 +3,24 @@
 #include <cstdint>
 #include "neon_2_1.h"
 
-
-void verify_matmul_16_6_1(const float * __restrict__ expected, const float * __restrict__ result)
+void verify_matmul_16_6_1(const float *__restrict__ expected, const float *__restrict__ result)
 {
     for (size_t i = 0; i < 6; i++)
     {
         for (size_t j = 0; j < 16; j++)
         {
             REQUIRE_THAT(result[j + i * 16], Catch::Matchers::WithinRel(expected[j + i * 16]));
-        }   
+        }
     }
 }
 
-TEST_CASE("Test 16x6x1 simple gemm correctness random data", "[neon_2_1][correctness][gemm]") {
-    float matrix_a[16*1];
-    float matrix_b[1*6];
-    float matrix_c[16*6];
-    float matrix_c_verify[16*6];
-    
+TEST_CASE("Test 16x6x1 simple gemm correctness random data", "[neon_2_1][correctness][gemm]")
+{
+    float matrix_a[16 * 1];
+    float matrix_b[1 * 6];
+    float matrix_c[16 * 6];
+    float matrix_c_verify[16 * 6];
+
     fill_matmul_16_6_1(matrix_a, matrix_b, matrix_c, matrix_c_verify);
 
     // Run matmuls
@@ -30,12 +30,13 @@ TEST_CASE("Test 16x6x1 simple gemm correctness random data", "[neon_2_1][correct
     verify_matmul_16_6_1(matrix_c_verify, matrix_c);
 }
 
-TEST_CASE("Test 16x6x1 unrolled gemm correctness random data", "[neon_2_1][correctness][gemm]") {
-    float matrix_a[16*1];
-    float matrix_b[1*6];
-    float matrix_c[16*6];
-    float matrix_c_verify[16*6];
-    
+TEST_CASE("Test 16x6x1 unrolled gemm correctness random data", "[neon_2_1][correctness][gemm]")
+{
+    float matrix_a[16 * 1];
+    float matrix_b[1 * 6];
+    float matrix_c[16 * 6];
+    float matrix_c_verify[16 * 6];
+
     fill_matmul_16_6_1(matrix_a, matrix_b, matrix_c, matrix_c_verify);
 
     // Run matmuls
@@ -44,3 +45,19 @@ TEST_CASE("Test 16x6x1 unrolled gemm correctness random data", "[neon_2_1][corre
 
     verify_matmul_16_6_1(matrix_c_verify, matrix_c);
 }
+
+TEST_CASE("Test 16x6x1 optimized gemm correctness random data", "[neon_2_1][correctness][gemm]")
+{
+    float matrix_a[16 * 1];
+    float matrix_b[1 * 6];
+    float matrix_c[16 * 6];
+    float matrix_c_verify[16 * 6];
+
+    fill_matmul_16_6_1(matrix_a, matrix_b, matrix_c, matrix_c_verify);
+
+    // Run matmuls
+    matmul_16_6_1_optimized(matrix_a, matrix_b, matrix_c, 16, 1, 16);
+    naive_matmul_16_6_1(matrix_a, matrix_b, matrix_c_verify, 16, 1, 16);
+
+    verify_matmul_16_6_1(matrix_c_verify, matrix_c);
+}
@@ -2,23 +2,23 @@
 #include <cstdint>
 #include "neon_2_1.h"
 
-void matmul_16_6_1(const float * __restrict__ a, const float * __restrict__ b, float * __restrict__ c,
-    int64_t lda, int64_t ldb, int64_t ldc){
-        matmul_16_6_1_simple(a, b, c, lda, ldb, ldc);
-        // matmul_16_6_1_unrolled(a, b, c, lda, ldb, ldc);
-    }
-
+void matmul_16_6_1(const float *__restrict__ a, const float *__restrict__ b, float *__restrict__ c,
+                   int64_t lda, int64_t ldb, int64_t ldc)
+{
+    matmul_16_6_1_simple(a, b, c, lda, ldb, ldc);
+    // matmul_16_6_1_unrolled(a, b, c, lda, ldb, ldc);
+}
 
-int main() 
+int main()
 {
-    float matrix_a[16*1];
-    float matrix_b[1*6];
-    float matrix_c[16*6];
-    float matrix_c_verify[16*6];
+    float matrix_a[16 * 1];
+    float matrix_b[1 * 6];
+    float matrix_c[16 * 6];
+    float matrix_c_verify[16 * 6];
 
     // Fill with random values
     fill_matmul_16_6_1(matrix_a, matrix_b, matrix_c, matrix_c_verify);
-    
+
     // Run matmuls
     matmul_16_6_1(matrix_a, matrix_b, matrix_c, 16, 1, 16);
     naive_matmul_16_6_1(matrix_a, matrix_b, matrix_c_verify, 16, 1, 16);
@@ -30,10 +30,10 @@ int main()
         for (size_t j = 0; j < 16; j++)
         {
             success_count += (std::abs(matrix_c[j + i * 16] - matrix_c_verify[j + i * 16]) < 0.01f);
-        }   
+        }
     }
-    
-    std::cout << success_count/static_cast<float>(16*6) * 100 << "% Successful" << std::endl;
+
+    std::cout << success_count / static_cast<float>(16 * 6) * 100 << "% Successful" << std::endl;
 
     return 0;
 }
Original file line number	Diff line number	Diff line change
`@@ -4,54 +4,66 @@`
`4`	`4`	`class Gemm16x6x1Fixture : public benchmark::Fixture`
`5`	`5`	`{`
`6`	`6`	`public:`
`7`		`-`
`8`		`- float matrix_a[16*1];`
`9`		`- float matrix_b[1*6];`
`10`		`- float matrix_c[16*6];`
	`7`	`+ float matrix_a[16 * 1];`
	`8`	`+ float matrix_b[1 * 6];`
	`9`	`+ float matrix_c[16 * 6];`
`11`	`10`	`double flops;`
`12`	`11`
`13`		`- void SetUp(::benchmark::State& _)`
	`12`	`+ void SetUp(::benchmark::State &_) override`
`14`	`13`	`{`
`15`	`14`	`flops = 0;`
`16`	`15`
`17`	`16`	`// Fill with random values`
`18`	`17`	`std::srand(std::time(0));`
`19`		`- for (size_t i = 0; i < 16*1; i++)`
	`18`	`+ for (size_t i = 0; i < 16 * 1; i++)`
`20`	`19`	`{`
`21`		`- matrix_c[i] = (static_cast<float>(std::rand()))/(static_cast<float>(std::rand()));`
	`20`	`+ matrix_c[i] = (static_cast<float>(std::rand())) / (static_cast<float>(std::rand()));`
`22`	`21`	`}`
`23`		`- for (size_t i = 0; i < 1*6; i++)`
	`22`	`+ for (size_t i = 0; i < 1 * 6; i++)`
`24`	`23`	`{`
`25`		`- matrix_b[i] = (static_cast<float>(std::rand()))/(static_cast<float>(std::rand()));`
	`24`	`+ matrix_b[i] = (static_cast<float>(std::rand())) / (static_cast<float>(std::rand()));`
`26`	`25`	`}`
`27`		`- for (size_t i = 0; i < 16*6; i++)`
	`26`	`+ for (size_t i = 0; i < 16 * 6; i++)`
`28`	`27`	`{`
`29`		`- matrix_c[i] = (static_cast<float>(std::rand()))/(static_cast<float>(std::rand()));`
	`28`	`+ matrix_c[i] = (static_cast<float>(std::rand())) / (static_cast<float>(std::rand()));`
`30`	`29`	`}`
`31`		`-`
`32`	`30`	`}`
`33`	`31`
`34`		`- void TearDown(::benchmark::State& state)`
	`32`	`+ void TearDown(::benchmark::State &state) override`
`35`	`33`	`{`
`36`	`34`	`state.counters["FLOPS"] = benchmark::Counter(flops, benchmark::Counter::kIsRate);`
`37`	`35`	`}`
`38`	`36`	`};`
`39`	`37`
`40`		`-BENCHMARK_F(Gemm16x6x1Fixture, BM_matmul_16_6_1_simple)(benchmark::State& state)`
	`38`	`+BENCHMARK_DEFINE_F(Gemm16x6x1Fixture, BM_matmul_16_6_1_simple)(benchmark::State &state)`
`41`	`39`	`{`
`42`		`- ReportAggregatesOnly(true);`
`43`	`40`	`for (auto _ : state)`
`44`	`41`	`{`
`45`	`42`	`matmul_16_6_1_simple(matrix_a, matrix_b, matrix_c, 16, 1, 16);`
`46`		`- flops += 4642; // 4 fmla 4 floats each * 2 instructions (add & mul) * 6 columns`
	`43`	`+ flops += 4 * 6 * 4 * 2; // 4 fmla * 4 floats each * 2 instructions (add & mul) * 6 columns`
`47`	`44`	`}`
`48`		`-}`
	`45`	`+}`
	`46`	`+`
	`47`	`+BENCHMARK_REGISTER_F(Gemm16x6x1Fixture, BM_matmul_16_6_1_simple)->MinWarmUpTime(1.0); // WarmUp in seconds`
`49`	`48`
`50`		`-BENCHMARK_F(Gemm16x6x1Fixture, BM_matmul_16_6_1_unrolled)(benchmark::State& state)`
	`49`	`+BENCHMARK_DEFINE_F(Gemm16x6x1Fixture, BM_matmul_16_6_1_unrolled)(benchmark::State &state)`
`51`	`50`	`{`
`52`	`51`	`for (auto _ : state)`
`53`	`52`	`{`
`54`	`53`	`matmul_16_6_1_unrolled(matrix_a, matrix_b, matrix_c, 16, 1, 16);`
`55`		`- flops += 4642; // 4 fmla 4 floats each * 2 instructions (add & mul) * 6 columns`
	`54`	`+ flops += 4 * 6 * 4 * 2; // 4 fmla * 4 floats each * 2 instructions (add & mul) * 6 columns`
	`55`	`+ }`
	`56`	`+}`
	`57`	`+`
	`58`	`+BENCHMARK_REGISTER_F(Gemm16x6x1Fixture, BM_matmul_16_6_1_unrolled)->MinWarmUpTime(1.0); // WarmUp in seconds`
	`59`	`+`
	`60`	`+BENCHMARK_DEFINE_F(Gemm16x6x1Fixture, BM_matmul_16_6_1_optimized)(benchmark::State &state)`
	`61`	`+{`
	`62`	`+ for (auto _ : state)`
	`63`	`+ {`
	`64`	`+ matmul_16_6_1_optimized(matrix_a, matrix_b, matrix_c, 16, 1, 16);`
	`65`	`+ flops += 4 * 6 * 4 * 2; // 4 fmla * 4 floats each * 2 instructions (add & mul) * 6 columns`
`56`	`66`	`}`
`57`		`-}`
	`67`	`+}`
	`68`	`+`
	`69`	`+BENCHMARK_REGISTER_F(Gemm16x6x1Fixture, BM_matmul_16_6_1_optimized)->MinWarmUpTime(1.0); // WarmUp in seconds`