Integer-Ctrl
diff --git a/‎docs_sphinx/submissions/report_25_05_01.rst‎
Lines changed: 183 additions & 140 deletions b/‎docs_sphinx/submissions/report_25_05_01.rst‎
Lines changed: 183 additions & 140 deletions
@@ -221,171 +221,214 @@ Loops
     :linenos:
 
     ...
-    matmul_16_6_64:
-        ...
-        mov x6, x1 // Store the initial value of x1, to be restored in the next loop iteration
-        mov x7, x2 // Store the initial value of x2, to be restored in the next loop iteration
+    // Offset the used leading dimension by the size of floats
+    lsl x3, x3, #2 // x3 * 4 = x3 * sizeof(float)
+    lsl x4, x4, #2 // x4 * 4 = x4 * sizeof(float)
+    lsl x5, x5, #2 // x5 * 4 = x5 * sizeof(float)
 
-        mov x9, #64 // x9 iterator for K loop
-    matmul_loop_over_K:
-        sub x9, x9, #1
+    mov x6, x1 // Store the initial value of x1, to be restored in the next loop iteration
+    mov x7, x2 // Store the initial value of x2, to be restored after the loop
 
-        ... logic of matmul_16_6_1 - neon_2_unrolled.s ...
+    // Load first column from the 16x6 matrix c
+    ld1 {v25.4s, v26.4s, v27.4s, v28.4s}, [x2], x5
+    // Load second column from the 16x6 matrix c
+    ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [x2], x5
+    // Load third column from the 16x6 matrix c
+    ld1 {v21.4s, v22.4s, v23.4s, v24.4s}, [x2], x5
+    // Load fourth column from the 16x6 matrix c
+    ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [x2], x5
+    // Load fifth column from the 16x6 matrix c
+    ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [x2], x5
+    // Load sixth column from the 16x6 matrix c
+    ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [x2], x5
+
+    mov x9, #64 // x9 iterator for K loop
+  matmul_loop_over_K:
+    sub x9, x9, #1
+
+    // Load first column data from the 16x1 matrix a
+    ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], x3
+
+    // run the known matmul_16_6_1_unrolled kernel
+    // Load first element from the 1x6 matrix b
+    ldr s4, [x1]
+    add x1, x1, x4
 
-        // offset x6 to the next element in the column
-        add x6, x6, #4 // #4 = sizeof(float)
+    // Calculate first column of c
+    fmla v25.4s, v0.4s, v4.s[0]
+    fmla v26.4s, v1.4s, v4.s[0]
+    fmla v27.4s, v2.4s, v4.s[0]
+    fmla v28.4s, v3.4s, v4.s[0]
 
-        // Restore x1 and x2 to be incremented again
-        mov x1, x6
-        mov x2, x7
 
-        // Loop back
-        cbnz x9, matmul_loop_over_K
+    // Load second element from the 1x6 matrix b
+    ldr s4, [x1]
+    add x1, x1, x4
 
-        ret
-        ...
+    // Calculate second column of c
+    fmla v17.4s, v0.4s, v4.s[0]
+    fmla v18.4s, v1.4s, v4.s[0]
+    fmla v19.4s, v2.4s, v4.s[0]
+    fmla v20.4s, v3.4s, v4.s[0]
 
+    
+    // Load third element from the 1x6 matrix b
+    ldr s4, [x1]
+    add x1, x1, x4
 
-2. Loop over M: Implement a kernel that computes C+=AB for M=64, N=6 and K=64.
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-- File ``submissions/submission_25_05_01/neon_3_2.s``
+    // Calculated third column of c
+    fmla v21.4s, v0.4s, v4.s[0]
+    fmla v22.4s, v1.4s, v4.s[0]
+    fmla v23.4s, v2.4s, v4.s[0]
+    fmla v24.4s, v3.4s, v4.s[0]
 
-.. code-block:: asm
-    :linenos:
 
-    ...
-    matmul_64_6_64:
-        ...
+    // Load fourth element from the 1x6 matrix b
+    ldr s4, [x1]
+    add x1, x1, x4
 
-        mov x6, x1 // Store the initial value of x1, to be restored in the K loop iteration
-        mov x7, x2 // Store the initial value of x2, to be restored in the K loop iteration
+    // Calculate fourth column of c
+    fmla v5.4s, v0.4s, v4.s[0]
+    fmla v6.4s, v1.4s, v4.s[0]
+    fmla v7.4s, v2.4s, v4.s[0]
+    fmla v8.4s, v3.4s, v4.s[0]
 
-        mov x8, x0 // Store the initial value of x0, to be restored in the M loop iteration
-        mov x9, x1 // Store the initial value of x1, to be restored in the M loop iteration
 
-        mov x16, #4 // x16 iterator for M loop
-    matmul_loop_over_M:
-        sub x16, x16, #1
+    // Load fifth element from the 1x6 matrix b
+    ldr s4, [x1]
+    add x1, x1, x4
 
-        mov x15, #64 // x15 iterator for K loop
-    matmul_loop_over_K:
-        sub x15, x15, #1
+    // Calculate fifth column of c
+    fmla v9.4s, v0.4s, v4.s[0]
+    fmla v10.4s, v1.4s, v4.s[0]
+    fmla v11.4s, v2.4s, v4.s[0]
+    fmla v12.4s, v3.4s, v4.s[0]
 
-        ... logic of matmul_16_6_1 - neon_2_unrolled.s ...
+    
+    // Load sixth element from the 1x6 matrix b
+    ldr s4, [x1]
+    add x1, x1, x4
 
-        // offset x6 to the next element in the column
-        add x6, x6, #4 // #4 = sizeof(float)
+    // Calculated sixth column of c
+    fmla v13.4s, v0.4s, v4.s[0]
+    fmla v14.4s, v1.4s, v4.s[0]
+    fmla v15.4s, v2.4s, v4.s[0]
+    fmla v16.4s, v3.4s, v4.s[0]
 
-        // Restore x1 and x2 to be incremented again
-        mov x1, x6
-        mov x2, x7
 
-        // Loop back to K
-        cbnz x15, matmul_loop_over_K
+    // offset x6 to the next element in the column
+    add x6, x6, #4 // #4 = sizeof(float)
 
-        // next M iteration on the matrix c and matrix a, both need offset about 16 values
-        // also matrix b needs to start at the initial location again
-        // Updates for the matrix c
-        add x7, x7, #16*4 // column height * sizeof(float)
-        mov x2, x7 // also apply offset to x2
+    // Restore x1 to be incremented again
+    mov x1, x6
 
-        // Updates for the matrix a
-        add x8, x8, #16*4 // column height * sizeof(float)
-        mov x0, x8 // also apply offset to x0
+    // Loop back
+    cbnz x9, matmul_loop_over_K
 
-        // Updates for the matrix b
-        mov x6, x9 // Update the restore register for x1 for the K loop
-        mov x1, x9 // Update the x1 register itself
+    // Restore initial value of x2 that was changed by the loads
+    mov x2, x7
 
-        // Loop back to M
-        cbnz x16, matmul_loop_over_M
+    // Store first column back to memory
+    st1 {v25.4s, v26.4s, v27.4s, v28.4s}, [x2], x5 
+    // Store second column back to memory
+    st1 {v17.4s, v18.4s, v19.4s, v20.4s}, [x2], x5
+    // Store third column back to memory
+    st1 {v21.4s, v22.4s, v23.4s, v24.4s}, [x2], x5
+    // Store fourth column back to memory
+    st1 {v5.4s, v6.4s, v7.4s, v8.4s}, [x2], x5 
+    // Store fifth column back to memory
+    st1 {v9.4s, v10.4s, v11.4s, v12.4s}, [x2], x5
+    // Store sixth column back to memory
+    st1 {v13.4s, v14.4s, v15.4s, v16.4s}, [x2], x5
 
-        ret
-        ...
 
-3. Loop over N: Implement a kernel that computes C+=AB for M=64, N=48 and K=64.
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-- File ``submissions/submission_25_05_01/neon_3_3.s``
+2. Loop over M: Implement a kernel that computes C+=AB for M=64, N=6 and K=64.
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+- File ``submissions/submission_25_05_01/neon_3_2.s``
 
 .. code-block:: asm
     :linenos:
 
-    ...
-    matmul_64_48_64:
-        ...
+      // Offset the used leading dimension by the size of floats
+      lsl x3, x3, #2 // x3 * 4 = x3 * sizeof(float)
+      lsl x4, x4, #2 // x4 * 4 = x4 * sizeof(float)
+      lsl x5, x5, #2 // x5 * 4 = x5 * sizeof(float)
+
+      mov x6, x1 // Store the initial value of x1, to be restored in the K loop iteration
+      mov x7, x2 // Store the initial value of x2, to be restored in the K loop iteration
+
+      mov x8, x0 // Store the initial value of x0, to be restored in the M loop iteration
+      mov x9, x1 // Store the initial value of x1, to be restored in the M loop iteration
+
+      mov x16, #4 // x16 iterator for M loop
+  matmul_loop_over_M:
+      sub x16, x16, #1
+
+      ... <logic of loop over K - neon_3_1>
 
-        mov x6, x1 // Store the initial value of x1, to be restored in the K loop iteration
-        mov x7, x2 // Store the initial value of x2, to be restored in the K loop iteration
+      // next M iteration on the matrix c and matrix a, both need offset about 16 values
+      // also matrix b needs to start at the initial location again
+      // Updates for the matrix c
+      add x7, x7, #16*4 // column height * sizeof(float)
+      mov x2, x7 // also apply offset to x2
 
-        mov x8, x0 // Store the initial value of x0, to be restored in the M loop iteration
-        mov x9, x1 // Store the initial value of x1, to be restored in the M loop iteration
+      // Updates for the matrix a
+      add x8, x8, #16*4 // column height * sizeof(float)
+      mov x0, x8 // also apply offset to x0
 
-        mov x10, x0 // Store the initial value of x0, to be restored in the N loop iteration
-        mov x11, x2 // Store the initial value of x2, to bes restored in the N loop iteration
-        mov x12, #6 // hold the size of N that are processed in one loop, needed for offset calculation 
+      // Updates for the matrix b
+      mov x6, x9 // Update the restore register for x1 for the K loop
+      mov x1, x9 // Update the x1 register itself
 
-        mov x17, #8 // x17 iterator for N loop
-    matmul_loop_over_N:
-        sub x17, x17, #1
+      // Loop back to M
+      cbnz x16, matmul_loop_over_M
 
-        mov x16, #4 // x16 iterator for M loop
-    matmul_loop_over_M:
-        sub x16, x16, #1
+3. Loop over N: Implement a kernel that computes C+=AB for M=64, N=48 and K=64.
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+- File ``submissions/submission_25_05_01/neon_3_3.s``
 
-        mov x15, #64 // x15 iterator for K loop
-    matmul_loop_over_K:
-        sub x15, x15, #1
+.. code-block:: asm
+    :linenos:
+  
+      // Offset the used leading dimension by the size of floats
+      lsl x3, x3, #2 // x3 * 4 = x3 * sizeof(float)
+      lsl x4, x4, #2 // x4 * 4 = x4 * sizeof(float)
+      lsl x5, x5, #2 // x5 * 4 = x5 * sizeof(float)
 
-        ... logic of matmul_16_6_1 - neon_2_unrolled.s ...
+      mov x6, x1 // Store the initial value of x1, to be restored in the K loop iteration
+      mov x7, x2 // Store the initial value of x2, to be restored in the K loop iteration
 
-        // offset x6 to the next element in the column
-        add x6, x6, #4 // #4 = sizeof(float)
+      mov x8, x0 // Store the initial value of x0, to be restored in the M loop iteration
+      mov x9, x1 // Store the initial value of x1, to be restored in the M loop iteration
 
-        // Restore x1 and x2 to be incremented again
-        mov x1, x6
-        mov x2, x7
+      mov x10, x0 // Store the initial value of x0, to be restored in the N loop iteration
+      mov x11, x2 // Store the initial value of x2, to bes restored in the N loop iteration
+      mov x12, #6 // hold the size of N that are processed in one loop, needed for offset calculation 
 
-        // Loop back to K
-        cbnz x15, matmul_loop_over_K
+      mov x17, #8 // x17 iterator for N loop
+  matmul_loop_over_N:
+      sub x17, x17, #1
 
-        // next M iteration on the matrix c and matrix a, both need offset about 16 values
-        // also matrix b needs to start at the initial location again
-        // Updates for the matrix a
-        add x8, x8, #16*4 // column height * sizeof(float)
-        mov x0, x8 // also apply offset to x0
+    ... <logic of loop over M - neon_3_2>
 
-        // Updates for the matrix c
-        add x7, x7, #16*4 // column height * sizeof(float)
-        mov x2, x7 // also apply offset to x2
+      // next M iteration on the matrix b and matrix c, both need offset about 6*ldb/ldc values
+      // also matrix a needs to start at the initial location again
+      // Update for the matrix a
+      mov x8, x10 // Update the restore register for x0 for the M loop
+      mov x0, x10 // Update the x0 register itself
 
-        // Updates for the matrix b
-        mov x6, x9 // Update the restore register for x1 for the K loop
-        mov x1, x9 // Update the x1 register itself
+      // Updates for the matrix b
+      madd x9, x4, x12, x9 // ldb * 6 + initial position
+      mov x6, x9 // Update the restore register of x1 for the K loop
+      mov x1, x9 // Update the x1 register itself
 
-        // Loop back to M
-        cbnz x16, matmul_loop_over_M
-        
-        // next M iteration on the matrix b and matrix c, both need offset about 6*ldb/ldc values
-        // also matrix a needs to start at the initial location again
-        // Update for the matrix a
-        mov x8, x10 // Update the restore register for x0 for the M loop
-        mov x0, x10 // Update the x0 register itself
-
-        // Updates for the matrix b
-        madd x9, x4, x12, x9 // ldb * 6 + initial position
-        mov x6, x9 // Update the restore register of x1 for the K loop
-        mov x1, x9 // Update the x1 register itself
-
-        // Updates for the matrix c
-        madd x11, x5, x12, x11 // ldc * 6 + initial position
-        mov x7, x11 // Update the restore register of x2 for the K loop
-        mov x2, x11 // Update the x2 register itself
-
-        // Loop back to N
-        cbnz x17, matmul_loop_over_N
-
-        ret
-        ...
+      // Updates for the matrix c
+      madd x11, x5, x12, x11 // ldc * 6 + initial position
+      mov x7, x11 // Update the restore register of x2 for the K loop
+      mov x2, x11 // Update the x2 register itself
+
+      // Loop back to N
+      cbnz x17, matmul_loop_over_N
 
 4. Test and optimize the kernels. Report your performance in GFLOPS.
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -412,20 +455,20 @@ We run the benchmark with the following command:
   ----------------------------------------------------------------------------------------------------------------------------------
   Benchmark                                                                             Time             CPU   Iterations      FLOPS
   ----------------------------------------------------------------------------------------------------------------------------------
-  GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_mean            396 ns          396 ns           10 31.0266G/s
-  GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_median          396 ns          396 ns           10 31.0281G/s
-  GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_stddev        0.069 ns        0.057 ns           10 4.50274M/s
-  GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_cv             0.02 %          0.01 %            10      0.01%
-  GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_mean           1728 ns         1728 ns           10 28.4438G/s
-  GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_median         1728 ns         1728 ns           10 28.4445G/s
-  GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_stddev        0.115 ns        0.106 ns           10  1.7484M/s
-  GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_cv             0.01 %          0.01 %            10      0.01%
-  GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_mean        13078 ns        13077 ns           10 22.5524G/s
-  GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_median      13078 ns        13077 ns           10  22.552G/s
-  GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_stddev       1.83 ns         1.60 ns           10 2.76464M/s
-  GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_cv           0.01 %          0.01 %            10      0.01%
-
-
-- Mean FLOPS for loop over K: **31.0 GFLOPS**.
-- Mean FLOPS for loop over M: **28.4 GFLOPS**.
-- Mean FLOPS for loop over N: **22.6 GFLOPS**.
+  GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_mean           97.8 ns         97.4 ns           10  126.12G/s
+  GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_median         97.7 ns         97.3 ns           10 126.245G/s
+  GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_stddev        0.581 ns        0.563 ns           10 720.109M/s
+  GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_cv             0.59 %          0.58 %            10      0.57%
+  GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_mean            386 ns          385 ns           10 127.812G/s
+  GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_median          385 ns          384 ns           10  127.95G/s
+  GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_stddev         2.16 ns         2.11 ns           10 693.069M/s
+  GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_cv             0.56 %          0.55 %            10      0.54%
+  GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_mean         3103 ns         3092 ns           10 95.3736G/s
+  GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_median       3097 ns         3087 ns           10 95.5363G/s
+  GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_stddev       16.0 ns         15.6 ns           10 475.851M/s
+  GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_cv           0.52 %          0.50 %            10      0.50%
+
+
+- Mean FLOPS for loop over K: **126.1 GFLOPS**.
+- Mean FLOPS for loop over M: **127.8 GFLOPS**.
+- Mean FLOPS for loop over N: **95.4 GFLOPS**.