Integer-Ctrl
diff --git a/‎CMakeLists.txt‎
Lines changed: 7 additions & 3 deletions b/‎CMakeLists.txt‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎src/main/Brgemm.cpp‎
Lines changed: 18 additions & 3 deletions b/‎src/main/Brgemm.cpp‎
Lines changed: 18 additions & 3 deletions
diff --git a/‎…main/kernels/matmul_16mRest_4nRest_k.cpp‎ ‎…in/kernels/matmul_16mRest_lt4nRest_k.cpp‎src/main/kernels/matmul_16mRest_4nRest_k.cpp renamed to src/main/kernels/matmul_16mRest_lt4nRest_k.cpp
Lines changed: 10 additions & 6 deletions b/‎…main/kernels/matmul_16mRest_4nRest_k.cpp‎ ‎…in/kernels/matmul_16mRest_lt4nRest_k.cpp‎src/main/kernels/matmul_16mRest_4nRest_k.cpp renamed to src/main/kernels/matmul_16mRest_lt4nRest_k.cpp
Lines changed: 10 additions & 6 deletions
diff --git a/‎…c/main/kernels/matmul_16mRest_4nRest_k.h‎ ‎…main/kernels/matmul_16mRest_lt4nRest_k.h‎src/main/kernels/matmul_16mRest_4nRest_k.h renamed to src/main/kernels/matmul_16mRest_lt4nRest_k.h
Lines changed: 5 additions & 5 deletions b/‎…c/main/kernels/matmul_16mRest_4nRest_k.h‎ ‎…main/kernels/matmul_16mRest_lt4nRest_k.h‎src/main/kernels/matmul_16mRest_4nRest_k.h renamed to src/main/kernels/matmul_16mRest_lt4nRest_k.h
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/main/kernels/matmul_lt16_4n_k.cpp‎
Lines changed: 76 additions & 65 deletions b/‎src/main/kernels/matmul_lt16_4n_k.cpp‎
Lines changed: 76 additions & 65 deletions
diff --git a/‎src/main/kernels/matmul_lt16_4n_k.h‎
Lines changed: 4 additions & 1 deletion b/‎src/main/kernels/matmul_lt16_4n_k.h‎
Lines changed: 4 additions & 1 deletion
@@ -82,10 +82,12 @@ set(KERNEL_FILES
     matmul_16m_lt4nRest_k.cpp
     matmul_16mRest_4n_k.h
     matmul_16mRest_4n_k.cpp
-    matmul_16mRest_4nRest_k.h
-    matmul_16mRest_4nRest_k.cpp
+    matmul_16mRest_lt4nRest_k.h
+    matmul_16mRest_lt4nRest_k.cpp
     matmul_lt16_4n_k.h
     matmul_lt16_4n_k.cpp
+    matmul_lt16_lt4nRest_k.h
+    matmul_lt16_lt4nRest_k.cpp
 )
 
 set(ARM_INSTRUCTION_FILES
@@ -127,9 +129,11 @@ set(TEST_KERNELS
     matmul_16_6_1.test.cpp
     matmul_16_6_k.test.cpp
     matmul_16m_4n_k.test.cpp
+    matmul_16mRest_4n_k.test.cpp
     matmul_16m_lt4nRest_k.test.cpp
-    matmul_16mRest_4nRest_k.test.cpp
+    matmul_16mRest_lt4nRest_k.test.cpp
     matmul_lt16_4n_k.test.cpp
+    matmul_lt16_lt4nRest_k.test.cpp
 )
 
 set(TEST_ARM_INSTRUCTION_FILES
 
@@ -11,7 +11,7 @@ mini_jit::Brgemm::error_t mini_jit::Brgemm::generate(uint32_t m, uint32_t n, uin
   {
     return error_t::err_wrong_dtype;
   }
-  if (m < 16 || n < 4 || m == 0 || n == 0 || k == 0)
+  if (m == 0 || n == 0 || k == 0)
   {
     return error_t::err_wrong_dimension;
   }
@@ -83,10 +83,25 @@ void mini_jit::Brgemm::fill_with_matmuls_no_batch_dim_column_major_fp32(uint32_t
     return;
   }
 
-  if (m >= 16 && n >= 4)
+  if (m < 16 && n >= 4 && n % 4 == 0)
+  {
+    kernels::matmul_lt16_4n_k(native_kernel, n / 4, k, m % 16);
+    return;
+  }
+
+  if (m >= 16)
+  {
+    // At this point m % 16 != 0 and n % 4 != 0
+    kernels::matmul_16mRest_lt4nRest_k(native_kernel, m / 16, n / 4, k, m % 16, n % 4);
+    return;
+  }
+
+  if (m < 16)
   {
     // At this point m % 16 != 0 and n % 4 != 0
-    kernels::matmul_16mRest_4nRest_k(native_kernel, m / 16, n / 4, k, m % 16, n % 4);
+    kernels::matmul_lt16_lt4nRest_k(native_kernel, n / 4, k, m % 16, n % 4);
     return;
   }
+
+  throw std::logic_error(std::format("Unhandled combination found for MxNxK matmul: m='{}', n='{}', k='{}'", m, n, k));
 }
@@ -1,24 +1,25 @@
-#include "matmul_16mRest_4nRest_k.h"
+#include "matmul_16mRest_lt4nRest_k.h"
 #include "../Kernel.h"
 #include "../arm_instructions/arm_all.h"
 #include "../release_assert.h"
 #include "matmul_16mRest_4n_k.h"
 #include "matmul_16m_lt4nRest_k.h"
 
-void mini_jit::kernels::matmul_16mRest_4nRest_k(mini_jit::Kernel &kernel, const uint32_t m_loop_16, const uint32_t n_loop_4,
-                                                const uint32_t k_loop, const uint32_t m_loop_rest, const uint32_t n_loop_rest)
+void mini_jit::kernels::matmul_16mRest_lt4nRest_k(mini_jit::Kernel &kernel, const uint32_t m_loop_16, const uint32_t n_loop_4,
+                                                  const uint32_t k_loop, const uint32_t m_loop_rest, const uint32_t n_loop_rest)
 {
   using namespace mini_jit::arm_instructions;
 
   release_assert(m_loop_16 != 0, "Cannot proccess matrix with m loop of 0.");
-  release_assert(n_loop_4 != 0, "Cannot proccess matrix with n loop of 0.");
   release_assert(k_loop != 0, "Cannot proccess matrix with k loop of 0.");
   release_assert(m_loop_rest != 0, "Cannot create a matrix with a rest of m equal to 0!");
   release_assert(m_loop_rest <= 15, "Cannot create a matrix with a rest of m larger than 15!");
   release_assert(n_loop_rest != 0, "Cannot create a matrix with a rest of n equal to 0!");
   release_assert(n_loop_rest <= 3, "Cannot create a matrix with a rest of n larger than 3!");
 
   // Idea: Division of the matrix into sub-matrices and calculated in the following order.
+  // 1. matmul_lt16_4n_k is omitted if n is less than 4;
+  //
   //                       N dimension
   // ←---------------------------------------------------→
   // ===================================================== ↑
@@ -79,7 +80,10 @@ void mini_jit::kernels::matmul_16mRest_4nRest_k(mini_jit::Kernel &kernel, const
   // ========================================================================================
   // Calculate m + rest but n is multiple of 4
   // ========================================================================================
-  matmul_16mRest_4n_k(kernel, m_loop_16, n_loop_4, k_loop, m_loop_rest, false);
+  if (n_loop_4 != 0)
+  {
+    matmul_16mRest_4n_k(kernel, m_loop_16, n_loop_4, k_loop, m_loop_rest, false);
+  }
 
   // Offset to the next matrix block
   // Here we want to start with the initial m value but n should be offset by the already calculated amount.
@@ -91,7 +95,7 @@ void mini_jit::kernels::matmul_16mRest_4nRest_k(mini_jit::Kernel &kernel, const
   matmul_16m_lt4nRest_k(kernel, m_loop_16, 0, k_loop, n_loop_rest, false);
 
   // Now we want to make sure to not restore the position of the m as it is in the right position.
-  // Therefore we should restore the register above the m_loop
+  // Therefore we should restore the register below the m_loop
 
   // ========================================================================================
   // Rest Calculation of m and n loop
 
@@ -1,5 +1,5 @@
-#ifndef MINI_JIT_KERNELS_MATMUL_16MRest_4NRest_K_H
-#define MINI_JIT_KERNELS_MATMUL_16MRest_4NRest_K_H
+#ifndef MINI_JIT_KERNELS_MATMUL_16MRest_LT4NRest_K_H
+#define MINI_JIT_KERNELS_MATMUL_16MRest_LT4NRest_K_H
 
 #include "../Kernel.h"
 #include <cstdint>
@@ -19,9 +19,9 @@ namespace mini_jit
      * @param m_loop_rest The rest/remainder of the m loop that is not dividable by 16.
      * @param n_loop_rest The rest/remainder of the n loop that is not dividable by 4.
      */
-    void matmul_16mRest_4nRest_k(mini_jit::Kernel &kernel, const uint32_t m_loop_16, const uint32_t n_loop_4, const uint32_t k_loop,
-                                 const uint32_t m_loop_rest, const uint32_t n_loop_rest);
+    void matmul_16mRest_lt4nRest_k(mini_jit::Kernel &kernel, const uint32_t m_loop_16, const uint32_t n_loop_4, const uint32_t k_loop,
+                                   const uint32_t m_loop_rest, const uint32_t n_loop_rest);
 
   }  // namespace kernels
 }  // namespace mini_jit
-#endif  // MINI_JIT_KERNELS_MATMUL_16MRest_4NRest_K_H
+#endif  // MINI_JIT_KERNELS_MATMUL_16MRest_LT4NRest_K_H
@@ -4,68 +4,72 @@
 #include "../release_assert.h"
 
 void mini_jit::kernels::matmul_lt16_4n_k(mini_jit::Kernel &kernel, const uint32_t n_loop_4, const uint32_t k_loop,
-                                         const uint32_t m_loop_rest)
+                                         const uint32_t m_loop_rest, const bool use_init_and_end)
 {
   using namespace mini_jit::arm_instructions;
 
   release_assert(n_loop_4 != 0, "Cannot proccess matrix with k loop of 0.");
   release_assert(k_loop != 0, "Cannot proccess matrix with k loop of 0.");
   release_assert(m_loop_rest != 0, "Cannot create a matrix with a rest of m equal to 0!");
   release_assert(m_loop_rest <= 15, "Cannot create a matrix with a rest of m larger than 15!");
-  // Hold the number of instruction to jump for each loop
-  int32_t jump_N_loop = 23;  // start value = amount of instructions outside of control flow
 
-  kernel.add({
-    // /**
-    //     * @param x0 = a pointer to column-major 64x64 matrix A.
-    //     * @param x1 = b pointer to column-major 64x64 matrix B.
-    //     * @param x2 = c pointer to column-major 64x64 matrix C.
-    //     * @param x3 = lda leading dimension of A.
-    //     * @param x4 = ldb leading dimension of B.
-    //     * @param x5 = ldc leading dimension of C.
-    // **/
-    // .type matmul_64_48_64, %function
-    // .global matmul_64_48_64
-    // matmul_64_48_64:
-
-    //     // Procedural Call Standard
-    //     // save frame pointer and link register
-    //     // stp fp, lr, [sp, #-16]!
-    //     // update frame pointer to current stack pointer
-    //     // mov fp, sp
-
-    //     // save callee-saved registers
-    //     // stp x19, x20, [sp, #-16]!
-    //     // stp x21, x22, [sp, #-16]!
-    //     // stp x23, x24, [sp, #-16]!
-    //     // stp x25, x26, [sp, #-16]!
-    //     // stp x27, x28, [sp, #-16]!
-
-    stpPre(d8, d9, sp, -16),  //     stp  d8,  d9, [sp, #-16]!
-    //     // stp d10, d11, [sp, #-16]!
-    //     // stp d12, d13, [sp, #-16]!
-    //     // stp d14, d15, [sp, #-16]!
-
-    //     // Offset the used leading dimension by the size of floats
-    lsl(x3, x3, 2),  //     lsl x3, x3, #2 // x3 * 4 = x3 * sizeof(float)
-    lsl(x4, x4, 2),  //     lsl x4, x4, #2 // x4 * 4 = x4 * sizeof(float)
-    lsl(x5, x5, 2),  //     lsl x5, x5, #2 // x5 * 4 = x5 * sizeof(float)
-
-    mov(x6, x1),  //     mov x6, x1 // Store the initial value of x1, to be restored in the K loop iteration
-    mov(x7, x2),  //     mov x7, x2 // Store the initial value of x2, to be restored in the K loop iteration
-
-    mov(x8, x0),  //     mov x8, x0 // Store the initial value of x0, to be restored in the M loop iteration
-    mov(x9, x1),  //     mov x9, x1 // Store the initial value of x1, to be restored in the M loop iteration
-
-    mov(x10, x0),  //     mov x10, x0 // Store the initial value of x0, to be restored in the N loop iteration
-    mov(x11, x2),  //     mov x11, x2 // Store the initial value of x2, to bes restored in the N loop iteration
-    mov(x12, 4),   //     mov x12, #4 // hold the size of N that are processed in one loop, needed for offset calculation
-  });
+  if (use_init_and_end)
+  {
+    kernel.add({
+      // /**
+      //     * @param x0 = a pointer to column-major 64x64 matrix A.
+      //     * @param x1 = b pointer to column-major 64x64 matrix B.
+      //     * @param x2 = c pointer to column-major 64x64 matrix C.
+      //     * @param x3 = lda leading dimension of A.
+      //     * @param x4 = ldb leading dimension of B.
+      //     * @param x5 = ldc leading dimension of C.
+      // **/
+      // .type matmul_64_48_64, %function
+      // .global matmul_64_48_64
+      // matmul_64_48_64:
+
+      //     // Procedural Call Standard
+      //     // save frame pointer and link register
+      //     // stp fp, lr, [sp, #-16]!
+      //     // update frame pointer to current stack pointer
+      //     // mov fp, sp
+
+      //     // save callee-saved registers
+      //     // stp x19, x20, [sp, #-16]!
+      //     // stp x21, x22, [sp, #-16]!
+      //     // stp x23, x24, [sp, #-16]!
+      //     // stp x25, x26, [sp, #-16]!
+      //     // stp x27, x28, [sp, #-16]!
+
+      stpPre(d8, d9, sp, -16),  //     stp  d8,  d9, [sp, #-16]!
+      //     // stp d10, d11, [sp, #-16]!
+      //     // stp d12, d13, [sp, #-16]!
+      //     // stp d14, d15, [sp, #-16]!
+
+      //     // Offset the used leading dimension by the size of floats
+      lsl(x3, x3, 2),  //     lsl x3, x3, #2 // x3 * 4 = x3 * sizeof(float)
+      lsl(x4, x4, 2),  //     lsl x4, x4, #2 // x4 * 4 = x4 * sizeof(float)
+      lsl(x5, x5, 2),  //     lsl x5, x5, #2 // x5 * 4 = x5 * sizeof(float)
+
+      mov(x6, x1),  //     mov x6, x1 // Store the initial value of x1, to be restored in the K loop iteration
+      mov(x7, x2),  //     mov x7, x2 // Store the initial value of x2, to be restored in the K loop iteration
+
+      mov(x8, x0),  //     mov x8, x0 // Store the initial value of x0, to be restored in the M loop iteration
+      mov(x9, x1),  //     mov x9, x1 // Store the initial value of x1, to be restored in the M loop iteration
+
+      mov(x10, x0),  //     mov x10, x0 // Store the initial value of x0, to be restored in the N loop iteration
+      mov(x11, x2),  //     mov x11, x2 // Store the initial value of x2, to bes restored in the N loop iteration
+      mov(x12, 4),   //     mov x12, #4 // hold the size of N that are processed in one loop, needed for offset calculation
+    });
+  }
 
   // ========================================================================================
   // Rest Calculation of m loop
   // ========================================================================================
 
+  // Hold the number of instruction to jump for each loop
+  int32_t jump_N_loop = 23;  // start value = amount of instructions outside of control flow
+
   kernel.add({
     mov(x17, n_loop_4),  //     mov x17, #12 // x17 iterator for N loop
     // matmul_loop_over_N:
@@ -733,26 +737,33 @@ void mini_jit::kernels::matmul_lt16_4n_k(mini_jit::Kernel &kernel, const uint32_
 
     //     // Loop back to N
     cbnz(x17, -jump_N_loop * 4),  //     cbnz x17, matmul_loop_over_N
+  });
 
-    //     // Procedural Call Standard
-    //     // restore callee-saved registers
-    //     // ldp d14, d15, [sp], #16
-    //     // ldp d12, d13, [sp], #16
-    //     // ldp d10, d11, [sp], #16
-    ldpPost(d8, d9, sp, 16),  //     ldp  d8,  d9, [sp], #16
+  if (use_init_and_end)
+  {
 
-    //     // ldp x27, x28, [sp], #16
-    //     // ldp x25, x26, [sp], #16
-    //     // ldp x23, x24, [sp], #16
-    //     // ldp x21, x22, [sp], #16
-    //     // ldp x19, x20, [sp], #16
+    kernel.add({
 
-    //     // restore frame pointer and link register
-    //     // ldp fp, lr, [sp], #16
+      //     // Procedural Call Standard
+      //     // restore callee-saved registers
+      //     // ldp d14, d15, [sp], #16
+      //     // ldp d12, d13, [sp], #16
+      //     // ldp d10, d11, [sp], #16
+      ldpPost(d8, d9, sp, 16),  //     ldp  d8,  d9, [sp], #16
 
-    ret()  //     ret
-    //     .size matmul_64_48_64, (. - matmul_64_48_64)
-  });
+      //     // ldp x27, x28, [sp], #16
+      //     // ldp x25, x26, [sp], #16
+      //     // ldp x23, x24, [sp], #16
+      //     // ldp x21, x22, [sp], #16
+      //     // ldp x19, x20, [sp], #16
+
+      //     // restore frame pointer and link register
+      //     // ldp fp, lr, [sp], #16
+
+      ret()  //     ret
+      //     .size matmul_64_48_64, (. - matmul_64_48_64)
+    });
+  }
 
 #ifdef SAVE_JITS_TO_FILE
   kernel.write("matmul_lt16_4n_k.bin");
 
@@ -16,8 +16,11 @@ namespace mini_jit
      * @param n_loop_4 The repetitions of the n block of size 4.
      * @param k_loop The loops in the k dimensions.
      * @param m_loop_rest The rest/remainder of the m loop that is not dividable by 16.
+     * @param use_init_and_end Indicates if the procedural call standard, initializing setup and the ret instruction are used. Defaults to
+     * true.
      */
-    void matmul_lt16_4n_k(mini_jit::Kernel &kernel, const uint32_t n_loop_4, const uint32_t k_loop, const uint32_t m_loop_rest);
+    void matmul_lt16_4n_k(mini_jit::Kernel &kernel, const uint32_t n_loop_4, const uint32_t k_loop, const uint32_t m_loop_rest,
+                          const bool use_init_and_end = true);
 
   }  // namespace kernels
 }  // namespace mini_jit
Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@ mini_jit::Brgemm::error_t mini_jit::Brgemm::generate(uint32_t m, uint32_t n, uin`
`11`	`11`	`{`
`12`	`12`	`return error_t::err_wrong_dtype;`
`13`	`13`	`}`
`14`		`- if (m < 16 \|\| n < 4 \|\| m == 0 \|\| n == 0 \|\| k == 0)`
	`14`	`+ if (m == 0 \|\| n == 0 \|\| k == 0)`
`15`	`15`	`{`
`16`	`16`	`return error_t::err_wrong_dimension;`
`17`	`17`	`}`
`@@ -83,10 +83,25 @@ void mini_jit::Brgemm::fill_with_matmuls_no_batch_dim_column_major_fp32(uint32_t`
`83`	`83`	`return;`
`84`	`84`	`}`
`85`	`85`
`86`		`- if (m >= 16 && n >= 4)`
	`86`	`+ if (m < 16 && n >= 4 && n % 4 == 0)`
	`87`	`+ {`
	`88`	`+ kernels::matmul_lt16_4n_k(native_kernel, n / 4, k, m % 16);`
	`89`	`+ return;`
	`90`	`+ }`
	`91`	`+`
	`92`	`+ if (m >= 16)`
	`93`	`+ {`
	`94`	`+ // At this point m % 16 != 0 and n % 4 != 0`
	`95`	`+ kernels::matmul_16mRest_lt4nRest_k(native_kernel, m / 16, n / 4, k, m % 16, n % 4);`
	`96`	`+ return;`
	`97`	`+ }`
	`98`	`+`
	`99`	`+ if (m < 16)`
`87`	`100`	`{`
`88`	`101`	`// At this point m % 16 != 0 and n % 4 != 0`
`89`		`- kernels::matmul_16mRest_4nRest_k(native_kernel, m / 16, n / 4, k, m % 16, n % 4);`
	`102`	`+ kernels::matmul_lt16_lt4nRest_k(native_kernel, n / 4, k, m % 16, n % 4);`
`90`	`103`	`return;`
`91`	`104`	`}`
	`105`	`+`
	`106`	`+ throw std::logic_error(std::format("Unhandled combination found for MxNxK matmul: m='{}', n='{}', k='{}'", m, n, k));`
`92`	`107`	`}`