chores: test case

Integer-Ctrl · Integer-Ctrl · commit f2e998df12a5 · 2025-06-04T16:42:45.000Z
diff --git a/src/main/TensorOperation.cpp b/src/main/TensorOperation.cpp
@@ -316,7 +316,8 @@ mini_jit::TensorOperation::error_t mini_jit::TensorOperation::setup(dtype_t dtyp
   if (dim_sizes.size() != dim_types.size() || dim_sizes.empty() || dim_types.empty())
   {
     hasSetupError = true;
-    std::cerr << "Error: Dimension sizes and types must match and cannot be empty." << std::endl;
+    std::cerr << "Error: Dimension sizes and types must match and cannot be empty, but got dim_sizes: " << dim_sizes.size() << ", dim_types"
+              << dim_types.size() << std::endl;
     return error_t::err_wrong_dimension;
   }
 
@@ -327,7 +328,9 @@ mini_jit::TensorOperation::error_t mini_jit::TensorOperation::setup(dtype_t dtyp
              (isUnary(prim_last_touch) || prim_last_touch == prim_t::none) && strides_in1.empty()))))
   {
     hasSetupError = true;
-    std::cerr << "Error: Strides must match the number of dimensions." << std::endl;
+    std::cerr << "Error: Strides must match the number of dimensions, but got dim_sizes: " << dim_sizes.size()
+              << ", strides_in0: " << strides_in0.size() << ", strides_in1: " << strides_in1.size()
+              << ", strides_out:" << strides_out.size() << std::endl;
     return error_t::err_wrong_dimension;  // Strides must match the number of dimensions
   }
 
@@ -355,27 +358,32 @@ mini_jit::TensorOperation::error_t mini_jit::TensorOperation::setup(dtype_t dtyp
   if (dtype != dtype_t::fp32)
   {
     hasSetupError = true;
+    std::cerr << "Error: data type must be fp32, but got " << static_cast<uint32_t>(dtype) << std::endl;
     return error_t::err_wrong_dtype;
   }
 
   // Validate execution type order: shared -> seq -> prim
   if (!isSortedConfiguration(exec_types))
   {
     hasSetupError = true;
+    std::cerr << "Error: Expected the execution types to be sorted in the order: (shared*, sequential*, primitive*)" << std::endl;
     return error_t::err_invalid_execution_order;
   }
 
   if (!isValidPrimConfig(dim_types, exec_types, strides_in0, strides_out))
   {
     hasSetupError = true;
-    std::cerr << "1: Invalid primitive configuration detected" << std::endl;
+    std::cerr << "Error: Invalid primitive configuration detected. Expected one primitive for m and one primitive for n to exist"
+              << std::endl;
     return error_t::err_invalid_primitive_configuration;
   }
 
   if (!isValidKDim(dim_types, exec_types, strides_in1, prim_main))
   {
     hasSetupError = true;
-    std::cerr << "2: Invalid primitive configuration detected" << std::endl;
+    std::cerr << "Error: Invalid primitive configuration detected. Expected to find zero primitive k dimension for unary, one primitive k "
+                 "dimension for gemm, two primitive k dimension."
+              << std::endl;
     return error_t::err_invalid_primitive_configuration;
   }
 
@@ -384,7 +392,7 @@ mini_jit::TensorOperation::error_t mini_jit::TensorOperation::setup(dtype_t dtyp
     if (!isValidStride(dim_types, strides_in0, stride_t::out) || !isValidStride(dim_types, strides_out, stride_t::out))
     {
       hasSetupError = true;
-      std::cerr << "3: Invalid stride configuration detected for unary" << std::endl;
+      std::cerr << "Error: Invalid stride configuration detected for unary. Expected k-dimension to have a stride of zero." << std::endl;
       return error_t::err_invalid_strides;
     }
   }
@@ -394,7 +402,9 @@ mini_jit::TensorOperation::error_t mini_jit::TensorOperation::setup(dtype_t dtyp
         !isValidStride(dim_types, strides_out, stride_t::out))
     {
       hasSetupError = true;
-      std::cerr << "3: Invalid stride configuration detected for brgemm" << std::endl;
+      std::cerr << "Error: Invalid stride configuration detected for brgemm. Expected for in0 to have n-dimension stride of zero, in1 to "
+                   "have m-dimension stride of zero and out to have k-dimension stride of zero."
+                << std::endl;
       return error_t::err_invalid_strides;
     }
   }
@@ -426,12 +436,14 @@ mini_jit::TensorOperation::error_t mini_jit::TensorOperation::setup(dtype_t dtyp
       if (error != Unary::error_t::success)
       {
         hasSetupError = true;
+        std::cerr << "Error: while generating the first touch unary: " << static_cast<uint32_t>(error) << std::endl;
         return error_t::err_invalid_first_touch_configuration;
       }
     }
     else
     {
       hasSetupError = true;
+      std::cerr << "Error: Invalid type for the first touch primitive, only support zero, copy, relu." << std::endl;
       return error_t::err_wrong_first_touch_primitive;
     }
   }
@@ -451,18 +463,32 @@ mini_jit::TensorOperation::error_t mini_jit::TensorOperation::setup(dtype_t dtyp
         release_assert(indexPrimBatch != -1, "Expected a valid index for the Batch dimension but found none.");
         release_assert(indexPrimK != -1, "Expected a valid index for the Batch dimension but found none.");
 
-        std::get<Brgemm>(main_kernel)
-          .generate(dim_sizes[indexPrimM], dim_sizes[indexPrimN], dim_sizes[indexPrimK], dim_sizes[indexPrimBatch], 0, 0, 0,
-                    Brgemm::dtype_t::fp32);
+        Brgemm::error_t error = std::get<Brgemm>(main_kernel)
+                                  .generate(dim_sizes[indexPrimM], dim_sizes[indexPrimN], dim_sizes[indexPrimK], dim_sizes[indexPrimBatch],
+                                            0, 0, 0, Brgemm::dtype_t::fp32);
+        if (error != Brgemm::error_t::success)
+        {
+          hasSetupError = true;
+          std::cerr << "Error: while generating the main brgemm: " << static_cast<uint32_t>(error) << std::endl;
+          return error_t::err_invalid_main_configuration;
+        }
       }
       else if (prim_main == prim_t::gemm)
       {
         indexPrimK = findMatch(dim_types, exec_types, dim_t::k, exec_t::prim);
 
         release_assert(indexPrimK != -1, "Expected a valid index for the K dimension but found none.");
 
-        std::get<Brgemm>(main_kernel)
-          .generate(dim_sizes[indexPrimM], dim_sizes[indexPrimN], dim_sizes[indexPrimK], 1, 0, 0, 0, Brgemm::dtype_t::fp32);
+        Brgemm::error_t error =
+          std::get<Brgemm>(main_kernel)
+            .generate(dim_sizes[indexPrimM], dim_sizes[indexPrimN], dim_sizes[indexPrimK], 1, 0, 0, 0, Brgemm::dtype_t::fp32);
+
+        if (error != Brgemm::error_t::success)
+        {
+          hasSetupError = true;
+          std::cerr << "Error: while generating the main gemm: " << static_cast<uint32_t>(error) << std::endl;
+          return error_t::err_invalid_main_configuration;
+        }
       }
       else
       {
@@ -479,12 +505,14 @@ mini_jit::TensorOperation::error_t mini_jit::TensorOperation::setup(dtype_t dtyp
       if (error != Unary::error_t::success)
       {
         hasSetupError = true;
+        std::cerr << "Error: while generating the main unary: " << static_cast<uint32_t>(error) << std::endl;
         return error_t::err_invalid_main_configuration;
       }
     }
     else
     {
       hasSetupError = true;
+      std::cerr << "Error: Invalid type for the main primitive, only support zero, copy, relu, gemm, brgemm." << std::endl;
       return error_t::err_wrong_main_primitive;
     }
   }
@@ -501,12 +529,14 @@ mini_jit::TensorOperation::error_t mini_jit::TensorOperation::setup(dtype_t dtyp
       if (error != Unary::error_t::success)
       {
         hasSetupError = true;
-        return error_t::err_invalid_main_configuration;
+        std::cerr << "Error: while generating the last touch unary: " << static_cast<uint32_t>(error) << std::endl;
+        return error_t::err_invalid_last_touch_configuration;
       }
     }
     else
     {
       hasSetupError = true;
+      std::cerr << "Error: Invalid type for the last touch primitive, only support zero, copy, relu." << std::endl;
       return error_t::err_wrong_last_touch_primitive;
     }
   }
diff --git a/src/test/TensorOperation.test.cpp b/src/test/TensorOperation.test.cpp
@@ -625,6 +625,95 @@ TEST_CASE("Test tensor operation with first touch: unary (zero, relu, copy) & ma
  * =================================================================================================
  */
 
+TEST_CASE("Test tensor operation with outer loop with main kernel: unary (zero, relu, copy)", "[tensor_operation][unary][correctness]")
+{
+  using namespace mini_jit;
+
+  auto type = GENERATE(TensorOperation::prim_t::zero, TensorOperation::prim_t::relu, TensorOperation::prim_t::copy);
+
+  CAPTURE(type);
+
+  constexpr TensorOperation::dim_t dim_types[]{TensorOperation::dim_t::n, TensorOperation::dim_t::k, TensorOperation::dim_t::c,
+                                               TensorOperation::dim_t::m, TensorOperation::dim_t::k, TensorOperation::dim_t::m,
+                                               TensorOperation::dim_t::m, TensorOperation::dim_t::n};
+  constexpr TensorOperation::exec_t exec_types[]{TensorOperation::exec_t::seq,  TensorOperation::exec_t::seq, TensorOperation::exec_t::seq,
+                                                 TensorOperation::exec_t::seq,  TensorOperation::exec_t::seq, TensorOperation::exec_t::seq,
+                                                 TensorOperation::exec_t::prim, TensorOperation::exec_t::prim};
+  constexpr int64_t dim_sizes[]{2, 3, 5, 8, 13, 21, 16, 16};
+  constexpr int64_t strides_in0[]{16 * 16 * 1 * 13 * 8 * 1 * 3,
+                                  0,  // k-dim
+                                  16 * 16 * 1 * 13 * 8,
+                                  16 * 16 * 1 * 13,
+                                  0,  // k-dim
+                                  16 * 16,
+                                  1,
+                                  16};
+  constexpr int64_t strides_in1[]{0, 0, 0, 0, 0, 0, 0, 0};
+  constexpr int64_t strides_out[]{16 * 16 * 1 * 13 * 8 * 1 * 3,
+                                  0,  // k-dim
+                                  16 * 16 * 1 * 13 * 8,
+                                  16 * 16 * 1 * 13,
+                                  0,  // k-dim
+                                  16 * 16,
+                                  1,
+                                  16};
+
+  GenerationTest test(16, 16, 16, 1, 16 * 16 * 21 * 13 * 8 * 5 * 3 * 2, 0, 16 * 16 * 21 * 13 * 8 * 5 * 3 * 2);
+  test.SetUp(TestInfill::Random);
+
+  mini_jit::TensorOperation tensor_op;
+  TensorOperation::error_t err = tensor_op.setup(
+    TensorOperation::dtype_t::fp32, TensorOperation::prim_t::none, type, TensorOperation::prim_t::none, std::span{dim_types},
+    std::span{exec_types}, std::span{dim_sizes}, std::span{strides_in0}, std::span{strides_in1}, std::span{strides_out});
+
+  REQUIRE(err == TensorOperation::error_t::success);
+
+  tensor_op.execute(test.matrix_a.data(), nullptr, test.matrix_c.data());
+
+  UnaryType test_type = UnaryType::None;
+  switch (type)
+  {
+  case TensorOperation::prim_t::zero:
+    test_type = UnaryType::Zero;
+    break;
+  case TensorOperation::prim_t::copy:
+    test_type = UnaryType::Identity;
+    break;
+  case TensorOperation::prim_t::relu:
+    test_type = UnaryType::ReLu;
+    break;
+  default:
+    FAIL("Could not parse the unary type!");
+    break;
+  }
+
+  for (size_t i0 = 0; i0 < dim_sizes[0]; i0++)
+  {
+    for (size_t i1 = 0; i1 < dim_sizes[1]; i1++)
+    {
+      for (size_t i2 = 0; i2 < dim_sizes[2]; i2++)
+      {
+        for (size_t i3 = 0; i3 < dim_sizes[3]; i3++)
+        {
+          for (size_t i4 = 0; i4 < dim_sizes[4]; i4++)
+          {
+            for (size_t i5 = 0; i5 < dim_sizes[5]; i5++)
+            {
+              uint64_t offset_a = i0 * strides_in0[0] + i1 * strides_in0[1] + i2 * strides_in0[2] + i3 * strides_in0[3] +
+                                  i4 * strides_in0[4] + i5 * strides_in0[5];
+              uint64_t offset_c = i0 * strides_out[0] + i1 * strides_out[1] + i2 * strides_out[2] + i3 * strides_out[3] +
+                                  i4 * strides_out[4] + i5 * strides_out[5];
+              test.naive_unary_M_N(test.matrix_a.data() + offset_a, test.matrix_c_verify.data() + offset_c, 16, 16, false, test_type);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  test.verify_matmul(test.matrix_c_verify.data(), test.matrix_c.data(), test.matrix_c.size());
+}
+
 TEST_CASE("Test tensor operation with outer loop with main kernel: gemm", "[tensor_operation][gemm][correctness]")
 {
   using namespace mini_jit;
@@ -1187,6 +1276,96 @@ TEST_CASE("Test tensor operation with outer loop with first touch: unary (zero,
  * #################################################################################################
  */
 
+TEST_CASE("Test parallel tensor operation with outer loop with main kernel: unary (zero, relu, copy)",
+          "[tensor_operation][unary][correctness]")
+{
+  using namespace mini_jit;
+
+  auto type = GENERATE(TensorOperation::prim_t::zero, TensorOperation::prim_t::relu, TensorOperation::prim_t::copy);
+
+  CAPTURE(type);
+
+  constexpr TensorOperation::dim_t dim_types[]{TensorOperation::dim_t::n, TensorOperation::dim_t::m, TensorOperation::dim_t::c,
+                                               TensorOperation::dim_t::m, TensorOperation::dim_t::k, TensorOperation::dim_t::m,
+                                               TensorOperation::dim_t::m, TensorOperation::dim_t::n};
+  constexpr TensorOperation::exec_t exec_types[]{
+    TensorOperation::exec_t::shared, TensorOperation::exec_t::shared, TensorOperation::exec_t::shared, TensorOperation::exec_t::seq,
+    TensorOperation::exec_t::seq,    TensorOperation::exec_t::seq,    TensorOperation::exec_t::prim,   TensorOperation::exec_t::prim};
+  constexpr int64_t dim_sizes[]{2, 3, 5, 8, 13, 21, 16, 16};
+  constexpr int64_t strides_in0[]{16 * 16 * 1 * 13 * 8 * 1 * 3,
+                                  16 * 16 * 1 * 13 * 8 * 1,  // m-dim
+                                  16 * 16 * 1 * 13 * 8,
+                                  16 * 16 * 1 * 13,
+                                  0,  // k-dim
+                                  16 * 16,
+                                  1,
+                                  16};
+  constexpr int64_t strides_in1[]{0, 0, 0, 0, 0, 0, 0, 0};
+  constexpr int64_t strides_out[]{16 * 16 * 1 * 13 * 8 * 1 * 3,
+                                  16 * 16 * 1 * 13 * 8 * 1,  // m-dim
+                                  16 * 16 * 1 * 13 * 8,
+                                  16 * 16 * 1 * 13,
+                                  0,  // k-dim
+                                  16 * 16,
+                                  1,
+                                  16};
+
+  GenerationTest test(16, 16, 16, 1, 16 * 16 * 21 * 13 * 8 * 5 * 3 * 2, 0, 16 * 16 * 21 * 13 * 8 * 5 * 3 * 2);
+  test.SetUp(TestInfill::Random);
+
+  mini_jit::TensorOperation tensor_op;
+  TensorOperation::error_t err = tensor_op.setup(
+    TensorOperation::dtype_t::fp32, TensorOperation::prim_t::none, type, TensorOperation::prim_t::none, std::span{dim_types},
+    std::span{exec_types}, std::span{dim_sizes}, std::span{strides_in0}, std::span{strides_in1}, std::span{strides_out});
+
+  REQUIRE(err == TensorOperation::error_t::success);
+
+  tensor_op.execute(test.matrix_a.data(), nullptr, test.matrix_c.data());
+
+  UnaryType test_type = UnaryType::None;
+  switch (type)
+  {
+  case TensorOperation::prim_t::zero:
+    test_type = UnaryType::Zero;
+    break;
+  case TensorOperation::prim_t::copy:
+    test_type = UnaryType::Identity;
+    break;
+  case TensorOperation::prim_t::relu:
+    test_type = UnaryType::ReLu;
+    break;
+  default:
+    FAIL("Could not parse the unary type!");
+    break;
+  }
+
+  for (size_t i0 = 0; i0 < dim_sizes[0]; i0++)
+  {
+    for (size_t i1 = 0; i1 < dim_sizes[1]; i1++)
+    {
+      for (size_t i2 = 0; i2 < dim_sizes[2]; i2++)
+      {
+        for (size_t i3 = 0; i3 < dim_sizes[3]; i3++)
+        {
+          for (size_t i4 = 0; i4 < dim_sizes[4]; i4++)
+          {
+            for (size_t i5 = 0; i5 < dim_sizes[5]; i5++)
+            {
+              uint64_t offset_a = i0 * strides_in0[0] + i1 * strides_in0[1] + i2 * strides_in0[2] + i3 * strides_in0[3] +
+                                  i4 * strides_in0[4] + i5 * strides_in0[5];
+              uint64_t offset_c = i0 * strides_out[0] + i1 * strides_out[1] + i2 * strides_out[2] + i3 * strides_out[3] +
+                                  i4 * strides_out[4] + i5 * strides_out[5];
+              test.naive_unary_M_N(test.matrix_a.data() + offset_a, test.matrix_c_verify.data() + offset_c, 16, 16, false, test_type);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  test.verify_matmul(test.matrix_c_verify.data(), test.matrix_c.data(), test.matrix_c.size());
+}
+
 TEST_CASE("Test parallel tensor operation with outer loop with main kernel: gemm", "[tensor_operation][gemm][correctness]")
 {
   using namespace mini_jit;
@@ -1606,9 +1785,9 @@ TEST_CASE(
     TensorOperation::dim_t::m, TensorOperation::dim_t::k, TensorOperation::dim_t::m, TensorOperation::dim_t::n, TensorOperation::dim_t::k};
 
   constexpr TensorOperation::exec_t exec_types[]{
-    TensorOperation::exec_t::seq,  TensorOperation::exec_t::seq, TensorOperation::exec_t::seq,  TensorOperation::exec_t::seq,
-    TensorOperation::exec_t::seq,  TensorOperation::exec_t::seq, TensorOperation::exec_t::prim, TensorOperation::exec_t::prim,
-    TensorOperation::exec_t::prim, TensorOperation::exec_t::prim};
+    TensorOperation::exec_t::shared, TensorOperation::exec_t::shared, TensorOperation::exec_t::shared, TensorOperation::exec_t::seq,
+    TensorOperation::exec_t::seq,    TensorOperation::exec_t::seq,    TensorOperation::exec_t::prim,   TensorOperation::exec_t::prim,
+    TensorOperation::exec_t::prim,   TensorOperation::exec_t::prim};
 
   constexpr int64_t dim_sizes[]{2, 3, 5, 8, 13, 21, 3, 16, 16, 16};
   constexpr int64_t strides_in0[]{0,                              // n-dim

Original file line number	Diff line number	Diff line change
`@@ -316,7 +316,8 @@ mini_jit::TensorOperation::error_t mini_jit::TensorOperation::setup(dtype_t dtyp`
`316`	`316`	`if (dim_sizes.size() != dim_types.size() \|\| dim_sizes.empty() \|\| dim_types.empty())`
`317`	`317`	`{`
`318`	`318`	`hasSetupError = true;`
`319`		`- std::cerr << "Error: Dimension sizes and types must match and cannot be empty." << std::endl;`
	`319`	`+ std::cerr << "Error: Dimension sizes and types must match and cannot be empty, but got dim_sizes: " << dim_sizes.size() << ", dim_types"`
	`320`	`+ << dim_types.size() << std::endl;`
`320`	`321`	`return error_t::err_wrong_dimension;`
`321`	`322`	`}`
`322`	`323`
`@@ -327,7 +328,9 @@ mini_jit::TensorOperation::error_t mini_jit::TensorOperation::setup(dtype_t dtyp`
`327`	`328`	`(isUnary(prim_last_touch) \|\| prim_last_touch == prim_t::none) && strides_in1.empty()))))`
`328`	`329`	`{`
`329`	`330`	`hasSetupError = true;`
`330`		`- std::cerr << "Error: Strides must match the number of dimensions." << std::endl;`
	`331`	`+ std::cerr << "Error: Strides must match the number of dimensions, but got dim_sizes: " << dim_sizes.size()`
	`332`	`+ << ", strides_in0: " << strides_in0.size() << ", strides_in1: " << strides_in1.size()`
	`333`	`+ << ", strides_out:" << strides_out.size() << std::endl;`
`331`	`334`	`return error_t::err_wrong_dimension; // Strides must match the number of dimensions`
`332`	`335`	`}`
`333`	`336`
`@@ -355,27 +358,32 @@ mini_jit::TensorOperation::error_t mini_jit::TensorOperation::setup(dtype_t dtyp`
`355`	`358`	`if (dtype != dtype_t::fp32)`
`356`	`359`	`{`
`357`	`360`	`hasSetupError = true;`
	`361`	`+ std::cerr << "Error: data type must be fp32, but got " << static_cast<uint32_t>(dtype) << std::endl;`
`358`	`362`	`return error_t::err_wrong_dtype;`
`359`	`363`	`}`
`360`	`364`
`361`	`365`	`// Validate execution type order: shared -> seq -> prim`
`362`	`366`	`if (!isSortedConfiguration(exec_types))`
`363`	`367`	`{`
`364`	`368`	`hasSetupError = true;`
	`369`	`+ std::cerr << "Error: Expected the execution types to be sorted in the order: (shared, sequential, primitive*)" << std::endl;`
`365`	`370`	`return error_t::err_invalid_execution_order;`
`366`	`371`	`}`
`367`	`372`
`368`	`373`	`if (!isValidPrimConfig(dim_types, exec_types, strides_in0, strides_out))`
`369`	`374`	`{`
`370`	`375`	`hasSetupError = true;`
`371`		`- std::cerr << "1: Invalid primitive configuration detected" << std::endl;`
	`376`	`+ std::cerr << "Error: Invalid primitive configuration detected. Expected one primitive for m and one primitive for n to exist"`
	`377`	`+ << std::endl;`
`372`	`378`	`return error_t::err_invalid_primitive_configuration;`
`373`	`379`	`}`
`374`	`380`
`375`	`381`	`if (!isValidKDim(dim_types, exec_types, strides_in1, prim_main))`
`376`	`382`	`{`
`377`	`383`	`hasSetupError = true;`
`378`		`- std::cerr << "2: Invalid primitive configuration detected" << std::endl;`
	`384`	`+ std::cerr << "Error: Invalid primitive configuration detected. Expected to find zero primitive k dimension for unary, one primitive k "`
	`385`	`+ "dimension for gemm, two primitive k dimension."`
	`386`	`+ << std::endl;`
`379`	`387`	`return error_t::err_invalid_primitive_configuration;`
`380`	`388`	`}`
`381`	`389`
`@@ -384,7 +392,7 @@ mini_jit::TensorOperation::error_t mini_jit::TensorOperation::setup(dtype_t dtyp`
`384`	`392`	`if (!isValidStride(dim_types, strides_in0, stride_t::out) \|\| !isValidStride(dim_types, strides_out, stride_t::out))`
`385`	`393`	`{`
`386`	`394`	`hasSetupError = true;`
`387`		`- std::cerr << "3: Invalid stride configuration detected for unary" << std::endl;`
	`395`	`+ std::cerr << "Error: Invalid stride configuration detected for unary. Expected k-dimension to have a stride of zero." << std::endl;`
`388`	`396`	`return error_t::err_invalid_strides;`
`389`	`397`	`}`
`390`	`398`	`}`
`@@ -394,7 +402,9 @@ mini_jit::TensorOperation::error_t mini_jit::TensorOperation::setup(dtype_t dtyp`
`394`	`402`	`!isValidStride(dim_types, strides_out, stride_t::out))`
`395`	`403`	`{`
`396`	`404`	`hasSetupError = true;`
`397`		`- std::cerr << "3: Invalid stride configuration detected for brgemm" << std::endl;`
	`405`	`+ std::cerr << "Error: Invalid stride configuration detected for brgemm. Expected for in0 to have n-dimension stride of zero, in1 to "`
	`406`	`+ "have m-dimension stride of zero and out to have k-dimension stride of zero."`
	`407`	`+ << std::endl;`
`398`	`408`	`return error_t::err_invalid_strides;`
`399`	`409`	`}`
`400`	`410`	`}`
`@@ -426,12 +436,14 @@ mini_jit::TensorOperation::error_t mini_jit::TensorOperation::setup(dtype_t dtyp`
`426`	`436`	`if (error != Unary::error_t::success)`
`427`	`437`	`{`
`428`	`438`	`hasSetupError = true;`
	`439`	`+ std::cerr << "Error: while generating the first touch unary: " << static_cast<uint32_t>(error) << std::endl;`
`429`	`440`	`return error_t::err_invalid_first_touch_configuration;`
`430`	`441`	`}`
`431`	`442`	`}`
`432`	`443`	`else`
`433`	`444`	`{`
`434`	`445`	`hasSetupError = true;`
	`446`	`+ std::cerr << "Error: Invalid type for the first touch primitive, only support zero, copy, relu." << std::endl;`
`435`	`447`	`return error_t::err_wrong_first_touch_primitive;`
`436`	`448`	`}`
`437`	`449`	`}`
`@@ -451,18 +463,32 @@ mini_jit::TensorOperation::error_t mini_jit::TensorOperation::setup(dtype_t dtyp`
`451`	`463`	`release_assert(indexPrimBatch != -1, "Expected a valid index for the Batch dimension but found none.");`
`452`	`464`	`release_assert(indexPrimK != -1, "Expected a valid index for the Batch dimension but found none.");`
`453`	`465`
`454`		`- std::get<Brgemm>(main_kernel)`
`455`		`- .generate(dim_sizes[indexPrimM], dim_sizes[indexPrimN], dim_sizes[indexPrimK], dim_sizes[indexPrimBatch], 0, 0, 0,`
`456`		`- Brgemm::dtype_t::fp32);`
	`466`	`+ Brgemm::error_t error = std::get<Brgemm>(main_kernel)`
	`467`	`+ .generate(dim_sizes[indexPrimM], dim_sizes[indexPrimN], dim_sizes[indexPrimK], dim_sizes[indexPrimBatch],`
	`468`	`+ 0, 0, 0, Brgemm::dtype_t::fp32);`
	`469`	`+ if (error != Brgemm::error_t::success)`
	`470`	`+ {`
	`471`	`+ hasSetupError = true;`
	`472`	`+ std::cerr << "Error: while generating the main brgemm: " << static_cast<uint32_t>(error) << std::endl;`
	`473`	`+ return error_t::err_invalid_main_configuration;`
	`474`	`+ }`
`457`	`475`	`}`
`458`	`476`	`else if (prim_main == prim_t::gemm)`
`459`	`477`	`{`
`460`	`478`	`indexPrimK = findMatch(dim_types, exec_types, dim_t::k, exec_t::prim);`
`461`	`479`
`462`	`480`	`release_assert(indexPrimK != -1, "Expected a valid index for the K dimension but found none.");`
`463`	`481`
`464`		`- std::get<Brgemm>(main_kernel)`
`465`		`- .generate(dim_sizes[indexPrimM], dim_sizes[indexPrimN], dim_sizes[indexPrimK], 1, 0, 0, 0, Brgemm::dtype_t::fp32);`
	`482`	`+ Brgemm::error_t error =`
	`483`	`+ std::get<Brgemm>(main_kernel)`
	`484`	`+ .generate(dim_sizes[indexPrimM], dim_sizes[indexPrimN], dim_sizes[indexPrimK], 1, 0, 0, 0, Brgemm::dtype_t::fp32);`
	`485`	`+`
	`486`	`+ if (error != Brgemm::error_t::success)`
	`487`	`+ {`
	`488`	`+ hasSetupError = true;`
	`489`	`+ std::cerr << "Error: while generating the main gemm: " << static_cast<uint32_t>(error) << std::endl;`
	`490`	`+ return error_t::err_invalid_main_configuration;`
	`491`	`+ }`
`466`	`492`	`}`
`467`	`493`	`else`
`468`	`494`	`{`
`@@ -479,12 +505,14 @@ mini_jit::TensorOperation::error_t mini_jit::TensorOperation::setup(dtype_t dtyp`
`479`	`505`	`if (error != Unary::error_t::success)`
`480`	`506`	`{`
`481`	`507`	`hasSetupError = true;`
	`508`	`+ std::cerr << "Error: while generating the main unary: " << static_cast<uint32_t>(error) << std::endl;`
`482`	`509`	`return error_t::err_invalid_main_configuration;`
`483`	`510`	`}`
`484`	`511`	`}`
`485`	`512`	`else`
`486`	`513`	`{`
`487`	`514`	`hasSetupError = true;`
	`515`	`+ std::cerr << "Error: Invalid type for the main primitive, only support zero, copy, relu, gemm, brgemm." << std::endl;`
`488`	`516`	`return error_t::err_wrong_main_primitive;`
`489`	`517`	`}`
`490`	`518`	`}`
`@@ -501,12 +529,14 @@ mini_jit::TensorOperation::error_t mini_jit::TensorOperation::setup(dtype_t dtyp`
`501`	`529`	`if (error != Unary::error_t::success)`
`502`	`530`	`{`
`503`	`531`	`hasSetupError = true;`
`504`		`- return error_t::err_invalid_main_configuration;`
	`532`	`+ std::cerr << "Error: while generating the last touch unary: " << static_cast<uint32_t>(error) << std::endl;`
	`533`	`+ return error_t::err_invalid_last_touch_configuration;`
`505`	`534`	`}`
`506`	`535`	`}`
`507`	`536`	`else`
`508`	`537`	`{`
`509`	`538`	`hasSetupError = true;`
	`539`	`+ std::cerr << "Error: Invalid type for the last touch primitive, only support zero, copy, relu." << std::endl;`
`510`	`540`	`return error_t::err_wrong_last_touch_primitive;`
`511`	`541`	`}`
`512`	`542`	`}`