llvm · rengolin · Nov 7, 2024 · Oct 10, 2024 · Oct 10, 2024 · Nov 7, 2024
@@ -50,37 +50,6 @@ def log(*args):
 }
 """
 
-matmul_boiler = """
-func.func @main() -> f32 attributes {llvm.emit_c_interface} {
-  %v0 = arith.constant 0.0 : f32
-  %v1 = arith.constant -1 : i8
-  %v2 = arith.constant 2.0 : f32
-
-  %A = memref.alloc() : memref<4x16xi8>
-  %B = memref.alloc() : memref<16x8xf32>
-  %C0 = memref.alloc() : memref<4x8xf32>
-  %C1 = memref.alloc() : memref<4x8xf32>
-  linalg.fill ins(%v1 : i8) outs(%A : memref<4x16xi8>)
-  linalg.fill ins(%v2 : f32) outs(%B : memref<16x8xf32>)
-  linalg.fill ins(%v0 : f32) outs(%C0 : memref<4x8xf32>)
-  linalg.fill ins(%v0 : f32) outs(%C1 : memref<4x8xf32>)
-
-  call @matmul_signed_on_buffers(%A, %B, %C0) :
-    (memref<4x16xi8>, memref<16x8xf32>, memref<4x8xf32>) -> ()
-  call @matmul_unsigned_on_buffers(%A, %B, %C1) :
-    (memref<4x16xi8>, memref<16x8xf32>, memref<4x8xf32>) -> ()
-
-  %c0 = arith.constant 0 : index
-  %res0 = memref.load %C0[%c0, %c0] : memref<4x8xf32>
-  %res1 = memref.load %C1[%c0, %c0] : memref<4x8xf32>
-
-  %0 = arith.addf %res0, %res1 : f32
-
-  // TODO: FFI-based solution to allow testing and printing with python code.
-  return %0 : f32
-}
-"""
-
 fill_boiler = """
 func.func @main() -> i32 attributes {llvm.emit_c_interface} {
   %O0 = memref.alloc() : memref<i32>
@@ -296,90 +265,6 @@ def elemwise_log_mul_on_buffers(lhs, rhs, out):
 test_elemwise_generic()
 
 
-def test_matmul_builtin():
-    with Context() as ctx, Location.unknown():
-        module = Module.create()
-        f32 = F32Type.get()
-        i8 = IntegerType.get_signless(8)
-        with InsertionPoint(module.body):
-
-            @func.FuncOp.from_py_func(
-                MemRefType.get((4, 16), i8),
-                MemRefType.get((16, 8), f32),
-                MemRefType.get((4, 8), f32),
-            )
-            def matmul_signed_on_buffers(lhs, rhs, out):
-                linalg.matmul(lhs, rhs, outs=[out])
-
-            @func.FuncOp.from_py_func(
-                MemRefType.get((4, 16), i8),
-                MemRefType.get((16, 8), f32),
-                MemRefType.get((4, 8), f32),
-            )
-            def matmul_unsigned_on_buffers(lhs, rhs, out):
-                linalg.matmul(lhs, rhs, outs=[out], cast=TypeFn.cast_unsigned)
-
-        execution_engine = ExecutionEngine(transform(module, matmul_boiler))
-
-        # TODO: FFI-based solution to allow testing and printing with python code.
-        # Prepare arguments: one result f32.
-        # Arguments must be passed as pointers.
-        c_float_p = ctypes.c_float * 1
-        res = c_float_p(-1.0)
-        execution_engine.invoke("main", res)
-
-        log("RESULT: ", res[0])
-        # matmul_signed_on_buffers: -1 * 2.0 * 16 = -32
-        # matmul_unsigned_on_buffers: (2^8-1) * 2.0 * 16 = 8160
-        # CHECK: RESULT: 8128
-
-
-test_matmul_builtin()
-
-
-def test_matmul_generic():
-    with Context() as ctx, Location.unknown():
-        module = Module.create()
-        f32 = F32Type.get()
-        i8 = IntegerType.get_signless(8)
-        with InsertionPoint(module.body):
-
-            @func.FuncOp.from_py_func(
-                MemRefType.get((4, 16), i8),
-                MemRefType.get((16, 8), f32),
-                MemRefType.get((4, 8), f32),
-            )
-            def matmul_signed_on_buffers(lhs, rhs, out):
-                linalg.matmul(lhs, rhs, outs=[out], emit_generic=True)
-
-            @func.FuncOp.from_py_func(
-                MemRefType.get((4, 16), i8),
-                MemRefType.get((16, 8), f32),
-                MemRefType.get((4, 8), f32),
-            )
-            def matmul_unsigned_on_buffers(lhs, rhs, out):
-                linalg.matmul(
-                    lhs, rhs, outs=[out], cast=TypeFn.cast_unsigned, emit_generic=True
-                )
-
-        execution_engine = ExecutionEngine(transform(module, matmul_boiler))
-
-        # TODO: FFI-based solution to allow testing and printing with python code.
-        # Prepare arguments: one result f32.
-        # Arguments must be passed as pointers.
-        c_float_p = ctypes.c_float * 1
-        res = c_float_p(-1.0)
-        execution_engine.invoke("main", res)
-
-        log("RESULT: ", res[0])
-        # matmul_signed_on_buffers = -1 * 2.0 * 16 = -32
-        # matmul_unsigned_on_buffers = (2^8-1) * 2.0 * 16 = 8160
-        # CHECK: RESULT: 8128
-
-
-test_matmul_generic()
-
-
 def test_fill_builtin():
     with Context() as ctx, Location.unknown():
         module = Module.create()

@@ -99,26 +99,28 @@ def basic(target: any_op_t()):
 # CHECK-LABEL: TEST: test_apply_patterns
 @construct_and_print_in_module
 def test_apply_patterns(module_):
-    M, N, K = 3, 5, 3
+    b, M, N, K = 1, 3, 5, 3
 
-    # CHECK-LABEL:   func.func @matmul(
-    # CHECK-SAME:                      %[[VAL_0:.*]]: tensor<3x5xf32>, %[[VAL_1:.*]]: tensor<5x3xf32>, %[[VAL_2:.*]]: tensor<3x3xf32>) -> tensor<3x3xf32> {
+    # CHECK-LABEL:   func.func @batch_reduce_matmul(
+    # CHECK-SAME:                      %[[VAL_0:.*]]: tensor<1x3x5xf32>,
+    # CHECK-SAME:                      %[[VAL_1:.*]]: tensor<1x5x3xf32>,
+    # CHECK-SAME:                      %[[VAL_2:.*]]: tensor<3x3xf32>) -> tensor<3x3xf32> {
     # CHECK:           %[[VAL_3:.*]] = arith.constant 1 : i32
     # CHECK:           %[[VAL_4:.*]] = arith.addi %[[VAL_3]], %[[VAL_3]] : i32
-    # CHECK:           %[[VAL_5:.*]] = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%[[VAL_0]], %[[VAL_1]] : tensor<3x5xf32>, tensor<5x3xf32>) outs(%[[VAL_2]] : tensor<3x3xf32>) -> tensor<3x3xf32>
+    # CHECK:           %[[VAL_5:.*]] = linalg.batch_reduce_matmul ins(%[[VAL_0]], %[[VAL_1]] : tensor<1x3x5xf32>, tensor<1x5x3xf32>) outs(%[[VAL_2]] : tensor<3x3xf32>) -> tensor<3x3xf32>
     # CHECK:           return %[[VAL_5]] : tensor<3x3xf32>
     # CHECK:         }
     @func.func(
-        T.tensor(M, N, T.f32()), T.tensor(N, K, T.f32()), T.tensor(M, K, T.f32())
+        T.tensor(b, M, N, T.f32()), T.tensor(b, N, K, T.f32()), T.tensor(M, K, T.f32())
     )
-    def matmul(A, B, C):
+    def batch_reduce_matmul(A, B, C):
         i = arith.constant(T.i32(), 1)
         v = arith.addi(i, i)
-        return linalg.matmul(A, B, outs=[C])
+        return linalg.batch_reduce_matmul(A, B, outs=[C])
 
     # CHECK-LABEL:   module attributes {transform.with_named_sequence} {
     # CHECK:           transform.named_sequence @__transform_main(%[[VAL_0:.*]]: !transform.any_op) {
-    # CHECK:             %[[VAL_1:.*]] = transform.structured.match ops{["linalg.matmul"]} in %[[VAL_0]] : (!transform.any_op) -> !transform.any_op
+    # CHECK:             %[[VAL_1:.*]] = transform.structured.match ops{["linalg.batch_reduce_matmul"]} in %[[VAL_0]] : (!transform.any_op) -> !transform.any_op
     # CHECK:             %[[VAL_2:.*]] = transform.get_parent_op %[[VAL_1]] {op_name = "func.func"} : (!transform.any_op) -> !pdl.operation
     # CHECK:             transform.apply_patterns to %[[VAL_2]] {
     # CHECK:               transform.apply_patterns.canonicalization
@@ -132,7 +134,7 @@ def matmul(A, B, C):
     def mod():
         @named_sequence("__transform_main", [any_op_t()], [])
         def basic(variant_op: any_op_t()):
-            matmul = structured_match(any_op_t(), variant_op, ops=["linalg.matmul"])
+            matmul = structured_match(any_op_t(), variant_op, ops=["linalg.batch_reduce_matmul"])
             top_func = get_parent_op(pdl.op_t(), matmul, op_name="func.func")
 
             @apply_patterns(top_func)
@@ -147,9 +149,9 @@ def pats():
     pm = PassManager.parse("builtin.module(transform-interpreter)")
     pm.run(module_.operation)
 
-    # CHECK-LABEL:   func.func @matmul(
-    # CHECK-SAME:                      %[[VAL_0:.*]]: tensor<3x5xf32>, %[[VAL_1:.*]]: tensor<5x3xf32>, %[[VAL_2:.*]]: tensor<3x3xf32>) -> tensor<3x3xf32> {
-    # CHECK:           %[[VAL_3:.*]] = linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%[[VAL_0]], %[[VAL_1]] : tensor<3x5xf32>, tensor<5x3xf32>) outs(%[[VAL_2]] : tensor<3x3xf32>) -> tensor<3x3xf32>
+    # CHECK-LABEL:   func.func @batch_reduce_matmul(
+    # CHECK-SAME:                      %[[VAL_0:.*]]: tensor<1x3x5xf32>, %[[VAL_1:.*]]: tensor<1x5x3xf32>, %[[VAL_2:.*]]: tensor<3x3xf32>) -> tensor<3x3xf32> {
+    # CHECK:           %[[VAL_3:.*]] = linalg.batch_reduce_matmul ins(%[[VAL_0]], %[[VAL_1]] : tensor<1x3x5xf32>, tensor<1x5x3xf32>) outs(%[[VAL_2]] : tensor<3x3xf32>) -> tensor<3x3xf32>
     # CHECK:           return %[[VAL_3]] : tensor<3x3xf32>
     # CHECK:         }
     print(module_)