[μKernels]: AMX - optimal register allocation (#1076)

arun-thmn · web-flow · commit 075fdaa61eb1 · 2025-07-23T17:42:03.000+05:30
Changes/Relaxation in `AMX` pass to support optimal register allocation.
diff --git a/lib/TPP/Transforms/VectorContractToAMX.cpp b/lib/TPP/Transforms/VectorContractToAMX.cpp
@@ -388,7 +388,7 @@ static SmallVector<Value> createTileMuls(OpBuilder &builder, Location loc,
   SmallVector<Value> results;
   int numIterArgs = 0;
   for (unsigned i = 0; i < aLoadTiles.size(); i++) {
-    for (unsigned j = 0; j < aLoadTiles.size(); j++) {
+    for (unsigned j = 0; j < bLoadTiles.size(); j++) {
       auto amx =
           resType.getElementType().isFloat()
               ? builder.create<amx::TileMulFOp>(loc, resType, aLoadTiles[i],
@@ -515,10 +515,10 @@ struct VectorContractToAMXPattern
     auto accType = cast<ShapedType>(accDefiningOp.getType());
     int64_t M = accType.getDimSize(0);
     int64_t N = accType.getDimSize(1);
-    // M and N must be equal and divisible by 16.
-    if (M != N || M % 16 != 0 || N % 16 != 0)
+    // M and N must be divisible by 16.
+    if (M % 16 != 0 || N % 16 != 0)
       return rewriter.notifyMatchFailure(
-          op, "Output matrix dimensions must be equal and divisible by 16");
+          op, "Output matrix dimensions must be divisible by 16");
 
     auto accSubview = accDefiningOp.getBase();
     Location loc = op.getLoc();
diff --git a/test/BF16/Integration/amx/vector-contract-to-amx-gemm-mp.mlir b/test/BF16/Integration/amx/vector-contract-to-amx-gemm-mp.mlir
@@ -0,0 +1,32 @@
+// RUN: tpp-run -e optimal_blocking --entry-point-result=void -print --splat-to-random --init-type normal  -seed 123  %s > %t.1
+// RUN: tpp-opt %s  --tile-brgemm-linalg="registerBlocking=48,16,32"  --loop-invariant-code-motion --vectorization-pass --hoist-vector-transfer --vector-contract-to-amx | tpp-run -e optimal_blocking --entry-point-result=void -print  --splat-to-random --init-type normal  -seed 123  > %t.2
+// RUN: fpcmp -r 0.001 %t.1 %t.2
+
+func.func @optimal_blocking(%arg0: memref<1x48x16x2xbf16>, %arg1: memref<1x16x16x2xbf16>, %arg2: memref<48x16xf32>) -> memref<48x16xf32> {
+    linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d2, d4, d1)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3, d1)>, affine_map<(d0, d1, d2, d3, d4) -> (d2, d3)>], iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : memref<1x48x16x2xbf16>, memref<1x16x16x2xbf16>) outs(%arg2 : memref<48x16xf32>) {
+    ^bb0(%in: bf16, %in_1: bf16, %out: f32):
+      %a = arith.extf %in : bf16 to f32
+      %b = arith.extf %in_1 : bf16 to f32
+      %1 = arith.mulf %a, %b : f32
+      %2 = arith.addf %out, %1 : f32
+      linalg.yield %2 : f32
+    }
+  return %arg2 : memref<48x16xf32>
+}
+
+
+// RUN: tpp-run -e optimal_blocking_1x3 --entry-point-result=void -print --splat-to-random --init-type normal  -seed 123  %s > %t.1
+// RUN: tpp-opt %s  --tile-brgemm-linalg="registerBlocking=16,48,32"  --loop-invariant-code-motion --vectorization-pass --hoist-vector-transfer --vector-contract-to-amx | tpp-run -e optimal_blocking_1x3 --entry-point-result=void -print  --splat-to-random --init-type normal  -seed 123  > %t.2
+// RUN: fpcmp -r 0.001 %t.1 %t.2
+
+func.func @optimal_blocking_1x3(%arg0: memref<1x16x16x2xbf16>, %arg1: memref<1x16x48x2xbf16>, %arg2: memref<16x48xf32>) -> memref<16x48xf32> {
+    linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d2, d4, d1)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3, d1)>, affine_map<(d0, d1, d2, d3, d4) -> (d2, d3)>], iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : memref<1x16x16x2xbf16>, memref<1x16x48x2xbf16>) outs(%arg2 : memref<16x48xf32>) {
+    ^bb0(%in: bf16, %in_1: bf16, %out: f32):
+      %a = arith.extf %in : bf16 to f32
+      %b = arith.extf %in_1 : bf16 to f32
+      %1 = arith.mulf %a, %b : f32
+      %2 = arith.addf %out, %1 : f32
+      linalg.yield %2 : f32
+    }
+  return %arg2 : memref<16x48xf32>
+}
diff --git a/test/Passes/pass-vector-contract-to-amx.mlir b/test/Passes/pass-vector-contract-to-amx.mlir
@@ -204,6 +204,41 @@ func.func @entry(%arg0: memref<8x32x32x32xbf16>, %arg1: memref<2x32x16x32x2xbf16
 
 // -----
 
+func.func @optimal_register_blocking_3x1(%arg0: memref<1x48x16x2xbf16>, %arg1: memref<1x16x16x2xbf16>, %arg2: memref<48x16xf32>) -> memref<48x16xf32> {
+  %0 = ub.poison : f32
+  %1 = ub.poison : bf16
+  %c0 = arith.constant 0 : index
+  %c48 = arith.constant 48 : index
+  %c16 = arith.constant 16 : index
+  %c1 = arith.constant 1 : index
+  scf.for %arg3 = %c0 to %c48 step %c48 {
+    scf.for %arg4 = %c0 to %c16 step %c16 {
+      %subview = memref.subview %arg2[%arg3, %arg4] [48, 16] [1, 1] : memref<48x16xf32> to memref<48x16xf32, strided<[16, 1], offset: ?>>
+      %2 = vector.transfer_read %subview[%c0, %c0], %0 {in_bounds = [true, true]} : memref<48x16xf32, strided<[16, 1], offset: ?>>, vector<48x16xf32>
+      %3 = scf.for %arg5 = %c0 to %c1 step %c1 iter_args(%arg6 = %2) -> (vector<48x16xf32>) {
+        %4 = scf.for %arg7 = %c0 to %c16 step %c16 iter_args(%arg8 = %arg6) -> (vector<48x16xf32>) {
+          %subview_0 = memref.subview %arg0[%arg5, %arg3, %arg7, 0] [1, 48, 16, 2] [1, 1, 1, 1] : memref<1x48x16x2xbf16> to memref<1x48x16x2xbf16, strided<[1536, 32, 2, 1], offset: ?>>
+          %subview_1 = memref.subview %arg1[%arg5, %arg7, %arg4, 0] [1, 16, 16, 2] [1, 1, 1, 1] : memref<1x16x16x2xbf16> to memref<1x16x16x2xbf16, strided<[512, 32, 2, 1], offset: ?>>
+          %5 = vector.transfer_read %subview_0[%c0, %c0, %c0, %c0], %1 {in_bounds = [true, true, true, true]} : memref<1x48x16x2xbf16, strided<[1536, 32, 2, 1], offset: ?>>, vector<1x48x16x2xbf16>
+          %6 = vector.transfer_read %subview_1[%c0, %c0, %c0, %c0], %1 {in_bounds = [true, true, true, true]} : memref<1x16x16x2xbf16, strided<[512, 32, 2, 1], offset: ?>>, vector<1x16x16x2xbf16>
+          %7 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2, d3, d4) -> (d0, d2, d4, d1)>, affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d3, d1)>, affine_map<(d0, d1, d2, d3, d4) -> (d2, d3)>], iterator_types = ["reduction", "reduction", "parallel", "parallel", "reduction"], kind = #vector.kind<add>} %5, %6, %arg8 : vector<1x48x16x2xbf16>, vector<1x16x16x2xbf16> into vector<48x16xf32>
+          scf.yield %7 : vector<48x16xf32>
+        }
+        scf.yield %4 : vector<48x16xf32>
+      }
+      vector.transfer_write %3, %subview[%c0, %c0] {in_bounds = [true, true]} : vector<48x16xf32>, memref<48x16xf32, strided<[16, 1], offset: ?>>
+    }
+  }
+  return %arg2 : memref<48x16xf32>
+}
+
+// CHECK-LABEL:   func.func @optimal_register_blocking_3x1
+// CHECK-COUNT-3:     amx.tile_load
+// CHECK-COUNT-3:     amx.tile_mulf
+// CHECK-COUNT-3:     amx.tile_store
+
+// -----
+
 // This tests shows the lowering of a mixed precision vector.contract
 // (i8 x i8 -> i32) to AMX dialect.
 func.func @entry(%arg0: memref<4x16x64x64xi8>, %arg1: memref<16x16x16x64x4xi8>, %arg2: memref<4x16x64x64xi32>) {