[mlir][amx] Add write side effect to AMX tile creation ops (#155403)

arun-thmn · web-flow · commit 448811dfc2ab · 2025-08-29T13:41:05.000+02:00
Adds `MemWrite` side effect to `amx.tile_zero` and `amx.tile_load` ops.

Memory write models hardware populating AMX tiles with specified values
through tile zero and load ops.
Making the side effect explicit allows to use multiple op instances as a
compilation hint to use different AMX tile registers. This can prevent
less efficient lowering through tile store-load copies compared to
directly populating tiles with values.

To illustrate the trade off:
Without explicit side effects, `CSE` optimizes two `amx.tile_zero` into
a single op which lowers to a copy for the second tile:
```
  tilezero        %tmm0
  tilestored      %tmm0, -2032(%rbp,%rbx) # 1024-byte Folded Spill
  tileloadd       -2032(%rbp,%rbx), %tmm1 # 1024-byte Folded Reload
```
By keeping the two `amx.tile_zero` ops and, thus, lowering to two
separate intrinsic invocations, the two tile registers are zeroed out
directly without the additional round trip through memory:
```
  tilezero        %tmm0
  tilezero        %tmm1
```
The same principle applies to `amx.tile_load` ops.
diff --git a/mlir/include/mlir/Dialect/AMX/AMX.td b/mlir/include/mlir/Dialect/AMX/AMX.td
@@ -142,14 +142,17 @@ class AMX_Op<string mnemonic, list<Trait> traits = []> :
 // Tile reset.
 //
 
-def TileZeroOp : AMX_Op<"tile_zero", [Pure,
-    AMXIntrinsicOpInterface
+def TileZeroOp : AMX_Op<"tile_zero", [
+    AMXIntrinsicOpInterface,
+    MemoryEffects<[MemWrite]>
   ]> {
   let summary = "tile zero operation";
   let description = [{
     Zeroes the destination tile, with the shape defined by the 2-dim
     vector type of the result. This is eventually lowered into the
     "tilezero" instruction with the corresponding tile configuration.
+    With memory-effects, each "tilezero" operation serves as a compilation 
+    hint to use a separate tile register.
 
     Example:
 
@@ -179,15 +182,17 @@ def TileZeroOp : AMX_Op<"tile_zero", [Pure,
 // Tile memory operations.
 //
 
-def TileLoadOp : AMX_Op<"tile_load", [Pure,
-    AMXIntrinsicOpInterface
+def TileLoadOp : AMX_Op<"tile_load", [
+    AMXIntrinsicOpInterface,
+    MemoryEffects<[MemWrite]>
   ]> {
   let summary = "tile load operation";
   let description = [{
     Loads a tile from memory defined by a base and indices, with the
     shape defined by the 2-dim vector type of the result. This is
     eventually lowered into the "tileloadd" instruction with the
-    corresponding tile configuration.
+    corresponding tile configuration. With memory-effects, each "tileload" 
+    operation serves as a compilation hint to use a separate tile register.
 
     Example:
 
diff --git a/mlir/test/Dialect/AMX/side-effects.mlir b/mlir/test/Dialect/AMX/side-effects.mlir
@@ -0,0 +1,32 @@
+// RUN: mlir-opt %s -cse -convert-vector-to-llvm="enable-amx" | FileCheck %s
+
+// With inclusion of memory side-effects, it is expected CSE not to fold multiple 
+// "tileload" and "tilezero".
+// CHECK-LABEL: do_not_fold_tiles(
+// CHECK: llvm.call_intrinsic "llvm.x86.tilezero.internal"
+// CHECK: llvm.call_intrinsic "llvm.x86.tilezero.internal"
+// CHECK: llvm.call_intrinsic "llvm.x86.tileloadd64.internal"
+// CHECK: llvm.call_intrinsic "llvm.x86.tileloadd64.internal"
+// CHECK: llvm.call_intrinsic "llvm.x86.tileloadd64.internal"
+// CHECK: llvm.call_intrinsic "llvm.x86.tileloadd64.internal"
+func.func @do_not_fold_tiles(%arg0: memref<2x32x32xbf16>, %arg1: memref<2x16x32xbf16>) -> memref<16x32xf32> {
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c16 = arith.constant 16 : index
+  %alloca = memref.alloca() : memref<16x32xf32>
+  %0 = amx.tile_zero : !amx.tile<16x16xf32>
+  %1 = amx.tile_zero : !amx.tile<16x16xf32>
+  %2:2 = scf.for %arg2 = %c0 to %c2 step %c1 iter_args(%arg3 = %0, %arg4 = %1) -> (!amx.tile<16x16xf32>, !amx.tile<16x16xf32>) {
+    %3 = amx.tile_load %arg0[%arg2, %c0, %c0] : memref<2x32x32xbf16> into !amx.tile<16x32xbf16>
+    %4 = amx.tile_load %arg0[%arg2, %c16, %c0] : memref<2x32x32xbf16> into !amx.tile<16x32xbf16>
+    %5 = amx.tile_load %arg1[%arg2, %c0, %c0] : memref<2x16x32xbf16> into !amx.tile<16x32xbf16>
+    %6 = amx.tile_load %arg1[%arg2, %c0, %c0] : memref<2x16x32xbf16> into !amx.tile<16x32xbf16>
+    %7 = amx.tile_mulf %3, %5, %arg3 : !amx.tile<16x32xbf16>, !amx.tile<16x32xbf16>, !amx.tile<16x16xf32>
+    %8 = amx.tile_mulf %4, %6, %arg4 : !amx.tile<16x32xbf16>, !amx.tile<16x32xbf16>, !amx.tile<16x16xf32>
+    scf.yield %7, %8 : !amx.tile<16x16xf32>, !amx.tile<16x16xf32>
+  }
+  amx.tile_store %alloca[%c0, %c0], %2#0 : memref<16x32xf32>, !amx.tile<16x16xf32>
+  amx.tile_store %alloca[%c0, %c16], %2#1 : memref<16x32xf32>, !amx.tile<16x16xf32>
+  return %alloca : memref<16x32xf32>
+}