[TritonGEN] Update 2D block verifier (#4644)

whitneywhtsang · web-flow · commit f4764a997d80 · 2025-07-09T03:07:38.000Z
Signed-off-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/test/Conversion/intel/tritongpu_to_llvm_intel_advanced_path_invalid.mlir b/test/Conversion/intel/tritongpu_to_llvm_intel_advanced_path_invalid.mlir
@@ -5,7 +5,7 @@ module attributes {"ttig.support_sg_2d_block", "ttig.support_dpas", "ttg.num-war
     %c1_i64 = arith.constant 1 : i64
     %c0_i32 = arith.constant 0 : i32
     %22 = tt.make_tensor_ptr %arg0, [%arg1, %arg1], [%arg1, %c1_i64], [%arg2, %c0_i32] {order = array<i32: 1, 0>} : <tensor<2x32xf32>>
-    // expected-error @+2 {{tile_width for 32 bit elements should be equal to 8 or 16}}
+    // expected-error @+2 {{expecting elem_size_in_bits * tile_width * v_blocks <= 512}}
     // expected-error @+1 {{failed to legalize operation 'ttig.prefetch'}}
     ttig.prefetch %22 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr<tensor<2x32xf32>>
     tt.return
@@ -19,7 +19,7 @@ module attributes {"ttig.support_sg_2d_block", "ttig.support_dpas", "ttg.num-war
     %c1_i64 = arith.constant 1 : i64
     %c0_i32 = arith.constant 0 : i32
     %22 = tt.make_tensor_ptr %arg0, [%arg1, %arg1], [%arg1, %c1_i64], [%arg2, %c0_i32] {order = array<i32: 1, 0>} : <tensor<2x32xf32>>
-    // expected-error @+2 {{expecting tile_width to be between 1 and 16}}
+    // expected-error @+2 {{expecting elem_size_in_bits * tile_width * v_blocks <= 512}}
     // expected-error @+1 {{failed to legalize operation 'tt.load'}}
     %res = tt.load %22 {DotIdx = 0 : i32, boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<2x32xf32>>
     tt.return
diff --git a/test/TritonGEN/tritongen-invalid.mlir b/test/TritonGEN/tritongen-invalid.mlir
@@ -170,7 +170,7 @@ llvm.func @matrix_2Dblockload(%ptr : !llvm.ptr, %base_width : i32, %base_height
 // -----
 
 llvm.func @matrix_2Dblockload(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
-  // expected-error @+1 {{'triton_gen.2Dblockload' op tile_width * v_blocks should be less than or equal to 64 for 8 bit elements}}
+  // expected-error @+1 {{'triton_gen.2Dblockload' op expecting elem_size_in_bits * tile_width * v_blocks <= 512}}
   %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=4, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<32xi16>
   llvm.return
 }
@@ -523,23 +523,15 @@ llvm.func @matrix_2Dblockprefetch(%ptr : !llvm.ptr, %base_width : i32, %base_hei
 // -----
 
 llvm.func @matrix_2Dblockprefetch(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
-  // expected-error @+1 {{'triton_gen.2Dblockprefetch' op tile_width for 16 bit elements should be equal to 16}}
-  triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=32, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32)
-  llvm.return
-}
-
-// -----
-
-llvm.func @matrix_2Dblockprefetch(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
-  // expected-error @+1 {{'triton_gen.2Dblockprefetch' op tile_width for 8 bit elements should be equal to 16 or 32}}
-  triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=8, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32)
+  // expected-error @+1 {{'triton_gen.2Dblockprefetch' op expecting elem_size_in_bits * tile_width * v_blocks <= 512}}
+  triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=32, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32)
   llvm.return
 }
 
 // -----
 
 llvm.func @matrix_2Dblockprefetch(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
-  // expected-error @+1 {{'triton_gen.2Dblockprefetch' op tile_width for 32 bit elements should be equal to 8 or 16}}
-  triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=32, tile_height=8, v_blocks=1, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32)
+  // expected-error @+1 {{'triton_gen.2Dblockprefetch' op expecting tile_width to be between 4 and 64}}
+  triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=1, tile_height=32, v_blocks=1, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32)
   llvm.return
 }
diff --git a/third_party/intel/lib/Dialect/TritonGEN/IR/TritonGENOps.cpp b/third_party/intel/lib/Dialect/TritonGEN/IR/TritonGENOps.cpp
@@ -86,6 +86,40 @@ static LogicalResult verify2DBlockAddressPayloadRestriction(Op op) {
   return success();
 }
 
+template <typename Op> static LogicalResult verify2DBlockHWRestriction(Op op) {
+  static_assert(llvm::is_one_of<Op, TritonGEN::Matrix2DBlockLoadOp,
+                                TritonGEN::Matrix2DBlockPrefetchOp>::value,
+                "Unexpected template parameter");
+
+  unsigned elemSizeInBits = op.getElemSizeInBits();
+  uint32_t tileWidth = op.getTileWidth();
+  uint32_t vBlocks = op.getVBlocks();
+  if (elemSizeInBits * tileWidth * vBlocks > 512)
+    return op->emitOpError(
+        "expecting elem_size_in_bits * tile_width * v_blocks <= 512");
+
+  switch (elemSizeInBits) {
+  case 8:
+    if (tileWidth < 4)
+      return op->emitOpError("expecting tile_width to be between 4 and 64");
+    break;
+  case 16:
+    if (tileWidth < 2 || tileWidth > 32)
+      return op.emitOpError("expecting tile_width to be between 2 and 32");
+    break;
+  case 32:
+    if (tileWidth > 16)
+      return op.emitOpError("expecting tile_width to be between 1 and 16");
+    if (vBlocks == 4)
+      return op->emitOpError("v_blocks for 32 bit elements should be 1 or 2");
+    break;
+  default:
+    llvm_unreachable("unexpected element size");
+  }
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // gen.matrix.dpas
 //===----------------------------------------------------------------------===//
@@ -202,49 +236,8 @@ verify2DBlockLoadHWRestriction(TritonGEN::Matrix2DBlockLoadOp op) {
     return op.emitOpError(
         "transpose and vnni_transform are mutually exclusive");
 
-  if (!op.getTranspose() && !op.getVnniTransform()) {
-    uint32_t tileWidth = op.getTileWidth();
-    uint32_t vBlocks = op.getVBlocks();
-    switch (op.getElemSizeInBits()) {
-    case 8:
-      if (tileWidth < 4 || tileWidth > 64)
-        return op.emitOpError("expecting tile_width to be between 4 and 64");
-      if (tileWidth * vBlocks > 64)
-        return op.emitOpError(
-            "tile_width * v_blocks should be less than or equal "
-            "to 64 for 8 bit elements");
-      break;
-    case 16:
-      if (tileWidth < 2 || tileWidth > 32)
-        return op.emitOpError("expecting tile_width to be between 2 and 32");
-      if (tileWidth * vBlocks > 32)
-        return op.emitOpError(
-            "tile_width * v_blocks should be less than or equal "
-            "to 32 for 16 bit elements");
-      break;
-    case 32:
-      if (tileWidth < 1 || tileWidth > 16)
-        return op.emitOpError("expecting tile_width to be between 1 and 16");
-      if (vBlocks != 1 && vBlocks != 2)
-        return op.emitOpError("expecting v_blocks to be 1 or 2");
-      if (tileWidth * vBlocks > 16)
-        return op.emitOpError(
-            "tile_width * v_blocks should be less than or equal "
-            "to 16 for 32 bit elements");
-      break;
-    case 64:
-      if (tileWidth < 1 || tileWidth > 8)
-        return op.emitOpError("expecting tile_width to be between 1 and 8");
-      if (vBlocks != 1)
-        return op.emitOpError("expecting v_blocks to be 1");
-      break;
-    default:
-      return op.emitOpError(
-          "expecting elem_size_in_bits to be 8, 16, 32, or 64");
-    }
-
-    return success();
-  }
+  if (!op.getTranspose() && !op.getVnniTransform())
+    return verify2DBlockHWRestriction(op);
 
   if (op.getTranspose()) {
     assert(!op.getVnniTransform() &&
@@ -411,26 +404,5 @@ LogicalResult TritonGEN::Matrix2DBlockPrefetchOp::verify() {
   if (verify2DBlockAddressPayloadRestriction(*this).failed())
     return failure();
 
-  uint32_t tileWidth = getTileWidth();
-  switch (getElemSizeInBits()) {
-  case 8:
-    if (tileWidth != 16 && tileWidth != 32)
-      return emitOpError("tile_width for 8 bit elements should be equal to "
-                         "16 or 32");
-    break;
-  case 16:
-    if (tileWidth != 16)
-      return emitOpError("tile_width for 16 bit elements should be equal "
-                         "to 16");
-    break;
-  case 32:
-    if (tileWidth != 8 && tileWidth != 16)
-      return emitOpError(
-          "tile_width for 32 bit elements should be equal to 8 or 16");
-    break;
-  default:
-    llvm_unreachable("unexpected element size");
-  }
-
-  return success();
+  return verify2DBlockHWRestriction(*this);
 }