[LoadStoreOpToLLVM] Check pitch HW restriction before generating 2d block load (#4829)

whitneywhtsang · web-flow · commit 94312430fb21 · 2025-08-02T11:54:42.000-04:00
Prevent the generation of 2D block loads when the pitch does not meet hardware restrictions, which can be identified during compile time. BMG CI: https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/16685703178 Signed-off-by: Whitney Tsang <whitney.tsang@intel.com>
diff --git a/python/test/unit/intel/test_block_load.py b/python/test/unit/intel/test_block_load.py
@@ -16,9 +16,6 @@
 @pytest.mark.xfail(not torch.xpu.get_device_capability()['has_subgroup_2d_block_io'],
                    reason="Block loads not supported on this architecture")
 def test_block_load_dpas_layout(M, N, dtype_str, transpose, device, tmp_path: pathlib.Path):
-    if transpose and N == 8:
-        pytest.xfail("Pitch = 8 is not allowed by block IO")
-
     # modify the layouts to ensure the correct OCL/SPIRV intrinsic is called for each datatype
     if dtype_str == "int8":
         A_width = 2
diff --git a/test/TritonIntelGPU/blockptr_load.mlir b/test/TritonIntelGPU/blockptr_load.mlir
@@ -137,7 +137,7 @@ module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32,
     // CHECK:           %[[OFFSET_1:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][1] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:           %[[WIDTH_i64:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][2] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:           %[[HEIGHT_i64:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][3] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
-    // CHECK:           %[[ROW_STRIDE_i64:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][4] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
+    // CHECK:           %[[ROW_STRIDE_i64:.*]] = llvm.extractvalue %[[VAL_12]][4] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:           %[[COL_STRIDE_i64:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][5] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:           %[[BASE:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:           %[[ROW_STRIDE_i32:.*]] = llvm.trunc %[[ROW_STRIDE_i64]] : i64 to i32
@@ -200,7 +200,7 @@ module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32,
     // CHECK:           %[[OFFSET_1:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][1] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:           %[[WIDTH_i64:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][2] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:           %[[HEIGHT_i64:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][3] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
-    // CHECK:           %[[ROW_STRIDE_i64:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][4] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
+    // CHECK:           %[[ROW_STRIDE_i64:.*]] = llvm.extractvalue %[[VAL_11]][4] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:           %[[COL_STRIDE_i64:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][5] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:           %[[BASE:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:           %[[ROW_STRIDE_i32:.*]] = llvm.trunc %[[ROW_STRIDE_i64]] : i64 to i32
diff --git a/test/TritonIntelGPU/subgroup-2d-block-io.mlir b/test/TritonIntelGPU/subgroup-2d-block-io.mlir
@@ -8,7 +8,7 @@ module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, tt
     tt.func public @subgroup_2d_block_load(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16: i32}, %arg3: !tt.ptr<f16> {tt.divisibility = 16: i32}) attributes {noinline = false} {
         %0 = tt.get_program_id x : i32
         %M_i64 = arith.constant 16 : i64
-        %N_i64 = arith.constant 16 : i64
+        %N_i64 = arith.constant 64 : i64
         %c1_i64 = arith.constant 1 : i64
         %c0_i32 = arith.constant 0 : i32
 
@@ -29,7 +29,7 @@ module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, tt
     tt.func public @subgroup_2d_block_load(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16: i32}, %arg3: !tt.ptr<f16> {tt.divisibility = 16: i32}) attributes {noinline = false} {
         %0 = tt.get_program_id x : i32
         %M_i64 = arith.constant 16 : i64
-        %N_i64 = arith.constant 16 : i64
+        %N_i64 = arith.constant 64 : i64
         %c1_i64 = arith.constant 1 : i64
         %c0_i32 = arith.constant 0 : i32
 
@@ -50,7 +50,7 @@ module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, tt
     tt.func public @subgroup_2d_block_load(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16: i32}, %arg3: !tt.ptr<f16> {tt.divisibility = 16: i32}) attributes {noinline = false} {
         %0 = tt.get_program_id x : i32
         %M_i64 = arith.constant 16 : i64
-        %N_i64 = arith.constant 16 : i64
+        %N_i64 = arith.constant 64 : i64
         %c1_i64 = arith.constant 1 : i64
         %c0_i32 = arith.constant 0 : i32
 
@@ -71,7 +71,7 @@ module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, tt
     tt.func public @subgroup_2d_block_load(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16: i32}, %arg3: !tt.ptr<f16> {tt.divisibility = 16: i32}) attributes {noinline = false} {
         %0 = tt.get_program_id x : i32
         %M_i64 = arith.constant 16 : i64
-        %N_i64 = arith.constant 16 : i64
+        %N_i64 = arith.constant 64 : i64
         %c1_i64 = arith.constant 1 : i64
         %c0_i32 = arith.constant 0 : i32
 
@@ -92,7 +92,7 @@ module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, tt
     tt.func public @subgroup_2d_block_load(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16: i32}, %arg3: !tt.ptr<f16> {tt.divisibility = 16: i32}) attributes {noinline = false} {
         %0 = tt.get_program_id x : i32
         %M_i64 = arith.constant 32 : i64
-        %N_i64 = arith.constant 16 : i64
+        %N_i64 = arith.constant 64 : i64
         %c1_i64 = arith.constant 1 : i64
         %c0_i32 = arith.constant 0 : i32
 
@@ -113,7 +113,7 @@ module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, tt
     tt.func public @subgroup_2d_block_load(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16: i32}, %arg3: !tt.ptr<f16> {tt.divisibility = 16: i32}) attributes {noinline = false} {
         %0 = tt.get_program_id x : i32
         %M_i64 = arith.constant 32 : i64
-        %N_i64 = arith.constant 16 : i64
+        %N_i64 = arith.constant 64 : i64
         %c1_i64 = arith.constant 1 : i64
         %c0_i32 = arith.constant 0 : i32
 
@@ -134,7 +134,7 @@ module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, tt
     tt.func public @subgroup_2d_block_load(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16: i32}, %arg3: !tt.ptr<f16> {tt.divisibility = 16: i32}) attributes {noinline = false} {
         %0 = tt.get_program_id x : i32
         %M_i64 = arith.constant 32 : i64
-        %N_i64 = arith.constant 16 : i64
+        %N_i64 = arith.constant 64 : i64
         %c1_i64 = arith.constant 1 : i64
         %c0_i32 = arith.constant 0 : i32
 
@@ -155,7 +155,7 @@ module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, tt
     tt.func public @subgroup_2d_block_load(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16: i32}, %arg3: !tt.ptr<f16> {tt.divisibility = 16: i32}) attributes {noinline = false} {
         %0 = tt.get_program_id x : i32
         %M_i64 = arith.constant 32 : i64
-        %N_i64 = arith.constant 16 : i64
+        %N_i64 = arith.constant 64 : i64
         %c1_i64 = arith.constant 1 : i64
         %c0_i32 = arith.constant 0 : i32
 
@@ -176,7 +176,7 @@ module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, tt
     tt.func public @subgroup_2d_block_load(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16: i32}, %arg3: !tt.ptr<f16> {tt.divisibility = 16: i32}) attributes {noinline = false} {
         %0 = tt.get_program_id x : i32
         %M_i64 = arith.constant 64 : i64
-        %N_i64 = arith.constant 16 : i64
+        %N_i64 = arith.constant 64 : i64
         %c1_i64 = arith.constant 1 : i64
         %c0_i32 = arith.constant 0 : i32
 
@@ -197,7 +197,7 @@ module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, tt
     tt.func public @subgroup_2d_block_load(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16: i32}, %arg3: !tt.ptr<f16> {tt.divisibility = 16: i32}) attributes {noinline = false} {
         %0 = tt.get_program_id x : i32
         %M_i64 = arith.constant 64 : i64
-        %N_i64 = arith.constant 16 : i64
+        %N_i64 = arith.constant 64 : i64
         %c1_i64 = arith.constant 1 : i64
         %c0_i32 = arith.constant 0 : i32
 
@@ -218,7 +218,7 @@ module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, tt
     tt.func public @subgroup_2d_block_load(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16: i32}, %arg3: !tt.ptr<f16> {tt.divisibility = 16: i32}) attributes {noinline = false} {
         %0 = tt.get_program_id x : i32
         %M_i64 = arith.constant 64 : i64
-        %N_i64 = arith.constant 16 : i64
+        %N_i64 = arith.constant 64 : i64
         %c1_i64 = arith.constant 1 : i64
         %c0_i32 = arith.constant 0 : i32
 
@@ -239,7 +239,7 @@ module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, tt
     tt.func public @subgroup_2d_block_load(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16: i32}, %arg3: !tt.ptr<f16> {tt.divisibility = 16: i32}) attributes {noinline = false} {
         %0 = tt.get_program_id x : i32
         %M_i64 = arith.constant 64 : i64
-        %N_i64 = arith.constant 32 : i64
+        %N_i64 = arith.constant 64 : i64
         %c1_i64 = arith.constant 1 : i64
         %c0_i32 = arith.constant 0 : i32
 
@@ -260,7 +260,7 @@ module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, tt
     tt.func public @subgroup_2d_block_load(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16: i32}, %arg3: !tt.ptr<f16> {tt.divisibility = 16: i32}) attributes {noinline = false} {
         %0 = tt.get_program_id x : i32
         %M_i64 = arith.constant 64 : i64
-        %N_i64 = arith.constant 32 : i64
+        %N_i64 = arith.constant 64 : i64
         %c1_i64 = arith.constant 1 : i64
         %c0_i32 = arith.constant 0 : i32
 
@@ -281,7 +281,7 @@ module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, tt
     tt.func public @subgroup_2d_block_load(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16: i32}, %arg3: !tt.ptr<f16> {tt.divisibility = 16: i32}) attributes {noinline = false} {
         %0 = tt.get_program_id x : i32
         %M_i64 = arith.constant 64 : i64
-        %N_i64 = arith.constant 32 : i64
+        %N_i64 = arith.constant 64 : i64
         %c1_i64 = arith.constant 1 : i64
         %c0_i32 = arith.constant 0 : i32
 
@@ -302,7 +302,7 @@ module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, tt
     tt.func public @subgroup_2d_block_load(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16: i32}, %arg3: !tt.ptr<f16> {tt.divisibility = 16: i32}) attributes {noinline = false} {
         %0 = tt.get_program_id x : i32
         %M_i64 = arith.constant 128 : i64
-        %N_i64 = arith.constant 32 : i64
+        %N_i64 = arith.constant 64 : i64
         %c1_i64 = arith.constant 1 : i64
         %c0_i32 = arith.constant 0 : i32
 
@@ -323,7 +323,7 @@ module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, tt
     tt.func public @subgroup_2d_block_load(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16: i32}, %arg3: !tt.ptr<f16> {tt.divisibility = 16: i32}) attributes {noinline = false} {
         %0 = tt.get_program_id x : i32
         %M_i64 = arith.constant 256 : i64
-        %N_i64 = arith.constant 32 : i64
+        %N_i64 = arith.constant 64 : i64
         %c1_i64 = arith.constant 1 : i64
         %c0_i32 = arith.constant 0 : i32
 
@@ -344,7 +344,7 @@ module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, tt
     tt.func public @subgroup_2d_block_load(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16: i32}, %arg3: !tt.ptr<f16> {tt.divisibility = 16: i32}) attributes {noinline = false} {
         %0 = tt.get_program_id x : i32
         %M_i64 = arith.constant 256 : i64
-        %N_i64 = arith.constant 32 : i64
+        %N_i64 = arith.constant 64 : i64
         %c1_i64 = arith.constant 1 : i64
         %c0_i32 = arith.constant 0 : i32
 
@@ -365,7 +365,7 @@ module attributes {ttig.min_sg_size = 16 : i32, ttig.support_bf16_conversion, tt
     tt.func public @subgroup_2d_block_load(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16: i32}, %arg3: !tt.ptr<f16> {tt.divisibility = 16: i32}) attributes {noinline = false} {
         %0 = tt.get_program_id x : i32
         %M_i64 = arith.constant 256 : i64
-        %N_i64 = arith.constant 32 : i64
+        %N_i64 = arith.constant 64 : i64
         %c1_i64 = arith.constant 1 : i64
         %c0_i32 = arith.constant 0 : i32
 
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -13,6 +13,7 @@
 
 #include "intel/include/Dialect/TritonIntelGPU/IR/Attributes.h"
 #include "intel/include/Dialect/TritonIntelGPU/Transforms/Utility.h"
+#include "intel/include/Utils/Utility.h"
 #include "triton/Tools/LinearLayout.h"
 #include <optional>
 #include <triton/Tools/Sys/GetEnv.hpp>
@@ -1540,6 +1541,21 @@ struct LoadOpToBlockIOConversion
       pitch = b.trunc(i32_ty, colStride);
       std::swap(baseWidth, baseHeight);
     }
+    // HW requires the pitch to be at least 64 bytes.
+    std::function<Value(Value)> skipTrunc = [&](Value v) {
+      if (dyn_cast_or_null<LLVM::TruncOp>(v.getDefiningOp()))
+        return skipTrunc(v.getDefiningOp()->getOperand(0));
+      return v;
+    };
+    if (Operation *op = skipTrunc(pitch).getDefiningOp()) {
+      std::optional<int64_t> pitchConst =
+          mlir::triton::intel::getFoldedConstantValue(op);
+      if (pitchConst.has_value()) {
+        if ((*pitchConst * elemSizeInBits / 8) < 64)
+          return failure();
+      }
+    }
+
     baseWidth = b.trunc(i32_ty, baseWidth);
     baseHeight = b.trunc(i32_ty, baseHeight);
 
diff --git a/third_party/intel/lib/Utils/Utility.cpp b/third_party/intel/lib/Utils/Utility.cpp
@@ -119,16 +119,7 @@ std::optional<int64_t> getFoldedConstantValue(Operation *op) {
   if (results.size() != 1)
     return std::nullopt;
 
-  std::optional<int64_t> intAttr = getIntAttr(results[0]);
-  if (intAttr.has_value())
-    return intAttr.value();
-
-  auto val = cast<Value>(results[0]);
-  auto constOp = val.getDefiningOp<arith::ConstantOp>();
-  if (!constOp)
-    return std::nullopt;
-
-  return getIntAttr(constOp.getValue());
+  return getConstantIntValue(results[0]);
 }
 
 bool isConstant(Value val, int64_t expected) {