[GEMM] Allow 2D block io when M == 1 (#4540)

whitneywhtsang · web-flow · commit ccd14a95b84e · 2025-06-19T11:23:15.000-04:00
When M is 1, `offs_am` is 0, and `a_ptrs` has stride `[0, 1]`. ``` offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) ``` This PR changes MaterializeBlockPointer pass to allow stride to be 0. Further improved GEMM tensor of pointer performance by 8%. ![Screenshot 2025-06-19 102705](https://github.com/user-attachments/assets/d5197a85-a188-4e1f-8db6-9549bc566564) CI: https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/15748650337 Signed-off-by: Whitney Tsang <whitney.tsang@intel.com>
diff --git a/test/TritonIntelGPU/materialize-block-pointer.mlir b/test/TritonIntelGPU/materialize-block-pointer.mlir
@@ -135,3 +135,20 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     tt.return
   }
 }
+
+// -----
+
+// COM: Ensure pointer with stride [0, 1] is considered as row major.
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 4], warpsPerCTA = [32, 1], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32, ttig.support_sg_2d_block} {
+  tt.func public @tensor_of_ptr(%arg0: !tt.ptr<bf16> {tt.divisibility = 16 : i32}) {
+    %18 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>>
+    %19 = tt.expand_dims %18 {axis = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked>
+    %20 = tt.splat %arg0 : !tt.ptr<bf16> -> tensor<1x32x!tt.ptr<bf16>, #blocked>
+    %21 = tt.addptr %20, %19 : tensor<1x32x!tt.ptr<bf16>, #blocked>, tensor<1x32xi32, #blocked>
+    %22 = tt.broadcast %21 : tensor<1x32x!tt.ptr<bf16>, #blocked> -> tensor<256x32x!tt.ptr<bf16>, #blocked>
+    // CHECK: tt.load {{.*}} {ttig.block_io = "row_major"}
+    %50 = tt.load %22 : tensor<256x32x!tt.ptr<bf16>, #blocked>
+    tt.return
+  }
+}
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -139,7 +139,7 @@ struct LoadStoreConversionBase {
       const triton::intel::ModuleAxisInfoAnalysis &axisAnalysisPass)
       : targetInfo(targetInfo), axisAnalysisPass(axisAnalysisPass) {}
 
-  unsigned getStride(Value ptr, unsigned dim) const {
+  int getStride(Value ptr, unsigned dim) const {
     AxisInfo *axisInfo =
         const_cast<triton::intel::ModuleAxisInfoAnalysis &>(axisAnalysisPass)
             .getAxisInfo(ptr);
@@ -349,8 +349,12 @@ struct BlockIOConversionBase : public LoadStoreConversionBase {
     Location loc = ptr.getLoc();
     auto b = TritonLLVMOpBuilder(loc, rewriter);
 
-    unsigned stride = getStride(ptr, 0);
-    if (stride != -1)
+    int stride = getStride(ptr, 0);
+    // If the stride is 0, we assume a minimum pitch of 64 bytes.
+    constexpr int MIN_PITCH = 64;
+    if (stride == 0)
+      return b.i32_val(MIN_PITCH);
+    else if (stride != -1)
       return b.i32_val(stride * elemSizeInBits / 8);
 
     // ptrs[{0, 0}] and ptrs[{1, 0}] are currently used to calculate the
@@ -685,7 +689,9 @@ struct PrefetchOpConversion
     if (!rowStrideInBytes)
       return failure();
 
-    Value baseHeight = b.i32_val(tileHeightInElem);
+    // If the stride is 0, we want to load only the first row.
+    int stride = getStride(op.getPtr(), 0);
+    Value baseHeight = b.i32_val(stride == 0 ? 1 : tileHeightInElem);
     Value offsetBaseX = b.i32_val(0);
     Value offsetBaseY = b.i32_val(0);
 
@@ -1140,7 +1146,10 @@ struct LoadOpToBlockIOConversion
     if (!pitch)
       return failure();
 
-    Value baseHeight = b.i32_val(tileHeight);
+    // If the stride is 0, we want to load only the first row.
+    int stride = getStride(ptr, 0);
+    Value baseHeight = b.i32_val(stride == 0 ? 1 : tileHeight);
+
     StringAttr kRegister = str_attr("register");
     StringAttr kLane = str_attr("lane");
     StringAttr kWarp = str_attr("warp");
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp
@@ -186,9 +186,8 @@ struct TritonIntelGPUMaterializeBlockPointerPass
       }
 
       // Value -1 is used to represent the unknown stride.
-      if (axisInfo->getStride(otherDim) <= 0) {
-        LDBG("Found unknown or non positive stride: "
-             << axisInfo->getStride(otherDim));
+      if (axisInfo->getStride(otherDim) < 0) {
+        LDBG("Found unknown stride: " << axisInfo->getStride(otherDim));
         return false;
       }
 

Original file line number	Diff line number	Diff line change
`@@ -186,9 +186,8 @@ struct TritonIntelGPUMaterializeBlockPointerPass`
`186`	`186`	`}`
`187`	`187`
`188`	`188`	`// Value -1 is used to represent the unknown stride.`
`189`		`- if (axisInfo->getStride(otherDim) <= 0) {`
`190`		`- LDBG("Found unknown or non positive stride: "`
`191`		`- << axisInfo->getStride(otherDim));`
	`189`	`+ if (axisInfo->getStride(otherDim) < 0) {`
	`190`	`+ LDBG("Found unknown stride: " << axisInfo->getStride(otherDim));`
`192`	`191`	`return false;`
`193`	`192`	`}`
`194`	`193`