[MaterializeBlockPointer] Handle i64 element type (#4759)

whitneywhtsang · web-flow · commit 9a77cdb41d36 · 2025-07-21T20:42:48.000-04:00
Inductor CI: https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/16429761342 (pass) Fixes #4725 Signed-off-by: Whitney Tsang <whitney.tsang@intel.com>
diff --git a/test/TritonIntelGPU/materialize-block-pointer.mlir b/test/TritonIntelGPU/materialize-block-pointer.mlir
@@ -185,3 +185,26 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, "ttg.th
     tt.return
   }
 }
+
+// -----
+
+// COM: Ensure i64 element type is supported in materialize block pointer.
+
+#dpas = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 2], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>
+#dot_a = #ttg.dot_op<{opIdx = 0, parent = #dpas, kWidth = 1}>
+module attributes {"ttg.num-ctas" = 1 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 16 : i32, ttig.support_sg_2d_block} {
+  // CHECK-LABEL: tt.func public @materialize_block_pointer(
+  tt.func public @materialize_block_pointer(%arg0: !tt.ptr<i64> {tt.divisibility = 16 : i32}, %pitch: i64 {tt.divisibility = 16 : i32}) {
+    %c0_i32 = arith.constant 0 : i32
+    %c0_i64 = arith.constant 0 : i64
+    %c1_i64 = arith.constant 1 : i64
+
+    // CHECK: tt.load {{.*}} {ttig.block_io = "row_major"}
+    %0 = tt.make_tensor_ptr %arg0, [%c0_i64, %c0_i64], [%pitch, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<64x32xi64, #dot_a>>
+    %1 = tt.load %0 : !tt.ptr<tensor<64x32xi64, #dot_a>>
+    // CHECK: tt.store {{.*}} {ttig.block_io = "row_major"}
+    tt.store %0, %1 : !tt.ptr<tensor<64x32xi64, #dot_a>>
+
+    tt.return
+  }
+}
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp
@@ -116,7 +116,7 @@ struct TritonIntelGPUMaterializeBlockPointerPass
         Value pitch =
             strides[(strideOneDimVal == rank - 1) ? rank - 2 : rank - 1];
         LDBG("Pitch: " << pitch);
-        if (!ttgi::isDivisible(pitch, 128 / elementWidth))
+        if (!ttgi::isDivisible(pitch, llvm::divideCeil(128, elementWidth)))
           return;
 
         const bool isRowMajor = (strideOneDimVal == rank - 1);
@@ -336,7 +336,7 @@ struct TritonIntelGPUMaterializeBlockPointerPass
     // Analyze the shape of the stride one dimension to ensure it satisfies HW
     // constraints.
     Value baseWidth = tt::intel::getFinalValue(shape[strideOneDimVal]);
-    unsigned divisor = std::ceil(32 / elementWidth);
+    unsigned divisor = llvm::divideCeil(32, elementWidth);
     if (!ttgi::isDivisible(baseWidth, divisor)) {
       LLVM_DEBUG({
         llvm::dbgs() << "baseWidth does not satisfies HW constraint: ";