Fix issue in prefetching column major matrix. (#4611)

chengjunlu · web-flow · commit c104666d02c2 · 2025-07-14T09:03:57.000-04:00
The prefetching lowering uses the incorrect shape sizes to get the
tiling shape for column major matrix.

---------

Signed-off-by: Lu,Chengjun &lt;chengjun.lu@intel.com&gt;
diff --git a/test/TritonIntelGPU/prefetch-to-llvm.mlir b/test/TritonIntelGPU/prefetch-to-llvm.mlir
@@ -73,10 +73,10 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32}
     // CHECK:     %[[SUB_GROUP_ID_RAW:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() {no_unwind, will_return} : () -> i32
     // CHECK:     %[[SUB_GROUP_ID_EXT:.*]] = llvm.zext %[[SUB_GROUP_ID_RAW]] : i32 to i64
     // CHECK:     %[[SUB_GROUP_ID:.*]] = llvm.trunc %[[SUB_GROUP_ID_EXT]] : i64 to i32
-    // CHECK:     %[[VAL_18:.*]] = llvm.mlir.constant(2 : i32) : i32
+    // CHECK:     %[[VAL_18:.*]] = llvm.mlir.constant(1 : i32) : i32
     // CHECK:     %[[VAL_19:.*]] = llvm.urem %[[SUB_GROUP_ID]], %[[VAL_18]] : i32
     // CHECK:     %[[VAL_20:.*]] = llvm.udiv %[[SUB_GROUP_ID]], %[[VAL_18]] : i32
-    // CHECK:     %[[CST_8:.*]] = llvm.mlir.constant(4 : i32) : i32
+    // CHECK:     %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32
     // CHECK:     %[[VAL_22:.*]] = llvm.urem %[[VAL_20]], %[[CST_8]] : i32
     // CHECK:     %[[VAL_23:.*]] = llvm.udiv %[[VAL_20]], %[[CST_8]] : i32
     // CHECK:     %[[OFFSET_0:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][0] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
@@ -94,20 +94,20 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32}
     // CHECK:     %[[VAL_24:.*]] = llvm.mul %[[COL_STRIDE_i64]], %[[CST_2]] : i64
     // CHECK:     %[[COL_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_24]] : i64 to i32
     // CHECK:     %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:     %[[CST_32:.*]] = llvm.mlir.constant(16 : i32) : i32
+    // CHECK:     %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32
     // CHECK:     %[[VAL_26:.*]] = llvm.mul %[[VAL_19]], %[[CST_32]] : i32
     // CHECK:     %[[VAL_27:.*]] = llvm.add %[[VAL_26]], %[[CST_0]] : i32
     // CHECK:     %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32
     // CHECK:     %[[VAL_28:.*]] = llvm.urem %[[VAL_27]], %[[CST_32]] : i32
     // CHECK:     %[[COL_MAJOR_OFFSET_X:.*]] = llvm.add %[[VAL_28]], %[[OFFSET_1]] : i32
     // CHECK:     %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:     %[[CST_2:.*]] = llvm.mlir.constant(4 : i32) : i32
+    // CHECK:     %[[CST_2:.*]] = llvm.mlir.constant(2 : i32) : i32
     // CHECK:     %[[VAL_30:.*]] = llvm.mul %[[VAL_22]], %[[CST_2]] : i32
     // CHECK:     %[[VAL_31:.*]] = llvm.add %[[VAL_30]], %[[CST_0]] : i32
     // CHECK:     %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32
     // CHECK:     %[[VAL_32:.*]] = llvm.urem %[[VAL_31]], %[[CST_16]] : i32
     // CHECK:     %[[COL_MAJOR_OFFSET_Y:.*]] = llvm.add %[[VAL_32]], %[[OFFSET_0]] : i32
-    // CHECK:     triton_gen.2Dblockprefetch %[[BASE_]], %[[COL_MAJOR_BASE_WIDTH]], %[[COL_MAJOR_BASE_HEIGHT]], %[[COL_MAJOR_PITCH]], %[[COL_MAJOR_OFFSET_X]], %[[COL_MAJOR_OFFSET_Y]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 4, v_blocks = 1, cache_control = L1C_L3C}
+    // CHECK:     triton_gen.2Dblockprefetch %[[BASE_]], %[[COL_MAJOR_BASE_WIDTH]], %[[COL_MAJOR_BASE_HEIGHT]], %[[COL_MAJOR_PITCH]], %[[COL_MAJOR_OFFSET_X]], %[[COL_MAJOR_OFFSET_Y]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 2, v_blocks = 2, cache_control = L1C_L3C}
     %columnMajorPtr = tt.make_tensor_ptr %arg0, [%arg4, %arg2], [%c1_i64, %arg5], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<32x16xf16>>
     ttig.prefetch %columnMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, ttig.block_io = "column_major"} : !tt.ptr<tensor<32x16xf16>>
 
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -559,6 +559,10 @@ struct PrefetchOpConversion
       // Swap the shape to make it row major and then get the tiling
       // size base on row major shape.
       std::swap(tensorShape[0], tensorShape[1]);
+
+      // Create the new tensor type with swapped row and col.
+      tensorType = RankedTensorType::get(
+          tensorShape, tensorType.getElementType(), tensorType.getEncoding());
     }
 
     unsigned numWarps = triton::gpu::lookupNumWarps(op);