[Prefetch] Prefetch lowering code cleanup (#5535)

chengjunlu · web-flow · commit 5bbe0fde6ff0 · 2025-11-27T13:30:01.000-05:00
Use the linear layout for the offsets evaluation. And fixes a bug that
the base offsetX nad offsetY is not swapped for column major memory.

Signed-off-by: Lu,Chengjun &lt;chengjun.lu@intel.com&gt;
diff --git a/test/TritonIntelGPU/prefetch-to-llvm.mlir b/test/TritonIntelGPU/prefetch-to-llvm.mlir
@@ -21,43 +21,25 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32}
     // CHECK:     %[[VAL_12:.*]] = llvm.insertvalue %[[ROW_STRIDE]], %[[VAL_11]][4] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:     %[[VAL_13:.*]] = llvm.insertvalue %[[VAL_6]], %[[VAL_12]][5] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:     %[[BLOCK_POINTER:.*]] = llvm.insertvalue %[[BASE]], %[[VAL_13]][6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
-    // CHECK:     %[[SUB_GROUP_ID_RAW:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() {no_unwind, will_return} : () -> i32
-    // CHECK:     %[[SUB_GROUP_ID_EXT:.*]] = llvm.zext %[[SUB_GROUP_ID_RAW]] : i32 to i64
-    // CHECK:     %[[SUB_GROUP_ID:.*]] = llvm.trunc %[[SUB_GROUP_ID_EXT]] : i64 to i32
-    // CHECK:     %[[VAL_18:.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK:     %[[VAL_19:.*]] = llvm.urem %[[SUB_GROUP_ID]], %[[VAL_18]] : i32
-    // CHECK:     %[[VAL_20:.*]] = llvm.udiv %[[SUB_GROUP_ID]], %[[VAL_18]] : i32
-    // CHECK:     %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32
-    // CHECK:     %[[VAL_22:.*]] = llvm.urem %[[VAL_20]], %[[CST_8]] : i32
-    // CHECK:     %[[VAL_23:.*]] = llvm.udiv %[[VAL_20]], %[[CST_8]] : i32
     // CHECK:     %[[OFFSET_0:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][0] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:     %[[OFFSET_1:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][1] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
-    // CHECK:     %[[WIDTH_i64:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][2] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
-    // CHECK:     %[[HEIGHT_i64:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][3] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
+    // CHECK:     %[[HEIGHT_i64:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][2] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
+    // CHECK:     %[[WIDTH_i64:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][3] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:     %[[ROW_STRIDE_i64:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][4] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:     %[[COL_STRIDE_i64:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][5] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:     %[[BASE_:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:     %[[CST_2:.*]] = llvm.mlir.constant(2 : i64) : i64
-    // CHECK:     %[[VAL_21:.*]] = llvm.mul %[[HEIGHT_i64]], %[[CST_2]] : i64
+    // CHECK:     %[[VAL_21:.*]] = llvm.mul %[[WIDTH_i64]], %[[CST_2]] : i64
     // CHECK:     %[[ROW_MAJOR_BASE_WIDTH:.*]] = llvm.trunc %[[VAL_21]] : i64 to i32
-    // CHECK:     %[[ROW_MAJOR_BASE_HEIGHT:.*]] = llvm.trunc %[[WIDTH_i64]] : i64 to i32
+    // CHECK:     %[[ROW_MAJOR_BASE_HEIGHT:.*]] = llvm.trunc %[[HEIGHT_i64]] : i64 to i32
     // CHECK:     %[[CST_2:.*]] = llvm.mlir.constant(2 : i64) : i64
     // CHECK:     %[[VAL_24:.*]] = llvm.mul %[[ROW_STRIDE_i64]], %[[CST_2]] : i64
     // CHECK:     %[[ROW_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_24]] : i64 to i32
-    // CHECK:     %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:     %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32
-    // CHECK:     %[[VAL_26:.*]] = llvm.mul %[[VAL_19]], %[[CST_32]] : i32
-    // CHECK:     %[[VAL_27:.*]] = llvm.add %[[VAL_26]], %[[CST_0]] : i32
-    // CHECK:     %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32
-    // CHECK:     %[[VAL_28:.*]] = llvm.urem %[[VAL_27]], %[[CST_32]] : i32
-    // CHECK:     %[[ROW_MAJOR_OFFSET_X:.*]] = llvm.add %[[VAL_28]], %[[OFFSET_1]] : i32
-    // CHECK:     %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:     %[[CST_2:.*]] = llvm.mlir.constant(2 : i32) : i32
-    // CHECK:     %[[VAL_30:.*]] = llvm.mul %[[VAL_22]], %[[CST_2]] : i32
-    // CHECK:     %[[VAL_31:.*]] = llvm.add %[[VAL_30]], %[[CST_0]] : i32
-    // CHECK:     %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:     %[[VAL_32:.*]] = llvm.urem %[[VAL_31]], %[[CST_16]] : i32
-    // CHECK:     %[[ROW_MAJOR_OFFSET_Y:.*]] = llvm.add %[[VAL_32]], %[[OFFSET_0]] : i32
+    // CHECK:     %[[SUB_GROUP_ID_RAW:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() {no_unwind, will_return} : () -> i32
+    // CHECK:     %[[SUB_GROUP_ID_EXT:.*]] = llvm.zext %[[SUB_GROUP_ID_RAW]] : i32 to i64
+    // CHECK:     %[[SUB_GROUP_ID:.*]] = llvm.trunc %[[SUB_GROUP_ID_EXT]] : i64 to i32
+    // CHECK:     %[[ROW_MAJOR_OFFSET_X:.*]] = llvm.add {{.*}}, %[[OFFSET_1]] : i32
+    // CHECK:     %[[ROW_MAJOR_OFFSET_Y:.*]] = llvm.add {{.*}}, %[[OFFSET_0]] : i32
     // CHECK:     triton_gen.2Dblockprefetch %[[BASE_]], %[[ROW_MAJOR_BASE_WIDTH]], %[[ROW_MAJOR_BASE_HEIGHT]], %[[ROW_MAJOR_PITCH]], %[[ROW_MAJOR_OFFSET_X]], %[[ROW_MAJOR_OFFSET_Y]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 2, v_blocks = 2, cache_control = L1C_L3C}
     %rowMajorPtr = tt.make_tensor_ptr %arg0, [%arg2, %arg4], [%arg5, %c1_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<16x32xf16>>
     ttig.prefetch %rowMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, ttig.block_io = "row_major"} : !tt.ptr<tensor<16x32xf16>>
@@ -70,15 +52,6 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32}
     // CHECK:     %[[VAL_12:.*]] = llvm.insertvalue %[[VAL_6]], %[[VAL_11]][4] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:     %[[VAL_13:.*]] = llvm.insertvalue %[[ROW_STRIDE]], %[[VAL_12]][5] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:     %[[BLOCK_POINTER:.*]] = llvm.insertvalue %[[BASE]], %[[VAL_13]][6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
-    // CHECK:     %[[SUB_GROUP_ID_RAW:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() {no_unwind, will_return} : () -> i32
-    // CHECK:     %[[SUB_GROUP_ID_EXT:.*]] = llvm.zext %[[SUB_GROUP_ID_RAW]] : i32 to i64
-    // CHECK:     %[[SUB_GROUP_ID:.*]] = llvm.trunc %[[SUB_GROUP_ID_EXT]] : i64 to i32
-    // CHECK:     %[[VAL_18:.*]] = llvm.mlir.constant(1 : i32) : i32
-    // CHECK:     %[[VAL_19:.*]] = llvm.urem %[[SUB_GROUP_ID]], %[[VAL_18]] : i32
-    // CHECK:     %[[VAL_20:.*]] = llvm.udiv %[[SUB_GROUP_ID]], %[[VAL_18]] : i32
-    // CHECK:     %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32
-    // CHECK:     %[[VAL_22:.*]] = llvm.urem %[[VAL_20]], %[[CST_8]] : i32
-    // CHECK:     %[[VAL_23:.*]] = llvm.udiv %[[VAL_20]], %[[CST_8]] : i32
     // CHECK:     %[[OFFSET_0:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][0] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:     %[[OFFSET_1:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][1] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:     %[[WIDTH_i64:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][2] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
@@ -93,20 +66,11 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32}
     // CHECK:     %[[CST_2:.*]] = llvm.mlir.constant(2 : i64) : i64
     // CHECK:     %[[VAL_24:.*]] = llvm.mul %[[COL_STRIDE_i64]], %[[CST_2]] : i64
     // CHECK:     %[[COL_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_24]] : i64 to i32
-    // CHECK:     %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:     %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32
-    // CHECK:     %[[VAL_26:.*]] = llvm.mul %[[VAL_19]], %[[CST_32]] : i32
-    // CHECK:     %[[VAL_27:.*]] = llvm.add %[[VAL_26]], %[[CST_0]] : i32
-    // CHECK:     %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32
-    // CHECK:     %[[VAL_28:.*]] = llvm.urem %[[VAL_27]], %[[CST_32]] : i32
-    // CHECK:     %[[COL_MAJOR_OFFSET_X:.*]] = llvm.add %[[VAL_28]], %[[OFFSET_1]] : i32
-    // CHECK:     %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:     %[[CST_2:.*]] = llvm.mlir.constant(2 : i32) : i32
-    // CHECK:     %[[VAL_30:.*]] = llvm.mul %[[VAL_22]], %[[CST_2]] : i32
-    // CHECK:     %[[VAL_31:.*]] = llvm.add %[[VAL_30]], %[[CST_0]] : i32
-    // CHECK:     %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32
-    // CHECK:     %[[VAL_32:.*]] = llvm.urem %[[VAL_31]], %[[CST_16]] : i32
-    // CHECK:     %[[COL_MAJOR_OFFSET_Y:.*]] = llvm.add %[[VAL_32]], %[[OFFSET_0]] : i32
+    // CHECK:     %[[SUB_GROUP_ID_RAW:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() {no_unwind, will_return} : () -> i32
+    // CHECK:     %[[SUB_GROUP_ID_EXT:.*]] = llvm.zext %[[SUB_GROUP_ID_RAW]] : i32 to i64
+    // CHECK:     %[[SUB_GROUP_ID:.*]] = llvm.trunc %[[SUB_GROUP_ID_EXT]] : i64 to i32
+    // CHECK:     %[[COL_MAJOR_OFFSET_X:.*]] = llvm.add {{.*}}, %[[OFFSET_0]] : i32
+    // CHECK:     %[[COL_MAJOR_OFFSET_Y:.*]] = llvm.add {{.*}}, %[[OFFSET_1]] : i32
     // CHECK:     triton_gen.2Dblockprefetch %[[BASE_]], %[[COL_MAJOR_BASE_WIDTH]], %[[COL_MAJOR_BASE_HEIGHT]], %[[COL_MAJOR_PITCH]], %[[COL_MAJOR_OFFSET_X]], %[[COL_MAJOR_OFFSET_Y]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 2, v_blocks = 2, cache_control = L1C_L3C}
     %columnMajorPtr = tt.make_tensor_ptr %arg0, [%arg4, %arg2], [%c1_i64, %arg5], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<32x16xf16>>
     ttig.prefetch %columnMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, ttig.block_io = "column_major"} : !tt.ptr<tensor<32x16xf16>>
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -706,8 +706,6 @@ struct PrefetchOpConversion
     Attribute blockIOAttr =
         op->getAttr(TritonIntelGPUDialect::getBlockIOAttrName());
     if (!blockIOAttr) {
-      // TODO: Fallback to gather semantic prefetching. Simply erase the
-      // prefetching op which is not supported for now.
       rewriter.eraseOp(op);
       return success();
     }
@@ -727,33 +725,20 @@ struct PrefetchOpConversion
       // Swap the shape to make it row major and then get the tiling
       // size base on row major shape.
       std::swap(tensorShape[0], tensorShape[1]);
-
-      // Create the new tensor type with swapped row and col.
-      tensorType = RankedTensorType::get(
-          tensorShape, tensorType.getElementType(), tensorType.getEncoding());
     }
-
     unsigned numWarps = triton::gpu::lookupNumWarps(op);
 
-    SmallVector<unsigned, 2> shapePerWarp =
-        get2DPrefetchShapePerWarp(tensorType);
+    auto [tileHeightInElem, tileWidthInElem, warpsM, warpsN] =
+        get2DPrefetchWarpsPerCTA(tensorShape, eltTy, numWarps);
 
-    SmallVector<unsigned, 2> warpsPerCTA =
-        getWarpsPerCTA(tensorShape, shapePerWarp, numWarps);
+    auto llEncoding = getLinearLayout(
+        tensorShape, {tileHeightInElem, tileWidthInElem}, {warpsM, warpsN});
 
-    // To adjust the row shape per warp to fit the tensor shape and avoid
-    // duplication in prefetching.
-    unsigned factor =
-        mlir::ceil(shapePerWarp[0] * warpsPerCTA[0], (unsigned)tensorShape[0]);
-    shapePerWarp[0] = mlir::ceil(shapePerWarp[0], factor);
-
-    SmallVector<int64_t> numReps = {
-        mlir::ceil<int64_t>(tensorShape[0], shapePerWarp[0] * warpsPerCTA[0]),
-        mlir::ceil<int64_t>(tensorShape[1], shapePerWarp[1] * warpsPerCTA[1])};
+    unsigned tileSizeInElem = tileHeightInElem * tileWidthInElem;
+    unsigned numTilesPerWarp =
+        (tensorShape[0] * tensorShape[1]) / (tileSizeInElem * numWarps);
 
     unsigned elemSizeInBits = eltTy.getIntOrFloatBitWidth();
-    unsigned tileWidthInElem = shapePerWarp[1];
-    unsigned tileHeightInElem = shapePerWarp[0];
     unsigned vBlocks = 1;
     switch (elemSizeInBits) {
     case 8:
@@ -774,12 +759,6 @@ struct PrefetchOpConversion
       break;
     }
 
-    Value warpId = rewriter.create<arith::IndexCastOp>(
-        loc, i32_ty,
-        rewriter.create<mlir::gpu::SubgroupIdOp>(loc, /*upperBound=*/nullptr));
-    SmallVector<Value> multiDimWarpId =
-        mlir::LLVM::delinearize(rewriter, loc, warpId, warpsPerCTA, {1, 0});
-
     auto [base, baseWidth, baseHeight, rowStride, colStride, offsetBaseX,
           offsetBaseY] =
         getValuesFromBlockPointerStruct(adaptor.getPtr(), rewriter);
@@ -788,6 +767,7 @@ struct PrefetchOpConversion
       // Swap the width/height and strides to the row major.
       std::swap(baseWidth, baseHeight);
       std::swap(colStride, rowStride);
+      std::swap(offsetBaseX, offsetBaseY);
     }
 
     baseWidth = b.mul(baseWidth, b.i64_val(eltTy.getIntOrFloatBitWidth() / 8));
@@ -799,46 +779,43 @@ struct PrefetchOpConversion
         b.mul(rowStride, b.i64_val(eltTy.getIntOrFloatBitWidth() / 8));
     rowStrideInBytes = b.trunc(i32_ty, rowStrideInBytes);
 
-    for (int row = 0; row < numReps[0]; ++row) {
-      for (int col = 0; col < numReps[1]; ++col) {
-        Value offsetX, offsetY;
-        offsetX = b.add(
-            // the offset of this warp.
-            b.mul(multiDimWarpId[1], b.i32_val(shapePerWarp[1])),
-            // add the replica offset with a warp stride.
-            b.i32_val(col * warpsPerCTA[1] * shapePerWarp[1]));
-        // Round the offset into to the tensor shape
-        offsetX = b.urem(offsetX, b.i32_val(tensorShape[1]));
-        offsetX = b.add(offsetX, offsetBaseX);
-        offsetY = b.add(
-            // the offset of this warp.
-            b.mul(multiDimWarpId[0], b.i32_val(shapePerWarp[0])),
-            // add the replica offset with a warp stride.
-            b.i32_val(row * warpsPerCTA[0] * shapePerWarp[0]));
-        // Round the offset into to the tensor shape
-        offsetY = b.urem(offsetY, b.i32_val(tensorShape[0]));
-        offsetY = b.add(offsetY, offsetBaseY);
-
-        auto newOp = rewriter.create<TritonGEN::Matrix2DBlockPrefetchOp>(
-            loc,
-            /*ptr*/ base,
-            /*base_width*/ baseWidth,
-            /*base_height*/ baseHeight,
-            /*base_pitch*/ rowStrideInBytes,
-            /*x*/ offsetX,
-            /*y*/ offsetY,
-            /*elem_size_in_bits*/ elemSizeInBits,
-            /*tile_width*/ tileWidthInElem,
-            /*tile_height*/ tileHeightInElem,
-            /*v_blocks*/ vBlocks,
-            /*cache_opt*/ TritonGEN::LoadCacheControl::L1C_L3C);
-        if (failed(newOp.verify())) {
-          // delete the op so that the verifier will not abort the pass
-          // pipeline later, as we can fail this path and try a different
-          // approach.
-          rewriter.eraseOp(newOp);
-          return failure();
-        }
+    MLIRContext *ctx = getContext();
+    StringAttr kOffset = S("offset");
+    StringAttr kWarp = S("warp");
+    StringAttr kBlock = S("block");
+
+    Value warpId = rewriter.create<arith::IndexCastOp>(
+        loc, i32_ty,
+        rewriter.create<mlir::gpu::SubgroupIdOp>(loc,
+                                                 /*upperBound=*/nullptr));
+
+    for (unsigned tile = 0; tile < numTilesPerWarp; ++tile) {
+      unsigned off = tile * tileSizeInElem;
+      auto offsets = applyLinearLayout(
+          loc, rewriter, llEncoding,
+          {{kOffset, b.i32_val(off)}, {kWarp, warpId}, {kBlock, b.i32_val(0)}});
+      Value offsetX = b.add(offsets[1].second, offsetBaseX);
+      Value offsetY = b.add(offsets[0].second, offsetBaseY);
+
+      auto newOp = rewriter.create<TritonGEN::Matrix2DBlockPrefetchOp>(
+          loc,
+          /*ptr*/ base,
+          /*base_width*/ baseWidth,
+          /*base_height*/ baseHeight,
+          /*base_pitch*/ rowStrideInBytes,
+          /*x*/ offsetX,
+          /*y*/ offsetY,
+          /*elem_size_in_bits*/ elemSizeInBits,
+          /*tile_width*/ tileWidthInElem,
+          /*tile_height*/ tileHeightInElem,
+          /*v_blocks*/ vBlocks,
+          /*cache_opt*/ TritonGEN::LoadCacheControl::L1C_L3C);
+      if (failed(newOp.verify())) {
+        // delete the op so that the verifier will not abort the pass
+        // pipeline later, as we can fail this path and try a different
+        // approach.
+        rewriter.eraseOp(newOp);
+        return failure();
       }
     }
 
@@ -1050,6 +1027,58 @@ struct PrefetchOpConversion
     rewriter.eraseOp(op);
     return success();
   }
+
+private:
+  // tensor shape has to be in row major.
+  // Returns:
+  // Prefetch Op Shape in {M, N}
+  // Warps per CTA in {M, N}
+  std::tuple<unsigned, unsigned, unsigned, unsigned>
+  get2DPrefetchWarpsPerCTA(const ArrayRef<int64_t> tensorShape, Type eltTy,
+                           unsigned numWarps) const {
+    unsigned rank = tensorShape.size();
+    assert(rank >= 2 && "Only rank >= 2 tensor is supported for now");
+    unsigned dimM = rank - 2, dimN = rank - 1;
+    unsigned elemSizeInBits = eltTy.getIntOrFloatBitWidth();
+    unsigned elemSizeInBytes = elemSizeInBits / 8;
+    constexpr unsigned maxBytesPerRow = 64;
+    unsigned numColsPerPrefOps =
+        std::min<unsigned>(tensorShape[dimN], maxBytesPerRow / elemSizeInBytes);
+
+    unsigned repNumN =
+        mlir::ceil((unsigned)tensorShape[dimN], numColsPerPrefOps);
+    unsigned warpsNumN = std::min(numWarps, repNumN);
+    unsigned warpsNumM = mlir::ceil(numWarps, warpsNumN);
+
+    // Get the number of rows per warp to fit the shape to the tensor shape to
+    // avoid duplication in prefetching.
+    unsigned rowNumPerWarp = mlir::ceil<unsigned>(tensorShape[dimM], warpsNumM);
+    unsigned numRowsPerPrefOps = std::min<unsigned>(rowNumPerWarp, 32);
+    SmallVector<unsigned, 2> tilePerPrefOps{numRowsPerPrefOps,
+                                            numColsPerPrefOps};
+
+    return {numRowsPerPrefOps, numColsPerPrefOps, warpsNumM, warpsNumN};
+  }
+
+  // Get the linear layout for the cooperative prefetching.
+  LinearLayout getLinearLayout(const ArrayRef<int64_t> tensorShape,
+                               const ArrayRef<unsigned> tileShape,
+                               const ArrayRef<unsigned> warpsPerCTA) const {
+    MLIRContext *ctx = getContext();
+    unsigned rank = warpsPerCTA.size();
+    assert(rank >= 2 && "Only rank >= 2 tensor is supported for now");
+    SmallVector<unsigned> order(rank);
+    for (size_t i = 0; i < warpsPerCTA.size(); ++i) {
+      // The fastest change dim is the first.
+      order[i] = rank - i - 1;
+    }
+    LinearLayout ctaLayout = identityStandardND(S("offset"), tileShape, order) *
+                             identityStandardND(S("warp"), warpsPerCTA, order);
+
+    return combineCtaCgaWithShape(std::move(ctaLayout),
+                                  CTALayoutAttr::getDefault(ctx, rank),
+                                  tensorShape);
+  }
 };
 
 struct LoadOpToBlockIOConversion