Replace 2D block load payload with i64.

silee2 · silee2 · commit 4e4cbd06de5c · 2025-10-22T00:27:09.000Z
diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
@@ -48,15 +48,6 @@ namespace {
 static constexpr int32_t systolicDepth{8};
 static constexpr int32_t executionSize{16};
 
-// Offsets to individual fields of the 8xi32 layout nd tensor descriptor.
-enum class NdTdescOffset : uint32_t {
-  BasePtr = 0,       // Base pointer (i64)
-  BaseShapeW = 2,    // Base shape width (i32)
-  BaseShapeH = 3,    // Base shape height (i32)
-  TensorOffsetW = 4, // Tensor offset W (i32)
-  TensorOffsetH = 5  // Tensor offset H (i32)
-};
-
 static int32_t getNumericXeVMAddrSpace(xegpu::MemorySpace xeGpuMemspace) {
   switch (xeGpuMemspace) {
   case xegpu::MemorySpace::Global:
@@ -177,92 +168,14 @@ class CreateNdDescToXeVMPattern
     if (mixedOffsets.size() != 0)
       return rewriter.notifyMatchFailure(op, "Offsets not supported.");
     auto loc = op.getLoc();
-    auto source = op.getSource();
-    // Op is lowered to a code sequence that populates payload.
-    // Payload is a 8xi32 vector. Offset to individual fields are defined in
-    // NdTdescOffset enum.
-    Type payloadElemTy = rewriter.getI32Type();
-    VectorType payloadTy = VectorType::get(8, payloadElemTy);
-    Type i64Ty = rewriter.getI64Type();
-    // 4xi64 view is used for inserting the base pointer.
-    VectorType payloadI64Ty = VectorType::get(4, i64Ty);
-    // Initialize payload to zero.
-    Value payload = arith::ConstantOp::create(
-        rewriter, loc,
-        DenseElementsAttr::get(payloadTy, IntegerAttr::get(payloadElemTy, 0)));
-
-    Value baseAddr;
-    Value baseShapeW;
-    Value baseShapeH;
-    Value offsetW;
-    Value offsetH;
 
-    // Source can be a memref or a pointer (ui64, ui32, i64 or i32).
-    SmallVector<OpFoldResult> mixedSizes = op.getMixedSizes();
-    auto srcRank = mixedSizes.size();
-    if (srcRank < 2)
-      return rewriter.notifyMatchFailure(op, "Expected at least 2D source.");
-
-    auto sourceTy = source.getType();
-    auto sourceMemrefTy = dyn_cast<MemRefType>(sourceTy);
-    // If source is a memref, we need to extract the aligned pointer as index.
-    // Pointer type is passed as i32 or i64 by type converter.
-    if (sourceMemrefTy) {
-      if (!sourceMemrefTy.hasStaticShape()) {
-        return rewriter.notifyMatchFailure(op, "Expected static memref shape.");
-      }
-      baseAddr =
-          memref::ExtractAlignedPointerAsIndexOp::create(rewriter, loc, source);
-    } else {
-      baseAddr = adaptor.getSource();
-    }
-    // Utility for creating offset values from op fold result.
-    auto createOffset = [&](OpFoldResult ofr) -> Value {
-      Value val = getValueOrCreateConstantIntOp(rewriter, loc, ofr);
-      val = getValueOrCreateCastToIndexLike(rewriter, loc, payloadElemTy, val);
-      return val;
-    };
-    // Offsets are not supported (0 is used).
-    offsetW = arith::ConstantIntOp::create(rewriter, loc, payloadElemTy, 0);
-    offsetH = arith::ConstantIntOp::create(rewriter, loc, payloadElemTy, 0);
-    // Get shape values from op fold results.
-    baseShapeW = createOffset(mixedSizes[srcRank - 1]);
-    if (srcRank == 2) {
-      baseShapeH = createOffset(mixedSizes[0]);
-    } else {
-      // Generate compute chain for height (product of sizes of all but the last
-      // dimension).
-      baseShapeH = getProductOfSizes(rewriter, loc, mixedSizes, 0, srcRank - 1);
-      baseShapeH = getValueOrCreateCastToIndexLike(rewriter, loc, payloadElemTy,
-                                                   baseShapeH);
-    }
-    if (sourceMemrefTy) {
-      // Cast index to i64.
-      baseAddr = arith::IndexCastUIOp::create(rewriter, loc, i64Ty, baseAddr);
-    } else if (baseAddr.getType() != i64Ty) {
+    Value baseAddr = adaptor.getSource();
+    Type i64Ty = rewriter.getI64Type();
+    if (baseAddr.getType() != i64Ty) {
       // Pointer type may be i32. Cast to i64 if needed.
       baseAddr = arith::ExtUIOp::create(rewriter, loc, i64Ty, baseAddr);
     }
-    // Populate payload.
-    Value payLoadAsI64 =
-        vector::BitCastOp::create(rewriter, loc, payloadI64Ty, payload);
-    payLoadAsI64 =
-        vector::InsertOp::create(rewriter, loc, baseAddr, payLoadAsI64,
-                                 static_cast<int>(NdTdescOffset::BasePtr));
-    payload = vector::BitCastOp::create(rewriter, loc, payloadTy, payLoadAsI64);
-    payload =
-        vector::InsertOp::create(rewriter, loc, baseShapeW, payload,
-                                 static_cast<int>(NdTdescOffset::BaseShapeW));
-    payload =
-        vector::InsertOp::create(rewriter, loc, baseShapeH, payload,
-                                 static_cast<int>(NdTdescOffset::BaseShapeH));
-    payload = vector::InsertOp::create(
-        rewriter, loc, offsetW, payload,
-        static_cast<int>(NdTdescOffset::TensorOffsetW));
-    payload = vector::InsertOp::create(
-        rewriter, loc, offsetH, payload,
-        static_cast<int>(NdTdescOffset::TensorOffsetH));
-    rewriter.replaceOp(op, payload);
+    rewriter.replaceOp(op, baseAddr);
     return success();
   }
 };
@@ -291,7 +204,6 @@ class LoadStorePrefetchNdToXeVMPattern : public OpConversionPattern<OpType> {
     auto loc = op.getLoc();
     auto ctxt = rewriter.getContext();
 
-    auto tdesc = adaptor.getTensorDesc();
     auto tdescTy = op.getTensorDescType();
     if (tdescTy.getRank() != 2)
       return rewriter.notifyMatchFailure(op, "Expected 2D tensor descriptor.");
@@ -301,15 +213,27 @@ class LoadStorePrefetchNdToXeVMPattern : public OpConversionPattern<OpType> {
       return rewriter.notifyMatchFailure(
           op, "Expected element type bit width to be multiple of 8.");
 
-    VectorType payloadI64Ty = VectorType::get(4, rewriter.getI64Type());
-    Value payLoadAsI64 =
-        vector::BitCastOp::create(rewriter, loc, payloadI64Ty, tdesc);
-    Value basePtr = vector::ExtractOp::create(
-        rewriter, loc, payLoadAsI64, static_cast<int>(NdTdescOffset::BasePtr));
-    Value baseShapeW = vector::ExtractOp::create(
-        rewriter, loc, tdesc, static_cast<int>(NdTdescOffset::BaseShapeW));
-    Value baseShapeH = vector::ExtractOp::create(
-        rewriter, loc, tdesc, static_cast<int>(NdTdescOffset::BaseShapeH));
+    Value basePtr = adaptor.getTensorDesc();
+    // Utility for creating offset values from op fold result.
+    Type payloadElemTy = rewriter.getIntegerType(32);
+    auto createOffset = [&](OpFoldResult ofr) -> Value {
+      Value val = getValueOrCreateConstantIntOp(rewriter, loc, ofr);
+      val = getValueOrCreateCastToIndexLike(rewriter, loc, payloadElemTy, val);
+      return val;
+    };
+    auto srcRank = mixedSizes.size();
+    // Get shape values from op fold results.
+    Value baseShapeW = createOffset(mixedSizes[srcRank - 1]);
+    Value baseShapeH;
+    if (srcRank == 2) {
+      baseShapeH = createOffset(mixedSizes[0]);
+    } else {
+      // Generate compute chain for height (product of sizes of all but the last
+      // dimension).
+      baseShapeH = getProductOfSizes(rewriter, loc, mixedSizes, 0, srcRank - 1);
+      baseShapeH = getValueOrCreateCastToIndexLike(rewriter, loc, payloadElemTy,
+                                                   baseShapeH);
+    }
     // Offsets are provided by the op.
     // convert them to i32.
     // Offset computation assumes base memory layout is row major.
@@ -979,10 +903,7 @@ struct ConvertXeGPUToXeVMPass
       return VectorType::get(sum, elemType);
     });
     typeConverter.addConversion([&](xegpu::TensorDescType type) -> Type {
-      if (type.isScattered())
-        return IntegerType::get(&getContext(), 64);
-      auto i32Type = IntegerType::get(&getContext(), 32);
-      return VectorType::get(8, i32Type);
+      return IntegerType::get(&getContext(), 64);
     });
     // Convert MemDescType into flattened MemRefType for SLM
     typeConverter.addConversion([&](xegpu::MemDescType type) -> Type {
diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-xegpu-to-xevm %s | FileCheck %s
+// RUN: mlir-opt -convert-xegpu-to-xevm -canonicalize %s | FileCheck %s
 
 gpu.module @load_store_check {
     gpu.func @load_store(%src: memref<8x16xf32, 1>, %dst: memref<8x16xf32, 1>) kernel {
diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_high_base_rank.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_high_base_rank.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-xegpu-to-xevm %s | FileCheck %s
+// RUN: mlir-opt -convert-xegpu-to-xevm -canonicalize %s | FileCheck %s
 
 gpu.module @load_store_check {
     // CHECK: fail
diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_int_addr.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstore_nd_int_addr.mlir
@@ -0,0 +1,75 @@
+// RUN: mlir-opt -convert-xegpu-to-xevm -canonicalize %s | FileCheck %s
+
+gpu.module @load_store_check {
+    gpu.func @load_store(%src: ui64, %dst: ui32) kernel {
+        // CHECK: %[[LD_PTR_AS_I64:.*]] = arith.index_castui {{.*}} : index to i64
+        // CHECK: %[[LD_CREATE_DESC_I64:.*]] = vector.bitcast {{.*}} : vector<8xi32> to vector<4xi64>
+        // CHECK: %[[LD_DESC_0:.*]] = vector.insert %[[LD_PTR_AS_I64]], %[[LD_CREATE_DESC_I64]] [0] : i64 into vector<4xi64>
+        // CHECK: %[[LD_DESC_1:.*]] = vector.bitcast %[[LD_DESC_0]] : vector<4xi64> to vector<8xi32>
+        // CHECK: %[[LD_DESC_2:.*]] = vector.insert {{.*}}, %[[LD_DESC_1]] [2] : i32 into vector<8xi32>
+        // CHECK: %[[LD_DESC_3:.*]] = vector.insert {{.*}}, %[[LD_DESC_2]] [3] : i32 into vector<8xi32>
+        // CHECK: %[[LD_DESC_4:.*]] = vector.insert {{.*}}, %[[LD_DESC_3]] [4] : i32 into vector<8xi32>
+        // CHECK: %[[LD_DESC:.*]] = vector.insert {{.*}}, %[[LD_DESC_4]] [5] : i32 into vector<8xi32>
+        %c8 = arith.constant 8 : index
+        %c16 = arith.constant 16 : index
+        %c1 = arith.constant 1 : index
+        %src_tdesc = xegpu.create_nd_tdesc %src, shape:[%c8, %c16], strides:[%c16, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32>
+
+
+        //CHECK: %[[LD_DESC_I64:.*]] = vector.bitcast %[[LD_DESC]] : vector<8xi32> to vector<4xi64>
+        //CHECK: %[[LD_INTPTR:.*]] = vector.extract %[[LD_DESC_I64]][0] : i64 from vector<4xi64>
+        //CHECK: %[[LD_BASE_W:.*]] = vector.extract %[[LD_DESC]][2] : i32 from vector<8xi32>
+        //CHECK: %[[LD_BASE_H:.*]] = vector.extract %[[LD_DESC]][3] : i32 from vector<8xi32>
+        //CHECK: %[[LD_TILE_W64:.*]] = arith.constant 0 : i64
+        //CHECK: %[[LD_TILE_W:.*]] = arith.trunci %[[LD_TILE_W64]] : i64 to i32
+        //CHECK: %[[LD_TILE_H64:.*]] = arith.constant 0 : i64
+        //CHECK: %[[LD_TILE_H:.*]] = arith.trunci %[[LD_TILE_H64]] : i64 to i32
+        //CHECK: %[[LD_LLVMPTR:.*]] = llvm.inttoptr %[[LD_INTPTR]] : i64 to !llvm.ptr<1>
+        //CHECK: %[[LD_SIZEOF_F32:.*]] = arith.constant 4 : i32
+        //CHECK: %[[LD_BASE_ROW_IN_BYTES:.*]] = arith.muli %[[LD_BASE_W]], %[[LD_SIZEOF_F32]] : i32
+        //CHECK: %[[LD_LOADED_I32:.*]] = xevm.blockload2d %[[LD_LLVMPTR]], %[[LD_BASE_ROW_IN_BYTES]],
+        //CHECK-SAME: %[[LD_BASE_H]], %[[LD_BASE_ROW_IN_BYTES]], %[[LD_TILE_W]], %[[LD_TILE_H]]
+        //CHECK-SAME: <{cache_control = #xevm.load_cache_control<L1c_L2uc_L3uc>, elem_size_in_bits = 32 : i32,
+        //CHECK-SAME:   pack_register = false, tile_height = 8 : i32, tile_width = 16 : i32, transpose = false,
+        //CHECK-SAME:   v_blocks = 1 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
+        %loaded = xegpu.load_nd %src_tdesc[0, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+            : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32>
+        //CHECK: %[[LD_LOADED_F32:.*]] = vector.bitcast %[[LD_LOADED_I32]] : vector<8xi32> to vector<8xf32>
+
+        %tid_x = gpu.thread_id x
+        %tid_x_i32 = arith.index_cast %tid_x : index to i32
+        %tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32
+        //CHECK: %[[LOADED_F32_MODIFIED:.*]] = vector.insert %{{.*}}, %[[LD_LOADED_F32]] [0] : f32 into vector<8xf32>
+        %loaded_modified = vector.insert %tid_x_f32, %loaded[0] : f32 into vector<8xf32>
+
+        // CHECK: %[[PTR_AS_I64:.*]] = arith.index_castui {{.*}} : index to i64
+        // CHECK: %[[CREATE_DESC_I64:.*]] = vector.bitcast {{.*}} : vector<8xi32> to vector<4xi64>
+        // CHECK: %[[DESC_0:.*]] = vector.insert %[[PTR_AS_I64]], %[[CREATE_DESC_I64]] [0] : i64 into vector<4xi64>
+        // CHECK: %[[DESC_1:.*]] = vector.bitcast %[[DESC_0]] : vector<4xi64> to vector<8xi32>
+        // CHECK: %[[DESC_2:.*]] = vector.insert {{.*}}, %[[DESC_1]] [2] : i32 into vector<8xi32>
+        // CHECK: %[[DESC_3:.*]] = vector.insert {{.*}}, %[[DESC_2]] [3] : i32 into vector<8xi32>
+        // CHECK: %[[DESC_4:.*]] = vector.insert {{.*}}, %[[DESC_3]] [4] : i32 into vector<8xi32>
+        // CHECK: %[[DESC:.*]] = vector.insert {{.*}}, %[[DESC_4]] [5] : i32 into vector<8xi32>
+        %dst_tdesc = xegpu.create_nd_tdesc %dst, shape:[%c8, %c16], strides:[%c16, %c1] : ui32 -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = global>>
+
+        //CHECK: %[[DESC_I64:.*]] = vector.bitcast %[[DESC]] : vector<8xi32> to vector<4xi64>
+        //CHECK: %[[INTPTR:.*]] = vector.extract %[[DESC_I64]][0] : i64 from vector<4xi64>
+        //CHECK: %[[BASE_W:.*]] = vector.extract %[[DESC]][2] : i32 from vector<8xi32>
+        //CHECK: %[[BASE_H:.*]] = vector.extract %[[DESC]][3] : i32 from vector<8xi32>
+        //CHECK: %[[TILE_W64:.*]] = arith.constant 0 : i64
+        //CHECK: %[[TILE_W:.*]] = arith.trunci %[[TILE_W64]] : i64 to i32
+        //CHECK: %[[TILE_H64:.*]] = arith.constant 0 : i64
+        //CHECK: %[[TILE_H:.*]] = arith.trunci %[[TILE_H64]] : i64 to i32
+        //CHECK: %[[LLVMPTR:.*]] = llvm.inttoptr %[[INTPTR]] : i64 to !llvm.ptr<1>
+        //CHECK: %[[SIZEOF_F32:.*]] = arith.constant 4 : i32
+        //CHECK: %[[BASE_ROW_IN_BYTES:.*]] = arith.muli %[[BASE_W]], %[[SIZEOF_F32]] : i32
+        //CHECK: %[[FLAT_VALUE_I32:.*]] = vector.bitcast %[[LOADED_F32_MODIFIED]] : vector<8xf32> to vector<8xi32>
+        //CHECK: xevm.blockstore2d %[[LLVMPTR]], %[[BASE_ROW_IN_BYTES]], %[[BASE_H]], %[[BASE_ROW_IN_BYTES]],
+        //CHECK-SAME: %[[TILE_W]], %[[TILE_H]], %[[FLAT_VALUE_I32]]
+        //CHECK-SAME: <{cache_control = #xevm.store_cache_control<L1wb_L2uc_L3uc>, elem_size_in_bits = 32 : i32,
+        //CHECK-SAME:   tile_height = 8 : i32, tile_width = 16 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>)
+        xegpu.store_nd %loaded_modified, %dst_tdesc[0, 0] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>
+            : vector<8xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = global>>
+        gpu.return
+    }
+}
diff --git a/mlir/test/Conversion/XeGPUToXeVM/prefetch_nd.mlir b/mlir/test/Conversion/XeGPUToXeVM/prefetch_nd.mlir
@@ -1,7 +1,7 @@
-// RUN: mlir-opt -convert-xegpu-to-xevm -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -convert-xegpu-to-xevm %s | FileCheck %s
 
-gpu.module @fence_check {
-    gpu.func @fence(%src: memref<8x16xf32, 1>, %dst: memref<8x16xf32, 1>) kernel {
+gpu.module @prefetch_nd_check {
+    gpu.func @prefetch_nd(%src: memref<8x16xf32, 1>, %dst: memref<8x16xf32, 1>) kernel {
         %srcce = memref.memory_space_cast %src : memref<8x16xf32, 1> to memref<8x16xf32>
         %dstte = memref.memory_space_cast %dst : memref<8x16xf32, 1> to memref<8x16xf32>
 

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-// RUN: mlir-opt -convert-xegpu-to-xevm %s \| FileCheck %s`
	`1`	`+// RUN: mlir-opt -convert-xegpu-to-xevm -canonicalize %s \| FileCheck %s`
`2`	`2`
`3`	`3`	`gpu.module @load_store_check {`
`4`	`4`	`gpu.func @load_store(%src: memref<8x16xf32, 1>, %dst: memref<8x16xf32, 1>) kernel {`