address more comments

Jianhui-Li · Jianhui-Li · commit b1857a275d7e · 2025-10-14T01:05:09.000Z
diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
@@ -365,10 +365,11 @@ class LoadStorePrefetchNdToXeVMPattern : public OpConversionPattern<OpType> {
 
 // Add a builder that creates
 // offset * elemByteSize + baseAddr
-static Value addOffset(ConversionPatternRewriter &rewriter, Location loc,
-                       Value baseAddr, Value offset, int64_t elemByteSize) {
+static Value addOffsetToBaseAddr(ConversionPatternRewriter &rewriter,
+                                 Location loc, Value baseAddr, Value offset,
+                                 int64_t elemByteSize) {
   Value byteSize = arith::ConstantIntOp::create(
-      rewriter, loc, rewriter.getI64Type(), elemByteSize);
+      rewriter, loc, baseAddr.getType(), elemByteSize);
   Value byteOffset = arith::MulIOp::create(rewriter, loc, offset, byteSize);
   Value newAddr = arith::AddIOp::create(rewriter, loc, baseAddr, byteOffset);
   return newAddr;
@@ -443,7 +444,8 @@ class LoadStoreToXeVMPattern : public OpConversionPattern<OpType> {
       // If offset is provided, we add them to the base pointer.
       // Offset is in number of elements, we need to multiply by
       // element byte size.
-      basePtrI64 = addOffset(rewriter, loc, basePtrI64, offset, elemByteSize);
+      basePtrI64 =
+          addOffsetToBaseAddr(rewriter, loc, basePtrI64, offset, elemByteSize);
     }
     // Convert base pointer (i64) to LLVM pointer type.
     Value basePtrLLVM =
@@ -516,7 +518,7 @@ class CreateMemDescOpPattern final
   LogicalResult
   matchAndRewrite(xegpu::CreateMemDescOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    TypedValue<MemRefType> src = op.getSource();
+
     auto resTy = cast<xegpu::MemDescType>(op.getResult().getType());
 
     // Create the result MemRefType with the same shape, element type, and
@@ -525,7 +527,7 @@ class CreateMemDescOpPattern final
 
     Value zero = arith::ConstantIndexOp::create(rewriter, op.getLoc(), 0);
     auto viewOp = memref::ViewOp::create(rewriter, op.getLoc(), newResTy,
-                                         Value(src), zero, ValueRange());
+                                         op.getSource(), zero, ValueRange());
     rewriter.replaceOp(op, viewOp);
     return success();
   }
@@ -587,88 +589,74 @@ class LoadStoreMatrixToXeVMPattern : public OpConversionPattern<OpType> {
     Value basePtrLLVM = memref::ExtractAlignedPointerAsIndexOp::create(
         rewriter, loc, basePtrStruct);
 
-    // Convert base pointer (ptr) to i64
-    Value basePtrI64 = arith::IndexCastUIOp::create(
-        rewriter, loc, rewriter.getI64Type(), basePtrLLVM);
+    // Convert base pointer (ptr) to i32
+    Value basePtrI32 = arith::IndexCastUIOp::create(
+        rewriter, loc, rewriter.getI32Type(), basePtrLLVM);
 
     Value linearOffset = mdescTy.getLinearOffsets(rewriter, loc, offsets);
     linearOffset = arith::IndexCastUIOp::create(
-        rewriter, loc, rewriter.getI64Type(), linearOffset);
-    basePtrI64 =
-        addOffset(rewriter, loc, basePtrI64, linearOffset, elemByteSize);
+        rewriter, loc, rewriter.getI32Type(), linearOffset);
+    basePtrI32 = addOffsetToBaseAddr(rewriter, loc, basePtrI32, linearOffset,
+                                     elemByteSize);
 
-    // convert base pointer (i64) to LLVM pointer type
+    // convert base pointer (i32) to LLVM pointer type
     basePtrLLVM =
-        LLVM::IntToPtrOp::create(rewriter, loc, ptrTypeLLVM, basePtrI64);
+        LLVM::IntToPtrOp::create(rewriter, loc, ptrTypeLLVM, basePtrI32);
 
-    // if the size of valOrResVecTy is 1, it lowers to a scalar load/store
-    // operation. LLVM load/store does not support vector of size 1, so we need
-    // to handle this case separately.
-    if (valOrResVecTy.getNumElements() == 1) {
-      Type scalarTy = valOrResVecTy.getElementType();
-      if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp>) {
-        Value loadOp =
-            LLVM::LoadOp::create(rewriter, loc, scalarTy, basePtrLLVM);
-        rewriter.replaceOp(op, loadOp);
-      } else {
-        LLVM::StoreOp::create(rewriter, loc, adaptor.getData(), basePtrLLVM);
-        rewriter.eraseOp(op);
-      }
-      return success();
-    } else {
+    if (op.getSubgroupBlockIoAttr()) {
       // if the attribute 'subgroup_block_io' is set to true, it lowers to
       // xevm.blockload
-      auto subgroupBlockIoAttr = op.getSubgroupBlockIoAttr();
-      bool subgroup_block_io = static_cast<bool>(subgroupBlockIoAttr);
-
-      // BlockLoadOp only supports integer types, so we need to bitcast
-      // Get integer type with matching bit width
-      Type elemTy = valOrResVecTy.getElementType();
-      int64_t bitWidth = elemTy.getIntOrFloatBitWidth();
-      Type intElemTy = rewriter.getIntegerType(bitWidth);
+
+      Type intElemTy = rewriter.getIntegerType(elemBitWidth);
       VectorType intVecTy =
           VectorType::get(valOrResVecTy.getShape(), intElemTy);
 
-      if (subgroup_block_io) {
-        if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp>) {
-          Value loadOp =
-              xevm::BlockLoadOp::create(rewriter, loc, intVecTy, basePtrLLVM);
-          if (intVecTy != valOrResVecTy) {
-            loadOp =
-                vector::BitCastOp::create(rewriter, loc, valOrResVecTy, loadOp);
-          }
-          rewriter.replaceOp(op, loadOp);
-        } else {
-          Value dataToStore = adaptor.getData();
-          if (valOrResVecTy != intVecTy) {
-            dataToStore =
-                vector::BitCastOp::create(rewriter, loc, intVecTy, dataToStore);
-          }
-          xevm::BlockStoreOp::create(rewriter, loc, basePtrLLVM, dataToStore,
-                                     nullptr);
-          rewriter.eraseOp(op);
+      if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp>) {
+        Value loadOp =
+            xevm::BlockLoadOp::create(rewriter, loc, intVecTy, basePtrLLVM);
+        if (intVecTy != valOrResVecTy) {
+          loadOp =
+              vector::BitCastOp::create(rewriter, loc, valOrResVecTy, loadOp);
         }
+        rewriter.replaceOp(op, loadOp);
       } else {
-        // if the result is 1D vector, if the vector direction is Column, then
-        // the
-        //  memory descriptor should be treated as column major
-        auto chipOpt = xegpu::getChipStr(op);
-        if (!chipOpt || (*chipOpt != "pvc" && *chipOpt != "bmg")) {
-          // the lowering only works for pvc and bmg
-          return rewriter.notifyMatchFailure(
-              op, "The lowering is specific to pvc or bmg.");
+        Value dataToStore = adaptor.getData();
+        if (valOrResVecTy != intVecTy) {
+          dataToStore =
+              vector::BitCastOp::create(rewriter, loc, intVecTy, dataToStore);
         }
+        xevm::BlockStoreOp::create(rewriter, loc, basePtrLLVM, dataToStore,
+                                   nullptr);
+        rewriter.eraseOp(op);
+      }
+      return success();
+    }
 
-        if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp>) {
-          Value loadOp =
-              LLVM::LoadOp::create(rewriter, loc, valOrResVecTy, basePtrLLVM);
-          rewriter.replaceOp(op, loadOp);
-        } else {
-          LLVM::StoreOp::create(rewriter, loc, adaptor.getData(), basePtrLLVM);
-          rewriter.eraseOp(op);
-        }
+    if (valOrResVecTy.getNumElements() >= 1) {
+      auto chipOpt = xegpu::getChipStr(op);
+      if (!chipOpt || (*chipOpt != "pvc" && *chipOpt != "bmg")) {
+        // the lowering for chunk load only works for pvc and bmg
+        return rewriter.notifyMatchFailure(
+            op, "The lowering is specific to pvc or bmg.");
       }
     }
+
+    if constexpr (std::is_same_v<OpType, xegpu::LoadMatrixOp>) {
+      // if the size of valOrResVecTy is 1, it lowers to a scalar load/store
+      // operation. LLVM load/store does not support vector of size 1, so we
+      // need to handle this case separately.
+      auto scalarTy = valOrResVecTy.getElementType();
+      LLVM::LoadOp loadOp;
+      if (valOrResVecTy.getNumElements() == 1)
+        loadOp = LLVM::LoadOp::create(rewriter, loc, scalarTy, basePtrLLVM);
+      else
+        loadOp =
+            LLVM::LoadOp::create(rewriter, loc, valOrResVecTy, basePtrLLVM);
+      rewriter.replaceOp(op, loadOp);
+    } else {
+      LLVM::StoreOp::create(rewriter, loc, adaptor.getData(), basePtrLLVM);
+      rewriter.eraseOp(op);
+    }
     return success();
   }
 };
@@ -715,8 +703,8 @@ class PrefetchToXeVMPattern : public OpConversionPattern<xegpu::PrefetchOp> {
                 op, "Expected element type bit width to be multiple of 8.");
           elemByteSize = elemBitWidth / 8;
         }
-        basePtrI64 =
-            addOffset(rewriter, loc, basePtrI64, offsets, elemByteSize);
+        basePtrI64 = addOffsetToBaseAddr(rewriter, loc, basePtrI64, offsets,
+                                         elemByteSize);
       }
     }
     // Default memory space is global.
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -174,9 +174,9 @@ isValidGatherScatterBufferParams(Type offsetsTy, Type maskTy,
 }
 
 LogicalResult
-IsValidStoreMatrixParams(VectorType dataTy, MemDescType mdescTy,
-                         UnitAttr subgroup_block_io,
-                         function_ref<InFlightDiagnostic()> emitError) {
+IsValidMatrixOpParams(VectorType dataTy, MemDescType mdescTy,
+                      UnitAttr subgroup_block_io,
+                      function_ref<InFlightDiagnostic()> emitError) {
 
   if (!dataTy) {
     if (subgroup_block_io)
@@ -1107,8 +1107,8 @@ LogicalResult LoadMatrixOp::verify() {
   UnitAttr subgroup_block_io = getSubgroupBlockIoAttr();
   MemDescType mdescTy = getMemDesc().getType();
 
-  return IsValidStoreMatrixParams(resTy, mdescTy, subgroup_block_io,
-                                  [&]() { return emitError(); });
+  return IsValidMatrixOpParams(resTy, mdescTy, subgroup_block_io,
+                               [&]() { return emitError(); });
 }
 
 //===----------------------------------------------------------------------===//
@@ -1131,8 +1131,8 @@ LogicalResult StoreMatrixOp::verify() {
   auto dataTy = dyn_cast<VectorType>(getData().getType());
   UnitAttr subgroup_block_io = getSubgroupBlockIoAttr();
   MemDescType mdescTy = getMemDesc().getType();
-  return IsValidStoreMatrixParams(dataTy, mdescTy, subgroup_block_io,
-                                  [&]() { return emitError(); });
+  return IsValidMatrixOpParams(dataTy, mdescTy, subgroup_block_io,
+                               [&]() { return emitError(); });
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstore_matrix.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstore_matrix.mlir
@@ -11,8 +11,8 @@ gpu.module @test_kernel [#xevm.target<chip = "pvc">] {
     //CHECK: %[[TID:.*]] = gpu.thread_id x
     //CHECK: %[[C1:.*]] = arith.constant 1 : index
     //CHECK: %[[MUL1:.*]] = arith.muli %[[TID]], %[[C1]] : index
-    //CHECK: %[[C4:.*]] = arith.constant 4 : i64
-    //CHECK: %[[MUL2:.*]] = arith.muli {{.*}}, %[[C4]] : i64
+    //CHECK: %[[C4:.*]] = arith.constant 4 : i32
+    //CHECK: %[[MUL2:.*]] = arith.muli {{.*}}, %[[C4]] : i32
     //CHECK: llvm.load {{.*}} : !llvm.ptr<3> -> f32
 
     %tid_x = gpu.thread_id x
@@ -80,7 +80,7 @@ gpu.module @test_kernel [#xevm.target<chip = "pvc">] {
     %c19 = arith.constant 19: index
     
     //CHECK: %[[intptr:.*]] = memref.extract_aligned_pointer_as_index %[[view]] : memref<2048xf16, 3> -> index
-    //CHECK: %[[basePtrI64:.*]] = arith.index_castui %[[intptr]] : index to i64
+    //CHECK: %[[basePtrI64:.*]] = arith.index_castui %[[intptr]] : index to i32
     //CHECK: %[[c16:.*]] = arith.constant 16 : index
     //CHECK: %[[offsetx_0:.*]] = arith.divsi %[[c19]], %[[c16]] : index
     //CHECK: %[[offsetx_1:.*]] = arith.remsi %[[c19]], %[[c16]] : index
@@ -164,7 +164,7 @@ gpu.module @test_kernel [#xevm.target<chip = "pvc">] {
     %c48 = arith.constant 48 : index
 
     //CHECK: %[[intptr:.*]] = memref.extract_aligned_pointer_as_index %[[view]] : memref<2048xf16, 3> -> index
-    //CHECK: %[[basePtrI64:.*]] = arith.index_castui %[[intptr]] : index to i64
+    //CHECK: %[[basePtrI64:.*]] = arith.index_castui %[[intptr]] : index to i32
     //CHECK: %[[offset0:.*]] = arith.divsi %[[c16]], %[[c16]] : index
     //CHECK: %[[offset1:.*]] = arith.remsi %[[c16]], %[[c16]] : index
     //CHECK: %[[offset2:.*]] = arith.divsi %[[c48]], %[[c16]] : index
@@ -180,11 +180,11 @@ gpu.module @test_kernel [#xevm.target<chip = "pvc">] {
     //CHECK: %[[c1:.*]] = arith.constant 1 : index
     //CHECK: %[[mul3:.*]] = arith.muli %[[offset3]], %[[c1]] : index
     //CHECK: %[[linearOffset:.*]] = arith.addi %[[mul3]], %[[add2]] : index
-    //CHECK: %[[linearOffsetI64:.*]] = arith.index_castui %[[linearOffset]] : index to i64
-    //CHECK: %[[c2:.*]] = arith.constant 2 : i64
-    //CHECK: %[[byteOffset:.*]] = arith.muli %[[linearOffsetI64]], %[[c2]] : i64
-    //CHECK: %[[finalPtr:.*]] = arith.addi %[[basePtrI64]], %[[byteOffset]] : i64
-    //CHECK: %[[ptr:.*]] = llvm.inttoptr %[[finalPtr]] : i64 to !llvm.ptr<3>
+    //CHECK: %[[linearOffsetI64:.*]] = arith.index_castui %[[linearOffset]] : index to i32
+    //CHECK: %[[c2:.*]] = arith.constant 2 : i32
+    //CHECK: %[[byteOffset:.*]] = arith.muli %[[linearOffsetI64]], %[[c2]] : i32
+    //CHECK: %[[finalPtr:.*]] = arith.addi %[[basePtrI64]], %[[byteOffset]] : i32
+    //CHECK: %[[ptr:.*]] = llvm.inttoptr %[[finalPtr]] : i32 to !llvm.ptr<3>
     //CHECK: %[[loadedI16:.*]] = xevm.blockload %[[ptr]] : (!llvm.ptr<3>) -> vector<8xi16>
     //CHECK: %[[loaded:.*]] = vector.bitcast %[[loadedI16]] : vector<8xi16> to vector<8xf16>