Temp save.

silee2 · silee2 · commit e240e47a1c37 · 2025-08-27T17:45:35.000Z
diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Conversion/XeGPUToXeVM/XeGPUToXeVM.h"
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/LLVMIR/XeVMDialect.h"
 
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
@@ -426,18 +427,6 @@ class LoadStorePrefetchNdToXeVMPattern : public OpConversionPattern<OpType> {
   }
 };
 
-template <
-    typename OpType,
-    typename = std::enable_if_t<llvm::is_one_of<
-        OpType, xegpu::LoadGatherOp, xegpu::StoreScatterOp, xegpu::CreateDescOp,
-        xegpu::UpdateOffsetOp, xegpu::PrefetchOp>::value>>
-int64_t getElemByteSize(OpType op) {
-  // Get the element byte size from the tensor descriptor.
-  auto elemBitWidth =
-      op.getTensorDesc().getType().getElementType().getIntOrFloatBitWidth();
-  return elemBitWidth / 8;
-}
-
 // Add a builder that creates
 // offset * elemByteSize + baseAddr
 auto addOffset = [](ConversionPatternRewriter &rewriter, Location loc,
@@ -456,23 +445,23 @@ class CreateDescToXeVMPattern
   LogicalResult
   matchAndRewrite(xegpu::CreateDescOp op, xegpu::CreateDescOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    auto eTy = op.getTensorDescType().getElementType();
+    if (eTy.getIntOrFloatBitWidth() % 8 != 0) {
+      return rewriter.notifyMatchFailure(op,
+                                         "Expected element type bit width to be multiple of 8.");
+    }
     auto loc = op.getLoc();
+    // offsets are provided as scalar i64 by type converter.
     auto offsets = adaptor.getOffsets();
-    // Source type can be a 1D memref or ui64
-    // Using "op" instead of "adaptor" since we want to access memref type
-    // instead of LLVM struct type.
-    auto memrefTy = dyn_cast<MemRefType>(op.getSource().getType());
-    Value subGroupAddr;
-    if (memrefTy) {
-      subGroupAddr = memref::ExtractAlignedPointerAsIndexOp::create(
-          rewriter, loc, op.getSource());
-      subGroupAddr = arith::IndexCastUIOp::create(
-          rewriter, loc, rewriter.getI64Type(), subGroupAddr);
-    } else {
-      subGroupAddr = adaptor.getSource();
-    }
+    // Source type can be a 1D memref or pointer type (ui64, ui32, i64 or i32).
+    // But type converter will convert them to integer types.
+    Value addr = adaptor.getSource();
+    // ui32 or i32 are passed as i32 so they need to be casted to i64.
+    if (addr.getType() != rewriter.getI64Type())
+      addr = arith::IndexCastUIOp::create(
+          rewriter, loc, rewriter.getI64Type(), addr);
     auto laneAddr =
-        addOffset(rewriter, loc, subGroupAddr, offsets, getElemByteSize(op));
+        addOffset(rewriter, loc, addr, offsets, getElemByteSize(op));
     rewriter.replaceOp(op, laneAddr);
     return success();
   }
@@ -485,11 +474,18 @@ class UpdateOffsetToXeVMPattern
   matchAndRewrite(xegpu::UpdateOffsetOp op,
                   xegpu::UpdateOffsetOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    auto eTy = op.getTensorDescType().getElementType();
+    if (eTy.getIntOrFloatBitWidth() % 8 != 0) {
+      return rewriter.notifyMatchFailure(op,
+                                         "Expected element type bit width to be multiple of 8.");
+    }
     auto loc = op.getLoc();
-    Value newOffsetForLane =
+    // scatter descriptor is provided as scalar i64 by type converter.
+    // offsets are provided as scalar i64 by type converter.
+    Value newOffset =
         addOffset(rewriter, loc, adaptor.getTensorDesc(), adaptor.getOffsets(),
                   getElemByteSize(op));
-    rewriter.replaceOp(op, newOffsetForLane);
+    rewriter.replaceOp(op, newOffset);
     return success();
   }
 };
@@ -505,19 +501,38 @@ class LoadStoreToXeVMPattern : public OpConversionPattern<OpType> {
     auto loc = op.getLoc();
     auto ctxt = rewriter.getContext();
     auto tdescTy = op.getTensorDescType();
-    auto ptrTypeLLVM = LLVM::LLVMPointerType::get(
-        ctxt, getNumericXeVMAddrSpace(tdescTy.getMemorySpace()));
+    LLVM::LLVMPointerType ptrTypeLLVM = LLVM::LLVMPointerType::get(
+            ctxt, getNumericXeVMAddrSpace(xegpu::MemorySpace::Global));
+    if (tdescTy)
+        ptrTypeLLVM = LLVM::LLVMPointerType::get(
+            ctxt, getNumericXeVMAddrSpace(tdescTy.getMemorySpace()));
     Value basePtrI64;
     if constexpr (std::is_same_v<OpType, xegpu::LoadGatherOp>) {
       basePtrI64 = adaptor.getSource();
+      if (auto memRefTy = dyn_cast<MemRefType>(op.getSource().getType())) {
+        auto addrSpace = memRefTy.getMemorySpaceAsInt();
+        if (addrSpace != 0)
+          ptrTypeLLVM = LLVM::LLVMPointerType::get(ctxt, addrSpace);
+      }
     } else {
       basePtrI64 = adaptor.getDest();
+      if (auto memRefTy = dyn_cast<MemRefType>(op.getDest().getType())) {
+        auto addrSpace = memRefTy.getMemorySpaceAsInt();
+        if (addrSpace != 0)
+          ptrTypeLLVM = LLVM::LLVMPointerType::get(ctxt, addrSpace);
+      }
     }
+    if (basePtrI64.getType() != rewriter.getI64Type()) {
+      basePtrI64 = arith::IndexCastUIOp::create(rewriter, loc, rewriter.getI64Type(),
+                                                basePtrI64);
+    }
+    basePtrI64.dump();
     Value offsets = adaptor.getOffsets();
+    offsets.dump();
     Value mask = adaptor.getMask();
+    mask.dump();
     if (offsets) {
-      VectorType offsetsVecTy = dyn_cast<VectorType>(offsets.getType());
-      if (offsetsVecTy) {
+      if (dyn_cast<VectorType>(offsets.getType())){
         // Offset needs be scalar.
         return rewriter.notifyMatchFailure(op,
                                            "Expected offsets to be a scalar.");
@@ -526,8 +541,10 @@ class LoadStoreToXeVMPattern : public OpConversionPattern<OpType> {
             addOffset(rewriter, loc, basePtrI64, offsets, getElemByteSize(op));
       }
     }
+    basePtrI64.dump();
     Value basePtrLLVM =
         LLVM::IntToPtrOp::create(rewriter, loc, ptrTypeLLVM, basePtrI64);
+    basePtrLLVM.dump();
     VectorType srcOrDstVecTy = op.getValueType();
     VectorType srcOrDstFlatVecTy = VectorType::get(
         srcOrDstVecTy.getNumElements(), srcOrDstVecTy.getElementType());
@@ -597,6 +614,10 @@ class PrefetchToXeVMPattern : public OpConversionPattern<xegpu::PrefetchOp> {
         ctxt, getNumericXeVMAddrSpace(tdescTy.getMemorySpace()));
     Value basePtrI64 = adaptor.getSource();
     Value offsets = adaptor.getOffsets();
+    if (basePtrI64.getType() != rewriter.getI64Type()) {
+      basePtrI64 = arith::IndexCastUIOp::create(rewriter, loc, rewriter.getI64Type(),
+                                                basePtrI64);
+    }
     if (offsets) {
       VectorType offsetsVecTy = dyn_cast<VectorType>(offsets.getType());
       if (offsetsVecTy) {
@@ -836,6 +857,26 @@ struct ConvertXeGPUToXeVMPass
       auto i32Type = IntegerType::get(&getContext(), 32);
       return VectorType::get(8, i32Type);
     });
+    typeConverter.addConversion([&](MemRefType type) -> Type {
+      // Convert MemRefType to i64 type.
+      return IntegerType::get(&getContext(), 64);
+    });
+
+    auto memrefMaterializationCast = [](OpBuilder &builder, Type type,
+                                      ValueRange inputs,
+                                      Location loc) -> Value {
+      if (inputs.size() != 1)
+        return {};
+      auto input = inputs.front();
+      if (auto memrefTy = dyn_cast<MemRefType>(input.getType())) {
+
+        Value addr = memref::ExtractAlignedPointerAsIndexOp::create(
+          builder, loc, input);
+        return arith::IndexCastUIOp::create(builder, loc, type,
+                                            addr).getResult();
+      }
+      return {};
+    };
 
     auto ui64MaterializationCast = [](OpBuilder &builder, Type type,
                                       ValueRange inputs,
@@ -847,7 +888,22 @@ struct ConvertXeGPUToXeVMPass
         Value cast =
             index::CastUOp::create(builder, loc, builder.getIndexType(), input)
                 .getResult();
-        return arith::IndexCastOp::create(builder, loc, type, cast).getResult();
+        return arith::IndexCastUIOp::create(builder, loc, type, cast).getResult();
+      }
+      return {};
+    };
+
+    auto ui32MaterializationCast = [](OpBuilder &builder, Type type,
+                                      ValueRange inputs,
+                                      Location loc) -> Value {
+      if (inputs.size() != 1)
+        return {};
+      auto input = inputs.front();
+      if (input.getType() == builder.getIntegerType(32, false)) {
+        Value cast =
+            index::CastUOp::create(builder, loc, builder.getIndexType(), input)
+                .getResult();
+        return arith::IndexCastUIOp::create(builder, loc, type, cast).getResult();
       }
       return {};
     };
@@ -864,15 +920,19 @@ struct ConvertXeGPUToXeVMPass
           Value cast =
               vector::ExtractOp::create(builder, loc, input, 0).getResult();
           if (vecTy.getElementType() == builder.getIndexType())
-            cast = arith::IndexCastOp::create(builder, loc, type, cast)
+            cast = arith::IndexCastUIOp::create(builder, loc, type, cast)
                        .getResult();
           return cast;
         }
       }
       return {};
     };
+    typeConverter.addSourceMaterialization(memrefMaterializationCast);
     typeConverter.addSourceMaterialization(ui64MaterializationCast);
+    typeConverter.addSourceMaterialization(ui32MaterializationCast);
     typeConverter.addSourceMaterialization(vector1DMaterializationCast);
+    typeConverter.addTargetMaterialization(memrefMaterializationCast);
+    typeConverter.addTargetMaterialization(ui32MaterializationCast);
     typeConverter.addTargetMaterialization(ui64MaterializationCast);
     typeConverter.addTargetMaterialization(vector1DMaterializationCast);
     ConversionTarget target(getContext());
diff --git a/mlir/test/Conversion/XeGPUToXeVM/create_nd_tdesc.mlir b/mlir/test/Conversion/XeGPUToXeVM/create_nd_tdesc.mlir
@@ -6,8 +6,8 @@ gpu.module @create_nd_tdesc {
   // CHECK-SAME: %[[ARG2:.*]]: index, %[[ARG3:.*]]: index, %[[ARG4:.*]]: index, %[[ARG5:.*]]: index
   gpu.func @create_nd_tdesc(%src: memref<8x16xf32, 1>, %ptr: ui64, %shape1: index, %shape2: index,
        %stride1: index, %stride2: index) kernel {
-         // CHECK: %[[VAR0:.*]] = index.castu %[[ARG1]] : ui64 to index
-         // CHECK: %[[VAR1:.*]] = arith.index_cast %[[VAR0]] : index to i64
+        // CHECK: %[[VAR0:.*]] = index.castu %[[ARG1]] : ui64 to index
+        // CHECK: %[[VAR1:.*]] = arith.index_castui %[[VAR0]] : index to i64
         // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<8xi32>
         // CHECK: %[[C0_I32:.*]] = arith.constant 0 : i32
         // CHECK: %[[C0_I32_0:.*]] = arith.constant 0 : i32
diff --git a/mlir/test/Conversion/XeGPUToXeVM/loadstoreprefetch.mlir b/mlir/test/Conversion/XeGPUToXeVM/loadstoreprefetch.mlir
@@ -5,10 +5,10 @@ gpu.module @test {
 // CHECK-SAME: %[[ARG0:.*]]: ui64
 gpu.func @load_gather_ui64_src_constant_offset(%src: ui64) {
   // CHECK: %[[VAR0:.*]] = index.castu %[[ARG0]] : ui64 to index
-  // CHECK: %[[VAR1:.*]] = arith.index_cast %[[VAR0]] : index to i64
+  // CHECK: %[[VAR1:.*]] = arith.index_castui %[[VAR0]] : index to i64
   // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<1xindex>
   // CHECK: %[[VAR2:.*]] = vector.extract %[[CST]][0] : index from vector<1xindex>
-  // CHECK: %[[VAR3:.*]] = arith.index_cast %[[VAR2]] : index to i64
+  // CHECK: %[[VAR3:.*]] = arith.index_castui %[[VAR2]] : index to i64
   %0 = arith.constant dense<0> : vector<1xindex>
   // CHECK: %[[CST_0:.*]] = arith.constant dense<true> : vector<1xi1>
   // CHECK: %[[VAR4:.*]] = vector.extract %[[CST_0]][0] : i1 from vector<1xi1>
diff --git a/mlir/test/Conversion/XeGPUToXeVM/materializecast.mlir b/mlir/test/Conversion/XeGPUToXeVM/materializecast.mlir
@@ -0,0 +1,49 @@
+// RUN: mlir-opt -convert-xegpu-to-xevm %s | FileCheck %s
+
+gpu.module @materializecast {
+  // CHECK-LABEL: gpu.func @materialize_memref
+  // CHECK-SAME: %[[ARG0:.*]]: memref<128xf32>
+  gpu.func @materialize_memref(%src: memref<128xf32>) kernel {
+    // CHECK: XXX
+    %offset = arith.constant dense<0> : vector<1xindex>
+    %src_tdesc = xegpu.create_tdesc %src, %offset : memref<128xf32>, vector<1xindex>
+        -> !xegpu.tensor_desc<1xf32, #xegpu.scatter_tdesc_attr<>>
+    gpu.return
+  }
+  // CHECK-LABEL: gpu.func @materialize_ui64
+  // CHECK-SAME: %[[ARG0:.*]]: ui64
+  gpu.func @materialize_ui64(%src: ui64) kernel {
+    // CHECK: XXX
+    %offset = arith.constant dense<0> : vector<1xindex>
+    %src_tdesc = xegpu.create_tdesc %src, %offset : ui64, vector<1xindex>
+        -> !xegpu.tensor_desc<1xf32, #xegpu.scatter_tdesc_attr<>>
+    gpu.return
+  }
+  // CHECK-LABEL: gpu.func @materialize_ui32
+  // CHECK-SAME: %[[ARG0:.*]]: ui32
+  gpu.func @materialize_ui32(%src: ui32) kernel {
+    %offset = arith.constant dense<0> : vector<1xindex>
+    //%src_tdesc = xegpu.create_tdesc %src, %offset : ui32, vector<1xindex>
+    //    -> !xegpu.tensor_desc<1xf32, #xegpu.scatter_tdesc_attr<>>
+    gpu.return
+  }
+  // CHECK-LABEL: gpu.func @materialize_single_index_vector
+  // CHECK-SAME: %[[ARG0:.*]]: memref<128xf32>
+  gpu.func @materialize_single_index_vector(%src: memref<128xf32>) kernel {
+    // CHECK: XXX
+    %offset = arith.constant dense<0> : vector<1xindex>
+    %src_tdesc = xegpu.create_tdesc %src, %offset : memref<128xf32>, vector<1xindex>
+        -> !xegpu.tensor_desc<1xf32, #xegpu.scatter_tdesc_attr<>>
+    gpu.return
+  }
+  // CHECK-LABEL: gpu.func @materialize_single_elem_vector
+  // CHECK-SAME: %[[ARG0:.*]]: vector<1xi1>
+  gpu.func @materialize_single_elem_vector(%src: memref<128xf32>) kernel {
+    // CHECK: XXX
+    %mask = arith.constant dense<1>: vector<1xi1>
+    %offset = arith.constant dense<0> : vector<1xindex>
+    %0 = xegpu.load %src[%offset], %mask <{chunk_size=8, l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+      : memref<128xf32>, vector<1xindex>, vector<1xi1> -> vector<1x8xf32>
+    gpu.return
+  }
+}
diff --git a/mlir/test/Conversion/XeGPUToXeVM/update_offset.mlir b/mlir/test/Conversion/XeGPUToXeVM/update_offset.mlir
@@ -4,12 +4,12 @@ gpu.module @update_offset {
   // CHECK-LABEL: gpu.func @update_offset
   // CHECK-SAME: %[[ARG0:.*]]: memref<128xf32>
   gpu.func @update_offset(%src: memref<128xf32>) kernel {
+    // CHECK: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<128xf32> -> index
+    // CHECK: %[[VAR2:.*]] = arith.index_castui %[[INTPTR]] : index to i64
     // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<1xindex>
     %offset = arith.constant dense<0> : vector<1xindex>
     // CHECK: %[[VAR0:.*]] = vector.extract %[[CST]][0] : index from vector<1xindex>
-    // CHECK: %[[VAR1:.*]] = arith.index_cast %[[VAR0]] : index to i64
-    // CHECK: %[[INTPTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<128xf32> -> index
-    // CHECK: %[[VAR2:.*]] = arith.index_castui %[[INTPTR]] : index to i64
+    // CHECK: %[[VAR1:.*]] = arith.index_castui %[[VAR0]] : index to i64
     // CHECK: %[[C4_I64:.*]] = arith.constant 4 : i64
     // CHECK: %[[VAR3:.*]] = arith.muli %[[VAR1]], %[[C4_I64]] : i64
     // CHECK: %[[VAR4:.*]] = arith.addi %[[VAR2]], %[[VAR3]] : i64