From 3cde3ac7ec7665708eed6bdd93b17a5f937c5ef4 Mon Sep 17 00:00:00 2001 From: Garra1980 Date: Thu, 4 Sep 2025 00:23:33 +0200 Subject: [PATCH 1/2] [LLVM Pulldown] Bump to rev cdf30f0bc2362f8ac0b0a946372d7114229c34fd --- build_tools/llvm_version.txt | 2 +- include/imex/Conversion/Passes.h | 1 - include/imex/Conversion/Passes.td | 17 - .../imex/Conversion/XeGPUToXeVM/XeGPUToXeVM.h | 30 - lib/Conversion/CMakeLists.txt | 3 +- lib/Conversion/XeGPUToXeVM/CMakeLists.txt | 24 - lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp | 759 ------------------ test/Conversion/XeGPUToXeVM/dpas.mlir | 15 - test/Conversion/XeGPUToXeVM/fence.mlir | 15 - test/Conversion/XeGPUToXeVM/loadstore_nd.mlir | 96 --- test/Conversion/XeGPUToXeVM/prefetch_nd.mlir | 36 - .../Dialect/XeGPU/SIMT/lit.local.cfg | 2 +- .../SIMT/loadstore_scatter_chunk_size_1.mlir | 11 +- ...atter_chunk_size_1_non_contig_offsets.mlir | 13 +- ...re_scatter_chunk_size_1_update_offset.mlir | 22 +- .../SIMT/loadstore_scatter_chunk_size_2.mlir | 13 +- 16 files changed, 31 insertions(+), 1028 deletions(-) delete mode 100644 include/imex/Conversion/XeGPUToXeVM/XeGPUToXeVM.h delete mode 100644 lib/Conversion/XeGPUToXeVM/CMakeLists.txt delete mode 100644 lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp delete mode 100644 test/Conversion/XeGPUToXeVM/dpas.mlir delete mode 100644 test/Conversion/XeGPUToXeVM/fence.mlir delete mode 100644 test/Conversion/XeGPUToXeVM/loadstore_nd.mlir delete mode 100644 test/Conversion/XeGPUToXeVM/prefetch_nd.mlir diff --git a/build_tools/llvm_version.txt b/build_tools/llvm_version.txt index 55d3649ae..70268252b 100644 --- a/build_tools/llvm_version.txt +++ b/build_tools/llvm_version.txt @@ -1 +1 @@ -b44e47a68f9b49a6283b1beaab3af55fa39e8907 +cdf30f0bc2362f8ac0b0a946372d7114229c34fd \ No newline at end of file diff --git a/include/imex/Conversion/Passes.h b/include/imex/Conversion/Passes.h index 3598c7ab8..1efd47ab0 100644 --- a/include/imex/Conversion/Passes.h +++ b/include/imex/Conversion/Passes.h @@ -26,7 +26,6 @@ #include #include #include -#include #include namespace imex { diff --git a/include/imex/Conversion/Passes.td b/include/imex/Conversion/Passes.td index 32b1a06ff..e4c0cecca 100644 --- a/include/imex/Conversion/Passes.td +++ b/include/imex/Conversion/Passes.td @@ -466,21 +466,4 @@ def ConvertArithToVC : Pass<"convert-arith-to-vc", "::mlir::gpu::GPUModuleOp"> { let constructor = "imex::createConvertArithToVCPass()"; } -//===----------------------------------------------------------------------===// -// XeGPUToXeVM -//===----------------------------------------------------------------------===// - -def ConvertXeGPUToXeVMPass : Pass<"convert-xegpu-to-xevm"> { - let summary = "Convert XeGPU to XeVM dialect"; - let dependentDialects = [ - "::mlir::xegpu::XeGPUDialect", - "::mlir::xevm::XeVMDialect", - "::mlir::vector::VectorDialect", - "::mlir::memref::MemRefDialect", - "::mlir::arith::ArithDialect", - ]; -} - - - #endif // _IMEX_CONVERSION_PASSES_TD_INCLUDED_ diff --git a/include/imex/Conversion/XeGPUToXeVM/XeGPUToXeVM.h b/include/imex/Conversion/XeGPUToXeVM/XeGPUToXeVM.h deleted file mode 100644 index b76f4c9ae..000000000 --- a/include/imex/Conversion/XeGPUToXeVM/XeGPUToXeVM.h +++ /dev/null @@ -1,30 +0,0 @@ -//===-- XeGPUToXeVM.h - Convert XeVM to LLVM dialect -------------*- C++ -//-*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#ifndef MLIR_CONVERSION_XEGPUTOXEVM_XEGPUTOXEVMPASS_H_ -#define MLIR_CONVERSION_XEGPUTOXEVM_XEGPUTOXEVMPASS_H_ - -#include - -namespace mlir { -class DialectRegistry; -class LLVMTypeConverter; -class RewritePatternSet; -class Pass; -} // namespace mlir - -namespace imex { -#define GEN_PASS_DECL_CONVERTXEGPUTOXEVMPASS -#include "imex/Conversion/Passes.h.inc" - -void populateXeGPUToXeVMConversionPatterns( - mlir::RewritePatternSet &patterns, mlir::LLVMTypeConverter &typeConverter); - -} // namespace imex - -#endif // MLIR_CONVERSION_XEGPUTOXEVM_XEGPUTOXEVMPASS_H_ diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt index 98dee4ddc..eee3f57ab 100644 --- a/lib/Conversion/CMakeLists.txt +++ b/lib/Conversion/CMakeLists.txt @@ -7,5 +7,4 @@ add_subdirectory(GPUToGPUX) add_subdirectory(GPUXToLLVM) add_subdirectory(MathToVC) add_subdirectory(XeTileToXeGPU) -add_subdirectory(XeGPUToVC) -add_subdirectory(XeGPUToXeVM) +add_subdirectory(XeGPUToVC) \ No newline at end of file diff --git a/lib/Conversion/XeGPUToXeVM/CMakeLists.txt b/lib/Conversion/XeGPUToXeVM/CMakeLists.txt deleted file mode 100644 index 43e232c45..000000000 --- a/lib/Conversion/XeGPUToXeVM/CMakeLists.txt +++ /dev/null @@ -1,24 +0,0 @@ -add_imex_conversion_library(MLIRXeGPUToXeVM - XeGPUToXeVM.cpp - - ADDITIONAL_HEADER_DIRS - ${MLIR_MAIN_INCLUDE_DIR}/imex/Conversion/XeGPUToXeVM - - DEPENDS - IMEXConversionPassIncGen - - LINK_COMPONENTS - Core - - LINK_LIBS PUBLIC - MLIRFuncDialect - MLIRGPUDialect - MLIRLLVMCommonConversion - MLIRLLVMDialect - MLIRXeVMDialect - MLIRVectorDialect - MLIRArithDialect - MLIRXeGPUDialect - MLIRPass - MLIRTransforms -) diff --git a/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp deleted file mode 100644 index 498c11da1..000000000 --- a/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp +++ /dev/null @@ -1,759 +0,0 @@ -//===-- XeVMToLLVM.cpp - XeVM to LLVM dialect conversion --------*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "imex/Conversion/XeGPUToXeVM/XeGPUToXeVM.h" -#include "mlir/Dialect/LLVMIR/XeVMDialect.h" - -#include "mlir/Conversion/LLVMCommon/Pattern.h" -#include "mlir/Dialect/Arith/IR/Arith.h" -#include "mlir/Dialect/GPU/IR/GPUDialect.h" -#include "mlir/Dialect/LLVMIR/FunctionCallUtils.h" -#include "mlir/Dialect/LLVMIR/LLVMDialect.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" -#include "mlir/Dialect/SCF/Transforms/Patterns.h" -#include "mlir/Dialect/XeGPU/IR/XeGPU.h" -#include "mlir/Pass/Pass.h" -#include "mlir/Support/LLVM.h" -#include "llvm/Support/FormatVariadic.h" - -#include "mlir/IR/BuiltinTypes.h" -#include "mlir/IR/Types.h" - -#include "llvm/ADT/TypeSwitch.h" - -#define DEBUG_TYPE "xegpu-to-xevm" - -namespace imex { -#define GEN_PASS_DEF_CONVERTXEGPUTOXEVMPASS -#include "imex/Conversion/Passes.h.inc" -} // namespace imex - -using namespace mlir; -using namespace xevm; -using namespace xegpu; - -namespace { - -enum class NdDescI32Layout : uint32_t { - BasePtr = 0, - BaseShapeW = 2, - BaseShapeH = 3, - TensorOffsetW = 4, - TensorOffsetH = 5 -}; - -static int32_t getNumericXeVMAddrSpace(xegpu::MemorySpace xeGpuMemspace) { - switch (xeGpuMemspace) { - case xegpu::MemorySpace::Global: - return static_cast(xevm::AddrSpace::GLOBAL); - case xegpu::MemorySpace::SLM: - return static_cast(xevm::AddrSpace::SHARED); - } - llvm_unreachable("Unknown XeGPU memory space."); -} - -template -std::tuple checkAllLinear(SmallVector denseAttr) { - assert(!denseAttr.empty()); - const int32_t intercept{static_cast(denseAttr[0])}; - if (denseAttr.size() < 2) - return {true, 0, intercept}; - const T slope{denseAttr[1] - denseAttr[0]}; - for (size_t i = 1; i < denseAttr.size(); ++i) - if (denseAttr[i] - denseAttr[i - 1] != slope) - return {false, 0, 0}; - return {true, static_cast(slope), intercept}; -} - -mlir::VectorType encodeVectorTypeTo(mlir::VectorType currentVecType, - mlir::Type toElemType) { - auto elemType = currentVecType.getElementType(); - auto currentBitWidth = elemType.getIntOrFloatBitWidth(); - auto newBitWidth = toElemType.getIntOrFloatBitWidth(); - const int size = - currentVecType.getNumElements() * currentBitWidth / newBitWidth; - return mlir::VectorType::get(size, toElemType); -} - -xevm::LoadCacheControl -translateLoadXeGPUCacheHint(std::optional L1hint, - std::optional L3hint) { - auto L1hintVal = - L1hint.has_value() ? L1hint.value() : xegpu::CachePolicy::UNCACHED; - auto L3hintVal = - L3hint.has_value() ? L3hint.value() : xegpu::CachePolicy::UNCACHED; - switch (L1hintVal) { - case xegpu::CachePolicy::CACHED: - if (L3hintVal == xegpu::CachePolicy::CACHED) - return xevm::LoadCacheControl::L1C_L2UC_L3C; - else if (L3hintVal == xegpu::CachePolicy::UNCACHED) - return xevm::LoadCacheControl::L1C_L2UC_L3UC; - else - llvm_unreachable("Unsupported cache control."); - case xegpu::CachePolicy::UNCACHED: - if (L3hintVal == xegpu::CachePolicy::CACHED) - return xevm::LoadCacheControl::L1UC_L2UC_L3C; - else if (L3hintVal == xegpu::CachePolicy::UNCACHED) - return xevm::LoadCacheControl::L1UC_L2UC_L3UC; - else - llvm_unreachable("Unsupported cache control."); - case xegpu::CachePolicy::STREAMING: - if (L3hintVal == xegpu::CachePolicy::CACHED) - return xevm::LoadCacheControl::L1S_L2UC_L3C; - else if (L3hintVal == xegpu::CachePolicy::UNCACHED) - return xevm::LoadCacheControl::L1S_L2UC_L3UC; - else - llvm_unreachable("Unsupported cache control."); - case xegpu::CachePolicy::READ_INVALIDATE: - return xevm::LoadCacheControl::INVALIDATE_READ; - default: - llvm_unreachable("Unsupported cache control."); - } -} - -xevm::StoreCacheControl -translateStoreXeGPUCacheHint(std::optional L1hint, - std::optional L3hint) { - auto L1hintVal = - L1hint.has_value() ? L1hint.value() : xegpu::CachePolicy::UNCACHED; - auto L3hintVal = - L3hint.has_value() ? L3hint.value() : xegpu::CachePolicy::UNCACHED; - switch (L1hintVal) { - case xegpu::CachePolicy::UNCACHED: - if (L3hintVal == xegpu::CachePolicy::UNCACHED) - return xevm::StoreCacheControl::L1UC_L2UC_L3UC; - else if (L3hintVal == xegpu::CachePolicy::WRITE_BACK) - return xevm::StoreCacheControl::L1UC_L2UC_L3WB; - else - llvm_unreachable("Unsupported cache control."); - case xegpu::CachePolicy::STREAMING: - if (L3hintVal == xegpu::CachePolicy::UNCACHED) - return xevm::StoreCacheControl::L1S_L2UC_L3UC; - else if (L3hintVal == xegpu::CachePolicy::WRITE_BACK) - return xevm::StoreCacheControl::L1S_L2UC_L3WB; - else - llvm_unreachable("Unsupported cache control."); - case xegpu::CachePolicy::WRITE_BACK: - if (L3hintVal == xegpu::CachePolicy::UNCACHED) - return xevm::StoreCacheControl::L1WB_L2UC_L3UC; - else if (L3hintVal == xegpu::CachePolicy::WRITE_BACK) - return xevm::StoreCacheControl::L1WB_L2UC_L3WB; - else - llvm_unreachable("Unsupported cache control."); - case xegpu::CachePolicy::WRITE_THROUGH: - if (L3hintVal == xegpu::CachePolicy::UNCACHED) - return xevm::StoreCacheControl::L1WT_L2UC_L3UC; - else if (L3hintVal == xegpu::CachePolicy::WRITE_BACK) - return xevm::StoreCacheControl::L1WT_L2UC_L3WB; - else - llvm_unreachable("Unsupported cache control."); - default: - llvm_unreachable("Unsupported cache control."); - } -} - -class CreateNdDescToXeVMPattern - : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult - matchAndRewrite(xegpu::CreateNdDescOp op, - xegpu::CreateNdDescOp::Adaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto source = op.getSource(); - Type payloadElemTy = rewriter.getI32Type(); - Type i64Ty = rewriter.getI64Type(); - VectorType payloadTy = VectorType::get(8, payloadElemTy); - VectorType payloadI64Ty = VectorType::get(4, i64Ty); - Value payload = rewriter.create( - loc, - DenseElementsAttr::get(payloadTy, IntegerAttr::get(payloadElemTy, 0))); - - Value baseAddr; - Value baseShapeW; - Value baseShapeH; - Value offsetW; - Value offsetH; - auto convertToValue = [&](OpFoldResult ofr) -> Value { - Value val; - if (auto v = llvm::dyn_cast_if_present(ofr)) { - val = rewriter.create(loc, i64Ty, v); - val = rewriter.create(loc, payloadElemTy, val); - } else { - int32_t off = llvm::cast(cast(ofr)).getInt(); - val = rewriter.create(loc, payloadElemTy, off); - } - return val; - }; - - int rank = op.getMixedOffsets().size(); - if (rank != 2) { - op.emitError() << "Expected 2D offsets, got " << rank << "D offsets."; - return mlir::failure(); - } - offsetW = convertToValue(op.getMixedOffsets()[rank - 1]); - offsetH = convertToValue(op.getMixedOffsets()[rank - 2]); - - if (auto sourceTy = source.getType(); isa(sourceTy)) { - baseAddr = - rewriter.create(loc, source); - baseAddr = rewriter.create(loc, i64Ty, baseAddr); - auto sourceMemrefTy = cast(sourceTy); - if (!sourceMemrefTy.hasStaticShape()) { - op.emitError() << "Expected static memref shape."; - return mlir::failure(); - } - auto rank = sourceMemrefTy.getRank(); - baseShapeW = rewriter.create( - loc, payloadElemTy, sourceMemrefTy.getDimSize(rank - 1)); - baseShapeH = rewriter.create( - loc, payloadElemTy, sourceMemrefTy.getDimSize(rank - 2)); - } else if (isa(sourceTy)) { - baseAddr = source; - baseShapeW = convertToValue(op.getMixedSizes()[rank - 1]); - baseShapeH = convertToValue(op.getMixedSizes()[rank - 2]); - } else { - op.emitError() << "Unknown source type."; - return mlir::failure(); - } - - Value payLoadAsI64 = - rewriter.create(loc, payloadI64Ty, payload); - payLoadAsI64 = rewriter.create( - loc, baseAddr, payLoadAsI64, - static_cast(NdDescI32Layout::BasePtr)); - payload = rewriter.create(loc, payloadTy, payLoadAsI64); - payload = rewriter.create( - loc, baseShapeW, payload, - static_cast(NdDescI32Layout::BaseShapeW)); - payload = rewriter.create( - loc, baseShapeH, payload, - static_cast(NdDescI32Layout::BaseShapeH)); - payload = rewriter.create( - loc, offsetW, payload, - static_cast(NdDescI32Layout::TensorOffsetW)); - payload = rewriter.create( - loc, offsetH, payload, - static_cast(NdDescI32Layout::TensorOffsetH)); - rewriter.replaceOp(op, payload); - return success(); - } -}; - -class UpdateNdOffsetToXeVMPattern - : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult - matchAndRewrite(xegpu::UpdateNdOffsetOp op, - xegpu::UpdateNdOffsetOp::Adaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto offsets = op.getOffsets(); - auto tdesc = adaptor.getTensorDesc(); - for (size_t offsetDim = 0; offsetDim < offsets.size(); offsetDim++) { - auto offset = offsets[offsetDim]; - if (auto cst = - dyn_cast_if_present(offset.getDefiningOp())) - if (auto attr = dyn_cast_if_present(cst.getValue()); - attr && !attr.getInt()) - continue; - const int offsetPos = - static_cast(offsetDim ? NdDescI32Layout::TensorOffsetW - : NdDescI32Layout::TensorOffsetH); - auto oldOffset = - rewriter.create(loc, tdesc, offsetPos); - offset = rewriter.create(loc, rewriter.getI32Type(), - offset); - auto newOffset = rewriter.create(loc, oldOffset, offset); - tdesc = - rewriter.create(loc, newOffset, tdesc, offsetPos); - } - rewriter.replaceOp(op, tdesc); - return success(); - } -}; - -template ::value>> -class LoadStorePrefetchNdToXeVMPattern : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult - matchAndRewrite(OpType op, typename OpType::Adaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto ctxt = rewriter.getContext(); - - auto tdesc = adaptor.getTensorDesc(); - auto tdescTy = op.getTensorDescType(); - - VectorType payloadI64Ty = VectorType::get(4, rewriter.getI64Type()); - Value payLoadAsI64 = - rewriter.create(loc, payloadI64Ty, tdesc); - Value basePtr = rewriter.create( - loc, payLoadAsI64, static_cast(NdDescI32Layout::BasePtr)); - Value baseShapeW = rewriter.create( - loc, tdesc, static_cast(NdDescI32Layout::BaseShapeW)); - Value baseShapeH = rewriter.create( - loc, tdesc, static_cast(NdDescI32Layout::BaseShapeH)); - Value offsetW = rewriter.create( - loc, tdesc, static_cast(NdDescI32Layout::TensorOffsetW)); - Value offsetH = rewriter.create( - loc, tdesc, static_cast(NdDescI32Layout::TensorOffsetH)); - auto ptrTypeLLVM = LLVM::LLVMPointerType::get( - ctxt, getNumericXeVMAddrSpace(tdescTy.getMemorySpace())); - Value basePtrLLVM = - rewriter.create(loc, ptrTypeLLVM, basePtr); - auto elemType = tdescTy.getElementType(); - const uint32_t elemBitSize = elemType.getIntOrFloatBitWidth(); - Value elemByteSize = rewriter.create( - loc, rewriter.getI32Type(), elemBitSize / 8); - Value surfaceW = - rewriter.create(loc, baseShapeW, elemByteSize); - - auto tileW = tdescTy.getDimSize(1); - auto tileH = tdescTy.getDimSize(0); - int32_t vblocks = tdescTy.getArrayLength(); - if constexpr (std::is_same_v) { - VectorType srcVecTy = cast(op.getValue().getType()); - auto storeCacheControl = - translateStoreXeGPUCacheHint(op.getL1Hint(), op.getL3Hint()); - VectorType srcFlatVecTy = - VectorType::get(srcVecTy.getNumElements(), srcVecTy.getElementType()); - Value srcFlatVec = op.getValue(); - srcFlatVecTy = encodeVectorTypeTo(srcFlatVecTy, - rewriter.getIntegerType(elemBitSize)); - srcFlatVec = - rewriter.create(loc, srcFlatVecTy, srcFlatVec); - rewriter.create( - loc, basePtrLLVM, surfaceW, baseShapeH, surfaceW, offsetW, offsetH, - elemBitSize, tileW, tileH, srcFlatVec, - xevm::StoreCacheControlAttr::get(ctxt, storeCacheControl)); - rewriter.eraseOp(op); - } else { - auto loadCacheControl = - translateLoadXeGPUCacheHint(op.getL1Hint(), op.getL3Hint()); - if constexpr (std::is_same_v) { - rewriter.create( - loc, basePtrLLVM, surfaceW, baseShapeH, surfaceW, offsetW, offsetH, - elemBitSize, tileW, tileH, vblocks, - xevm::LoadCacheControlAttr::get(ctxt, loadCacheControl)); - rewriter.eraseOp(op); - } else { - VectorType dstVecTy = cast(op.getValue().getType()); - const bool vnni = op.getPacked().value_or(false); - auto transposeValue = op.getTranspose(); - bool transpose = - transposeValue.has_value() && transposeValue.value()[0] == 1; - VectorType loadedTy = encodeVectorTypeTo( - dstVecTy, vnni ? rewriter.getI32Type() - : rewriter.getIntegerType(elemBitSize)); - - Value resultFlatVec = rewriter.create( - loc, loadedTy, basePtrLLVM, surfaceW, baseShapeH, surfaceW, offsetW, - offsetH, elemBitSize, tileW, tileH, vblocks, transpose, vnni, - xevm::LoadCacheControlAttr::get(ctxt, loadCacheControl)); - resultFlatVec = rewriter.create( - loc, encodeVectorTypeTo(loadedTy, dstVecTy.getElementType()), - resultFlatVec); - rewriter.replaceOp(op, resultFlatVec); - } - } - return success(); - } -}; - -class CreateDescToXeVMPattern - : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult - matchAndRewrite(xegpu::CreateDescOp op, xegpu::CreateDescOp::Adaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto offsets = op.getOffsets(); - bool allLinear{false}; - int32_t slope{0}; - int32_t intercept{0}; - if (auto cstOp = dyn_cast(offsets.getDefiningOp())) { - if (auto denseAttr = cstOp->getAttrOfType( - cstOp.getValueAttrName())) { - SmallVector intValues; - for (APInt val : denseAttr.getValues()) - intValues.push_back(static_cast(val.getSExtValue())); - std::tie(allLinear, slope, intercept) = checkAllLinear(intValues); - } else { - op.emitError() << "Unknown offsets source, expected a dense array."; - return failure(); - } - } else { - op.emitError() - << "Unknown offsets source, must be a compile-time constant array."; - return failure(); - } - if (!allLinear) { - op.emitError() << "Expected linear offsets pattern."; - return failure(); - } - - auto memrefTy = cast(op.getSource().getType()); - Value subGroupAddr = - rewriter.create(loc, - op.getSource()); - Value elemByteWidth = rewriter.create( - loc, memrefTy.getElementTypeBitWidth() / 8); - Value offsetIntercept = - rewriter.create(loc, intercept); - offsetIntercept = - rewriter.create(loc, elemByteWidth, offsetIntercept); - Value offsetSlope = rewriter.create(loc, slope); - offsetSlope = - rewriter.create(loc, elemByteWidth, offsetSlope); - Value laneId = rewriter.create(loc, /*upperBound=*/nullptr); - Value laneOffset = rewriter.create(loc, laneId, offsetSlope); - laneOffset = - rewriter.create(loc, laneOffset, offsetIntercept); - auto laneAddr = - rewriter.create(loc, subGroupAddr, laneOffset); - rewriter.replaceOp(op, laneAddr); - return success(); - } -}; - -class UpdateOffsetToXeVMPattern - : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult - matchAndRewrite(xegpu::UpdateOffsetOp op, - xegpu::UpdateOffsetOp::Adaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto elemByteSize = - op.getTensorDesc().getType().getElementType().getIntOrFloatBitWidth() / - 8; - Value laneId = rewriter.create(loc, /*upperBound=*/nullptr); - Value offsetForLane = - rewriter.create(loc, adaptor.getOffsets(), laneId); - Value factor = rewriter.create(loc, elemByteSize); - offsetForLane = rewriter.create( - loc, rewriter.getIndexType(), offsetForLane); - offsetForLane = rewriter.create(loc, factor, offsetForLane); - Value newOffsetForLane = rewriter.create( - loc, adaptor.getTensorDesc(), offsetForLane); - rewriter.replaceOp(op, newOffsetForLane); - return success(); - } -}; - -template ::value>> -class LoadStoreToXeVMPattern : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult - matchAndRewrite(OpType op, typename OpType::Adaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto ctxt = rewriter.getContext(); - auto tdesc = op.getTensorDescType(); - auto ptrTypeLLVM = LLVM::LLVMPointerType::get( - ctxt, getNumericXeVMAddrSpace(tdesc.getMemorySpace())); - VectorType srcOrDstVecTy = cast(op.getValue().getType()); - VectorType srcOrDstFlatVecTy = VectorType::get( - srcOrDstVecTy.getNumElements(), srcOrDstVecTy.getElementType()); - if constexpr (std::is_same_v) { - Value basePtrI64 = rewriter.create( - loc, rewriter.getI64Type(), adaptor.getSource()); - Value basePtrLLVM = - rewriter.create(loc, ptrTypeLLVM, basePtrI64); - Value loaded = - rewriter.create(loc, srcOrDstFlatVecTy, basePtrLLVM); - auto newOp = - rewriter.create(loc, srcOrDstVecTy, loaded); - rewriter.replaceOp(op, newOp); - } else { - Value basePtrI64 = rewriter.create( - loc, rewriter.getI64Type(), adaptor.getDest()); - Value basePtrLLVM = - rewriter.create(loc, ptrTypeLLVM, basePtrI64); - Value srcFlatVec = rewriter.create( - loc, srcOrDstFlatVecTy, op.getValue()); - rewriter.create(loc, srcFlatVec, basePtrLLVM); - rewriter.eraseOp(op); - } - return success(); - } -}; - -class PrefetchToXeVMPattern : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult - matchAndRewrite(xegpu::PrefetchOp op, xegpu::PrefetchOp::Adaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto ctxt = rewriter.getContext(); - auto tdescTy = op.getTensorDescType(); - auto ptrTypeLLVM = LLVM::LLVMPointerType::get( - ctxt, getNumericXeVMAddrSpace(tdescTy.getMemorySpace())); - Value basePtrI64 = rewriter.create( - loc, rewriter.getI64Type(), adaptor.getSource()); - Value ptrLLVM = - rewriter.create(loc, ptrTypeLLVM, basePtrI64); - rewriter.create( - loc, ptrLLVM, - xevm::LoadCacheControlAttr::get( - ctxt, translateLoadXeGPUCacheHint(op.getL1Hint(), op.getL3Hint()))); - return success(); - } -}; -class FenceToXeVMPattern : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult - matchAndRewrite(xegpu::FenceOp op, xegpu::FenceOp::Adaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - xevm::MemScope memScope{xevm::MemScope::WORKGROUP}; - switch (op.getFenceScope()) { - case xegpu::FenceScope::Workgroup: - memScope = xevm::MemScope::WORKGROUP; - break; - case xegpu::FenceScope::Local: - memScope = xevm::MemScope::LANE; - break; - case xegpu::FenceScope::Tile: - memScope = xevm::MemScope::SUBGROUP; - break; - case xegpu::FenceScope::GPU: - memScope = xevm::MemScope::DEVICE; - break; - case xegpu::FenceScope::System: - memScope = xevm::MemScope::SYSTEM; - break; - llvm_unreachable("Unknown XeGPU fence scope."); - } - xevm::AddrSpace addrSpace{xevm::AddrSpace::GLOBAL}; - switch (op.getMemoryKind()) { - case xegpu::MemorySpace::Global: - addrSpace = xevm::AddrSpace::GLOBAL; - break; - case xegpu::MemorySpace::SLM: - addrSpace = xevm::AddrSpace::SHARED; - break; - llvm_unreachable("Unknown XeGPU fence scope."); - } - rewriter.create(loc, memScope, addrSpace); - rewriter.eraseOp(op); - return success(); - } -}; - -class DpasToXeVMPattern : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult - matchAndRewrite(xegpu::DpasOp op, xegpu::DpasOp::Adaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto ctxt = rewriter.getContext(); - auto aTy = mlir::cast(op.getLhs().getType()); - auto bTy = mlir::cast(op.getRhs().getType()); - auto resultType = mlir::cast(op.getResultType()); - - auto encodePrecision = [&](Type type) -> xevm::ElemType { - if (type == rewriter.getBF16Type()) - return xevm::ElemType::BF16; - else if (type == rewriter.getF16Type()) - return xevm::ElemType::F16; - else if (type == rewriter.getTF32Type()) - return xevm::ElemType::TF32; - else if (type.isInteger(8)) { - if (type.isUnsignedInteger()) - return xevm::ElemType::U8; - return xevm::ElemType::S8; - } else if (type == rewriter.getF32Type()) - return xevm::ElemType::F32; - else if (type.isInteger(32)) - return xevm::ElemType::S32; - llvm_unreachable("add more support for ElemType"); - }; - xevm::ElemType precATy = encodePrecision(aTy.getElementType()); - xevm::ElemType precBTy = encodePrecision(bTy.getElementType()); - Value c = op.getAcc(); - if (!c) { - auto elementTy = resultType.getElementType(); - Attribute initValueAttr; - if (isa(elementTy)) - initValueAttr = FloatAttr::get(elementTy, 0.0); - else - initValueAttr = IntegerAttr::get(elementTy, 0); - c = rewriter.create( - loc, DenseElementsAttr::get(resultType, initValueAttr)); - } - - Value aVec = op.getLhs(); - Value bVec = op.getRhs(); - auto cvecty = cast(c.getType()); - xevm::ElemType precCTy = encodePrecision(cvecty.getElementType()); - xevm::ElemType precDTy = encodePrecision(resultType.getElementType()); - VectorType cNty = - VectorType::get(cvecty.getNumElements(), cvecty.getElementType()); - if (cvecty != cNty) - c = rewriter.create(loc, cNty, c); - // TODO: below are uArch dependent values, should move away from hardcoding - constexpr int32_t systolicDepth{8}; - constexpr int32_t executionSize{16}; - Value dpasRes = rewriter.create( - loc, cNty, aVec, bVec, c, - xevm::MMAShapeAttr::get(ctxt, cvecty.getNumElements(), executionSize, - systolicDepth), - xevm::MMATypesAttr::get(ctxt, precDTy, precATy, precBTy, precCTy)); - if (cvecty != cNty) - dpasRes = rewriter.create(loc, resultType, dpasRes); - rewriter.replaceOp(op, dpasRes); - return success(); - } -}; - -static std::optional -matchSimpleAtomicOp(arith::AtomicRMWKind arithKind) { - switch (arithKind) { - case arith::AtomicRMWKind::addf: - return LLVM::AtomicBinOp::fadd; - case arith::AtomicRMWKind::addi: - return LLVM::AtomicBinOp::add; - case arith::AtomicRMWKind::assign: - return LLVM::AtomicBinOp::xchg; - case arith::AtomicRMWKind::maximumf: - return LLVM::AtomicBinOp::fmax; - case arith::AtomicRMWKind::maxs: - return LLVM::AtomicBinOp::max; - case arith::AtomicRMWKind::maxu: - return LLVM::AtomicBinOp::umax; - case arith::AtomicRMWKind::minimumf: - return LLVM::AtomicBinOp::fmin; - case arith::AtomicRMWKind::mins: - return LLVM::AtomicBinOp::min; - case arith::AtomicRMWKind::minu: - return LLVM::AtomicBinOp::umin; - case arith::AtomicRMWKind::ori: - return LLVM::AtomicBinOp::_or; - case arith::AtomicRMWKind::andi: - return LLVM::AtomicBinOp::_and; - default: - return std::nullopt; - } - llvm_unreachable("Invalid AtomicRMWKind"); -} - -class AtomicRMWToXeVMPattern : public OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - LogicalResult - matchAndRewrite(xegpu::AtomicRMWOp op, xegpu::AtomicRMWOp::Adaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - auto loc = op.getLoc(); - auto ctxt = rewriter.getContext(); - auto tdesc = op.getTensorDesc().getType(); - auto ptrTypeLLVM = LLVM::LLVMPointerType::get( - ctxt, getNumericXeVMAddrSpace(tdesc.getMemorySpace())); - Value basePtrI64 = rewriter.create( - loc, rewriter.getI64Type(), adaptor.getTensorDesc()); - Value basePtrLLVM = - rewriter.create(loc, ptrTypeLLVM, basePtrI64); - VectorType srcOrDstVecTy = cast(op.getValue().getType()); - VectorType srcOrDstFlatVecTy = VectorType::get( - srcOrDstVecTy.getNumElements(), srcOrDstVecTy.getElementType()); - Value srcFlatVec = rewriter.create( - loc, srcOrDstFlatVecTy, op.getValue()); - auto atomicKind = matchSimpleAtomicOp(op.getKind()); - assert(atomicKind.has_value()); - Value resVec = srcFlatVec; - for (int i = 0; i < srcOrDstVecTy.getNumElements(); i++) { - auto val = rewriter.create(loc, resVec, i); - Value idx = rewriter.create(loc, rewriter.getI64Type(), - rewriter.getIndexAttr(i)); - Value currPtr = rewriter.create( - loc, ptrTypeLLVM, srcOrDstVecTy.getElementType(), basePtrLLVM, idx); - Value newVal = rewriter.create( - loc, atomicKind.value(), currPtr, val, LLVM::AtomicOrdering::seq_cst); - resVec = rewriter.create(loc, newVal, resVec, i); - } - rewriter.replaceOp(op, resVec); - return success(); - } -}; - -//===----------------------------------------------------------------------===// -// Pass Definition -//===----------------------------------------------------------------------===// - -struct ConvertXeGPUToXeVMPass - : public imex::impl::ConvertXeGPUToXeVMPassBase { - using Base::Base; - - void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); - } - - void runOnOperation() override { - LLVMTypeConverter typeConverter(&getContext()); - typeConverter.addConversion([&](IndexType type) -> Type { return type; }); - typeConverter.addConversion([&](VectorType type) -> Type { - unsigned rank = type.getRank(); - auto elemType = type.getElementType(); - if (llvm::isa(elemType)) - elemType = mlir::IntegerType::get(&getContext(), 64); - if (rank < 1 || type.getNumElements() == 1) - return elemType; - unsigned sum = 1; - for (unsigned i = 0; i < rank; i++) { - sum *= type.getShape()[i]; - } - return VectorType::get(sum, elemType); - }); - typeConverter.addConversion([&](xegpu::TensorDescType type) -> Type { - if (type.isScattered()) { - return IndexType::get(&getContext()); - } - auto i32Type = IntegerType::get(&getContext(), 32); - return VectorType::get(8, i32Type); - }); - - ConversionTarget target(getContext()); - target.addLegalDialect(); - target.addIllegalDialect(); - - RewritePatternSet patterns(&getContext()); - imex::populateXeGPUToXeVMConversionPatterns(patterns, typeConverter); - mlir::scf::populateSCFStructuralTypeConversionsAndLegality( - typeConverter, patterns, target); - if (failed(applyPartialConversion(getOperation(), target, - std::move(patterns)))) - signalPassFailure(); - } -}; -} // namespace - -//===----------------------------------------------------------------------===// -// Pattern Population -//===----------------------------------------------------------------------===// -void imex::populateXeGPUToXeVMConversionPatterns( - RewritePatternSet &patterns, LLVMTypeConverter &typeConverter) { - patterns.add, - LoadStorePrefetchNdToXeVMPattern, - LoadStorePrefetchNdToXeVMPattern>( - typeConverter, patterns.getContext()); - patterns.add, - LoadStoreToXeVMPattern>( - typeConverter, patterns.getContext()); - patterns.add(typeConverter, - patterns.getContext()); -} diff --git a/test/Conversion/XeGPUToXeVM/dpas.mlir b/test/Conversion/XeGPUToXeVM/dpas.mlir deleted file mode 100644 index 40a25eb71..000000000 --- a/test/Conversion/XeGPUToXeVM/dpas.mlir +++ /dev/null @@ -1,15 +0,0 @@ -// RUN: imex-opt -convert-xegpu-to-xevm %s | FileCheck %s - -#sg_map_a_f16 = #xegpu.layout -#sg_map_b_f16 = #xegpu.layout -#sg_map_c_f32 = #xegpu.layout - -gpu.module @load_store_check { - //CHECK: func.func @dpas(%[[ARG0:.*]]: vector<8xf16>, %[[ARG1:.*]]: vector<16xf16>, %[[ARG2:.*]]: vector<8xf32>) -> vector<8xf32> - func.func @dpas(%a_loaded: vector<8xf16>, %b_loaded: vector<16xf16>, %c_loaded: vector<8xf32>) -> vector<8xf32> { - // Loads are checked in a separate test. - // CHECK: %[[D:.*]] = xevm.mma %[[ARG0]], %[[ARG1]], %[[ARG2]] {shape = , types = } : (vector<8xf16>, vector<16xf16>, vector<8xf32>) -> vector<8xf32> - %d = xegpu.dpas %a_loaded, %b_loaded, %c_loaded {a_layout = #sg_map_a_f16, b_layout = #sg_map_b_f16, c_layout = #sg_map_c_f32} : vector<8xf16>, vector<16xf16>, vector<8xf32> -> vector<8xf32> - return %d : vector<8xf32> - } -} diff --git a/test/Conversion/XeGPUToXeVM/fence.mlir b/test/Conversion/XeGPUToXeVM/fence.mlir deleted file mode 100644 index 32c21a940..000000000 --- a/test/Conversion/XeGPUToXeVM/fence.mlir +++ /dev/null @@ -1,15 +0,0 @@ -// RUN: imex-opt -convert-xegpu-to-xevm %s | FileCheck %s - -gpu.module @fence_check { - gpu.func @fence(%dst: memref<8x16xf32, 1>) kernel { - %tid_x = gpu.thread_id x - %tid_x_i32 = arith.index_cast %tid_x : index to i32 - %tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32 - - // CHECK: xevm.memfence <{addrspace = #xevm.addr_space, scope = #xevm.mem_scope}> - xegpu.fence memory_kind = global, fence_scope = workgroup - %c0 = arith.constant 0 : index - memref.store %tid_x_f32, %dst[%c0, %c0] : memref<8x16xf32, 1> - gpu.return - } -} diff --git a/test/Conversion/XeGPUToXeVM/loadstore_nd.mlir b/test/Conversion/XeGPUToXeVM/loadstore_nd.mlir deleted file mode 100644 index dd7531b38..000000000 --- a/test/Conversion/XeGPUToXeVM/loadstore_nd.mlir +++ /dev/null @@ -1,96 +0,0 @@ -// RUN: imex-opt -convert-xegpu-to-xevm -allow-unregistered-dialect %s | FileCheck %s - -gpu.module @load_store_check { - gpu.func @load_store(%src: memref<8x16xf32, 1>, %dst: memref<8x16xf32, 1>) kernel { - %srcce = memref.memory_space_cast %src : memref<8x16xf32, 1> to memref<8x16xf32> - %dstte = memref.memory_space_cast %dst : memref<8x16xf32, 1> to memref<8x16xf32> - - // CHECK: %[[LD_PTR_AS_I64:.*]] = arith.index_castui {{.*}} : index to i64 - // CHECK: %[[LD_CREATE_DESC_I64:.*]] = vector.bitcast {{.*}} : vector<8xi32> to vector<4xi64> - // CHECK: %[[LD_DESC_0:.*]] = vector.insert %[[LD_PTR_AS_I64]], %[[LD_CREATE_DESC_I64]] [0] : i64 into vector<4xi64> - // CHECK: %[[LD_DESC_1:.*]] = vector.bitcast %[[LD_DESC_0]] : vector<4xi64> to vector<8xi32> - // CHECK: %[[LD_DESC_2:.*]] = vector.insert {{.*}}, %[[LD_DESC_1]] [2] : i32 into vector<8xi32> - // CHECK: %[[LD_DESC_3:.*]] = vector.insert {{.*}}, %[[LD_DESC_2]] [3] : i32 into vector<8xi32> - // CHECK: %[[LD_DESC_4:.*]] = vector.insert {{.*}}, %[[LD_DESC_3]] [4] : i32 into vector<8xi32> - // CHECK: %[[LD_DESC:.*]] = vector.insert {{.*}}, %[[LD_DESC_4]] [5] : i32 into vector<8xi32> - %src_tdesc = xegpu.create_nd_tdesc %srcce[0, 0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - - - //CHECK: %[[LD_DESC_I64:.*]] = vector.bitcast %[[LD_DESC]] : vector<8xi32> to vector<4xi64> - //CHECK: %[[LD_INTPTR:.*]] = vector.extract %[[LD_DESC_I64]][0] : i64 from vector<4xi64> - //CHECK: %[[LD_BASE_W:.*]] = vector.extract %[[LD_DESC]][2] : i32 from vector<8xi32> - //CHECK: %[[LD_BASE_H:.*]] = vector.extract %[[LD_DESC]][3] : i32 from vector<8xi32> - //CHECK: %[[LD_TILE_W:.*]] = vector.extract %[[LD_DESC]][4] : i32 from vector<8xi32> - //CHECK: %[[LD_TILE_H:.*]] = vector.extract %[[LD_DESC]][5] : i32 from vector<8xi32> - //CHECK: %[[LD_LLVMPTR:.*]] = llvm.inttoptr %[[LD_INTPTR]] : i64 to !llvm.ptr<1> - //CHECK: %[[LD_SIZEOF_F32:.*]] = arith.constant 4 : i32 - //CHECK: %[[LD_BASE_ROW_IN_BYTES:.*]] = arith.muli %[[LD_BASE_W]], %[[LD_SIZEOF_F32]] : i32 - //CHECK: %[[LD_LOADED_I32:.*]] = xevm.blockload2d %[[LD_LLVMPTR]], %[[LD_BASE_ROW_IN_BYTES]], - //CHECK-SAME: %[[LD_BASE_H]], %[[LD_BASE_ROW_IN_BYTES]], %[[LD_TILE_W]], %[[LD_TILE_H]] - //CHECK-SAME: <{cache_control = #xevm.load_cache_control, elem_size_in_bits = 32 : i32, - //CHECK-SAME: pack_register = false, tile_height = 8 : i32, tile_width = 16 : i32, transpose = false, - //CHECK-SAME: v_blocks = 1 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32> - %loaded = xegpu.load_nd %src_tdesc <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> - //CHECK: %[[LD_LOADED_F32:.*]] = vector.bitcast %[[LD_LOADED_I32]] : vector<8xi32> to vector<8xf32> - - %tid_x = gpu.thread_id x - %tid_x_i32 = arith.index_cast %tid_x : index to i32 - %tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32 - //CHECK: %[[LOADED_F32_MODIFIED:.*]] = vector.insert %{{.*}}, %[[LD_LOADED_F32]] [0] : f32 into vector<8xf32> - %loaded_modified = vector.insert %tid_x_f32, %loaded[0] : f32 into vector<8xf32> - - // CHECK: %[[PTR_AS_I64:.*]] = arith.index_castui {{.*}} : index to i64 - // CHECK: %[[CREATE_DESC_I64:.*]] = vector.bitcast {{.*}} : vector<8xi32> to vector<4xi64> - // CHECK: %[[DESC_0:.*]] = vector.insert %[[PTR_AS_I64]], %[[CREATE_DESC_I64]] [0] : i64 into vector<4xi64> - // CHECK: %[[DESC_1:.*]] = vector.bitcast %[[DESC_0]] : vector<4xi64> to vector<8xi32> - // CHECK: %[[DESC_2:.*]] = vector.insert {{.*}}, %[[DESC_1]] [2] : i32 into vector<8xi32> - // CHECK: %[[DESC_3:.*]] = vector.insert {{.*}}, %[[DESC_2]] [3] : i32 into vector<8xi32> - // CHECK: %[[DESC_4:.*]] = vector.insert {{.*}}, %[[DESC_3]] [4] : i32 into vector<8xi32> - // CHECK: %[[DESC:.*]] = vector.insert {{.*}}, %[[DESC_4]] [5] : i32 into vector<8xi32> - %dst_tdesc = xegpu.create_nd_tdesc %dstte[0, 0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - - //CHECK: %[[DESC_I64:.*]] = vector.bitcast %[[DESC]] : vector<8xi32> to vector<4xi64> - //CHECK: %[[INTPTR:.*]] = vector.extract %[[DESC_I64]][0] : i64 from vector<4xi64> - //CHECK: %[[BASE_W:.*]] = vector.extract %[[DESC]][2] : i32 from vector<8xi32> - //CHECK: %[[BASE_H:.*]] = vector.extract %[[DESC]][3] : i32 from vector<8xi32> - //CHECK: %[[TILE_W:.*]] = vector.extract %[[DESC]][4] : i32 from vector<8xi32> - //CHECK: %[[TILE_H:.*]] = vector.extract %[[DESC]][5] : i32 from vector<8xi32> - //CHECK: %[[LLVMPTR:.*]] = llvm.inttoptr %[[INTPTR]] : i64 to !llvm.ptr<1> - //CHECK: %[[SIZEOF_F32:.*]] = arith.constant 4 : i32 - //CHECK: %[[BASE_ROW_IN_BYTES:.*]] = arith.muli %[[BASE_W]], %[[SIZEOF_F32]] : i32 - //CHECK: %[[FLAT_VALUE_I32:.*]] = vector.bitcast %[[LOADED_F32_MODIFIED]] : vector<8xf32> to vector<8xi32> - //CHECK: xevm.blockstore2d %[[LLVMPTR]], %[[BASE_ROW_IN_BYTES]], %[[BASE_H]], %[[BASE_ROW_IN_BYTES]], - //CHECK-SAME: %[[TILE_W]], %[[TILE_H]], %[[FLAT_VALUE_I32]] - //CHECK-SAME: <{cache_control = #xevm.store_cache_control, elem_size_in_bits = 32 : i32, - //CHECK-SAME: tile_height = 8 : i32, tile_width = 16 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>) - xegpu.store_nd %loaded_modified, %dst_tdesc <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: vector<8xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - gpu.return - } - - gpu.func @create_nd_tdesc_integer_source(%src: i64, %src_h : index, %src_w : index) kernel { - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %c8 = arith.constant 8 : index - %c0 = arith.constant 0 : index - // CHECK: %[[PAYLOAD:.*]] = arith.constant dense<0> : vector<8xi32> - // CHECK: %[[T0:.*]] = arith.index_cast %{{.*}} : index to i64 - // CHECK: %[[T1:.*]] = arith.trunci %[[T0]] : i64 to i32 - // CHECK: %[[T2:.*]] = arith.index_cast %{{.*}} : index to i64 - // CHECK: %[[T3:.*]] = arith.trunci %[[T2]] : i64 to i32 - // CHECK: %[[T4:.*]] = arith.index_cast %{{.*}} : index to i64 - // CHECK: %[[T5:.*]] = arith.trunci %[[T4]] : i64 to i32 - // CHECK: %[[T6:.*]] = arith.index_cast %{{.*}} : index to i64 - // CHECK: %[[T7:.*]] = arith.trunci %[[T6]] : i64 to i32 - // CHECK: %[[T8:.*]] = vector.bitcast %[[PAYLOAD]] : vector<8xi32> to vector<4xi64> - // CHECK: %[[T9:.*]] = vector.insert %{{.*}}, %[[T8]] [0] : i64 into vector<4xi64> - // CHECK: %[[T10:.*]] = vector.bitcast %[[T9]] : vector<4xi64> to vector<8xi32> - // CHECK: %[[T11:.*]] = vector.insert %[[T5]], %[[T10]] [2] : i32 into vector<8xi32> - // CHECK: %[[T12:.*]] = vector.insert %[[T7]], %[[T11]] [3] : i32 into vector<8xi32> - // CHECK: %[[T13:.*]] = vector.insert %[[T1]], %[[T12]] [4] : i32 into vector<8xi32> - // CHECK: %[[T14:.*]] = vector.insert %[[T3]], %[[T13]] [5] : i32 into vector<8xi32> - %src_tdesc = xegpu.create_nd_tdesc %src [%c4, %c8], shape: [%src_h, %src_w], strides: [%src_w, %c1] : i64 - -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - "some_op"(%src_tdesc) : (!xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr>) -> () - gpu.return - } -} diff --git a/test/Conversion/XeGPUToXeVM/prefetch_nd.mlir b/test/Conversion/XeGPUToXeVM/prefetch_nd.mlir deleted file mode 100644 index 2a67edfa8..000000000 --- a/test/Conversion/XeGPUToXeVM/prefetch_nd.mlir +++ /dev/null @@ -1,36 +0,0 @@ -// RUN: imex-opt -convert-xegpu-to-xevm -split-input-file %s | FileCheck %s - -gpu.module @fence_check { - gpu.func @fence(%src: memref<8x16xf32, 1>, %dst: memref<8x16xf32, 1>) kernel { - %srcce = memref.memory_space_cast %src : memref<8x16xf32, 1> to memref<8x16xf32> - %dstte = memref.memory_space_cast %dst : memref<8x16xf32, 1> to memref<8x16xf32> - - // CHECK: %[[LD_PTR_AS_I64:.*]] = arith.index_castui {{.*}} : index to i64 - // CHECK: %[[LD_CREATE_DESC_I64:.*]] = vector.bitcast {{.*}} : vector<8xi32> to vector<4xi64> - // CHECK: %[[LD_DESC_0:.*]] = vector.insert %[[LD_PTR_AS_I64]], %[[LD_CREATE_DESC_I64]] [0] : i64 into vector<4xi64> - // CHECK: %[[LD_DESC_1:.*]] = vector.bitcast %[[LD_DESC_0]] : vector<4xi64> to vector<8xi32> - // CHECK: %[[LD_DESC_2:.*]] = vector.insert {{.*}}, %[[LD_DESC_1]] [2] : i32 into vector<8xi32> - // CHECK: %[[LD_DESC_3:.*]] = vector.insert {{.*}}, %[[LD_DESC_2]] [3] : i32 into vector<8xi32> - // CHECK: %[[LD_DESC_4:.*]] = vector.insert {{.*}}, %[[LD_DESC_3]] [4] : i32 into vector<8xi32> - // CHECK: %[[LD_DESC:.*]] = vector.insert {{.*}}, %[[LD_DESC_4]] [5] : i32 into vector<8xi32> - %src_tdesc = xegpu.create_nd_tdesc %srcce[0, 0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr, #xegpu.layout> - - //CHECK: %[[LD_DESC_I64:.*]] = vector.bitcast %[[LD_DESC]] : vector<8xi32> to vector<4xi64> - //CHECK: %[[PREF_INTPTR:.*]] = vector.extract %[[LD_DESC_I64]][0] : i64 from vector<4xi64> - //CHECK: %[[PREF_BASE_W:.*]] = vector.extract %[[LD_DESC]][2] : i32 from vector<8xi32> - //CHECK: %[[PREF_BASE_H:.*]] = vector.extract %[[LD_DESC]][3] : i32 from vector<8xi32> - //CHECK: %[[PREF_TILE_W:.*]] = vector.extract %[[LD_DESC]][4] : i32 from vector<8xi32> - //CHECK: %[[PREF_TILE_H:.*]] = vector.extract %[[LD_DESC]][5] : i32 from vector<8xi32> - //CHECK: %[[PREF_LLVMPTR:.*]] = llvm.inttoptr %[[PREF_INTPTR]] : i64 to !llvm.ptr<1> - //CHECK: %[[PREF_SIZEOF_F32:.*]] = arith.constant 4 : i32 - //CHECK: %[[PREF_BASE_ROW_IN_BYTES:.*]] = arith.muli %[[PREF_BASE_W]], %[[PREF_SIZEOF_F32]] : i32 - //CHECK: xevm.blockprefetch2d %[[PREF_LLVMPTR]], %[[PREF_BASE_ROW_IN_BYTES]], %[[PREF_BASE_H]], - //CHECK-SAME: %[[PREF_BASE_ROW_IN_BYTES]], %[[PREF_TILE_W]], %[[PREF_TILE_H]] - //CHECK-SAME: <{cache_control = #xevm.load_cache_control, elem_size_in_bits = 32 : i32, - //CHECK-SAME: tile_height = 8 : i32, tile_width = 16 : i32, v_blocks = 1 : i32}> - //CHECK-SAME: : (!llvm.ptr<1>, i32, i32, i32, i32, i32) - xegpu.prefetch_nd %src_tdesc<{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr, #xegpu.layout> - - gpu.return - } -} diff --git a/test/Integration/Dialect/XeGPU/SIMT/lit.local.cfg b/test/Integration/Dialect/XeGPU/SIMT/lit.local.cfg index 08a5c9747..bfd921fe2 100644 --- a/test/Integration/Dialect/XeGPU/SIMT/lit.local.cfg +++ b/test/Integration/Dialect/XeGPU/SIMT/lit.local.cfg @@ -1,2 +1,2 @@ if(not config.mlir_enable_levelzero_runtime): - config.unsupported = True + config.unsupported = True \ No newline at end of file diff --git a/test/Integration/Dialect/XeGPU/SIMT/loadstore_scatter_chunk_size_1.mlir b/test/Integration/Dialect/XeGPU/SIMT/loadstore_scatter_chunk_size_1.mlir index db54e23db..cf6ae57cb 100644 --- a/test/Integration/Dialect/XeGPU/SIMT/loadstore_scatter_chunk_size_1.mlir +++ b/test/Integration/Dialect/XeGPU/SIMT/loadstore_scatter_chunk_size_1.mlir @@ -9,19 +9,16 @@ module @gemm attributes {gpu.container_module} { %srcce = memref.memory_space_cast %src : memref<128xf32, 1> to memref<128xf32> %dstte = memref.memory_space_cast %dst : memref<128xf32, 1> to memref<128xf32> - %offsets = arith.constant dense<[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]> : vector<16xindex> - %src_tdesc = xegpu.create_tdesc %srcce, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr> - %dst_tdesc = xegpu.create_tdesc %dstte, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr> + %tid_x = gpu.thread_id x - %mask = arith.constant dense<[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]> : vector<16xi1> - %loaded = xegpu.load %src_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<1xf32> + %mask = arith.constant 1 : i1 + %loaded = xegpu.load %srcce[%tid_x], %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, chunk_size = 1}>: memref<128xf32>, index, i1 -> vector<1xf32> - %tid_x = gpu.thread_id x %tid_x_i32 = arith.index_cast %tid_x : index to i32 %tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32 %loaded_modified = vector.insert %tid_x_f32, %loaded[0] : f32 into vector<1xf32> - xegpu.store %loaded_modified, %dst_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<1xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> + xegpu.store %loaded_modified, %dstte[%tid_x], %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, chunk_size = 1}> : vector<1xf32>, memref<128xf32>, index, i1 gpu.return } } diff --git a/test/Integration/Dialect/XeGPU/SIMT/loadstore_scatter_chunk_size_1_non_contig_offsets.mlir b/test/Integration/Dialect/XeGPU/SIMT/loadstore_scatter_chunk_size_1_non_contig_offsets.mlir index 20c1decf7..34d5da091 100644 --- a/test/Integration/Dialect/XeGPU/SIMT/loadstore_scatter_chunk_size_1_non_contig_offsets.mlir +++ b/test/Integration/Dialect/XeGPU/SIMT/loadstore_scatter_chunk_size_1_non_contig_offsets.mlir @@ -9,21 +9,20 @@ module @gemm attributes {gpu.container_module} { %srcce = memref.memory_space_cast %src : memref<128xf32, 1> to memref<128xf32> %dstte = memref.memory_space_cast %dst : memref<128xf32, 1> to memref<128xf32> - %offsets = arith.constant dense<[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]> : vector<16xindex> - %src_tdesc = xegpu.create_tdesc %srcce, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr> - %dst_tdesc = xegpu.create_tdesc %dstte, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr> + %c2 = arith.constant 2 : index + %tid_x = gpu.thread_id x + %offsets = arith.muli %tid_x, %c2 : index - %mask = arith.constant dense<[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]> : vector<16xi1> - %loaded = xegpu.load %src_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<1xf32> + %mask = arith.constant 1 : i1 + %loaded = xegpu.load %srcce[%offsets], %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, chunk_size = 1}> : memref<128xf32>, index, i1 -> vector<1xf32> - %tid_x = gpu.thread_id x %tid_x_i32 = arith.index_cast %tid_x : index to i32 %tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32 %c0 = arith.constant 0 : i32 %loaded_modified = vector.insert %tid_x_f32, %loaded[0] : f32 into vector<1xf32> - xegpu.store %loaded_modified, %dst_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<1xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> + xegpu.store %loaded_modified, %dstte[%offsets], %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, chunk_size = 1}> : vector<1xf32>, memref<128xf32>, index, i1 gpu.return } } diff --git a/test/Integration/Dialect/XeGPU/SIMT/loadstore_scatter_chunk_size_1_update_offset.mlir b/test/Integration/Dialect/XeGPU/SIMT/loadstore_scatter_chunk_size_1_update_offset.mlir index c6f2f5990..99c783cab 100644 --- a/test/Integration/Dialect/XeGPU/SIMT/loadstore_scatter_chunk_size_1_update_offset.mlir +++ b/test/Integration/Dialect/XeGPU/SIMT/loadstore_scatter_chunk_size_1_update_offset.mlir @@ -9,23 +9,25 @@ module @gemm attributes {gpu.container_module} { %srcce = memref.memory_space_cast %src : memref<128xf32, 1> to memref<128xf32> %dstte = memref.memory_space_cast %dst : memref<128xf32, 1> to memref<128xf32> - %offsets = arith.constant dense<[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]> : vector<16xindex> - %src_tdesc = xegpu.create_tdesc %srcce, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr> - %dst_tdesc = xegpu.create_tdesc %dstte, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr> + %tid_x = gpu.thread_id x + %1 = arith.constant dense<[0]> : vector<1xindex> + %offsets = vector.insert %tid_x, %1[0] : index into vector<1xindex> - %mask = arith.constant dense<[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]> : vector<16xi1> - %loaded = xegpu.load %src_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<1xf32> + %mask = arith.constant 1 : i1 + %src_tdesc = xegpu.create_tdesc %srcce, %offsets : memref<128xf32>, vector<1xindex> -> !xegpu.tensor_desc<1xf32, #xegpu.scatter_tdesc_attr> + %dst_tdesc = xegpu.create_tdesc %dstte, %offsets : memref<128xf32>, vector<1xindex> -> !xegpu.tensor_desc<1xf32, #xegpu.scatter_tdesc_attr> + + %loaded = xegpu.load %src_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<1xf32, #xegpu.scatter_tdesc_attr>, i1 -> vector<1xf32> - %tid_x = gpu.thread_id x %tid_x_i32 = arith.index_cast %tid_x : index to i32 %tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32 %loaded_modified = vector.insert %tid_x_f32, %loaded[0] : f32 into vector<1xf32> - xegpu.store %loaded_modified, %dst_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<1xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> + xegpu.store %loaded_modified, %dst_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<1xf32>, !xegpu.tensor_desc<1xf32, #xegpu.scatter_tdesc_attr>, i1 - %update_offset = arith.constant dense<[16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]> : vector<16xindex> - %dst_tdesc_new = xegpu.update_offset %dst_tdesc, %update_offset : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, vector<16xindex> - xegpu.store %loaded_modified, %dst_tdesc_new, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<1xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> + %update_offset = arith.constant dense<[16]> : vector<1xindex> + %dst_tdesc_new = xegpu.update_offset %dst_tdesc, %update_offset : !xegpu.tensor_desc<1xf32, #xegpu.scatter_tdesc_attr>, vector<1xindex> + xegpu.store %loaded_modified, %dst_tdesc_new, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<1xf32>, !xegpu.tensor_desc<1xf32, #xegpu.scatter_tdesc_attr>, i1 gpu.return } } diff --git a/test/Integration/Dialect/XeGPU/SIMT/loadstore_scatter_chunk_size_2.mlir b/test/Integration/Dialect/XeGPU/SIMT/loadstore_scatter_chunk_size_2.mlir index 93fcfa231..cd8540d3d 100644 --- a/test/Integration/Dialect/XeGPU/SIMT/loadstore_scatter_chunk_size_2.mlir +++ b/test/Integration/Dialect/XeGPU/SIMT/loadstore_scatter_chunk_size_2.mlir @@ -8,19 +8,18 @@ module @gemm attributes {gpu.container_module} { %srcce = memref.memory_space_cast %src : memref<128xf32, 1> to memref<128xf32> %dstte = memref.memory_space_cast %dst : memref<128xf32, 1> to memref<128xf32> - %offsets = arith.constant dense<[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]> : vector<16xindex> - %src_tdesc = xegpu.create_tdesc %srcce, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> - %dst_tdesc = xegpu.create_tdesc %dstte, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> + %c2 = arith.constant 2 : index + %tid_x = gpu.thread_id x + %offsets = arith.muli %tid_x, %c2 : index - %mask = arith.constant dense<[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]> : vector<16xi1> - %loaded = xegpu.load %src_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<2xf32> + %mask = arith.constant 1 : i1 + %loaded = xegpu.load %srcce[%offsets], %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, chunk_size = 2}> : memref<128xf32>, index, i1 -> vector<2xf32> - %tid_x = gpu.thread_id x %tid_x_i32 = arith.index_cast %tid_x : index to i32 %tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32 %loaded_modified = vector.insert %tid_x_f32, %loaded[0] : f32 into vector<2xf32> - xegpu.store %loaded_modified, %dst_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : vector<2xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> + xegpu.store %loaded_modified, %dstte[%offsets], %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, chunk_size = 2}> : vector<2xf32>, memref<128xf32>, index, i1 gpu.return } } From 341bdae7ae2081e95aefa8d8d4e72cf36ebfcfd3 Mon Sep 17 00:00:00 2001 From: Garra1980 Date: Thu, 4 Sep 2025 00:29:48 +0200 Subject: [PATCH 2/2] fix pre-commit --- build_tools/llvm_version.txt | 2 +- lib/Conversion/CMakeLists.txt | 2 +- test/Integration/Dialect/XeGPU/SIMT/lit.local.cfg | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/build_tools/llvm_version.txt b/build_tools/llvm_version.txt index 70268252b..a49955bf2 100644 --- a/build_tools/llvm_version.txt +++ b/build_tools/llvm_version.txt @@ -1 +1 @@ -cdf30f0bc2362f8ac0b0a946372d7114229c34fd \ No newline at end of file +cdf30f0bc2362f8ac0b0a946372d7114229c34fd diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt index eee3f57ab..e02db3694 100644 --- a/lib/Conversion/CMakeLists.txt +++ b/lib/Conversion/CMakeLists.txt @@ -7,4 +7,4 @@ add_subdirectory(GPUToGPUX) add_subdirectory(GPUXToLLVM) add_subdirectory(MathToVC) add_subdirectory(XeTileToXeGPU) -add_subdirectory(XeGPUToVC) \ No newline at end of file +add_subdirectory(XeGPUToVC) diff --git a/test/Integration/Dialect/XeGPU/SIMT/lit.local.cfg b/test/Integration/Dialect/XeGPU/SIMT/lit.local.cfg index bfd921fe2..08a5c9747 100644 --- a/test/Integration/Dialect/XeGPU/SIMT/lit.local.cfg +++ b/test/Integration/Dialect/XeGPU/SIMT/lit.local.cfg @@ -1,2 +1,2 @@ if(not config.mlir_enable_levelzero_runtime): - config.unsupported = True \ No newline at end of file + config.unsupported = True