From 7f6d6efa5a9b3fab973bfc915037577425949c68 Mon Sep 17 00:00:00 2001 From: jerryyin Date: Tue, 18 Mar 2025 13:02:00 +0000 Subject: [PATCH 1/5] Adding Vector to AMDGPU conversion lowering --- mlir/include/mlir/Conversion/Passes.h | 1 + mlir/include/mlir/Conversion/Passes.td | 10 ++ .../VectorToAMDGPU/VectorToAMDGPU.h | 24 +++ mlir/lib/Conversion/CMakeLists.txt | 1 + .../Conversion/VectorToAMDGPU/CMakeLists.txt | 18 +++ .../VectorToAMDGPU/VectorToAMDGPU.cpp | 147 ++++++++++++++++++ .../vector-transfer-read-to-vector-load.mlir | 68 ++++++++ 7 files changed, 269 insertions(+) create mode 100644 mlir/include/mlir/Conversion/VectorToAMDGPU/VectorToAMDGPU.h create mode 100644 mlir/lib/Conversion/VectorToAMDGPU/CMakeLists.txt create mode 100644 mlir/lib/Conversion/VectorToAMDGPU/VectorToAMDGPU.cpp create mode 100644 mlir/test/Conversion/VectorToAMDGPU/vector-transfer-read-to-vector-load.mlir diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h index ccd862f67c068..ed5e8de8787f7 100644 --- a/mlir/include/mlir/Conversion/Passes.h +++ b/mlir/include/mlir/Conversion/Passes.h @@ -73,6 +73,7 @@ #include "mlir/Conversion/TosaToTensor/TosaToTensor.h" #include "mlir/Conversion/UBToLLVM/UBToLLVM.h" #include "mlir/Conversion/UBToSPIRV/UBToSPIRV.h" +#include "mlir/Conversion/VectorToAMDGPU/VectorToAMDGPU.h" #include "mlir/Conversion/VectorToArmSME/VectorToArmSME.h" #include "mlir/Conversion/VectorToGPU/VectorToGPU.h" #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.h" diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index bbba495e613b2..1845d0235183e 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -1333,6 +1333,16 @@ def ConvertVectorToArmSMEPass : Pass<"convert-vector-to-arm-sme"> { let dependentDialects = ["arm_sme::ArmSMEDialect", "arm_sve::ArmSVEDialect"]; } +//===----------------------------------------------------------------------===// +// VectorToAMDGPU +//===----------------------------------------------------------------------===// + +def ConvertVectorToAMDGPUPass : Pass<"convert-vector-to-amdgpu"> { + let summary = "Lower the operations from the vector dialect into the AMDGPU " + "dialect"; + let dependentDialects = ["amdgpu::AMDGPUDialect", "vector::VectorDialect"]; +} + //===----------------------------------------------------------------------===// // ArmSMEToSCF //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Conversion/VectorToAMDGPU/VectorToAMDGPU.h b/mlir/include/mlir/Conversion/VectorToAMDGPU/VectorToAMDGPU.h new file mode 100644 index 0000000000000..be96061a23b08 --- /dev/null +++ b/mlir/include/mlir/Conversion/VectorToAMDGPU/VectorToAMDGPU.h @@ -0,0 +1,24 @@ +//===- VectorToAMDGPU.h - Vector to AMDGPU dialect conversion ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_CONVERSION_VECTORTOAMDGPU_VECTORTOAMDGPU_H +#define MLIR_CONVERSION_VECTORTOAMDGPU_VECTORTOAMDGPU_H + +#include "mlir/IR/PatternMatch.h" + +namespace mlir { +class RewritePatternSet; +class Pass; + +#define GEN_PASS_DECL_CONVERTVECTORTOAMDGPUPASS +#include "mlir/Conversion/Passes.h.inc" + +void populateVectorToAMDGPUConversionPatterns(RewritePatternSet &patterns); +} // namespace mlir + +#endif // MLIR_CONVERSION_VECTORTOAMDGPU_VECTORTOAMDGPU_H diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt index b6c21440c571c..1e4cbd2be4c96 100644 --- a/mlir/lib/Conversion/CMakeLists.txt +++ b/mlir/lib/Conversion/CMakeLists.txt @@ -66,6 +66,7 @@ add_subdirectory(TosaToSCF) add_subdirectory(TosaToTensor) add_subdirectory(UBToLLVM) add_subdirectory(UBToSPIRV) +add_subdirectory(VectorToAMDGPU) add_subdirectory(VectorToArmSME) add_subdirectory(VectorToGPU) add_subdirectory(VectorToLLVM) diff --git a/mlir/lib/Conversion/VectorToAMDGPU/CMakeLists.txt b/mlir/lib/Conversion/VectorToAMDGPU/CMakeLists.txt new file mode 100644 index 0000000000000..2ad46c26d0a57 --- /dev/null +++ b/mlir/lib/Conversion/VectorToAMDGPU/CMakeLists.txt @@ -0,0 +1,18 @@ +add_mlir_conversion_library(MLIRVectorToAMDGPU + VectorToAMDGPU.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/VectorToAMDGPU + + DEPENDS + MLIRConversionPassIncGen + + LINK_COMPONENTS + Core + + LINK_LIBS PUBLIC + MLIRAMDGPUDialect + MLIRVectorDialect + MLIRPass + MLIRTransforms + ) diff --git a/mlir/lib/Conversion/VectorToAMDGPU/VectorToAMDGPU.cpp b/mlir/lib/Conversion/VectorToAMDGPU/VectorToAMDGPU.cpp new file mode 100644 index 0000000000000..248b84a7fdc98 --- /dev/null +++ b/mlir/lib/Conversion/VectorToAMDGPU/VectorToAMDGPU.cpp @@ -0,0 +1,147 @@ +//===- VectorToAMDGPU.cpp - Vector to AMDGPU dialect conversion ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/VectorToAMDGPU/VectorToAMDGPU.h" + +#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/IR/TypeUtilities.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Support/LogicalResult.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +namespace mlir { +#define GEN_PASS_DEF_CONVERTVECTORTOAMDGPUPASS +#include "mlir/Conversion/Passes.h.inc" +} // namespace mlir + +using namespace mlir; + +/// This pattern supports lowering of: +/// `vector.transfer_read` to a combination of `vector.load`, `arith.select` and +/// `vector.broadcast` if all of the following hold: +/// - The transfer op is masked. +/// - The memref is in buffer address space. +/// - Stride of most minor memref dimension must be 1. +/// - Out-of-bounds masking is not required. +/// - If the memref's element type is a vector type then it coincides with the +/// result type. +/// - The permutation map doesn't perform permutation (broadcasting is allowed). +/// Note: those conditions mostly come from TransferReadToVectorLoadLowering +/// pass. +static LogicalResult +transferPreconditions(PatternRewriter &rewriter, + VectorTransferOpInterface xferOp, + SmallVector &broadcastedDims, + VectorType &unbroadcastedVectorType) { + if (!xferOp.getMask()) + return rewriter.notifyMatchFailure(xferOp, "Only support masked transfer"); + + // Permutations are handled by VectorToSCF or + // populateVectorTransferPermutationMapLoweringPatterns. + // We let the 0-d corner case pass-through as it is supported. + if (!xferOp.getPermutationMap().isMinorIdentityWithBroadcasting( + &broadcastedDims)) + return rewriter.notifyMatchFailure(xferOp, "not minor identity + bcast"); + + auto memRefType = dyn_cast(xferOp.getShapedType()); + if (!memRefType) + return rewriter.notifyMatchFailure(xferOp, "not a memref source"); + + Attribute addrSpace = memRefType.getMemorySpace(); + if (!addrSpace || + llvm::dyn_cast(addrSpace).getValue() != + amdgpu::AddressSpace::FatRawBuffer) + return rewriter.notifyMatchFailure(xferOp, "not in buffer address space"); + + // Non-unit strides are handled by VectorToSCF. + if (!memRefType.isLastDimUnitStride()) + return rewriter.notifyMatchFailure(xferOp, "!= 1 stride needs VectorToSCF"); + + // If there is broadcasting involved then we first load the unbroadcasted + // vector, and then broadcast it with `vector.broadcast`. + ArrayRef vectorShape = xferOp.getVectorType().getShape(); + SmallVector unbroadcastedVectorShape(vectorShape); + for (unsigned i : broadcastedDims) + unbroadcastedVectorShape[i] = 1; + unbroadcastedVectorType = xferOp.getVectorType().cloneWith( + unbroadcastedVectorShape, xferOp.getVectorType().getElementType()); + + // `vector.load` supports vector types as memref's elements only when the + // resulting vector type is the same as the element type. + auto memrefElTy = memRefType.getElementType(); + if (isa(memrefElTy) && memrefElTy != unbroadcastedVectorType) + return rewriter.notifyMatchFailure(xferOp, "incompatible element type"); + + // Otherwise, element types of the memref and the vector must match. + if (!isa(memrefElTy) && + memrefElTy != xferOp.getVectorType().getElementType()) + return rewriter.notifyMatchFailure(xferOp, "non-matching element type"); + + // Out-of-bounds dims are handled by MaterializeTransferMask. + if (xferOp.hasOutOfBoundsDim()) + return rewriter.notifyMatchFailure(xferOp, "out-of-bounds needs mask"); + + if (xferOp.getVectorType().getRank() != 1) + // vector.maskedload operates on 1-D vectors. + return rewriter.notifyMatchFailure( + xferOp, "vector type is not rank 1, can't create masked load, needs " + "VectorToSCF"); + + return success(); +} + +struct TransferReadLowering : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(vector::TransferReadOp readOp, + PatternRewriter &rewriter) const override { + + SmallVector broadcastedDims; + VectorType unbroadcastedVectorType; + if (failed(transferPreconditions(rewriter, readOp, broadcastedDims, + unbroadcastedVectorType))) { + return failure(); + } + + Value fill = rewriter.create( + readOp.getLoc(), unbroadcastedVectorType, readOp.getPadding()); + Value load = rewriter.create( + readOp.getLoc(), unbroadcastedVectorType, readOp.getSource(), + readOp.getIndices()); + Value res = rewriter.create( + readOp.getLoc(), unbroadcastedVectorType, readOp.getMask(), load, fill); + + // Insert a broadcasting op if required. + if (!broadcastedDims.empty()) { + res = rewriter.create(readOp.getLoc(), + readOp.getVectorType(), res); + } + + rewriter.replaceOp(readOp, res); + + return success(); + } +}; + +void mlir::populateVectorToAMDGPUConversionPatterns( + RewritePatternSet &patterns) { + patterns.add(patterns.getContext()); +} + +struct ConvertVectorToAMDGPUPass + : public impl::ConvertVectorToAMDGPUPassBase { + void runOnOperation() override { + RewritePatternSet patterns(&getContext()); + populateVectorToAMDGPUConversionPatterns(patterns); + if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) + return signalPassFailure(); + } +}; diff --git a/mlir/test/Conversion/VectorToAMDGPU/vector-transfer-read-to-vector-load.mlir b/mlir/test/Conversion/VectorToAMDGPU/vector-transfer-read-to-vector-load.mlir new file mode 100644 index 0000000000000..30d9814cc0621 --- /dev/null +++ b/mlir/test/Conversion/VectorToAMDGPU/vector-transfer-read-to-vector-load.mlir @@ -0,0 +1,68 @@ +// RUN: mlir-opt %s -convert-vector-to-amdgpu --split-input-file | FileCheck %s + +// CHECK-LABEL: func @transfer_to_maskedload_fatrawbuffer( +// CHECK-SAME: %[[ARG0:.*]]: memref<8x8xf32, #amdgpu.address_space> +// CHECK-SAME: %[[ARG1:.*]]: index +// CHECK-SAME: %[[ARG2:.*]]: vector<4xi1> +func.func @transfer_to_maskedload_fatrawbuffer(%mem : memref<8x8xf32, #amdgpu.address_space>, %idx : index, %mask : vector<4xi1>) -> vector<4xf32> { + %cf0 = arith.constant 0.0 : f32 + %res = vector.transfer_read %mem[%idx, %idx], %cf0, %mask {in_bounds = [true]} : memref<8x8xf32, #amdgpu.address_space>, vector<4xf32> + return %res : vector<4xf32> +} +// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> +// CHECK: %[[LOAD:.*]] = vector.load %arg0[%arg1, %arg1] +// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[CST]] +// CHECK: return %[[SELECT]] : vector<4xf32> + +// ----- + +// CHECK-LABEL: func @transfer_to_maskedload_regular( +// CHECK-SAME: %[[ARG0:.*]]: memref<8x8xf32> +// CHECK-SAME: %[[ARG1:.*]]: index +// CHECK-SAME: %[[ARG2:.*]]: vector<4xi1> +func.func @transfer_to_maskedload_regular(%mem : memref<8x8xf32>, %idx : index, %mask : vector<4xi1>) -> vector<4xf32> { + %cf0 = arith.constant 0.0 : f32 + %res = vector.transfer_read %mem[%idx, %idx], %cf0, %mask {in_bounds = [true]} : memref<8x8xf32>, vector<4xf32> + return %res : vector<4xf32> +} +// CHECK: %[[CST:.*]] = arith.constant 0.0 +// CHECK: %[[RES:.*]] = vector.transfer_read %arg0[%arg1, %arg1], %[[CST]], %arg2 {in_bounds = [true]} : memref<8x8xf32>, vector<4xf32> +// CHECK: return %[[RES]] : vector<4xf32> + +// ----- + +// CHECK-LABEL: func @transfer_broadcasting( +// CHECK-SAME: %[[ARG0:.*]]: memref<8x8xf32, #amdgpu.address_space> +// CHECK-SAME: %[[ARG1:.*]]: index +// CHECK-SAME: %[[ARG2:.*]]: vector<1xi1> +#broadcast_1d = affine_map<(d0, d1) -> (0)> +func.func @transfer_broadcasting(%mem : memref<8x8xf32, #amdgpu.address_space>, %idx : index, %mask : vector<1xi1>) -> vector<4xf32> { + %cf0 = arith.constant 0.0 : f32 + %res = vector.transfer_read %mem[%idx, %idx], %cf0, %mask + {in_bounds = [true], permutation_map = #broadcast_1d} + : memref<8x8xf32, #amdgpu.address_space>, vector<4xf32> + return %res : vector<4xf32> +} +// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> +// CHECK: %[[LOAD:.*]] = vector.load %arg0[%arg1, %arg1] +// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[CST]] +// CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[SELECT]] : vector<1xf32> to vector<4xf32> +// CHECK: return %[[BROADCAST]] : vector<4xf32> + +// ----- + +// CHECK-LABEL: func @transfer_scalar( +// CHECK-SAME: %[[ARG0:.*]]: memref<8x8xf32, #amdgpu.address_space> +// CHECK-SAME: %[[ARG1:.*]]: index +// CHECK-SAME: %[[ARG2:.*]]: vector<1xi1> +func.func @transfer_scalar(%mem : memref<8x8xf32, #amdgpu.address_space>, %idx : index, %mask : vector<1xi1>) -> vector<1xf32> { + %cf0 = arith.constant 0.0 : f32 + %res = vector.transfer_read %mem[%idx, %idx], %cf0, %mask + {in_bounds = [true]} + : memref<8x8xf32, #amdgpu.address_space>, vector<1xf32> + return %res : vector<1xf32> +} +// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> +// CHECK: %[[LOAD:.*]] = vector.load %arg0[%arg1, %arg1] +// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[CST]] +// CHECK: return %[[SELECT]] : vector<1xf32> From 4610b01665b363c00c14d8a3674e201dd214c435 Mon Sep 17 00:00:00 2001 From: jerryyin Date: Tue, 18 Mar 2025 21:00:53 +0000 Subject: [PATCH 2/5] Addressing review feedbacks --- .../VectorToAMDGPU/VectorToAMDGPU.cpp | 48 +++++++++---------- .../vector-transfer-read-to-vector-load.mlir | 17 ++++--- 2 files changed, 33 insertions(+), 32 deletions(-) diff --git a/mlir/lib/Conversion/VectorToAMDGPU/VectorToAMDGPU.cpp b/mlir/lib/Conversion/VectorToAMDGPU/VectorToAMDGPU.cpp index 248b84a7fdc98..569e8f8bb4e3a 100644 --- a/mlir/lib/Conversion/VectorToAMDGPU/VectorToAMDGPU.cpp +++ b/mlir/lib/Conversion/VectorToAMDGPU/VectorToAMDGPU.cpp @@ -15,7 +15,7 @@ #include "mlir/IR/TypeUtilities.h" #include "mlir/Pass/Pass.h" #include "mlir/Support/LogicalResult.h" -#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "mlir/Transforms/WalkPatternRewriteDriver.h" namespace mlir { #define GEN_PASS_DEF_CONVERTVECTORTOAMDGPUPASS @@ -36,17 +36,16 @@ using namespace mlir; /// - The permutation map doesn't perform permutation (broadcasting is allowed). /// Note: those conditions mostly come from TransferReadToVectorLoadLowering /// pass. -static LogicalResult -transferPreconditions(PatternRewriter &rewriter, - VectorTransferOpInterface xferOp, - SmallVector &broadcastedDims, - VectorType &unbroadcastedVectorType) { +static LogicalResult transferPreconditions( + PatternRewriter &rewriter, VectorTransferOpInterface xferOp, + bool &requiresBroadcasting, VectorType &unbroadcastedVectorType) { if (!xferOp.getMask()) return rewriter.notifyMatchFailure(xferOp, "Only support masked transfer"); // Permutations are handled by VectorToSCF or // populateVectorTransferPermutationMapLoweringPatterns. // We let the 0-d corner case pass-through as it is supported. + SmallVector broadcastedDims; if (!xferOp.getPermutationMap().isMinorIdentityWithBroadcasting( &broadcastedDims)) return rewriter.notifyMatchFailure(xferOp, "not minor identity + bcast"); @@ -56,9 +55,8 @@ transferPreconditions(PatternRewriter &rewriter, return rewriter.notifyMatchFailure(xferOp, "not a memref source"); Attribute addrSpace = memRefType.getMemorySpace(); - if (!addrSpace || - llvm::dyn_cast(addrSpace).getValue() != - amdgpu::AddressSpace::FatRawBuffer) + if (!addrSpace || dyn_cast(addrSpace).getValue() != + amdgpu::AddressSpace::FatRawBuffer) return rewriter.notifyMatchFailure(xferOp, "not in buffer address space"); // Non-unit strides are handled by VectorToSCF. @@ -73,6 +71,7 @@ transferPreconditions(PatternRewriter &rewriter, unbroadcastedVectorShape[i] = 1; unbroadcastedVectorType = xferOp.getVectorType().cloneWith( unbroadcastedVectorShape, xferOp.getVectorType().getElementType()); + requiresBroadcasting = !broadcastedDims.empty(); // `vector.load` supports vector types as memref's elements only when the // resulting vector type is the same as the element type. @@ -98,31 +97,31 @@ transferPreconditions(PatternRewriter &rewriter, return success(); } -struct TransferReadLowering : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; +struct TransferReadLowering final : OpRewritePattern { + using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(vector::TransferReadOp readOp, PatternRewriter &rewriter) const override { - SmallVector broadcastedDims; + bool requiresBroadcasting = false; VectorType unbroadcastedVectorType; - if (failed(transferPreconditions(rewriter, readOp, broadcastedDims, + if (failed(transferPreconditions(rewriter, readOp, requiresBroadcasting, unbroadcastedVectorType))) { return failure(); } - Value fill = rewriter.create( - readOp.getLoc(), unbroadcastedVectorType, readOp.getPadding()); + Location loc = readOp.getLoc(); + Value fill = rewriter.create(loc, unbroadcastedVectorType, + readOp.getPadding()); Value load = rewriter.create( - readOp.getLoc(), unbroadcastedVectorType, readOp.getSource(), - readOp.getIndices()); - Value res = rewriter.create( - readOp.getLoc(), unbroadcastedVectorType, readOp.getMask(), load, fill); + loc, unbroadcastedVectorType, readOp.getSource(), readOp.getIndices()); + Value res = rewriter.create(loc, unbroadcastedVectorType, + readOp.getMask(), load, fill); // Insert a broadcasting op if required. - if (!broadcastedDims.empty()) { - res = rewriter.create(readOp.getLoc(), - readOp.getVectorType(), res); + if (requiresBroadcasting) { + res = rewriter.create(loc, readOp.getVectorType(), + res); } rewriter.replaceOp(readOp, res); @@ -136,12 +135,11 @@ void mlir::populateVectorToAMDGPUConversionPatterns( patterns.add(patterns.getContext()); } -struct ConvertVectorToAMDGPUPass +struct ConvertVectorToAMDGPUPass final : public impl::ConvertVectorToAMDGPUPassBase { void runOnOperation() override { RewritePatternSet patterns(&getContext()); populateVectorToAMDGPUConversionPatterns(patterns); - if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) - return signalPassFailure(); + walkAndApplyPatterns(getOperation(), std::move(patterns)); } }; diff --git a/mlir/test/Conversion/VectorToAMDGPU/vector-transfer-read-to-vector-load.mlir b/mlir/test/Conversion/VectorToAMDGPU/vector-transfer-read-to-vector-load.mlir index 30d9814cc0621..d0a79045c86da 100644 --- a/mlir/test/Conversion/VectorToAMDGPU/vector-transfer-read-to-vector-load.mlir +++ b/mlir/test/Conversion/VectorToAMDGPU/vector-transfer-read-to-vector-load.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-vector-to-amdgpu --split-input-file | FileCheck %s +// RUN: mlir-opt %s --convert-vector-to-amdgpu --split-input-file | FileCheck %s // CHECK-LABEL: func @transfer_to_maskedload_fatrawbuffer( // CHECK-SAME: %[[ARG0:.*]]: memref<8x8xf32, #amdgpu.address_space> @@ -9,9 +9,10 @@ func.func @transfer_to_maskedload_fatrawbuffer(%mem : memref<8x8xf32, #amdgpu.ad %res = vector.transfer_read %mem[%idx, %idx], %cf0, %mask {in_bounds = [true]} : memref<8x8xf32, #amdgpu.address_space>, vector<4xf32> return %res : vector<4xf32> } -// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> +// CHECK: %[[CST:.*]] = arith.constant 0.0 +// CHECK: %[[SPLAT:.*]] = vector.splat %[[CST]] // CHECK: %[[LOAD:.*]] = vector.load %arg0[%arg1, %arg1] -// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[CST]] +// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[SPLAT]] // CHECK: return %[[SELECT]] : vector<4xf32> // ----- @@ -43,9 +44,10 @@ func.func @transfer_broadcasting(%mem : memref<8x8xf32, #amdgpu.address_space>, vector<4xf32> return %res : vector<4xf32> } -// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> +// CHECK: %[[CST:.*]] = arith.constant 0.0 +// CHECK: %[[SPLAT:.*]] = vector.splat %[[CST]] // CHECK: %[[LOAD:.*]] = vector.load %arg0[%arg1, %arg1] -// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[CST]] +// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[SPLAT]] // CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[SELECT]] : vector<1xf32> to vector<4xf32> // CHECK: return %[[BROADCAST]] : vector<4xf32> @@ -62,7 +64,8 @@ func.func @transfer_scalar(%mem : memref<8x8xf32, #amdgpu.address_space>, vector<1xf32> return %res : vector<1xf32> } -// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> +// CHECK: %[[CST:.*]] = arith.constant 0.0 +// CHECK: %[[SPLAT:.*]] = vector.splat %[[CST]] // CHECK: %[[LOAD:.*]] = vector.load %arg0[%arg1, %arg1] -// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[CST]] +// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[SPLAT]] // CHECK: return %[[SELECT]] : vector<1xf32> From f7ca23b6d3dd559bac386dcb65cde1dfe7ee1d71 Mon Sep 17 00:00:00 2001 From: jerryyin Date: Wed, 19 Mar 2025 13:29:31 +0000 Subject: [PATCH 3/5] Addressing review feedbacks --- mlir/include/mlir/Conversion/Passes.td | 2 +- mlir/lib/Conversion/VectorToAMDGPU/VectorToAMDGPU.cpp | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index 1845d0235183e..d8f00c7109a3d 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -1340,7 +1340,7 @@ def ConvertVectorToArmSMEPass : Pass<"convert-vector-to-arm-sme"> { def ConvertVectorToAMDGPUPass : Pass<"convert-vector-to-amdgpu"> { let summary = "Lower the operations from the vector dialect into the AMDGPU " "dialect"; - let dependentDialects = ["amdgpu::AMDGPUDialect", "vector::VectorDialect"]; + let dependentDialects = ["vector::VectorDialect"]; } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Conversion/VectorToAMDGPU/VectorToAMDGPU.cpp b/mlir/lib/Conversion/VectorToAMDGPU/VectorToAMDGPU.cpp index 569e8f8bb4e3a..b923546d7aaa5 100644 --- a/mlir/lib/Conversion/VectorToAMDGPU/VectorToAMDGPU.cpp +++ b/mlir/lib/Conversion/VectorToAMDGPU/VectorToAMDGPU.cpp @@ -97,6 +97,8 @@ static LogicalResult transferPreconditions( return success(); } +namespace { + struct TransferReadLowering final : OpRewritePattern { using OpRewritePattern::OpRewritePattern; @@ -130,13 +132,15 @@ struct TransferReadLowering final : OpRewritePattern { } }; +} // namespace + void mlir::populateVectorToAMDGPUConversionPatterns( RewritePatternSet &patterns) { patterns.add(patterns.getContext()); } struct ConvertVectorToAMDGPUPass final - : public impl::ConvertVectorToAMDGPUPassBase { + : impl::ConvertVectorToAMDGPUPassBase { void runOnOperation() override { RewritePatternSet patterns(&getContext()); populateVectorToAMDGPUConversionPatterns(patterns); From 2c292f8958649170c00709211f017800453318f2 Mon Sep 17 00:00:00 2001 From: jerryyin Date: Wed, 19 Mar 2025 19:30:07 +0000 Subject: [PATCH 4/5] Adding bazel target for VectorToAMDGPU --- .../llvm-project-overlay/mlir/BUILD.bazel | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 95e2788906525..978779ea2bfb8 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -4466,6 +4466,7 @@ cc_library( ":TosaToTensor", ":UBToLLVM", ":UBToSPIRV", + ":VectorToAMDGPU", ":VectorToArmSME", ":VectorToGPU", ":VectorToLLVM", @@ -12189,6 +12190,27 @@ cc_library( ], ) +cc_library( + name = "VectorToAMDGPU", + srcs = glob([ + "lib/Conversion/VectorToAMDGPU/*.cpp", + ]), + hdrs = glob([ + "include/mlir/Conversion/VectorToAMDGPU/*.h", + ]), + includes = ["include"], + deps = [ + ":AMDGPUDialect", + ":VectorDialect", + ":ConversionPassIncGen", + ":IR", + ":MemRefDialect", + ":Pass", + ":TransformUtils", + "//llvm:Support", + ], +) + cc_library( name = "VectorToArmSME", srcs = glob([ From 01beca8b3e7fa6cba849ec12b232050c05809585 Mon Sep 17 00:00:00 2001 From: jerryyin Date: Thu, 20 Mar 2025 21:30:10 +0000 Subject: [PATCH 5/5] Amend from conversion to dialect rewrite pattern --- mlir/include/mlir/Conversion/Passes.h | 1 - mlir/include/mlir/Conversion/Passes.td | 10 ------- .../VectorToAMDGPU/VectorToAMDGPU.h | 24 --------------- .../mlir/Dialect/AMDGPU/Transforms/Passes.h | 4 +++ .../mlir/Dialect/AMDGPU/Transforms/Passes.td | 14 +++++++++ mlir/lib/Conversion/CMakeLists.txt | 1 - .../Conversion/VectorToAMDGPU/CMakeLists.txt | 18 ------------ .../Dialect/AMDGPU/Transforms/CMakeLists.txt | 1 + .../AMDGPU/Transforms/TransferReadToLoad.cpp} | 29 +++++++++++-------- .../AMDGPU/transfer-read-to-load.mlir} | 17 ++++++++++- .../llvm-project-overlay/mlir/BUILD.bazel | 22 -------------- 11 files changed, 52 insertions(+), 89 deletions(-) delete mode 100644 mlir/include/mlir/Conversion/VectorToAMDGPU/VectorToAMDGPU.h delete mode 100644 mlir/lib/Conversion/VectorToAMDGPU/CMakeLists.txt rename mlir/lib/{Conversion/VectorToAMDGPU/VectorToAMDGPU.cpp => Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp} (87%) rename mlir/test/{Conversion/VectorToAMDGPU/vector-transfer-read-to-vector-load.mlir => Dialect/AMDGPU/transfer-read-to-load.mlir} (78%) diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h index ed5e8de8787f7..ccd862f67c068 100644 --- a/mlir/include/mlir/Conversion/Passes.h +++ b/mlir/include/mlir/Conversion/Passes.h @@ -73,7 +73,6 @@ #include "mlir/Conversion/TosaToTensor/TosaToTensor.h" #include "mlir/Conversion/UBToLLVM/UBToLLVM.h" #include "mlir/Conversion/UBToSPIRV/UBToSPIRV.h" -#include "mlir/Conversion/VectorToAMDGPU/VectorToAMDGPU.h" #include "mlir/Conversion/VectorToArmSME/VectorToArmSME.h" #include "mlir/Conversion/VectorToGPU/VectorToGPU.h" #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.h" diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index d8f00c7109a3d..bbba495e613b2 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -1333,16 +1333,6 @@ def ConvertVectorToArmSMEPass : Pass<"convert-vector-to-arm-sme"> { let dependentDialects = ["arm_sme::ArmSMEDialect", "arm_sve::ArmSVEDialect"]; } -//===----------------------------------------------------------------------===// -// VectorToAMDGPU -//===----------------------------------------------------------------------===// - -def ConvertVectorToAMDGPUPass : Pass<"convert-vector-to-amdgpu"> { - let summary = "Lower the operations from the vector dialect into the AMDGPU " - "dialect"; - let dependentDialects = ["vector::VectorDialect"]; -} - //===----------------------------------------------------------------------===// // ArmSMEToSCF //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Conversion/VectorToAMDGPU/VectorToAMDGPU.h b/mlir/include/mlir/Conversion/VectorToAMDGPU/VectorToAMDGPU.h deleted file mode 100644 index be96061a23b08..0000000000000 --- a/mlir/include/mlir/Conversion/VectorToAMDGPU/VectorToAMDGPU.h +++ /dev/null @@ -1,24 +0,0 @@ -//===- VectorToAMDGPU.h - Vector to AMDGPU dialect conversion ---*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef MLIR_CONVERSION_VECTORTOAMDGPU_VECTORTOAMDGPU_H -#define MLIR_CONVERSION_VECTORTOAMDGPU_VECTORTOAMDGPU_H - -#include "mlir/IR/PatternMatch.h" - -namespace mlir { -class RewritePatternSet; -class Pass; - -#define GEN_PASS_DECL_CONVERTVECTORTOAMDGPUPASS -#include "mlir/Conversion/Passes.h.inc" - -void populateVectorToAMDGPUConversionPatterns(RewritePatternSet &patterns); -} // namespace mlir - -#endif // MLIR_CONVERSION_VECTORTOAMDGPU_VECTORTOAMDGPU_H diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h index c3ae7930e8ec8..94dd9e3a29331 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h +++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h @@ -22,6 +22,7 @@ namespace amdgpu { #define GEN_PASS_DECL_AMDGPUEMULATEATOMICSPASS #define GEN_PASS_DECL_AMDGPURESOLVESTRIDEDMETADATAPASS +#define GEN_PASS_DECL_AMDGPUTRANSFERREADTOLOADPASS #define GEN_PASS_REGISTRATION #include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc" @@ -30,6 +31,9 @@ void populateAmdgpuEmulateAtomicsPatterns(ConversionTarget &target, Chipset chipset); void populateAmdgpuResolveStridedMetadataPatterns(RewritePatternSet &patterns); + +void populateAmdgpuTransferReadToLoadPatterns(RewritePatternSet &patterns); + } // namespace amdgpu } // namespace mlir diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td index 6d0bcd6e1066e..761caa448a57c 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td @@ -51,4 +51,18 @@ def AmdgpuResolveStridedMetadataPass : Pass<"amdgpu-resolve-strided-metadata"> { ]; } +def AmdgpuTransferReadToLoadPass : Pass<"amdgpu-transfer-read-to-load"> { + let summary = "Lower the operations from the vector transfer_read to vector load"; + let description = [{ + This pass creates a transfer read op lowering. A vector trasfer read op + will be lowered to a combination of vector.load, arith.select and + vector.broadcast. + + This pattern will make it possible for masked transfer_read to be lowered + towards buffer load with bounds check, allowing a more optimized global + load accessing pattern compared with existing implementation of + llvm.intr.masked.load on vectors. + }]; + let dependentDialects = []; +} #endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_ diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt index 1e4cbd2be4c96..b6c21440c571c 100644 --- a/mlir/lib/Conversion/CMakeLists.txt +++ b/mlir/lib/Conversion/CMakeLists.txt @@ -66,7 +66,6 @@ add_subdirectory(TosaToSCF) add_subdirectory(TosaToTensor) add_subdirectory(UBToLLVM) add_subdirectory(UBToSPIRV) -add_subdirectory(VectorToAMDGPU) add_subdirectory(VectorToArmSME) add_subdirectory(VectorToGPU) add_subdirectory(VectorToLLVM) diff --git a/mlir/lib/Conversion/VectorToAMDGPU/CMakeLists.txt b/mlir/lib/Conversion/VectorToAMDGPU/CMakeLists.txt deleted file mode 100644 index 2ad46c26d0a57..0000000000000 --- a/mlir/lib/Conversion/VectorToAMDGPU/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -add_mlir_conversion_library(MLIRVectorToAMDGPU - VectorToAMDGPU.cpp - - ADDITIONAL_HEADER_DIRS - ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/VectorToAMDGPU - - DEPENDS - MLIRConversionPassIncGen - - LINK_COMPONENTS - Core - - LINK_LIBS PUBLIC - MLIRAMDGPUDialect - MLIRVectorDialect - MLIRPass - MLIRTransforms - ) diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt index 3d4567bff1e32..bc5b6e9186449 100644 --- a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt @@ -1,6 +1,7 @@ add_mlir_dialect_library(MLIRAMDGPUTransforms EmulateAtomics.cpp ResolveStridedMetadata.cpp + TransferReadToLoad.cpp ADDITIONAL_HEADER_DIRS {$MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU/Transforms diff --git a/mlir/lib/Conversion/VectorToAMDGPU/VectorToAMDGPU.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp similarity index 87% rename from mlir/lib/Conversion/VectorToAMDGPU/VectorToAMDGPU.cpp rename to mlir/lib/Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp index b923546d7aaa5..3c1a2eb962037 100644 --- a/mlir/lib/Conversion/VectorToAMDGPU/VectorToAMDGPU.cpp +++ b/mlir/lib/Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp @@ -1,4 +1,4 @@ -//===- VectorToAMDGPU.cpp - Vector to AMDGPU dialect conversion ---------===// +//===- TransferReadToLoad.cpp - Lowers masked transfer read to load -------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Conversion/VectorToAMDGPU/VectorToAMDGPU.h" +#include "mlir/Dialect/AMDGPU/Transforms/Passes.h" #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" @@ -17,12 +17,13 @@ #include "mlir/Support/LogicalResult.h" #include "mlir/Transforms/WalkPatternRewriteDriver.h" -namespace mlir { -#define GEN_PASS_DEF_CONVERTVECTORTOAMDGPUPASS -#include "mlir/Conversion/Passes.h.inc" -} // namespace mlir +namespace mlir::amdgpu { +#define GEN_PASS_DEF_AMDGPUTRANSFERREADTOLOADPASS +#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc" +} // namespace mlir::amdgpu using namespace mlir; +using namespace mlir::amdgpu; /// This pattern supports lowering of: /// `vector.transfer_read` to a combination of `vector.load`, `arith.select` and @@ -55,8 +56,11 @@ static LogicalResult transferPreconditions( return rewriter.notifyMatchFailure(xferOp, "not a memref source"); Attribute addrSpace = memRefType.getMemorySpace(); - if (!addrSpace || dyn_cast(addrSpace).getValue() != - amdgpu::AddressSpace::FatRawBuffer) + if (!addrSpace || !dyn_cast(addrSpace)) + return rewriter.notifyMatchFailure(xferOp, "no address space"); + + if (dyn_cast(addrSpace).getValue() != + amdgpu::AddressSpace::FatRawBuffer) return rewriter.notifyMatchFailure(xferOp, "not in buffer address space"); // Non-unit strides are handled by VectorToSCF. @@ -134,16 +138,17 @@ struct TransferReadLowering final : OpRewritePattern { } // namespace -void mlir::populateVectorToAMDGPUConversionPatterns( +void mlir::amdgpu::populateAmdgpuTransferReadToLoadPatterns( RewritePatternSet &patterns) { patterns.add(patterns.getContext()); } -struct ConvertVectorToAMDGPUPass final - : impl::ConvertVectorToAMDGPUPassBase { +struct AmdgpuTransferReadToLoadPass final + : amdgpu::impl::AmdgpuTransferReadToLoadPassBase< + AmdgpuTransferReadToLoadPass> { void runOnOperation() override { RewritePatternSet patterns(&getContext()); - populateVectorToAMDGPUConversionPatterns(patterns); + populateAmdgpuTransferReadToLoadPatterns(patterns); walkAndApplyPatterns(getOperation(), std::move(patterns)); } }; diff --git a/mlir/test/Conversion/VectorToAMDGPU/vector-transfer-read-to-vector-load.mlir b/mlir/test/Dialect/AMDGPU/transfer-read-to-load.mlir similarity index 78% rename from mlir/test/Conversion/VectorToAMDGPU/vector-transfer-read-to-vector-load.mlir rename to mlir/test/Dialect/AMDGPU/transfer-read-to-load.mlir index d0a79045c86da..3e1283579f2b1 100644 --- a/mlir/test/Conversion/VectorToAMDGPU/vector-transfer-read-to-vector-load.mlir +++ b/mlir/test/Dialect/AMDGPU/transfer-read-to-load.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s --convert-vector-to-amdgpu --split-input-file | FileCheck %s +// RUN: mlir-opt %s --amdgpu-transfer-read-to-load --split-input-file | FileCheck %s // CHECK-LABEL: func @transfer_to_maskedload_fatrawbuffer( // CHECK-SAME: %[[ARG0:.*]]: memref<8x8xf32, #amdgpu.address_space> @@ -32,6 +32,21 @@ func.func @transfer_to_maskedload_regular(%mem : memref<8x8xf32>, %idx : index, // ----- +// CHECK-LABEL: func @transfer_to_maskedload_addrspace( +// CHECK-SAME: %[[ARG0:.*]]: memref<8x8xf32, #gpu.address_space> +// CHECK-SAME: %[[ARG1:.*]]: index +// CHECK-SAME: %[[ARG2:.*]]: vector<4xi1> +func.func @transfer_to_maskedload_addrspace(%mem : memref<8x8xf32, #gpu.address_space>, %idx : index, %mask : vector<4xi1>) -> vector<4xf32> { + %cf0 = arith.constant 0.0 : f32 + %res = vector.transfer_read %mem[%idx, %idx], %cf0, %mask {in_bounds = [true]} : memref<8x8xf32, #gpu.address_space>, vector<4xf32> + return %res : vector<4xf32> +} +// CHECK: %[[CST:.*]] = arith.constant 0.0 +// CHECK: %[[RES:.*]] = vector.transfer_read %arg0[%arg1, %arg1], %[[CST]], %arg2 {in_bounds = [true]} : memref<8x8xf32, #gpu.address_space>, vector<4xf32> +// CHECK: return %[[RES]] : vector<4xf32> + +// ----- + // CHECK-LABEL: func @transfer_broadcasting( // CHECK-SAME: %[[ARG0:.*]]: memref<8x8xf32, #amdgpu.address_space> // CHECK-SAME: %[[ARG1:.*]]: index diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 978779ea2bfb8..95e2788906525 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -4466,7 +4466,6 @@ cc_library( ":TosaToTensor", ":UBToLLVM", ":UBToSPIRV", - ":VectorToAMDGPU", ":VectorToArmSME", ":VectorToGPU", ":VectorToLLVM", @@ -12190,27 +12189,6 @@ cc_library( ], ) -cc_library( - name = "VectorToAMDGPU", - srcs = glob([ - "lib/Conversion/VectorToAMDGPU/*.cpp", - ]), - hdrs = glob([ - "include/mlir/Conversion/VectorToAMDGPU/*.h", - ]), - includes = ["include"], - deps = [ - ":AMDGPUDialect", - ":VectorDialect", - ":ConversionPassIncGen", - ":IR", - ":MemRefDialect", - ":Pass", - ":TransformUtils", - "//llvm:Support", - ], -) - cc_library( name = "VectorToArmSME", srcs = glob([