Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions mlir/include/mlir/Conversion/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
#include "mlir/Conversion/TosaToTensor/TosaToTensor.h"
#include "mlir/Conversion/UBToLLVM/UBToLLVM.h"
#include "mlir/Conversion/UBToSPIRV/UBToSPIRV.h"
#include "mlir/Conversion/VectorToAMDGPU/VectorToAMDGPU.h"
#include "mlir/Conversion/VectorToArmSME/VectorToArmSME.h"
#include "mlir/Conversion/VectorToGPU/VectorToGPU.h"
#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.h"
Expand Down
10 changes: 10 additions & 0 deletions mlir/include/mlir/Conversion/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -1333,6 +1333,16 @@ def ConvertVectorToArmSMEPass : Pass<"convert-vector-to-arm-sme"> {
let dependentDialects = ["arm_sme::ArmSMEDialect", "arm_sve::ArmSVEDialect"];
}

//===----------------------------------------------------------------------===//
// VectorToAMDGPU
//===----------------------------------------------------------------------===//

def ConvertVectorToAMDGPUPass : Pass<"convert-vector-to-amdgpu"> {
let summary = "Lower the operations from the vector dialect into the AMDGPU "
"dialect";
let dependentDialects = ["amdgpu::AMDGPUDialect", "vector::VectorDialect"];
}

//===----------------------------------------------------------------------===//
// ArmSMEToSCF
//===----------------------------------------------------------------------===//
Expand Down
24 changes: 24 additions & 0 deletions mlir/include/mlir/Conversion/VectorToAMDGPU/VectorToAMDGPU.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
//===- VectorToAMDGPU.h - Vector to AMDGPU dialect conversion ---*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef MLIR_CONVERSION_VECTORTOAMDGPU_VECTORTOAMDGPU_H
#define MLIR_CONVERSION_VECTORTOAMDGPU_VECTORTOAMDGPU_H

#include "mlir/IR/PatternMatch.h"

namespace mlir {
class RewritePatternSet;
class Pass;

#define GEN_PASS_DECL_CONVERTVECTORTOAMDGPUPASS
#include "mlir/Conversion/Passes.h.inc"

void populateVectorToAMDGPUConversionPatterns(RewritePatternSet &patterns);
} // namespace mlir

#endif // MLIR_CONVERSION_VECTORTOAMDGPU_VECTORTOAMDGPU_H
1 change: 1 addition & 0 deletions mlir/lib/Conversion/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ add_subdirectory(TosaToSCF)
add_subdirectory(TosaToTensor)
add_subdirectory(UBToLLVM)
add_subdirectory(UBToSPIRV)
add_subdirectory(VectorToAMDGPU)
add_subdirectory(VectorToArmSME)
add_subdirectory(VectorToGPU)
add_subdirectory(VectorToLLVM)
Expand Down
18 changes: 18 additions & 0 deletions mlir/lib/Conversion/VectorToAMDGPU/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
add_mlir_conversion_library(MLIRVectorToAMDGPU
VectorToAMDGPU.cpp

ADDITIONAL_HEADER_DIRS
${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/VectorToAMDGPU

DEPENDS
MLIRConversionPassIncGen

LINK_COMPONENTS
Core

LINK_LIBS PUBLIC
MLIRAMDGPUDialect
MLIRVectorDialect
MLIRPass
MLIRTransforms
)
147 changes: 147 additions & 0 deletions mlir/lib/Conversion/VectorToAMDGPU/VectorToAMDGPU.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
//===- VectorToAMDGPU.cpp - Vector to AMDGPU dialect conversion ---------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "mlir/Conversion/VectorToAMDGPU/VectorToAMDGPU.h"

#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/TypeUtilities.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Support/LogicalResult.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"

namespace mlir {
#define GEN_PASS_DEF_CONVERTVECTORTOAMDGPUPASS
#include "mlir/Conversion/Passes.h.inc"
} // namespace mlir

using namespace mlir;

/// This pattern supports lowering of:
/// `vector.transfer_read` to a combination of `vector.load`, `arith.select` and
/// `vector.broadcast` if all of the following hold:
/// - The transfer op is masked.
/// - The memref is in buffer address space.
/// - Stride of most minor memref dimension must be 1.
/// - Out-of-bounds masking is not required.
/// - If the memref's element type is a vector type then it coincides with the
/// result type.
/// - The permutation map doesn't perform permutation (broadcasting is allowed).
/// Note: those conditions mostly come from TransferReadToVectorLoadLowering
/// pass.
static LogicalResult
transferPreconditions(PatternRewriter &rewriter,
VectorTransferOpInterface xferOp,
SmallVector<unsigned> &broadcastedDims,
VectorType &unbroadcastedVectorType) {
if (!xferOp.getMask())
return rewriter.notifyMatchFailure(xferOp, "Only support masked transfer");

// Permutations are handled by VectorToSCF or
// populateVectorTransferPermutationMapLoweringPatterns.
// We let the 0-d corner case pass-through as it is supported.
if (!xferOp.getPermutationMap().isMinorIdentityWithBroadcasting(
&broadcastedDims))
return rewriter.notifyMatchFailure(xferOp, "not minor identity + bcast");

auto memRefType = dyn_cast<MemRefType>(xferOp.getShapedType());
if (!memRefType)
return rewriter.notifyMatchFailure(xferOp, "not a memref source");

Attribute addrSpace = memRefType.getMemorySpace();
if (!addrSpace ||
llvm::dyn_cast<amdgpu::AddressSpaceAttr>(addrSpace).getValue() !=
amdgpu::AddressSpace::FatRawBuffer)
return rewriter.notifyMatchFailure(xferOp, "not in buffer address space");

// Non-unit strides are handled by VectorToSCF.
if (!memRefType.isLastDimUnitStride())
return rewriter.notifyMatchFailure(xferOp, "!= 1 stride needs VectorToSCF");

// If there is broadcasting involved then we first load the unbroadcasted
// vector, and then broadcast it with `vector.broadcast`.
ArrayRef<int64_t> vectorShape = xferOp.getVectorType().getShape();
SmallVector<int64_t> unbroadcastedVectorShape(vectorShape);
for (unsigned i : broadcastedDims)
unbroadcastedVectorShape[i] = 1;
unbroadcastedVectorType = xferOp.getVectorType().cloneWith(
unbroadcastedVectorShape, xferOp.getVectorType().getElementType());

// `vector.load` supports vector types as memref's elements only when the
// resulting vector type is the same as the element type.
auto memrefElTy = memRefType.getElementType();
if (isa<VectorType>(memrefElTy) && memrefElTy != unbroadcastedVectorType)
return rewriter.notifyMatchFailure(xferOp, "incompatible element type");

// Otherwise, element types of the memref and the vector must match.
if (!isa<VectorType>(memrefElTy) &&
memrefElTy != xferOp.getVectorType().getElementType())
return rewriter.notifyMatchFailure(xferOp, "non-matching element type");

// Out-of-bounds dims are handled by MaterializeTransferMask.
if (xferOp.hasOutOfBoundsDim())
return rewriter.notifyMatchFailure(xferOp, "out-of-bounds needs mask");

if (xferOp.getVectorType().getRank() != 1)
// vector.maskedload operates on 1-D vectors.
return rewriter.notifyMatchFailure(
xferOp, "vector type is not rank 1, can't create masked load, needs "
"VectorToSCF");

return success();
}

struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> {
using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;

LogicalResult matchAndRewrite(vector::TransferReadOp readOp,
PatternRewriter &rewriter) const override {

SmallVector<unsigned> broadcastedDims;
VectorType unbroadcastedVectorType;
if (failed(transferPreconditions(rewriter, readOp, broadcastedDims,
unbroadcastedVectorType))) {
return failure();
}

Value fill = rewriter.create<vector::SplatOp>(
readOp.getLoc(), unbroadcastedVectorType, readOp.getPadding());
Value load = rewriter.create<vector::LoadOp>(
readOp.getLoc(), unbroadcastedVectorType, readOp.getSource(),
readOp.getIndices());
Value res = rewriter.create<arith::SelectOp>(
readOp.getLoc(), unbroadcastedVectorType, readOp.getMask(), load, fill);

// Insert a broadcasting op if required.
if (!broadcastedDims.empty()) {
res = rewriter.create<vector::BroadcastOp>(readOp.getLoc(),
readOp.getVectorType(), res);
}

rewriter.replaceOp(readOp, res);

return success();
}
};

void mlir::populateVectorToAMDGPUConversionPatterns(
RewritePatternSet &patterns) {
patterns.add<TransferReadLowering>(patterns.getContext());
}

struct ConvertVectorToAMDGPUPass
: public impl::ConvertVectorToAMDGPUPassBase<ConvertVectorToAMDGPUPass> {
void runOnOperation() override {
RewritePatternSet patterns(&getContext());
populateVectorToAMDGPUConversionPatterns(patterns);
if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
return signalPassFailure();
}
};
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
// RUN: mlir-opt %s -convert-vector-to-amdgpu --split-input-file | FileCheck %s

// CHECK-LABEL: func @transfer_to_maskedload_fatrawbuffer(
// CHECK-SAME: %[[ARG0:.*]]: memref<8x8xf32, #amdgpu.address_space<fat_raw_buffer>>
// CHECK-SAME: %[[ARG1:.*]]: index
// CHECK-SAME: %[[ARG2:.*]]: vector<4xi1>
func.func @transfer_to_maskedload_fatrawbuffer(%mem : memref<8x8xf32, #amdgpu.address_space<fat_raw_buffer>>, %idx : index, %mask : vector<4xi1>) -> vector<4xf32> {
%cf0 = arith.constant 0.0 : f32
%res = vector.transfer_read %mem[%idx, %idx], %cf0, %mask {in_bounds = [true]} : memref<8x8xf32, #amdgpu.address_space<fat_raw_buffer>>, vector<4xf32>
return %res : vector<4xf32>
}
// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00>
// CHECK: %[[LOAD:.*]] = vector.load %arg0[%arg1, %arg1]
// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[CST]]
// CHECK: return %[[SELECT]] : vector<4xf32>

// -----

// CHECK-LABEL: func @transfer_to_maskedload_regular(
// CHECK-SAME: %[[ARG0:.*]]: memref<8x8xf32>
// CHECK-SAME: %[[ARG1:.*]]: index
// CHECK-SAME: %[[ARG2:.*]]: vector<4xi1>
func.func @transfer_to_maskedload_regular(%mem : memref<8x8xf32>, %idx : index, %mask : vector<4xi1>) -> vector<4xf32> {
%cf0 = arith.constant 0.0 : f32
%res = vector.transfer_read %mem[%idx, %idx], %cf0, %mask {in_bounds = [true]} : memref<8x8xf32>, vector<4xf32>
return %res : vector<4xf32>
}
// CHECK: %[[CST:.*]] = arith.constant 0.0
// CHECK: %[[RES:.*]] = vector.transfer_read %arg0[%arg1, %arg1], %[[CST]], %arg2 {in_bounds = [true]} : memref<8x8xf32>, vector<4xf32>
// CHECK: return %[[RES]] : vector<4xf32>

// -----

// CHECK-LABEL: func @transfer_broadcasting(
// CHECK-SAME: %[[ARG0:.*]]: memref<8x8xf32, #amdgpu.address_space<fat_raw_buffer>>
// CHECK-SAME: %[[ARG1:.*]]: index
// CHECK-SAME: %[[ARG2:.*]]: vector<1xi1>
#broadcast_1d = affine_map<(d0, d1) -> (0)>
func.func @transfer_broadcasting(%mem : memref<8x8xf32, #amdgpu.address_space<fat_raw_buffer>>, %idx : index, %mask : vector<1xi1>) -> vector<4xf32> {
%cf0 = arith.constant 0.0 : f32
%res = vector.transfer_read %mem[%idx, %idx], %cf0, %mask
{in_bounds = [true], permutation_map = #broadcast_1d}
: memref<8x8xf32, #amdgpu.address_space<fat_raw_buffer>>, vector<4xf32>
return %res : vector<4xf32>
}
// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00>
// CHECK: %[[LOAD:.*]] = vector.load %arg0[%arg1, %arg1]
// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[CST]]
// CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[SELECT]] : vector<1xf32> to vector<4xf32>
// CHECK: return %[[BROADCAST]] : vector<4xf32>

// -----

// CHECK-LABEL: func @transfer_scalar(
// CHECK-SAME: %[[ARG0:.*]]: memref<8x8xf32, #amdgpu.address_space<fat_raw_buffer>>
// CHECK-SAME: %[[ARG1:.*]]: index
// CHECK-SAME: %[[ARG2:.*]]: vector<1xi1>
func.func @transfer_scalar(%mem : memref<8x8xf32, #amdgpu.address_space<fat_raw_buffer>>, %idx : index, %mask : vector<1xi1>) -> vector<1xf32> {
%cf0 = arith.constant 0.0 : f32
%res = vector.transfer_read %mem[%idx, %idx], %cf0, %mask
{in_bounds = [true]}
: memref<8x8xf32, #amdgpu.address_space<fat_raw_buffer>>, vector<1xf32>
return %res : vector<1xf32>
}
// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00>
// CHECK: %[[LOAD:.*]] = vector.load %arg0[%arg1, %arg1]
// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[CST]]
// CHECK: return %[[SELECT]] : vector<1xf32>