Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
242 changes: 232 additions & 10 deletions llvm/lib/CodeGen/ExpandReductions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,184 @@
//===----------------------------------------------------------------------===//

#include "llvm/CodeGen/ExpandReductions.h"
#include "llvm/Analysis/DomTreeUpdater.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstIterator.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <optional>

using namespace llvm;

namespace {

bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
bool Changed = false;
/// Expand a reduction on a scalable vector into a loop
/// that iterates over one element after the other.
Value *expandScalableReduction(IRBuilderBase &Builder, IntrinsicInst *II,
Value *Acc, Value *Vec,
Instruction::BinaryOps BinOp,
DomTreeUpdater &DTU) {
ScalableVectorType *VecTy = cast<ScalableVectorType>(Vec->getType());

// Split the original BB in two and create a new BB between them,
// which will be a loop.
BasicBlock *BeforeBB = II->getParent();
BasicBlock *AfterBB = SplitBlock(BeforeBB, II, &DTU);
BasicBlock *LoopBB = BasicBlock::Create(Builder.getContext(), "rdx.loop",
BeforeBB->getParent(), AfterBB);
BeforeBB->getTerminator()->setSuccessor(0, LoopBB);

// Calculate the number of elements in the vector:
Builder.SetInsertPoint(BeforeBB->getTerminator());
Value *NumElts =
Builder.CreateVScale(Builder.getInt64(VecTy->getMinNumElements()));

// Create two PHIs, one for the index of the current lane and one for
// the reduction.
Builder.SetInsertPoint(LoopBB);
PHINode *IV = Builder.CreatePHI(Builder.getInt64Ty(), 2, "index");
IV->addIncoming(Builder.getInt64(0), BeforeBB);
PHINode *RdxPhi = Builder.CreatePHI(VecTy->getScalarType(), 2, "rdx.phi");
RdxPhi->addIncoming(Acc, BeforeBB);

Value *IVInc =
Builder.CreateAdd(IV, Builder.getInt64(1), "index.next", true, true);
IV->addIncoming(IVInc, LoopBB);

// Extract the value at the current lane from the vector and perform
// the scalar reduction binop:
Value *Lane = Builder.CreateExtractElement(Vec, IV, "elm");
Value *Rdx = Builder.CreateBinOp(BinOp, RdxPhi, Lane, "rdx");
RdxPhi->addIncoming(Rdx, LoopBB);

// Exit when all lanes have been treated (assuming there will be at least
// one element in the vector):
Value *Done = Builder.CreateCmp(CmpInst::ICMP_EQ, IVInc, NumElts, "exitcond");
Builder.CreateCondBr(Done, AfterBB, LoopBB);

DTU.applyUpdates({{DominatorTree::Insert, BeforeBB, LoopBB},
{DominatorTree::Insert, LoopBB, AfterBB},
{DominatorTree::Delete, BeforeBB, AfterBB}});
return Rdx;
}

/// Expand a reduction on a scalable vector in a parallel-tree like
/// manner, meaning halving the number of elements to treat in every
/// iteration.
Value *expandScalableTreeReduction(
IRBuilderBase &Builder, IntrinsicInst *II, std::optional<Value *> Acc,
Value *Vec, Instruction::BinaryOps BinOp,
function_ref<bool(Constant *)> IsNeutralElement, DomTreeUpdater &DTU,
std::optional<unsigned> FixedVScale) {
ScalableVectorType *VecTy = cast<ScalableVectorType>(Vec->getType());
ScalableVectorType *VecTyX2 = ScalableVectorType::get(
VecTy->getScalarType(), VecTy->getMinNumElements() * 2);

// If the VScale is fixed, do not generate a loop, and instead to
// something similar to llvm::getShuffleReduction(). That function
// cannot be used directly because it uses shuffle masks, which
// are not avaiable for scalable vectors (even if vscale is fixed).
// The approach is effectively the same.
if (FixedVScale.has_value()) {
unsigned VF = VecTy->getMinNumElements() * FixedVScale.value();
assert(isPowerOf2_64(VF));
for (unsigned I = VF; I != 1; I >>= 1) {
Value *Extended = Builder.CreateInsertVector(
VecTyX2, PoisonValue::get(VecTyX2), Vec, Builder.getInt64(0));
Value *Pair = Builder.CreateIntrinsic(Intrinsic::vector_deinterleave2,
{VecTyX2}, {Extended});

Value *Vec1 = Builder.CreateExtractValue(Pair, {0});
Value *Vec2 = Builder.CreateExtractValue(Pair, {1});
Vec = Builder.CreateBinOp(BinOp, Vec1, Vec2, "rdx");
}
Value *FinalVal = Builder.CreateExtractElement(Vec, uint64_t(0));
if (Acc)
if (auto *C = dyn_cast<Constant>(*Acc); !C || !IsNeutralElement(C))
FinalVal = Builder.CreateBinOp(BinOp, *Acc, FinalVal, "rdx.final");
return FinalVal;
}

// Split the original BB in two and create a new BB between them,
// which will be a loop.
BasicBlock *BeforeBB = II->getParent();
BasicBlock *AfterBB = SplitBlock(BeforeBB, II, &DTU);
BasicBlock *LoopBB = BasicBlock::Create(Builder.getContext(), "rdx.loop",
BeforeBB->getParent(), AfterBB);
BeforeBB->getTerminator()->setSuccessor(0, LoopBB);

// This tree reduction only needs to do log2(N) iterations.
// Note: Calculating log2(N) using count-trailing-zeros (cttz) only works if
// `vscale` the vector size is a power of two.
Builder.SetInsertPoint(BeforeBB->getTerminator());
Value *NumElts =
Builder.CreateVScale(Builder.getInt64(VecTy->getMinNumElements()));
Value *NumIters = Builder.CreateIntrinsic(NumElts->getType(), Intrinsic::cttz,
{NumElts, Builder.getTrue()});

// Create two PHIs, one for the IV and one for the reduction.
Builder.SetInsertPoint(LoopBB);
PHINode *IV = Builder.CreatePHI(Builder.getInt64Ty(), 2, "iter");
IV->addIncoming(Builder.getInt64(0), BeforeBB);
PHINode *VecPhi = Builder.CreatePHI(VecTy, 2, "rdx.phi");
VecPhi->addIncoming(Vec, BeforeBB);

Value *IVInc =
Builder.CreateAdd(IV, Builder.getInt64(1), "iter.next", true, true);
IV->addIncoming(IVInc, LoopBB);

// The deinterleave intrinsic takes a vector of, for example, type
// <vscale x 8 x float> and produces a pair of vectors with half the size,
// so 2 x <vscale x 4 x float>. An insert vector operation is used to
// create a double-sized vector where the upper half is poison, because
// we never care about that upper half anyways!
Value *Extended = Builder.CreateInsertVector(
VecTyX2, PoisonValue::get(VecTyX2), VecPhi, Builder.getInt64(0));
Value *Pair = Builder.CreateIntrinsic(Intrinsic::vector_deinterleave2,
{VecTyX2}, {Extended});
Value *Vec1 = Builder.CreateExtractValue(Pair, {0});
Value *Vec2 = Builder.CreateExtractValue(Pair, {1});
Value *Rdx = Builder.CreateBinOp(BinOp, Vec1, Vec2, "rdx");
VecPhi->addIncoming(Rdx, LoopBB);

// Reduction-loop exit condition:
Value *Done =
Builder.CreateCmp(CmpInst::ICMP_EQ, IVInc, NumIters, "exitcond");
Builder.CreateCondBr(Done, AfterBB, LoopBB);
Builder.SetInsertPoint(AfterBB, AfterBB->getFirstInsertionPt());
Value *FinalVal = Builder.CreateExtractElement(Rdx, uint64_t(0));

// If the Acc value is not the neutral element of the reduction operation,
// then we need to do the binop one last time with the end result of the
// tree reduction.
if (Acc)
if (auto *C = dyn_cast<Constant>(*Acc); !C || !IsNeutralElement(C))
FinalVal = Builder.CreateBinOp(BinOp, *Acc, FinalVal, "rdx.final");

DTU.applyUpdates({{DominatorTree::Insert, BeforeBB, LoopBB},
{DominatorTree::Insert, LoopBB, AfterBB},
{DominatorTree::Delete, BeforeBB, AfterBB}});

return FinalVal;
}

std::pair<bool, bool> expandReductions(Function &F,
const TargetTransformInfo *TTI,
DomTreeUpdater &DTU) {
bool Changed = false, CFGChanged = false;
SmallVector<IntrinsicInst *, 4> Worklist;
for (auto &I : instructions(F)) {
if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
Expand All @@ -54,6 +216,12 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
}
}

const auto &Attrs = F.getAttributes().getFnAttrs();
unsigned MinVScale = Attrs.getVScaleRangeMin();
std::optional<unsigned> FixedVScale = Attrs.getVScaleRangeMax();
if (FixedVScale != MinVScale)
FixedVScale = std::nullopt;

for (auto *II : Worklist) {
FastMathFlags FMF =
isa<FPMathOperator>(II) ? II->getFastMathFlags() : FastMathFlags{};
Expand All @@ -74,7 +242,34 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
// and it can't be handled by generating a shuffle sequence.
Value *Acc = II->getArgOperand(0);
Value *Vec = II->getArgOperand(1);
unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
auto RdxOpcode =
Instruction::BinaryOps(getArithmeticReductionInstruction(ID));

bool ScalableTy = Vec->getType()->isScalableTy();
if (ScalableTy && (!FixedVScale || FMF.allowReassoc())) {
CFGChanged |= !FixedVScale;
assert(TTI->isVScaleKnownToBeAPowerOfTwo() &&
"Scalable tree reduction unimplemented for targets with a "
"VScale not known to be a power of 2.");
if (FMF.allowReassoc())
Rdx = expandScalableTreeReduction(
Builder, II, Acc, Vec, RdxOpcode,
[&](Constant *C) {
switch (ID) {
case Intrinsic::vector_reduce_fadd:
return C->isZeroValue();
case Intrinsic::vector_reduce_fmul:
return C->isOneValue();
default:
llvm_unreachable("Binop not handled");
}
},
DTU, FixedVScale);
else
Rdx = expandScalableReduction(Builder, II, Acc, Vec, RdxOpcode, DTU);
break;
}

if (!FMF.allowReassoc())
Rdx = getOrderedReduction(Builder, Acc, Vec, RdxOpcode, RK);
else {
Expand Down Expand Up @@ -125,10 +320,22 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
case Intrinsic::vector_reduce_umax:
case Intrinsic::vector_reduce_umin: {
Value *Vec = II->getArgOperand(0);
unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
if (Vec->getType()->isScalableTy()) {
CFGChanged |= !FixedVScale;
assert(TTI->isVScaleKnownToBeAPowerOfTwo() &&
"Scalable tree reduction unimplemented for targets with a "
"VScale not known to be a power of 2.");
Rdx = expandScalableTreeReduction(
Builder, II, std::nullopt, Vec, Instruction::BinaryOps(RdxOpcode),
[](Constant *C) -> bool { llvm_unreachable("No accumulator!"); },
DTU, FixedVScale);
break;
}

if (!isPowerOf2_32(
cast<FixedVectorType>(Vec->getType())->getNumElements()))
continue;
unsigned RdxOpcode = getArithmeticReductionInstruction(ID);
Rdx = getShuffleReduction(Builder, Vec, RdxOpcode, RS, RK);
break;
}
Expand All @@ -150,7 +357,12 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
II->eraseFromParent();
Changed = true;
}
return Changed;

if (DTU.hasDomTree() && DTU.hasPendingUpdates()) {
DTU.flush();
assert(DTU.getDomTree().verify(DominatorTree::VerificationLevel::Fast));
}
return {CFGChanged, Changed};
}

class ExpandReductions : public FunctionPass {
Expand All @@ -161,13 +373,17 @@ class ExpandReductions : public FunctionPass {
}

bool runOnFunction(Function &F) override {
const auto *TTI =&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
return expandReductions(F, TTI);
const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto *DTA = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
DomTreeUpdater DTU(DTA ? &DTA->getDomTree() : nullptr,
DomTreeUpdater::UpdateStrategy::Lazy);
return expandReductions(F, TTI, DTU).second;
}

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.setPreservesCFG();
AU.addUsedIfAvailable<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
}
};
}
Expand All @@ -186,9 +402,15 @@ FunctionPass *llvm::createExpandReductionsPass() {
PreservedAnalyses ExpandReductionsPass::run(Function &F,
FunctionAnalysisManager &AM) {
const auto &TTI = AM.getResult<TargetIRAnalysis>(F);
if (!expandReductions(F, &TTI))
auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
auto [CFGChanged, Changed] = expandReductions(F, &TTI, DTU);
if (!Changed)
return PreservedAnalyses::all();
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
if (!CFGChanged)
PA.preserveSet<CFGAnalyses>();
else
PA.preserve<DominatorTreeAnalysis>();
return PA;
}
11 changes: 10 additions & 1 deletion llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,16 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
shouldConsiderAddressTypePromotion(const Instruction &I,
bool &AllowPromotionWithoutCommonHeader);

bool shouldExpandReduction(const IntrinsicInst *II) const { return false; }
bool shouldExpandReduction(const IntrinsicInst *II) const {
switch (II->getIntrinsicID()) {
case Intrinsic::vector_reduce_mul:
return II->getOperand(0)->getType()->isScalableTy();
case Intrinsic::vector_reduce_fmul:
return II->getOperand(1)->getType()->isScalableTy();
default:
return false;
}
}

unsigned getGISelRematGlobalCost() const {
return 2;
Expand Down
15 changes: 14 additions & 1 deletion llvm/lib/Transforms/Utils/LoopUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1114,10 +1114,23 @@ Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
return Select;
}

static unsigned getFixedVF(Function *F, Type *Ty) {
if (auto *Fixed = dyn_cast<FixedVectorType>(Ty))
return Fixed->getNumElements();

auto *ScalableTy = cast<ScalableVectorType>(Ty);
unsigned VScaleMin = F->getAttributes().getFnAttrs().getVScaleRangeMin();
assert(F->getAttributes().getFnAttrs().getVScaleRangeMax() == VScaleMin &&
"Expected a compile-time known VScale");

return ScalableTy->getMinNumElements() * VScaleMin;
}

// Helper to generate an ordered reduction.
Value *llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src,
unsigned Op, RecurKind RdxKind) {
unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements();
unsigned VF =
getFixedVF(Builder.GetInsertBlock()->getParent(), Src->getType());

// Extract and apply reduction ops in ascending order:
// e.g. ((((Acc + Scl[0]) + Scl[1]) + Scl[2]) + ) ... + Scl[VF-1]
Expand Down
Loading
Loading