Skip to content

Commit d90804d

Browse files
author
Sjoerd Meijer
committed
[ARM][MVE] canTailPredicateLoop
This implements TTI hook 'preferPredicateOverEpilogue' for MVE. This is a first version and it operates on single block loops only. With this change, the vectoriser will now determine if tail-folding scalar remainder loops is possible/desired, which is the first step to generate MVE tail-predicated vector loops. This is disabled by default for now. I.e,, this is depends on option -disable-mve-tail-predication, which is off by default. I will follow up on this soon with a patch for the vectoriser to respect loop hint 'vectorize.predicate.enable'. I.e., with this loop hint set to Disabled, we don't want to tail-fold and we shouldn't query this TTI hook, which is done in D70125. Differential Revision: https://reviews.llvm.org/D69845
1 parent a5ce8bd commit d90804d

File tree

3 files changed

+692
-36
lines changed

3 files changed

+692
-36
lines changed

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

Lines changed: 99 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ static cl::opt<bool> DisableLowOverheadLoops(
4444
"disable-arm-loloops", cl::Hidden, cl::init(false),
4545
cl::desc("Disable the generation of low-overhead loops"));
4646

47+
extern cl::opt<bool> DisableTailPredication;
48+
4749
bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
4850
const Function *Callee) const {
4951
const TargetMachine &TM = getTLI()->getTargetMachine();
@@ -1000,18 +1002,114 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
10001002
return true;
10011003
}
10021004

1005+
static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
1006+
// We don't allow icmp's, and because we only look at single block loops,
1007+
// we simply count the icmps, i.e. there should only be 1 for the backedge.
1008+
if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
1009+
return false;
1010+
1011+
// We could allow extending/narrowing FP loads/stores, but codegen is
1012+
// too inefficient so reject this for now.
1013+
if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
1014+
return false;
1015+
1016+
// Extends have to be extending-loads
1017+
if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
1018+
if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
1019+
return false;
1020+
1021+
// Truncs have to be narrowing-stores
1022+
if (isa<TruncInst>(&I) )
1023+
if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
1024+
return false;
1025+
1026+
return true;
1027+
}
1028+
1029+
// To set up a tail-predicated loop, we need to know the total number of
1030+
// elements processed by that loop. Thus, we need to determine the element
1031+
// size and:
1032+
// 1) it should be uniform for all operations in the vector loop, so we
1033+
// e.g. don't want any widening/narrowing operations.
1034+
// 2) it should be smaller than i64s because we don't have vector operations
1035+
// that work on i64s.
1036+
// 3) we don't want elements to be reversed or shuffled, to make sure the
1037+
// tail-predication masks/predicates the right lanes.
1038+
//
1039+
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
1040+
const DataLayout &DL,
1041+
const LoopAccessInfo *LAI) {
1042+
PredicatedScalarEvolution PSE = LAI->getPSE();
1043+
int ICmpCount = 0;
1044+
int Stride = 0;
1045+
1046+
LLVM_DEBUG(dbgs() << "tail-predication: checking allowed instructions\n");
1047+
SmallVector<Instruction *, 16> LoadStores;
1048+
for (BasicBlock *BB : L->blocks()) {
1049+
for (Instruction &I : BB->instructionsWithoutDebug()) {
1050+
if (isa<PHINode>(&I))
1051+
continue;
1052+
if (!canTailPredicateInstruction(I, ICmpCount)) {
1053+
LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
1054+
return false;
1055+
}
1056+
1057+
Type *T = I.getType();
1058+
if (T->isPointerTy())
1059+
T = T->getPointerElementType();
1060+
1061+
if (T->getScalarSizeInBits() > 32) {
1062+
LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
1063+
return false;
1064+
}
1065+
1066+
if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
1067+
Value *Ptr = isa<LoadInst>(I) ? I.getOperand(0) : I.getOperand(1);
1068+
int64_t NextStride = getPtrStride(PSE, Ptr, L);
1069+
// TODO: for now only allow consecutive strides of 1. We could support
1070+
// other strides as long as it is uniform, but let's keep it simple for
1071+
// now.
1072+
if (Stride == 0 && NextStride == 1) {
1073+
Stride = NextStride;
1074+
continue;
1075+
}
1076+
if (Stride != NextStride) {
1077+
LLVM_DEBUG(dbgs() << "Different strides found, can't "
1078+
"tail-predicate\n.");
1079+
return false;
1080+
}
1081+
}
1082+
}
1083+
}
1084+
1085+
LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
1086+
return true;
1087+
}
1088+
10031089
bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
10041090
ScalarEvolution &SE,
10051091
AssumptionCache &AC,
10061092
TargetLibraryInfo *TLI,
10071093
DominatorTree *DT,
10081094
const LoopAccessInfo *LAI) {
1095+
if (DisableTailPredication)
1096+
return false;
1097+
10091098
// Creating a predicated vector loop is the first step for generating a
10101099
// tail-predicated hardware loop, for which we need the MVE masked
10111100
// load/stores instructions:
10121101
if (!ST->hasMVEIntegerOps())
10131102
return false;
10141103

1104+
// For now, restrict this to single block loops.
1105+
if (L->getNumBlocks() > 1) {
1106+
LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: not a single block "
1107+
"loop.\n");
1108+
return false;
1109+
}
1110+
1111+
assert(L->empty() && "preferPredicateOverEpilogue: inner-loop expected");
1112+
10151113
HardwareLoopInfo HWLoopInfo(L);
10161114
if (!HWLoopInfo.canAnalyze(*LI)) {
10171115
LLVM_DEBUG(dbgs() << "preferPredicateOverEpilogue: hardware-loop is not "
@@ -1033,14 +1131,7 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
10331131
return false;
10341132
}
10351133

1036-
// TODO: to set up a tail-predicated loop, which works by setting up
1037-
// the total number of elements processed by the loop, we need to
1038-
// determine the element size here, and if it is uniform for all operations
1039-
// in the vector loop. This means we will reject narrowing/widening
1040-
// operations, and don't want to predicate the vector loop, which is
1041-
// the main prep step for tail-predicated loops.
1042-
1043-
return false;
1134+
return canTailPredicateLoop(L, LI, SE, DL, LAI);
10441135
}
10451136

10461137

llvm/lib/Target/ARM/MVETailPredication.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ using namespace llvm;
4141
#define DEBUG_TYPE "mve-tail-predication"
4242
#define DESC "Transform predicated vector loops to use MVE tail predication"
4343

44-
static cl::opt<bool>
44+
cl::opt<bool>
4545
DisableTailPredication("disable-mve-tail-predication", cl::Hidden,
4646
cl::init(true),
4747
cl::desc("Disable MVE Tail Predication"));

0 commit comments

Comments
 (0)