@@ -44,6 +44,8 @@ static cl::opt<bool> DisableLowOverheadLoops(
44
44
" disable-arm-loloops" , cl::Hidden, cl::init(false ),
45
45
cl::desc(" Disable the generation of low-overhead loops" ));
46
46
47
+ extern cl::opt<bool > DisableTailPredication;
48
+
47
49
bool ARMTTIImpl::areInlineCompatible (const Function *Caller,
48
50
const Function *Callee) const {
49
51
const TargetMachine &TM = getTLI ()->getTargetMachine ();
@@ -1000,18 +1002,114 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
1000
1002
return true ;
1001
1003
}
1002
1004
1005
+ static bool canTailPredicateInstruction (Instruction &I, int &ICmpCount) {
1006
+ // We don't allow icmp's, and because we only look at single block loops,
1007
+ // we simply count the icmps, i.e. there should only be 1 for the backedge.
1008
+ if (isa<ICmpInst>(&I) && ++ICmpCount > 1 )
1009
+ return false ;
1010
+
1011
+ // We could allow extending/narrowing FP loads/stores, but codegen is
1012
+ // too inefficient so reject this for now.
1013
+ if (isa<FPExtInst>(&I) || isa<FPTruncInst>(&I))
1014
+ return false ;
1015
+
1016
+ // Extends have to be extending-loads
1017
+ if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
1018
+ if (!I.getOperand (0 )->hasOneUse () || !isa<LoadInst>(I.getOperand (0 )))
1019
+ return false ;
1020
+
1021
+ // Truncs have to be narrowing-stores
1022
+ if (isa<TruncInst>(&I) )
1023
+ if (!I.hasOneUse () || !isa<StoreInst>(*I.user_begin ()))
1024
+ return false ;
1025
+
1026
+ return true ;
1027
+ }
1028
+
1029
+ // To set up a tail-predicated loop, we need to know the total number of
1030
+ // elements processed by that loop. Thus, we need to determine the element
1031
+ // size and:
1032
+ // 1) it should be uniform for all operations in the vector loop, so we
1033
+ // e.g. don't want any widening/narrowing operations.
1034
+ // 2) it should be smaller than i64s because we don't have vector operations
1035
+ // that work on i64s.
1036
+ // 3) we don't want elements to be reversed or shuffled, to make sure the
1037
+ // tail-predication masks/predicates the right lanes.
1038
+ //
1039
+ static bool canTailPredicateLoop (Loop *L, LoopInfo *LI, ScalarEvolution &SE,
1040
+ const DataLayout &DL,
1041
+ const LoopAccessInfo *LAI) {
1042
+ PredicatedScalarEvolution PSE = LAI->getPSE ();
1043
+ int ICmpCount = 0 ;
1044
+ int Stride = 0 ;
1045
+
1046
+ LLVM_DEBUG (dbgs () << " tail-predication: checking allowed instructions\n " );
1047
+ SmallVector<Instruction *, 16 > LoadStores;
1048
+ for (BasicBlock *BB : L->blocks ()) {
1049
+ for (Instruction &I : BB->instructionsWithoutDebug ()) {
1050
+ if (isa<PHINode>(&I))
1051
+ continue ;
1052
+ if (!canTailPredicateInstruction (I, ICmpCount)) {
1053
+ LLVM_DEBUG (dbgs () << " Instruction not allowed: " ; I.dump ());
1054
+ return false ;
1055
+ }
1056
+
1057
+ Type *T = I.getType ();
1058
+ if (T->isPointerTy ())
1059
+ T = T->getPointerElementType ();
1060
+
1061
+ if (T->getScalarSizeInBits () > 32 ) {
1062
+ LLVM_DEBUG (dbgs () << " Unsupported Type: " ; T->dump ());
1063
+ return false ;
1064
+ }
1065
+
1066
+ if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
1067
+ Value *Ptr = isa<LoadInst>(I) ? I.getOperand (0 ) : I.getOperand (1 );
1068
+ int64_t NextStride = getPtrStride (PSE, Ptr, L);
1069
+ // TODO: for now only allow consecutive strides of 1. We could support
1070
+ // other strides as long as it is uniform, but let's keep it simple for
1071
+ // now.
1072
+ if (Stride == 0 && NextStride == 1 ) {
1073
+ Stride = NextStride;
1074
+ continue ;
1075
+ }
1076
+ if (Stride != NextStride) {
1077
+ LLVM_DEBUG (dbgs () << " Different strides found, can't "
1078
+ " tail-predicate\n ." );
1079
+ return false ;
1080
+ }
1081
+ }
1082
+ }
1083
+ }
1084
+
1085
+ LLVM_DEBUG (dbgs () << " tail-predication: all instructions allowed!\n " );
1086
+ return true ;
1087
+ }
1088
+
1003
1089
bool ARMTTIImpl::preferPredicateOverEpilogue (Loop *L, LoopInfo *LI,
1004
1090
ScalarEvolution &SE,
1005
1091
AssumptionCache &AC,
1006
1092
TargetLibraryInfo *TLI,
1007
1093
DominatorTree *DT,
1008
1094
const LoopAccessInfo *LAI) {
1095
+ if (DisableTailPredication)
1096
+ return false ;
1097
+
1009
1098
// Creating a predicated vector loop is the first step for generating a
1010
1099
// tail-predicated hardware loop, for which we need the MVE masked
1011
1100
// load/stores instructions:
1012
1101
if (!ST->hasMVEIntegerOps ())
1013
1102
return false ;
1014
1103
1104
+ // For now, restrict this to single block loops.
1105
+ if (L->getNumBlocks () > 1 ) {
1106
+ LLVM_DEBUG (dbgs () << " preferPredicateOverEpilogue: not a single block "
1107
+ " loop.\n " );
1108
+ return false ;
1109
+ }
1110
+
1111
+ assert (L->empty () && " preferPredicateOverEpilogue: inner-loop expected" );
1112
+
1015
1113
HardwareLoopInfo HWLoopInfo (L);
1016
1114
if (!HWLoopInfo.canAnalyze (*LI)) {
1017
1115
LLVM_DEBUG (dbgs () << " preferPredicateOverEpilogue: hardware-loop is not "
@@ -1033,14 +1131,7 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
1033
1131
return false ;
1034
1132
}
1035
1133
1036
- // TODO: to set up a tail-predicated loop, which works by setting up
1037
- // the total number of elements processed by the loop, we need to
1038
- // determine the element size here, and if it is uniform for all operations
1039
- // in the vector loop. This means we will reject narrowing/widening
1040
- // operations, and don't want to predicate the vector loop, which is
1041
- // the main prep step for tail-predicated loops.
1042
-
1043
- return false ;
1134
+ return canTailPredicateLoop (L, LI, SE, DL, LAI);
1044
1135
}
1045
1136
1046
1137
0 commit comments