Skip to content

Commit 302beb5

Browse files
committed
[LV] Skip select cost for invariant divisors in legacy cost model.
For UDiv/SDiv with invariant divisors, the created selects will be hoisted out. Don't compute their cost for each iteration, to match the more accurate VPlan-based cost modeling. Fixes llvm#159402. (cherry picked from commit addfdb5)
1 parent d0f7329 commit 302beb5

File tree

2 files changed

+163
-8
lines changed

2 files changed

+163
-8
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3050,16 +3050,18 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
30503050
// likely.
30513051
ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind);
30523052
}
3053-
InstructionCost SafeDivisorCost = 0;
30543053

3054+
InstructionCost SafeDivisorCost = 0;
30553055
auto *VecTy = toVectorTy(I->getType(), VF);
3056-
3057-
// The cost of the select guard to ensure all lanes are well defined
3058-
// after we speculate above any internal control flow.
3059-
SafeDivisorCost +=
3060-
TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
3061-
toVectorTy(Type::getInt1Ty(I->getContext()), VF),
3062-
CmpInst::BAD_ICMP_PREDICATE, CostKind);
3056+
auto *DivisorI = dyn_cast<Instruction>(I->getOperand(1));
3057+
if (DivisorI && !Legal->isInvariant(DivisorI)) {
3058+
// The cost of the select guard to ensure all lanes are well defined
3059+
// after we speculate above any internal control flow.
3060+
SafeDivisorCost +=
3061+
TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
3062+
toVectorTy(Type::getInt1Ty(I->getContext()), VF),
3063+
CmpInst::BAD_ICMP_PREDICATE, CostKind);
3064+
}
30633065

30643066
SmallVector<const Value *, 4> Operands(I->operand_values());
30653067
SafeDivisorCost += TTI.getArithmeticInstrCost(

llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1019,3 +1019,156 @@ latch:
10191019
for.end:
10201020
ret void
10211021
}
1022+
1023+
; Test for https://github.com/llvm/llvm-project/issues/159402. For invariant divisors,
1024+
; selects can be introduced outside the vector loop and their cost should not be
1025+
; considered for each loop iteration.
1026+
define i32 @udiv_sdiv_with_invariant_divisors(i8 %x, i16 %y, i1 %c) {
1027+
; CHECK-LABEL: @udiv_sdiv_with_invariant_divisors(
1028+
; CHECK-NEXT: entry:
1029+
; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
1030+
; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 1
1031+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 12, [[TMP1]]
1032+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1033+
; CHECK: vector.ph:
1034+
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
1035+
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i32 [[TMP2]], 2
1036+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 12, [[TMP3]]
1037+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 12, [[N_MOD_VF]]
1038+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i16> poison, i16 [[Y:%.*]], i64 0
1039+
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
1040+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i8> poison, i8 [[X:%.*]], i64 0
1041+
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i8> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
1042+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[C:%.*]], i64 0
1043+
; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 2 x i1> [[BROADCAST_SPLATINSERT3]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
1044+
; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i32 [[N_VEC]] to i16
1045+
; CHECK-NEXT: [[TMP4:%.*]] = add i16 -12, [[DOTCAST]]
1046+
; CHECK-NEXT: [[DOTCAST5:%.*]] = trunc i32 [[N_VEC]] to i8
1047+
; CHECK-NEXT: [[TMP5:%.*]] = add i8 -12, [[DOTCAST5]]
1048+
; CHECK-NEXT: [[TMP6:%.*]] = select <vscale x 2 x i1> [[BROADCAST_SPLAT4]], <vscale x 2 x i8> splat (i8 1), <vscale x 2 x i8> [[BROADCAST_SPLAT2]]
1049+
; CHECK-NEXT: [[TMP7:%.*]] = select <vscale x 2 x i1> [[BROADCAST_SPLAT4]], <vscale x 2 x i16> splat (i16 1), <vscale x 2 x i16> [[BROADCAST_SPLAT]]
1050+
; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i8> @llvm.stepvector.nxv2i8()
1051+
; CHECK-NEXT: [[TMP9:%.*]] = mul <vscale x 2 x i8> [[TMP8]], splat (i8 1)
1052+
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i8> splat (i8 -12), [[TMP9]]
1053+
; CHECK-NEXT: [[TMP10:%.*]] = trunc i32 [[TMP3]] to i8
1054+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <vscale x 2 x i8> poison, i8 [[TMP10]], i64 0
1055+
; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <vscale x 2 x i8> [[BROADCAST_SPLATINSERT6]], <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
1056+
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
1057+
; CHECK: vector.body:
1058+
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1059+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i8> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1060+
; CHECK-NEXT: [[TMP11:%.*]] = udiv <vscale x 2 x i8> [[VEC_IND]], [[TMP6]]
1061+
; CHECK-NEXT: [[TMP12:%.*]] = zext <vscale x 2 x i8> [[TMP11]] to <vscale x 2 x i16>
1062+
; CHECK-NEXT: [[TMP13:%.*]] = sdiv <vscale x 2 x i16> [[TMP12]], [[TMP7]]
1063+
; CHECK-NEXT: [[TMP14:%.*]] = sext <vscale x 2 x i16> [[TMP13]] to <vscale x 2 x i32>
1064+
; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[BROADCAST_SPLAT4]], <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> [[TMP14]]
1065+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
1066+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i8> [[VEC_IND]], [[BROADCAST_SPLAT7]]
1067+
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
1068+
; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
1069+
; CHECK: middle.block:
1070+
; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
1071+
; CHECK-NEXT: [[TMP17:%.*]] = mul nuw i32 [[TMP16]], 2
1072+
; CHECK-NEXT: [[TMP18:%.*]] = sub i32 [[TMP17]], 1
1073+
; CHECK-NEXT: [[TMP19:%.*]] = extractelement <vscale x 2 x i32> [[PREDPHI]], i32 [[TMP18]]
1074+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 12, [[N_VEC]]
1075+
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
1076+
; CHECK: scalar.ph:
1077+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ [[TMP4]], [[MIDDLE_BLOCK]] ], [ -12, [[ENTRY:%.*]] ]
1078+
; CHECK-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i8 [ [[TMP5]], [[MIDDLE_BLOCK]] ], [ -12, [[ENTRY]] ]
1079+
; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
1080+
; CHECK: loop.header:
1081+
; CHECK-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
1082+
; CHECK-NEXT: [[NARROW_IV:%.*]] = phi i8 [ [[BC_RESUME_VAL8]], [[SCALAR_PH]] ], [ [[IV_NEXT_TRUNC:%.*]], [[LOOP_LATCH]] ]
1083+
; CHECK-NEXT: br i1 [[C]], label [[LOOP_LATCH]], label [[THEN:%.*]]
1084+
; CHECK: then:
1085+
; CHECK-NEXT: [[UD:%.*]] = udiv i8 [[NARROW_IV]], [[X]]
1086+
; CHECK-NEXT: [[UD_EXT:%.*]] = zext i8 [[UD]] to i16
1087+
; CHECK-NEXT: [[SD:%.*]] = sdiv i16 [[UD_EXT]], [[Y]]
1088+
; CHECK-NEXT: [[SD_EXT:%.*]] = sext i16 [[SD]] to i32
1089+
; CHECK-NEXT: br label [[LOOP_LATCH]]
1090+
; CHECK: loop.latch:
1091+
; CHECK-NEXT: [[MERGE:%.*]] = phi i32 [ 0, [[LOOP_HEADER]] ], [ [[SD_EXT]], [[THEN]] ]
1092+
; CHECK-NEXT: [[IV_NEXT]] = add nsw i16 [[IV]], 1
1093+
; CHECK-NEXT: [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 0
1094+
; CHECK-NEXT: [[IV_NEXT_TRUNC]] = trunc i16 [[IV_NEXT]] to i8
1095+
; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP13:![0-9]+]]
1096+
; CHECK: exit:
1097+
; CHECK-NEXT: [[MERGE_LCSSA:%.*]] = phi i32 [ [[MERGE]], [[LOOP_LATCH]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
1098+
; CHECK-NEXT: ret i32 [[MERGE_LCSSA]]
1099+
;
1100+
; FIXED-LABEL: @udiv_sdiv_with_invariant_divisors(
1101+
; FIXED-NEXT: entry:
1102+
; FIXED-NEXT: br label [[VECTOR_PH:%.*]]
1103+
; FIXED: vector.ph:
1104+
; FIXED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[Y:%.*]], i64 0
1105+
; FIXED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
1106+
; FIXED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[X:%.*]], i64 0
1107+
; FIXED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer
1108+
; FIXED-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i64 0
1109+
; FIXED-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT3]], <4 x i1> poison, <4 x i32> zeroinitializer
1110+
; FIXED-NEXT: [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT4]], <4 x i8> splat (i8 1), <4 x i8> [[BROADCAST_SPLAT2]]
1111+
; FIXED-NEXT: [[TMP1:%.*]] = select <4 x i1> [[BROADCAST_SPLAT4]], <4 x i16> splat (i16 1), <4 x i16> [[BROADCAST_SPLAT]]
1112+
; FIXED-NEXT: br label [[VECTOR_BODY:%.*]]
1113+
; FIXED: vector.body:
1114+
; FIXED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1115+
; FIXED-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 -12, i8 -11, i8 -10, i8 -9>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1116+
; FIXED-NEXT: [[TMP2:%.*]] = udiv <4 x i8> [[VEC_IND]], [[TMP0]]
1117+
; FIXED-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i16>
1118+
; FIXED-NEXT: [[TMP4:%.*]] = sdiv <4 x i16> [[TMP3]], [[TMP1]]
1119+
; FIXED-NEXT: [[TMP5:%.*]] = sext <4 x i16> [[TMP4]] to <4 x i32>
1120+
; FIXED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT4]], <4 x i32> zeroinitializer, <4 x i32> [[TMP5]]
1121+
; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
1122+
; FIXED-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
1123+
; FIXED-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
1124+
; FIXED-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
1125+
; FIXED: middle.block:
1126+
; FIXED-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3
1127+
; FIXED-NEXT: br label [[EXIT:%.*]]
1128+
; FIXED: scalar.ph:
1129+
; FIXED-NEXT: br label [[LOOP_HEADER:%.*]]
1130+
; FIXED: loop.header:
1131+
; FIXED-NEXT: [[IV:%.*]] = phi i16 [ -12, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
1132+
; FIXED-NEXT: [[NARROW_IV:%.*]] = phi i8 [ -12, [[SCALAR_PH]] ], [ [[IV_NEXT_TRUNC:%.*]], [[LOOP_LATCH]] ]
1133+
; FIXED-NEXT: br i1 [[C]], label [[LOOP_LATCH]], label [[THEN:%.*]]
1134+
; FIXED: then:
1135+
; FIXED-NEXT: [[UD:%.*]] = udiv i8 [[NARROW_IV]], [[X]]
1136+
; FIXED-NEXT: [[UD_EXT:%.*]] = zext i8 [[UD]] to i16
1137+
; FIXED-NEXT: [[SD:%.*]] = sdiv i16 [[UD_EXT]], [[Y]]
1138+
; FIXED-NEXT: [[SD_EXT:%.*]] = sext i16 [[SD]] to i32
1139+
; FIXED-NEXT: br label [[LOOP_LATCH]]
1140+
; FIXED: loop.latch:
1141+
; FIXED-NEXT: [[MERGE:%.*]] = phi i32 [ 0, [[LOOP_HEADER]] ], [ [[SD_EXT]], [[THEN]] ]
1142+
; FIXED-NEXT: [[IV_NEXT]] = add nsw i16 [[IV]], 1
1143+
; FIXED-NEXT: [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 0
1144+
; FIXED-NEXT: [[IV_NEXT_TRUNC]] = trunc i16 [[IV_NEXT]] to i8
1145+
; FIXED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP_HEADER]]
1146+
; FIXED: exit:
1147+
; FIXED-NEXT: [[MERGE_LCSSA:%.*]] = phi i32 [ [[MERGE]], [[LOOP_LATCH]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
1148+
; FIXED-NEXT: ret i32 [[MERGE_LCSSA]]
1149+
;
1150+
entry:
1151+
br label %loop.header
1152+
1153+
loop.header:
1154+
%iv = phi i16 [ -12, %entry ], [ %iv.next, %loop.latch ]
1155+
%narrow.iv = phi i8 [ -12, %entry ], [ %iv.next.trunc, %loop.latch ]
1156+
br i1 %c, label %loop.latch, label %then
1157+
1158+
then:
1159+
%ud = udiv i8 %narrow.iv, %x
1160+
%ud.ext = zext i8 %ud to i16
1161+
%sd = sdiv i16 %ud.ext, %y
1162+
%sd.ext = sext i16 %sd to i32
1163+
br label %loop.latch
1164+
1165+
loop.latch:
1166+
%merge = phi i32 [ 0, %loop.header ], [ %sd.ext, %then ]
1167+
%iv.next = add nsw i16 %iv, 1
1168+
%ec = icmp eq i16 %iv.next, 0
1169+
%iv.next.trunc = trunc i16 %iv.next to i8
1170+
br i1 %ec, label %exit, label %loop.header
1171+
1172+
exit:
1173+
ret i32 %merge
1174+
}

0 commit comments

Comments
 (0)