Skip to content

Commit b0c15cd

Browse files
committed
[AArch64] Improve urem by constant costs
A urem by a constant, much like a udiv by a constant, can be expanded into a series of mul/add/shift instructions. The exact sequence of instructions depends on the constants and the types. If the constant is a power-2 then a shift / and will be used, so the cost will be 1. This canonicalization happens relatively early so this likely has very little effect in practice (it does help the cost of funnel shifts). For a non-power 2 the code for div will expand to a series of UMULH + Add + Shift + Add, depending on the constant. urem is generally udiv + mul + sub, so involves a few extra instructions. The UMULH is not always available, i32 will use umull+shift, and vector types will use umull+shift or umull+umull2+uzp depending on the vector size. v2i64 will be scalarized because there is no mul available. SVE does have a UMULH instruction. The end result is that the costs should be closer to reality, with scalable types a little lower cost than the fixed-width versions. (In the future we might be able to use umulh for fixed-width when the SVE instruction is available, but for the moment this should favour scalable vectorization a little). I've tried to make this patch only apply to constant UREM/UDIV instructions. SDIV and SREM are left until a later patch to prevent this becoming too complex. The funnel shift costs are changing as it believes it will need a urem to clamp the shift amount, which should be a power-2 value for most common types.
1 parent fb512c9 commit b0c15cd

File tree

9 files changed

+567
-529
lines changed

9 files changed

+567
-529
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 50 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3519,20 +3519,58 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
35193519
return Cost;
35203520
}
35213521
[[fallthrough]];
3522-
case ISD::UDIV: {
3522+
case ISD::UDIV:
3523+
case ISD::UREM: {
35233524
auto VT = TLI->getValueType(DL, Ty);
3524-
if (Op2Info.isConstant() && Op2Info.isUniform()) {
3525+
if (Op2Info.isConstant()) {
3526+
// If the operand is a power of 2 we can use the shift or and cost.
3527+
if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
3528+
return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
3529+
Op1Info.getNoProps(),
3530+
Op2Info.getNoProps());
3531+
if (ISD == ISD::UREM && Op2Info.isPowerOf2())
3532+
return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
3533+
Op1Info.getNoProps(),
3534+
Op2Info.getNoProps());
3535+
3536+
if (ISD == ISD::UDIV || ISD == ISD::UREM) {
3537+
// Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
3538+
// The MULHU will be expanded to UMULL for the types not listed below,
3539+
// and will become a pair of UMULL+MULL2 for 128bit vectors.
3540+
bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
3541+
LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
3542+
LT.second == MVT::nxv16i8;
3543+
bool Is128bit = LT.second.is128BitVector();
3544+
3545+
InstructionCost MulCost =
3546+
getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
3547+
Op1Info.getNoProps(), Op2Info.getNoProps());
3548+
InstructionCost AddCost =
3549+
getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
3550+
Op1Info.getNoProps(), Op2Info.getNoProps());
3551+
InstructionCost ShrCost =
3552+
getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
3553+
Op1Info.getNoProps(), Op2Info.getNoProps());
3554+
InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
3555+
(HasMULH ? 0 : ShrCost) + // UMULL shift
3556+
AddCost * 2 + ShrCost;
3557+
return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
3558+
}
3559+
3560+
// TODOD: Fix SDIV and SREM costs, similar to the above.
35253561
if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT) &&
3526-
!VT.isScalableVector()) {
3562+
Op2Info.isUniform()) {
35273563
// Vector signed division by constant are expanded to the
3528-
// sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
3529-
// to MULHS + SUB + SRL + ADD + SRL.
3530-
InstructionCost MulCost = getArithmeticInstrCost(
3531-
Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3532-
InstructionCost AddCost = getArithmeticInstrCost(
3533-
Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3534-
InstructionCost ShrCost = getArithmeticInstrCost(
3535-
Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3564+
// sequence MULHS + ADD/SUB + SRA + SRL + ADD.
3565+
InstructionCost MulCost =
3566+
getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
3567+
Op1Info.getNoProps(), Op2Info.getNoProps());
3568+
InstructionCost AddCost =
3569+
getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
3570+
Op1Info.getNoProps(), Op2Info.getNoProps());
3571+
InstructionCost ShrCost =
3572+
getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
3573+
Op1Info.getNoProps(), Op2Info.getNoProps());
35363574
return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
35373575
}
35383576
}
@@ -3545,7 +3583,7 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
35453583

35463584
InstructionCost Cost = BaseT::getArithmeticInstrCost(
35473585
Opcode, Ty, CostKind, Op1Info, Op2Info);
3548-
if (Ty->isVectorTy()) {
3586+
if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
35493587
if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
35503588
// SDIV/UDIV operations are lowered using SVE, then we can have less
35513589
// costs.

llvm/test/Analysis/CostModel/AArch64/div.ll

Lines changed: 128 additions & 128 deletions
Large diffs are not rendered by default.

llvm/test/Analysis/CostModel/AArch64/div_cte.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ define <4 x i32> @sdiv32xi4(<4 x i32> %x) {
3434

3535
define <16 x i8> @udiv8xi16(<16 x i8> %x) {
3636
; CHECK-LABEL: 'udiv8xi16'
37-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %div = udiv <16 x i8> %x, splat (i8 9)
37+
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %div = udiv <16 x i8> %x, splat (i8 9)
3838
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %div
3939
;
4040
%div = udiv <16 x i8> %x, <i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9>
@@ -43,7 +43,7 @@ define <16 x i8> @udiv8xi16(<16 x i8> %x) {
4343

4444
define <8 x i16> @udiv16xi8(<8 x i16> %x) {
4545
; CHECK-LABEL: 'udiv16xi8'
46-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %div = udiv <8 x i16> %x, splat (i16 9)
46+
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %div = udiv <8 x i16> %x, splat (i16 9)
4747
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %div
4848
;
4949
%div = udiv <8 x i16> %x, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
@@ -52,7 +52,7 @@ define <8 x i16> @udiv16xi8(<8 x i16> %x) {
5252

5353
define <4 x i32> @udiv32xi4(<4 x i32> %x) {
5454
; CHECK-LABEL: 'udiv32xi4'
55-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %div = udiv <4 x i32> %x, splat (i32 9)
55+
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %div = udiv <4 x i32> %x, splat (i32 9)
5656
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %div
5757
;
5858
%div = udiv <4 x i32> %x, <i32 9, i32 9, i32 9, i32 9>

llvm/test/Analysis/CostModel/AArch64/fshl.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ entry:
1515

1616
define i8 @fshl_i8_3rd_arg_var(i8 %a, i8 %b, i8 %c) {
1717
; CHECK-LABEL: 'fshl_i8_3rd_arg_var'
18-
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fshl = tail call i8 @llvm.fshl.i8(i8 %a, i8 %b, i8 %c)
18+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fshl = tail call i8 @llvm.fshl.i8(i8 %a, i8 %b, i8 %c)
1919
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %fshl
2020
;
2121
entry:
@@ -49,7 +49,7 @@ entry:
4949

5050
define i32 @fshl_i32_3rd_arg_var(i32 %a, i32 %b, i32 %c) {
5151
; CHECK-LABEL: 'fshl_i32_3rd_arg_var'
52-
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fshl = tail call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
52+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fshl = tail call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
5353
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %fshl
5454
;
5555
entry:
@@ -71,7 +71,7 @@ entry:
7171

7272
define i64 @fshl_i64_3rd_arg_var(i64 %a, i64 %b, i64 %c) {
7373
; CHECK-LABEL: 'fshl_i64_3rd_arg_var'
74-
; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fshl = tail call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %c)
74+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fshl = tail call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %c)
7575
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %fshl
7676
;
7777
entry:
@@ -116,7 +116,7 @@ entry:
116116

117117
define <16 x i8> @fshl_v16i8_3rd_arg_var(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
118118
; CHECK-LABEL: 'fshl_v16i8_3rd_arg_var'
119-
; CHECK-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
119+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
120120
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %fshl
121121
;
122122
entry:
@@ -148,7 +148,7 @@ entry:
148148

149149
define <8 x i16> @fshl_v8i16_3rd_arg_var(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
150150
; CHECK-LABEL: 'fshl_v8i16_3rd_arg_var'
151-
; CHECK-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
151+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
152152
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %fshl
153153
;
154154
entry:
@@ -180,7 +180,7 @@ entry:
180180

181181
define <4 x i32> @fshl_v4i32_3rd_arg_var(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
182182
; CHECK-LABEL: 'fshl_v4i32_3rd_arg_var'
183-
; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
183+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
184184
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %fshl
185185
;
186186
entry:
@@ -212,7 +212,7 @@ entry:
212212

213213
define <2 x i64> @fshl_v2i64_3rd_arg_var(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
214214
; CHECK-LABEL: 'fshl_v2i64_3rd_arg_var'
215-
; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
215+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
216216
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %fshl
217217
;
218218
entry:
@@ -224,7 +224,7 @@ declare <2 x i64> @llvm.fshl.v4i64(<2 x i64>, <2 x i64>, <2 x i64>)
224224

225225
define <4 x i30> @fshl_v4i30_3rd_arg_var(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c) {
226226
; CHECK-LABEL: 'fshl_v4i30_3rd_arg_var'
227-
; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %fshl = tail call <4 x i30> @llvm.fshl.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
227+
; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fshl = tail call <4 x i30> @llvm.fshl.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
228228
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i30> %fshl
229229
;
230230
entry:

llvm/test/Analysis/CostModel/AArch64/fshr.ll

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ entry:
1515

1616
define i8 @fshr_i8_3rd_arg_var(i8 %a, i8 %b, i8 %c) {
1717
; CHECK-LABEL: 'fshr_i8_3rd_arg_var'
18-
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fshr = tail call i8 @llvm.fshr.i8(i8 %a, i8 %b, i8 %c)
18+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fshr = tail call i8 @llvm.fshr.i8(i8 %a, i8 %b, i8 %c)
1919
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %fshr
2020
;
2121
entry:
@@ -49,7 +49,7 @@ entry:
4949

5050
define i32 @fshr_i32_3rd_arg_var(i32 %a, i32 %b, i32 %c) {
5151
; CHECK-LABEL: 'fshr_i32_3rd_arg_var'
52-
; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fshr = tail call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c)
52+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fshr = tail call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %c)
5353
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %fshr
5454
;
5555
entry:
@@ -71,7 +71,7 @@ entry:
7171

7272
define i64 @fshr_i64_3rd_arg_var(i64 %a, i64 %b, i64 %c) {
7373
; CHECK-LABEL: 'fshr_i64_3rd_arg_var'
74-
; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fshr = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c)
74+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fshr = tail call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c)
7575
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %fshr
7676
;
7777
entry:
@@ -116,7 +116,7 @@ entry:
116116

117117
define <16 x i8> @fshr_v16i8_3rd_arg_var(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
118118
; CHECK-LABEL: 'fshr_v16i8_3rd_arg_var'
119-
; CHECK-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
119+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c)
120120
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %fshr
121121
;
122122
entry:
@@ -148,7 +148,7 @@ entry:
148148

149149
define <8 x i16> @fshr_v8i16_3rd_arg_var(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) {
150150
; CHECK-LABEL: 'fshr_v8i16_3rd_arg_var'
151-
; CHECK-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
151+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c)
152152
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %fshr
153153
;
154154
entry:
@@ -180,7 +180,7 @@ entry:
180180

181181
define <4 x i32> @fshr_v4i32_3rd_arg_var(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
182182
; CHECK-LABEL: 'fshr_v4i32_3rd_arg_var'
183-
; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
183+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
184184
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %fshr
185185
;
186186
entry:
@@ -212,7 +212,7 @@ entry:
212212

213213
define <2 x i64> @fshr_v2i64_3rd_arg_var(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) {
214214
; CHECK-LABEL: 'fshr_v2i64_3rd_arg_var'
215-
; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %fshr = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
215+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fshr = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c)
216216
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %fshr
217217
;
218218
entry:
@@ -224,7 +224,7 @@ declare <2 x i64> @llvm.fshr.v4i64(<2 x i64>, <2 x i64>, <2 x i64>)
224224

225225
define <4 x i30> @fshr_v4i30_3rd_arg_var(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c) {
226226
; CHECK-LABEL: 'fshr_v4i30_3rd_arg_var'
227-
; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %fshr = tail call <4 x i30> @llvm.fshr.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
227+
; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fshr = tail call <4 x i30> @llvm.fshr.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
228228
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i30> %fshr
229229
;
230230
entry:

0 commit comments

Comments
 (0)