Skip to content

Commit 4e2c6bd

Browse files
committed
[AArch64][CostModel] Alter sdiv/srem cost where the divisor is constant
This patch draws its inspiration from the udiv/urem patch #122236 For sdiv, typical sequence of instructions as per the type and divisor property is as follows: Scalar power-of-2: cmp + csel + asr Neon power-of-2: usra + sshr Scalar non-power-2: smulh/smull + asr/lsr + add/sub + asr + add Vector non-power-2: a) <2 x i64>: 2 * (smulh + asr + add) . This yeilds scalarized form. b) <4 x i32>: smull2 + smull + uzp2 + add + sshr + usra SVE versions should have more or less the same cost because sometimes they yeild native sdiv instructions, which should have less cost or the same sequence of neon instructions. For srem, typical sequence of instructions as per the type and divisor property is as follows: Scalar version: <set of sdiv instructions> + msub Vector version: <set of sdiv instructions> + 2-msub/1-mls
1 parent 226a9d7 commit 4e2c6bd

File tree

9 files changed

+783
-587
lines changed

9 files changed

+783
-587
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 46 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3491,23 +3491,53 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
34913491
default:
34923492
return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
34933493
Op2Info);
3494+
case ISD::SREM:
34943495
case ISD::SDIV:
3495-
if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
3496-
// On AArch64, scalar signed division by constants power-of-two are
3497-
// normally expanded to the sequence ADD + CMP + SELECT + SRA.
3498-
// The OperandValue properties many not be same as that of previous
3499-
// operation; conservatively assume OP_None.
3500-
InstructionCost Cost = getArithmeticInstrCost(
3501-
Instruction::Add, Ty, CostKind,
3502-
Op1Info.getNoProps(), Op2Info.getNoProps());
3503-
Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
3504-
Op1Info.getNoProps(), Op2Info.getNoProps());
3505-
Cost += getArithmeticInstrCost(
3506-
Instruction::Select, Ty, CostKind,
3507-
Op1Info.getNoProps(), Op2Info.getNoProps());
3508-
Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
3509-
Op1Info.getNoProps(), Op2Info.getNoProps());
3510-
return Cost;
3496+
/*
3497+
For sdiv, typical sequence of instructions as per the type and divisor
3498+
property is as follows:
3499+
Scalar power-of-2: cmp + csel + asr
3500+
Vector power-of-2: usra + sshr
3501+
3502+
Scalar non-power-2: smulh/smull + asr/lsr + add/sub + asr + add
3503+
Vector non-power-2:
3504+
a) <2 x i64>: 2 * (smulh + asr + add) --> This yeilds scalarized form.
3505+
b) <4 x i32>: smull2 + smull + uzp2 + add + sshr + usra
3506+
3507+
SVE versions should have more or less the same cost because sometimes they
3508+
yeild native sdiv instructions, which should have less cost or the same
3509+
sequence of neon instructions.
3510+
3511+
For srem, typical sequence of instructions as per the type and divisor
3512+
property is as follows:
3513+
Scalar version: <set of sdiv instructions> + msub
3514+
Vector version: <set of sdiv instructions> + 2-msub/mls
3515+
*/
3516+
if (Op2Info.isConstant()) {
3517+
InstructionCost AsrCost =
3518+
getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
3519+
Op1Info.getNoProps(), Op2Info.getNoProps());
3520+
InstructionCost AddCost =
3521+
getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
3522+
Op1Info.getNoProps(), Op2Info.getNoProps());
3523+
InstructionCost MulCost =
3524+
getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
3525+
Op1Info.getNoProps(), Op2Info.getNoProps());
3526+
3527+
bool HasSMUL = !Op2Info.isPowerOf2();
3528+
unsigned NumOfSMUL = HasSMUL ? (LT.second.isVector() ? 2 : 1) : 0;
3529+
bool HasExtraAsr =
3530+
(LT.second.isVector() || LT.second == MVT::i32) && HasSMUL;
3531+
3532+
InstructionCost CommonCost = AsrCost + AddCost;
3533+
// We typicall get 1 msub for scalar and 2-msub/1-mls for the vector form.
3534+
// Typically, the cost of msub is same and mls is twice as costly as
3535+
// add/sub/mul.
3536+
InstructionCost MlsOrMSubCost = (LT.second.isVector() ? 2 : 1) * MulCost;
3537+
InstructionCost DivCost =
3538+
CommonCost + (MulCost * NumOfSMUL) /* SMULH/SMULH */ +
3539+
(AsrCost * HasExtraAsr); // Coming with second SMULH
3540+
return DivCost + (ISD == ISD::SREM ? MlsOrMSubCost : 0);
35113541
}
35123542
[[fallthrough]];
35133543
case ISD::UDIV: {

llvm/test/Analysis/CostModel/AArch64/div.ll

Lines changed: 138 additions & 138 deletions
Large diffs are not rendered by default.

llvm/test/Analysis/CostModel/AArch64/div_cte.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
77

88
define <16 x i8> @sdiv8xi16(<16 x i8> %x) {
99
; CHECK-LABEL: 'sdiv8xi16'
10-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %div = sdiv <16 x i8> %x, splat (i8 9)
10+
; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %div = sdiv <16 x i8> %x, splat (i8 9)
1111
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %div
1212
;
1313
%div = sdiv <16 x i8> %x, <i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9>
@@ -16,7 +16,7 @@ define <16 x i8> @sdiv8xi16(<16 x i8> %x) {
1616

1717
define <8 x i16> @sdiv16xi8(<8 x i16> %x) {
1818
; CHECK-LABEL: 'sdiv16xi8'
19-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %div = sdiv <8 x i16> %x, splat (i16 9)
19+
; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %div = sdiv <8 x i16> %x, splat (i16 9)
2020
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %div
2121
;
2222
%div = sdiv <8 x i16> %x, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
@@ -25,7 +25,7 @@ define <8 x i16> @sdiv16xi8(<8 x i16> %x) {
2525

2626
define <4 x i32> @sdiv32xi4(<4 x i32> %x) {
2727
; CHECK-LABEL: 'sdiv32xi4'
28-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %div = sdiv <4 x i32> %x, splat (i32 9)
28+
; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %div = sdiv <4 x i32> %x, splat (i32 9)
2929
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %div
3030
;
3131
%div = sdiv <4 x i32> %x, <i32 9, i32 9, i32 9, i32 9>

0 commit comments

Comments
 (0)