Skip to content

Commit 3be10c0

Browse files
committed
Rebase and improve the previous patch
1 parent 4d2e78d commit 3be10c0

File tree

8 files changed

+643
-610
lines changed

8 files changed

+643
-610
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 87 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -3529,50 +3529,100 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
35293529
case ISD::SREM:
35303530
case ISD::SDIV:
35313531
/*
3532-
For sdiv, typical sequence of instructions as per the type and divisor
3533-
property is as follows:
3534-
Scalar power-of-2: cmp + csel + asr
3535-
Vector power-of-2: usra + sshr
3536-
3537-
Scalar non-power-2: smulh/smull + asr/lsr + add/sub + asr + add
3538-
Vector non-power-2:
3539-
a) <2 x i64>: 2 * (smulh + asr + add) --> This yeilds scalarized form.
3540-
b) <4 x i32>: smull2 + smull + uzp2 + add + sshr + usra
3541-
3542-
SVE versions should have more or less the same cost because sometimes they
3543-
yeild native sdiv instructions, which should have less cost or the same
3544-
sequence of neon instructions.
3545-
3546-
For srem, typical sequence of instructions as per the type and divisor
3547-
property is as follows:
3548-
Scalar version: <set of sdiv instructions> + msub
3549-
Vector version: <set of sdiv instructions> + 2-msub/mls
3532+
Notes for sdiv/srem specific costs:
3533+
1. This only considers the cases where the divisor is constant, uniform and
3534+
(pow-of-2/non-pow-of-2). Other cases are not important since they either
3535+
result in some form of (ldr + adrp), corresponding to constant vectors, or
3536+
scalarization of the division operation.
3537+
2. Constant divisors, either negative in whole or partially, don't result in
3538+
significantly different codegen as compared to positive constant divisors.
3539+
So, we don't consider negative divisors seperately.
3540+
3. If the codegen is significantly different with SVE, it has been indicated
3541+
using comments at appropriate places.
3542+
3543+
sdiv specific cases:
3544+
-----------------------------------------------------------------------
3545+
codegen | pow-of-2 | Type
3546+
-----------------------------------------------------------------------
3547+
add + cmp + csel + asr | Y | i64
3548+
add + cmp + csel + asr | Y | i32
3549+
-----------------------------------------------------------------------
3550+
3551+
srem specific cases:
3552+
-----------------------------------------------------------------------
3553+
codegen | pow-of-2 | Type
3554+
-----------------------------------------------------------------------
3555+
negs + and + and + csneg | Y | i64
3556+
negs + and + and + csneg | Y | i32
3557+
-----------------------------------------------------------------------
3558+
3559+
other sdiv/srem cases:
3560+
-------------------------------------------------------------------------
3561+
commom codegen | + srem | + sdiv | pow-of-2 | Type
3562+
-------------------------------------------------------------------------
3563+
smulh + asr + add + add | - | - | N | i64
3564+
smull + lsr + add + add | - | - | N | i32
3565+
usra | and + sub | sshr | Y | <2 x i64>
3566+
2 * (scalar code) | - | - | N | <2 x i64>
3567+
usra | bic + sub | sshr + neg | Y | <4 x i32>
3568+
smull2 + smull + uzp2 | mls | - | N | <4 x i32>
3569+
+ sshr + usra | | | |
3570+
-------------------------------------------------------------------------
35503571
*/
3551-
if (Op2Info.isConstant()) {
3552-
InstructionCost AsrCost =
3553-
getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
3554-
Op1Info.getNoProps(), Op2Info.getNoProps());
3572+
if (Op2Info.isConstant() && Op2Info.isUniform()) {
35553573
InstructionCost AddCost =
35563574
getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
35573575
Op1Info.getNoProps(), Op2Info.getNoProps());
3576+
InstructionCost AsrCost =
3577+
getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
3578+
Op1Info.getNoProps(), Op2Info.getNoProps());
35583579
InstructionCost MulCost =
35593580
getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
35603581
Op1Info.getNoProps(), Op2Info.getNoProps());
3561-
3562-
bool HasSMUL = !Op2Info.isPowerOf2();
3563-
unsigned NumOfSMUL = HasSMUL ? (LT.second.isVector() ? 2 : 1) : 0;
3564-
bool HasExtraAsr =
3565-
(LT.second.isVector() || LT.second == MVT::i32) && HasSMUL;
3566-
3567-
InstructionCost CommonCost = AsrCost + AddCost;
3568-
// We typicall get 1 msub for scalar and 2-msub/1-mls for the vector form.
3569-
// Typically, the cost of msub is same and mls is twice as costly as
3570-
// add/sub/mul.
3571-
InstructionCost MlsOrMSubCost = (LT.second.isVector() ? 2 : 1) * MulCost;
3572-
InstructionCost DivCost =
3573-
CommonCost + (MulCost * NumOfSMUL) /* SMULH/SMULH */ +
3574-
(AsrCost * HasExtraAsr); // Coming with second SMULH
3575-
return DivCost + (ISD == ISD::SREM ? MlsOrMSubCost : 0);
3582+
// add/cmp/csel/csneg should have similar cost while asr/negs/and should
3583+
// have similar cost.
3584+
if (LT.second.isScalarInteger()) {
3585+
if (Op2Info.isPowerOf2()) {
3586+
return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
3587+
: (3 * AsrCost + AddCost);
3588+
} else {
3589+
return MulCost + AsrCost + 2 * AddCost;
3590+
}
3591+
} else {
3592+
InstructionCost UsraCost = 2 * AsrCost;
3593+
if (Op2Info.isPowerOf2()) {
3594+
// Division with scalable types corresponds to native 'asrd'
3595+
// instruction when SVE is available.
3596+
// e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
3597+
if (Ty->isScalableTy() && ST->hasSVE())
3598+
return 2 * AsrCost;
3599+
return UsraCost +
3600+
(ISD == ISD::SDIV
3601+
? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) *
3602+
AsrCost
3603+
: 2 * AddCost);
3604+
} else if (LT.second.is128BitVector() &&
3605+
LT.second.getScalarType() == MVT::i64) {
3606+
auto VT = TLI->getValueType(DL, Ty);
3607+
return VT.getVectorNumElements() *
3608+
getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
3609+
Op1Info.getNoProps(),
3610+
Op2Info.getNoProps());
3611+
} else {
3612+
// When SVE is available, we get:
3613+
// smulh + lsr + add/sub + asr + add/sub.
3614+
if (Ty->isScalableTy() && ST->hasSVE())
3615+
return 2 * MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
3616+
return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
3617+
}
3618+
}
3619+
}
3620+
if (Op2Info.isConstant() && !Op2Info.isUniform() &&
3621+
LT.second.isFixedLengthVector()) {
3622+
auto VT = TLI->getValueType(DL, Ty);
3623+
return VT.getVectorNumElements() *
3624+
getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
3625+
Op1Info.getNoProps(), Op2Info.getNoProps());
35763626
}
35773627
[[fallthrough]];
35783628
case ISD::UDIV:
@@ -3612,23 +3662,6 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
36123662
AddCost * 2 + ShrCost;
36133663
return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
36143664
}
3615-
3616-
// TODO: Fix SDIV and SREM costs, similar to the above.
3617-
if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT) &&
3618-
Op2Info.isUniform() && !VT.isScalableVector()) {
3619-
// Vector signed division by constant are expanded to the
3620-
// sequence MULHS + ADD/SUB + SRA + SRL + ADD.
3621-
InstructionCost MulCost =
3622-
getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
3623-
Op1Info.getNoProps(), Op2Info.getNoProps());
3624-
InstructionCost AddCost =
3625-
getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
3626-
Op1Info.getNoProps(), Op2Info.getNoProps());
3627-
InstructionCost ShrCost =
3628-
getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
3629-
Op1Info.getNoProps(), Op2Info.getNoProps());
3630-
return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
3631-
}
36323665
}
36333666

36343667
// div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are

0 commit comments

Comments
 (0)