Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 105 additions & 33 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/CodeGen/CostTable.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsAArch64.h"
Expand Down Expand Up @@ -3531,23 +3532,111 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
default:
return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
Op2Info);
case ISD::SREM:
case ISD::SDIV:
if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
// On AArch64, scalar signed division by constants power-of-two are
// normally expanded to the sequence ADD + CMP + SELECT + SRA.
// The OperandValue properties many not be same as that of previous
// operation; conservatively assume OP_None.
InstructionCost Cost = getArithmeticInstrCost(
Instruction::Add, Ty, CostKind,
Op1Info.getNoProps(), Op2Info.getNoProps());
Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
Op1Info.getNoProps(), Op2Info.getNoProps());
Cost += getArithmeticInstrCost(
Instruction::Select, Ty, CostKind,
Op1Info.getNoProps(), Op2Info.getNoProps());
Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
Op1Info.getNoProps(), Op2Info.getNoProps());
return Cost;
/*
Notes for sdiv/srem specific costs:
1. This only considers the cases where the divisor is constant, uniform and
(pow-of-2/non-pow-of-2). Other cases are not important since they either
result in some form of (ldr + adrp), corresponding to constant vectors, or
scalarization of the division operation.
2. Constant divisors, either negative in whole or partially, don't result in
significantly different codegen as compared to positive constant divisors.
So, we don't consider negative divisors seperately.
3. If the codegen is significantly different with SVE, it has been indicated
using comments at appropriate places.

sdiv specific cases:
-----------------------------------------------------------------------
codegen | pow-of-2 | Type
-----------------------------------------------------------------------
add + cmp + csel + asr | Y | i64
add + cmp + csel + asr | Y | i32
-----------------------------------------------------------------------

srem specific cases:
-----------------------------------------------------------------------
codegen | pow-of-2 | Type
-----------------------------------------------------------------------
negs + and + and + csneg | Y | i64
negs + and + and + csneg | Y | i32
-----------------------------------------------------------------------

other sdiv/srem cases:
-------------------------------------------------------------------------
commom codegen | + srem | + sdiv | pow-of-2 | Type
-------------------------------------------------------------------------
smulh + asr + add + add | - | - | N | i64
smull + lsr + add + add | - | - | N | i32
usra | and + sub | sshr | Y | <2 x i64>
2 * (scalar code) | - | - | N | <2 x i64>
usra | bic + sub | sshr + neg | Y | <4 x i32>
smull2 + smull + uzp2 | mls | - | N | <4 x i32>
+ sshr + usra | | | |
-------------------------------------------------------------------------
*/
if (Op2Info.isConstant() && Op2Info.isUniform()) {
InstructionCost AddCost =
getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
Op1Info.getNoProps(), Op2Info.getNoProps());
InstructionCost AsrCost =
getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
Op1Info.getNoProps(), Op2Info.getNoProps());
InstructionCost MulCost =
getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
Op1Info.getNoProps(), Op2Info.getNoProps());
// add/cmp/csel/csneg should have similar cost while asr/negs/and should
// have similar cost.
auto VT = TLI->getValueType(DL, Ty);
if (LT.second.isScalarInteger() && VT.getSizeInBits() <= 64) {
if (Op2Info.isPowerOf2()) {
return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
: (3 * AsrCost + AddCost);
} else {
return MulCost + AsrCost + 2 * AddCost;
}
} else if (VT.isVector()) {
InstructionCost UsraCost = 2 * AsrCost;
if (Op2Info.isPowerOf2()) {
// Division with scalable types corresponds to native 'asrd'
// instruction when SVE is available.
// e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
if (Ty->isScalableTy() && ST->hasSVE())
return 2 * AsrCost;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the reason to use 2x for asrd?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

asrd is almost twice costly as compared to asr(by costly, I mean I am referring to latency here).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The default CostKind is TCK_RecipThroughput, not TCK_Latency. (This function currently only handles TCK_RecipThroughput, I was hoping to add at least codesize soon and it would be good to cover others). Unless we have a strong reason to discourage SVE generation here (which I don't think we do?), we should favour the throughput costs.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think just having 1x cost wouldnt be prudent because we are not just comparing the cost against similar instructions but other instructions as well i.e. there is no grouping of instructions where instruction cost is compared against instruction cost from the same group.
But if you think having 1x helps, will do that.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The A725 has a throughput of 1 for these, as opposed to 2 for most vector operations. So there is precedence for it.

I'm not sure I understood what you meant though. What do you mean by the groups?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The A725 has a throughput of 1 for these, as opposed to 2 for most vector operations. So there is precedence for it.

I assume fdiv as one of the examples here. This is OK here. But for cpu like Neoverse-v2 where the throughput>=1 for most of the instructions, recip_tput becomes approx equal to 1 for all. There is no way to differentiate how costly the instruction is wrt some other instruction.

Ideally, we would always like to know the no. of cycles consumed and this is the thing that we refer to when using tools like llvm-mca. We never go on calculating recip_tput. Also, in articles like this, the unit of recip_tput is cycles/instr which is nothing but latency(under certain conditions though).

Having cost=1 (with recip_tput as the cost metric)for most of the instructions is problematic I think for the same reason e.g. a load from constant pool would be costed same as a normal mul/add etc.

What do you mean by the groups?

I mean some sort of equivalence groups.
e.g. group of MemoryOps consisting of load/store where the instruction in this group is compared only within this group and then assigned a cost relative to others in this group. If there is comparison between two diff groups, groups can be coalesced to have a revised costing.
Now, this is my thinking but there maybe flaws with this.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure I am understanding what you are saying. I think this should still be 1 considering the instruction that is produced, but it seems OK either way as the cost of the SVE instruction will still be lower than the scalar / vector version. Lets go with this for the moment and we can adjust it in the future if we need to.

If you mean that you can't take two disparate recip-throughput costs, add them together and expect to come up with a sensible "reciprical-throughput", then yes I agree that doesn't always work very well. It would be better to have a cost-model that understood that some throughput costs are separate (loads/stores vs vector ops vs integer ops vs m-ops, etc) and was able to measure throughput bottlenecks better.

return UsraCost +
(ISD == ISD::SDIV
? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) *
AsrCost
: 2 * AddCost);
} else if (LT.second == MVT::v2i64) {
return VT.getVectorNumElements() *
getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
Op1Info.getNoProps(),
Op2Info.getNoProps());
} else {
// When SVE is available, we get:
// smulh + lsr + add/sub + asr + add/sub.
if (Ty->isScalableTy() && ST->hasSVE())
return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
}
}
}
if (Op2Info.isConstant() && !Op2Info.isUniform() &&
LT.second.isFixedLengthVector()) {
// FIXME: When the constant vector is non-uniform, this may result in
// loading the vector from constant pool or in some cases, may also result
// in scalarization. For now, we are approximating this with the
// scalarization cost.
auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
CostKind, -1, nullptr, nullptr);
auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
CostKind, -1, nullptr, nullptr);
unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
return ExtractCost + InsertCost +
NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
CostKind, Op1Info.getNoProps(),
Op2Info.getNoProps());
}
[[fallthrough]];
case ISD::UDIV:
Expand Down Expand Up @@ -3587,23 +3676,6 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
AddCost * 2 + ShrCost;
return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
}

// TODO: Fix SDIV and SREM costs, similar to the above.
if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT) &&
Op2Info.isUniform() && !VT.isScalableVector()) {
// Vector signed division by constant are expanded to the
// sequence MULHS + ADD/SUB + SRA + SRL + ADD.
InstructionCost MulCost =
getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
Op1Info.getNoProps(), Op2Info.getNoProps());
InstructionCost AddCost =
getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
Op1Info.getNoProps(), Op2Info.getNoProps());
InstructionCost ShrCost =
getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
Op1Info.getNoProps(), Op2Info.getNoProps());
return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
}
}

// div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
Expand Down
Loading
Loading