-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AArch64][CostModel] Alter sdiv/srem cost where the divisor is constant #123552
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ | |
| #include "llvm/CodeGen/BasicTTIImpl.h" | ||
| #include "llvm/CodeGen/CostTable.h" | ||
| #include "llvm/CodeGen/TargetLowering.h" | ||
| #include "llvm/IR/DerivedTypes.h" | ||
| #include "llvm/IR/IntrinsicInst.h" | ||
| #include "llvm/IR/Intrinsics.h" | ||
| #include "llvm/IR/IntrinsicsAArch64.h" | ||
|
|
@@ -3531,23 +3532,111 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( | |
| default: | ||
| return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, | ||
| Op2Info); | ||
| case ISD::SREM: | ||
| case ISD::SDIV: | ||
| if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) { | ||
| // On AArch64, scalar signed division by constants power-of-two are | ||
| // normally expanded to the sequence ADD + CMP + SELECT + SRA. | ||
| // The OperandValue properties many not be same as that of previous | ||
| // operation; conservatively assume OP_None. | ||
| InstructionCost Cost = getArithmeticInstrCost( | ||
| Instruction::Add, Ty, CostKind, | ||
| Op1Info.getNoProps(), Op2Info.getNoProps()); | ||
| Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, | ||
| Op1Info.getNoProps(), Op2Info.getNoProps()); | ||
| Cost += getArithmeticInstrCost( | ||
| Instruction::Select, Ty, CostKind, | ||
| Op1Info.getNoProps(), Op2Info.getNoProps()); | ||
| Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, | ||
| Op1Info.getNoProps(), Op2Info.getNoProps()); | ||
| return Cost; | ||
| /* | ||
| Notes for sdiv/srem specific costs: | ||
| 1. This only considers the cases where the divisor is constant, uniform and | ||
| (pow-of-2/non-pow-of-2). Other cases are not important since they either | ||
| result in some form of (ldr + adrp), corresponding to constant vectors, or | ||
| scalarization of the division operation. | ||
| 2. Constant divisors, either negative in whole or partially, don't result in | ||
| significantly different codegen as compared to positive constant divisors. | ||
| So, we don't consider negative divisors seperately. | ||
| 3. If the codegen is significantly different with SVE, it has been indicated | ||
| using comments at appropriate places. | ||
|
|
||
| sdiv specific cases: | ||
| ----------------------------------------------------------------------- | ||
| codegen | pow-of-2 | Type | ||
| ----------------------------------------------------------------------- | ||
| add + cmp + csel + asr | Y | i64 | ||
| add + cmp + csel + asr | Y | i32 | ||
| ----------------------------------------------------------------------- | ||
|
|
||
| srem specific cases: | ||
| ----------------------------------------------------------------------- | ||
| codegen | pow-of-2 | Type | ||
| ----------------------------------------------------------------------- | ||
| negs + and + and + csneg | Y | i64 | ||
| negs + and + and + csneg | Y | i32 | ||
| ----------------------------------------------------------------------- | ||
|
|
||
| other sdiv/srem cases: | ||
| ------------------------------------------------------------------------- | ||
| commom codegen | + srem | + sdiv | pow-of-2 | Type | ||
| ------------------------------------------------------------------------- | ||
| smulh + asr + add + add | - | - | N | i64 | ||
| smull + lsr + add + add | - | - | N | i32 | ||
| usra | and + sub | sshr | Y | <2 x i64> | ||
| 2 * (scalar code) | - | - | N | <2 x i64> | ||
| usra | bic + sub | sshr + neg | Y | <4 x i32> | ||
| smull2 + smull + uzp2 | mls | - | N | <4 x i32> | ||
| + sshr + usra | | | | | ||
| ------------------------------------------------------------------------- | ||
| */ | ||
| if (Op2Info.isConstant() && Op2Info.isUniform()) { | ||
| InstructionCost AddCost = | ||
| getArithmeticInstrCost(Instruction::Add, Ty, CostKind, | ||
| Op1Info.getNoProps(), Op2Info.getNoProps()); | ||
| InstructionCost AsrCost = | ||
| getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, | ||
| Op1Info.getNoProps(), Op2Info.getNoProps()); | ||
| InstructionCost MulCost = | ||
| getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, | ||
| Op1Info.getNoProps(), Op2Info.getNoProps()); | ||
| // add/cmp/csel/csneg should have similar cost while asr/negs/and should | ||
| // have similar cost. | ||
| auto VT = TLI->getValueType(DL, Ty); | ||
| if (LT.second.isScalarInteger() && VT.getSizeInBits() <= 64) { | ||
| if (Op2Info.isPowerOf2()) { | ||
| return ISD == ISD::SDIV ? (3 * AddCost + AsrCost) | ||
| : (3 * AsrCost + AddCost); | ||
| } else { | ||
| return MulCost + AsrCost + 2 * AddCost; | ||
| } | ||
| } else if (VT.isVector()) { | ||
| InstructionCost UsraCost = 2 * AsrCost; | ||
| if (Op2Info.isPowerOf2()) { | ||
| // Division with scalable types corresponds to native 'asrd' | ||
| // instruction when SVE is available. | ||
| // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8) | ||
| if (Ty->isScalableTy() && ST->hasSVE()) | ||
| return 2 * AsrCost; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the reason to use 2x for asrd?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. asrd is almost twice costly as compared to asr(by costly, I mean I am referring to latency here).
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The default CostKind is TCK_RecipThroughput, not TCK_Latency. (This function currently only handles TCK_RecipThroughput, I was hoping to add at least codesize soon and it would be good to cover others). Unless we have a strong reason to discourage SVE generation here (which I don't think we do?), we should favour the throughput costs.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think just having 1x cost wouldnt be prudent because we are not just comparing the cost against similar instructions but other instructions as well i.e. there is no grouping of instructions where instruction cost is compared against instruction cost from the same group.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The A725 has a throughput of 1 for these, as opposed to 2 for most vector operations. So there is precedence for it. I'm not sure I understood what you meant though. What do you mean by the groups?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I assume fdiv as one of the examples here. This is OK here. But for cpu like Neoverse-v2 where the throughput>=1 for most of the instructions, recip_tput becomes approx equal to 1 for all. There is no way to differentiate how costly the instruction is wrt some other instruction. Ideally, we would always like to know the no. of cycles consumed and this is the thing that we refer to when using tools like llvm-mca. We never go on calculating recip_tput. Also, in articles like this, the unit of recip_tput is cycles/instr which is nothing but latency(under certain conditions though). Having cost=1 (with recip_tput as the cost metric)for most of the instructions is problematic I think for the same reason e.g. a load from constant pool would be costed same as a normal mul/add etc.
I mean some sort of equivalence groups.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure I am understanding what you are saying. I think this should still be 1 considering the instruction that is produced, but it seems OK either way as the cost of the SVE instruction will still be lower than the scalar / vector version. Lets go with this for the moment and we can adjust it in the future if we need to. If you mean that you can't take two disparate recip-throughput costs, add them together and expect to come up with a sensible "reciprical-throughput", then yes I agree that doesn't always work very well. It would be better to have a cost-model that understood that some throughput costs are separate (loads/stores vs vector ops vs integer ops vs m-ops, etc) and was able to measure throughput bottlenecks better. |
||
| return UsraCost + | ||
| (ISD == ISD::SDIV | ||
| ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * | ||
| AsrCost | ||
| : 2 * AddCost); | ||
| } else if (LT.second == MVT::v2i64) { | ||
| return VT.getVectorNumElements() * | ||
| getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind, | ||
| Op1Info.getNoProps(), | ||
| Op2Info.getNoProps()); | ||
davemgreen marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } else { | ||
| // When SVE is available, we get: | ||
| // smulh + lsr + add/sub + asr + add/sub. | ||
| if (Ty->isScalableTy() && ST->hasSVE()) | ||
| return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost; | ||
| return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost; | ||
| } | ||
| } | ||
| } | ||
| if (Op2Info.isConstant() && !Op2Info.isUniform() && | ||
| LT.second.isFixedLengthVector()) { | ||
| // FIXME: When the constant vector is non-uniform, this may result in | ||
| // loading the vector from constant pool or in some cases, may also result | ||
| // in scalarization. For now, we are approximating this with the | ||
| // scalarization cost. | ||
| auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty, | ||
| CostKind, -1, nullptr, nullptr); | ||
| auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty, | ||
| CostKind, -1, nullptr, nullptr); | ||
| unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements(); | ||
| return ExtractCost + InsertCost + | ||
| NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(), | ||
| CostKind, Op1Info.getNoProps(), | ||
| Op2Info.getNoProps()); | ||
| } | ||
| [[fallthrough]]; | ||
| case ISD::UDIV: | ||
|
|
@@ -3587,23 +3676,6 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( | |
| AddCost * 2 + ShrCost; | ||
| return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0); | ||
| } | ||
|
|
||
| // TODO: Fix SDIV and SREM costs, similar to the above. | ||
| if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT) && | ||
| Op2Info.isUniform() && !VT.isScalableVector()) { | ||
| // Vector signed division by constant are expanded to the | ||
| // sequence MULHS + ADD/SUB + SRA + SRL + ADD. | ||
| InstructionCost MulCost = | ||
| getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, | ||
| Op1Info.getNoProps(), Op2Info.getNoProps()); | ||
| InstructionCost AddCost = | ||
| getArithmeticInstrCost(Instruction::Add, Ty, CostKind, | ||
| Op1Info.getNoProps(), Op2Info.getNoProps()); | ||
| InstructionCost ShrCost = | ||
| getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, | ||
| Op1Info.getNoProps(), Op2Info.getNoProps()); | ||
| return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1; | ||
| } | ||
| } | ||
|
|
||
| // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.