Skip to content

Commit f390cb5

Browse files
authored
Merge pull request #9390 from citymarina/cherry-pick-fadd
[AArch64][CostModel] Reduce the cost of fadd reduction with fast flag
2 parents 19ce576 + 7df73eb commit f390cb5

File tree

3 files changed

+927
-42
lines changed

3 files changed

+927
-42
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3941,6 +3941,26 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
39413941
switch (ISD) {
39423942
default:
39433943
break;
3944+
case ISD::FADD:
3945+
if (Type *EltTy = ValTy->getScalarType();
3946+
// FIXME: For half types without fullfp16 support, this could extend and
3947+
// use a fp32 faddp reduction but current codegen unrolls.
3948+
MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
3949+
(EltTy->isHalfTy() && ST->hasFullFP16()))) {
3950+
const unsigned NElts = MTy.getVectorNumElements();
3951+
if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
3952+
isPowerOf2_32(NElts))
3953+
// Reduction corresponding to series of fadd instructions is lowered to
3954+
// series of faddp instructions. faddp has latency/throughput that
3955+
// matches fadd instruction and hence, every faddp instruction can be
3956+
// considered to have a relative cost = 1 with
3957+
// CostKind = TCK_RecipThroughput.
3958+
// An faddp will pairwise add vector elements, so the size of input
3959+
// vector reduces by half every time, requiring
3960+
// #(faddp instructions) = log2_32(NElts).
3961+
return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
3962+
}
3963+
break;
39443964
case ISD::ADD:
39453965
if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
39463966
return (LT.first - 1) + Entry->Cost;

0 commit comments

Comments
 (0)