Skip to content

Commit 7b7727d

Browse files
[SystemZ] Add realistic cost estimates for vector reduction intrinsics
This adds more realistic cost estimates for these redcuction intrinsics - llvm.vector.reduce.umax - llvm.vector.reduce.umin - llvm.vector.reduce.smax - llvm.vector.reduce.smin - llvm.vector.reduce.fadd - llvm.vector.reduce.fmul - llvm.vector.reduce.fmax - llvm.vector.reduce.fmin - llvm.vector.reduce.fmaximum - llvm.vector.reduce.fminimum - llvm.vector.reduce.mul The pre-existing cost estimates for llvm.vector.reduce.add are moved to `getArithmeticReductionCosts` to reduce complexity in `getVectorIntrinsicInstrCost` and enable other passes, like the SLP vectorizer, to benefit from these updated calculations. These are not expected to provide noticable performance improvements and are rather provided for the sake of completeness and correctness. This also provides and/or updates cost tests for all of these intrinsics.
1 parent 89aaf2c commit 7b7727d

File tree

7 files changed

+1245
-147
lines changed

7 files changed

+1245
-147
lines changed

llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp

Lines changed: 75 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,12 @@
1616
#include "SystemZTargetTransformInfo.h"
1717
#include "llvm/Analysis/TargetTransformInfo.h"
1818
#include "llvm/CodeGen/BasicTTIImpl.h"
19-
#include "llvm/CodeGen/CostTable.h"
2019
#include "llvm/CodeGen/TargetLowering.h"
2120
#include "llvm/IR/DerivedTypes.h"
2221
#include "llvm/IR/IntrinsicInst.h"
2322
#include "llvm/IR/Intrinsics.h"
2423
#include "llvm/Support/Debug.h"
24+
#include "llvm/Support/InstructionCost.h"
2525
#include "llvm/Support/MathExtras.h"
2626

2727
using namespace llvm;
@@ -1353,30 +1353,86 @@ InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost(
13531353
return NumVectorMemOps + NumPermutes;
13541354
}
13551355

1356+
InstructionCost getIntAddReductionCost(unsigned NumVec, unsigned ScalarBits) {
1357+
InstructionCost Cost = 0;
1358+
// Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
1359+
Cost += NumVec - 1;
1360+
// For integer adds, VSUM creates shorter reductions on the final vector.
1361+
Cost += (ScalarBits < 32) ? 3 : 2;
1362+
return Cost;
1363+
}
1364+
1365+
InstructionCost getFastReductionCost(unsigned NumVec, unsigned NumElems,
1366+
unsigned ScalarBits) {
1367+
unsigned NumEltsPerVecReg = (SystemZ::VectorBits / ScalarBits);
1368+
InstructionCost Cost = 0;
1369+
// Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
1370+
Cost += NumVec - 1;
1371+
// For each shuffle / arithmetic layer, we need 2 instructions, and we need
1372+
// log2(Elements in Last Vector) layers.
1373+
Cost += 2 * Log2_32_Ceil(std::min(NumElems, NumEltsPerVecReg));
1374+
return Cost;
1375+
}
1376+
1377+
inline bool customCostReductions(unsigned Opcode) {
1378+
return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
1379+
Opcode == Instruction::Add || Opcode == Instruction::Mul;
1380+
}
1381+
1382+
InstructionCost
1383+
SystemZTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
1384+
std::optional<FastMathFlags> FMF,
1385+
TTI::TargetCostKind CostKind) {
1386+
unsigned ScalarBits = Ty->getScalarSizeInBits();
1387+
// The following is only for subtargets with vector math, non-ordered
1388+
// reductions, and reasonable scalar sizes for int and fp add/mul.
1389+
if (customCostReductions(Opcode) && ST->hasVector() &&
1390+
!TTI::requiresOrderedReduction(FMF) &&
1391+
ScalarBits <= SystemZ::VectorBits) {
1392+
unsigned NumVectors = getNumVectorRegs(Ty);
1393+
unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements();
1394+
// Integer Add is using custom code gen, that needs to be accounted for.
1395+
if (Opcode == Instruction::Add)
1396+
return getIntAddReductionCost(NumVectors, ScalarBits);
1397+
// The base cost is the same across all other arithmetic instructions
1398+
InstructionCost Cost =
1399+
getFastReductionCost(NumVectors, NumElems, ScalarBits);
1400+
// But we need to account for the final op involving the scalar operand.
1401+
if ((Opcode == Instruction::FAdd) || (Opcode == Instruction::FMul))
1402+
Cost += 1;
1403+
return Cost;
1404+
}
1405+
// otherwise, fall back to the standard implementation
1406+
return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1407+
}
1408+
1409+
InstructionCost
1410+
SystemZTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
1411+
FastMathFlags FMF,
1412+
TTI::TargetCostKind CostKind) {
1413+
// Return custom costs only on subtargets with vector enhancements.
1414+
if (ST->hasVectorEnhancements1()) {
1415+
unsigned NumVectors = getNumVectorRegs(Ty);
1416+
unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements();
1417+
unsigned ScalarBits = Ty->getScalarSizeInBits();
1418+
InstructionCost Cost = 0;
1419+
// Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
1420+
Cost += NumVectors - 1;
1421+
// For the final vector, we need shuffle + min/max operations, and
1422+
// we need #Elements - 1 of them.
1423+
Cost += 2 * (std::min(NumElems, SystemZ::VectorBits / ScalarBits) - 1);
1424+
return Cost;
1425+
}
1426+
// For other targets, fall back to the standard implementation
1427+
return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1428+
}
1429+
13561430
static int
13571431
getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
13581432
const SmallVectorImpl<Type *> &ParamTys) {
13591433
if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
13601434
return getNumVectorRegs(RetTy); // VPERM
13611435

1362-
if (ID == Intrinsic::vector_reduce_add) {
1363-
// Retrieve number and size of elements for the vector op.
1364-
auto *VTy = cast<FixedVectorType>(ParamTys.front());
1365-
unsigned ScalarSize = VTy->getScalarSizeInBits();
1366-
// For scalar sizes >128 bits, we fall back to the generic cost estimate.
1367-
if (ScalarSize > SystemZ::VectorBits)
1368-
return -1;
1369-
// This many vector regs are needed to represent the input elements (V).
1370-
unsigned VectorRegsNeeded = getNumVectorRegs(VTy);
1371-
// This many instructions are needed for the final sum of vector elems (S).
1372-
unsigned LastVectorHandling = (ScalarSize < 32) ? 3 : 2;
1373-
// We use vector adds to create a sum vector, which takes
1374-
// V/2 + V/4 + ... = V - 1 operations.
1375-
// Then, we need S operations to sum up the elements of that sum vector,
1376-
// for a total of V + S - 1 operations.
1377-
int Cost = VectorRegsNeeded + LastVectorHandling - 1;
1378-
return Cost;
1379-
}
13801436
return -1;
13811437
}
13821438

llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,13 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
125125
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
126126
bool UseMaskForCond = false, bool UseMaskForGaps = false);
127127

128+
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
129+
std::optional<FastMathFlags> FMF,
130+
TTI::TargetCostKind CostKind);
131+
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
132+
FastMathFlags FMF,
133+
TTI::TargetCostKind CostKind);
134+
128135
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
129136
TTI::TargetCostKind CostKind);
130137

llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll

Lines changed: 0 additions & 128 deletions
This file was deleted.

0 commit comments

Comments
 (0)