Skip to content

Commit d19fb9a

Browse files
committed
Enable vectorization of i8 values.
1 parent 661f90a commit d19fb9a

File tree

3 files changed

+962
-3
lines changed

3 files changed

+962
-3
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -344,9 +344,10 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344344
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
345345
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346346
return 32 * 4 / ElemWidth;
347-
return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
348-
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
349-
: 1;
347+
return ElemWidth == 8 ? 4
348+
: (ElemWidth == 16 && ST->has16BitInsts()) ? 2
349+
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
350+
: 1;
350351
}
351352

352353
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -562,6 +563,7 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
562563
if (ST->has16BitInsts() && SLT == MVT::i16)
563564
NElts = (NElts + 1) / 2;
564565

566+
// i32
565567
return LT.first * NElts * getFullRateInstrCost();
566568
case ISD::MUL: {
567569
const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
@@ -1423,3 +1425,30 @@ void GCNTTIImpl::collectKernelLaunchBounds(
14231425
LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
14241426
LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
14251427
}
1428+
1429+
InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1430+
Align Alignment,
1431+
unsigned AddressSpace,
1432+
TTI::TargetCostKind CostKind,
1433+
TTI::OperandValueInfo OpInfo,
1434+
const Instruction *I) {
1435+
if (VectorType *VecTy = dyn_cast<VectorType>(Src))
1436+
if (Opcode == Instruction::Load &&
1437+
VecTy->getElementType() ==
1438+
IntegerType::getInt8Ty(VecTy->getContext())) {
1439+
unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1440+
return ((ElementCount - 1) / 4) + 1;
1441+
}
1442+
return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1443+
OpInfo, I);
1444+
}
1445+
1446+
unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) {
1447+
if (VectorType *VecTy = dyn_cast<VectorType>(Tp))
1448+
if (VecTy->getElementType() ==
1449+
IntegerType::getInt8Ty(VecTy->getContext())) {
1450+
unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1451+
return ((ElementCount - 1) / 4) + 1;
1452+
}
1453+
return BaseT::getNumberOfParts(Tp);
1454+
}

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,20 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
282282
void collectKernelLaunchBounds(
283283
const Function &F,
284284
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const;
285+
286+
/// Account for loads of i8 vector types to have reduced cost. For
287+
/// example the cost of load 4 i8s values is one is the cost of loading
288+
/// a single i32 value.
289+
InstructionCost getMemoryOpCost(
290+
unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
291+
TTI::TargetCostKind CostKind,
292+
TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
293+
const Instruction *I = nullptr);
294+
295+
/// When counting parts on AMD GPUs, account for i8s being grouped
296+
/// together under a single i32 value. Otherwise fall back to base
297+
/// implementation.
298+
unsigned getNumberOfParts(Type *Tp);
285299
};
286300

287301
} // end namespace llvm

0 commit comments

Comments
 (0)