Skip to content

Commit 7618d36

Browse files
committed
Enable vectorization of i8 values.
1 parent 661f90a commit 7618d36

File tree

4 files changed

+1525
-3
lines changed

4 files changed

+1525
-3
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -344,9 +344,10 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344344
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
345345
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346346
return 32 * 4 / ElemWidth;
347-
return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
348-
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
349-
: 1;
347+
return ElemWidth == 8 ? 4
348+
: (ElemWidth == 16 && ST->has16BitInsts()) ? 2
349+
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
350+
: 1;
350351
}
351352

352353
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1423,3 +1424,32 @@ void GCNTTIImpl::collectKernelLaunchBounds(
14231424
LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
14241425
LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
14251426
}
1427+
1428+
InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1429+
Align Alignment,
1430+
unsigned AddressSpace,
1431+
TTI::TargetCostKind CostKind,
1432+
TTI::OperandValueInfo OpInfo,
1433+
const Instruction *I) {
1434+
if (VectorType *VecTy = dyn_cast<VectorType>(Src))
1435+
if (Opcode == Instruction::Load &&
1436+
VecTy->getElementType() ==
1437+
IntegerType::getInt8Ty(VecTy->getContext())) {
1438+
unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1439+
return (8 * (ElementCount - 1) /
1440+
getLoadStoreVecRegBitWidth(AddressSpace)) +
1441+
1;
1442+
}
1443+
return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1444+
OpInfo, I);
1445+
}
1446+
1447+
unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) {
1448+
if (VectorType *VecTy = dyn_cast<VectorType>(Tp))
1449+
if (VecTy->getElementType() ==
1450+
IntegerType::getInt8Ty(VecTy->getContext())) {
1451+
unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1452+
return ((ElementCount - 1) / 4) + 1;
1453+
}
1454+
return BaseT::getNumberOfParts(Tp);
1455+
}

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,20 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
282282
void collectKernelLaunchBounds(
283283
const Function &F,
284284
SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const;
285+
286+
/// Account for loads of i8 vector types to have reduced cost. For
287+
/// example the cost of load 4 i8s values is one is the cost of loading
288+
/// a single i32 value.
289+
InstructionCost getMemoryOpCost(
290+
unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
291+
TTI::TargetCostKind CostKind,
292+
TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
293+
const Instruction *I = nullptr);
294+
295+
/// When counting parts on AMD GPUs, account for i8s being grouped
296+
/// together under a single i32 value. Otherwise fall back to base
297+
/// implementation.
298+
unsigned getNumberOfParts(Type *Tp);
285299
};
286300

287301
} // end namespace llvm

0 commit comments

Comments
 (0)