Skip to content

Commit 1fa3c83

Browse files
authored
[AMDGPU] Enable i8 vectorization and sinking of free vector ops (llvm#3971)
2 parents ee329a0 + 907b1e4 commit 1fa3c83

File tree

10 files changed

+1609
-1668
lines changed

10 files changed

+1609
-1668
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 118 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -313,24 +313,6 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
313313
return !F || !ST->isSingleLaneExecution(*F);
314314
}
315315

316-
unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) {
317-
// For certain 8 bit ops, we can pack a v4i8 into a single part
318-
// (e.g. v4i8 shufflevectors -> v_perm v4i8, v4i8). Thus, we
319-
// do not limit the numberOfParts for 8 bit vectors to the
320-
// legalization costs of such. It is left up to other target
321-
// queries (e.g. get*InstrCost) to decide the proper handling
322-
// of 8 bit vectors.
323-
if (FixedVectorType *VTy = dyn_cast<FixedVectorType>(Tp)) {
324-
if (ST->shouldCoerceIllegalTypes() &&
325-
DL.getTypeSizeInBits(VTy->getElementType()) == 8) {
326-
unsigned ElCount = VTy->getElementCount().getFixedValue();
327-
return std::max(UINT64_C(1), PowerOf2Ceil(ElCount / 4));
328-
}
329-
}
330-
331-
return BaseT::getNumberOfParts(Tp);
332-
}
333-
334316
unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
335317
// NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
336318
// registers. See getRegisterClassForType for the implementation.
@@ -363,10 +345,12 @@ unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
363345
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
364346
return 32 * 4 / ElemWidth;
365347

366-
return (ST->shouldCoerceIllegalTypes() && ElemWidth == 8) ? 4
367-
: (ElemWidth == 16) ? 2
368-
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
369-
: 1;
348+
// For a given width return the max number of elements that can be combined
349+
// into a wider bit value:
350+
return (ElemWidth == 8 && ST->has16BitInsts()) ? 4
351+
: (ElemWidth == 16 && ST->has16BitInsts()) ? 2
352+
: (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
353+
: 1;
370354
}
371355

372356
unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -1176,8 +1160,7 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11761160

11771161
unsigned ScalarSize = DL.getTypeSizeInBits(VT->getElementType());
11781162
if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1179-
(ScalarSize == 16 ||
1180-
(ScalarSize == 8 && ST->shouldCoerceIllegalTypes()))) {
1163+
(ScalarSize == 16 || ScalarSize == 8)) {
11811164
// Larger vector widths may require additional instructions, but are
11821165
// typically cheaper than scalarized versions.
11831166
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
@@ -1239,6 +1222,90 @@ bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,
12391222

12401223
if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
12411224
Ops.push_back(&Op);
1225+
1226+
// Zero cost vector instructions (e.g. extractelement 0 of i32 vectors)
1227+
// will be optimized away, and sinking them can help SDAG combines.
1228+
DataLayout DL = I->getModule()->getDataLayout();
1229+
auto IsFreeExtractInsert = [&DL, this](VectorType *VecType,
1230+
unsigned VecIndex) {
1231+
unsigned EltSize = DL.getTypeSizeInBits(VecType->getElementType());
1232+
return EltSize >= 32 ||
1233+
(EltSize == 16 && VecIndex == 0 && ST->has16BitInsts());
1234+
};
1235+
1236+
uint64_t VecIndex;
1237+
Value *Vec;
1238+
if (match(Op.get(), m_ExtractElt(m_Value(Vec), m_ConstantInt(VecIndex)))) {
1239+
Instruction *VecOpInst =
1240+
dyn_cast<Instruction>(cast<Instruction>(Op.get())->getOperand(0));
1241+
// If a zero cost extractvector instruction is the only use of the vector,
1242+
// then it may be combined with the def.
1243+
if (VecOpInst && VecOpInst->hasOneUse())
1244+
continue;
1245+
1246+
if (IsFreeExtractInsert(cast<VectorType>(Vec->getType()), VecIndex))
1247+
Ops.push_back(&Op);
1248+
1249+
continue;
1250+
}
1251+
1252+
if (match(Op.get(),
1253+
m_InsertElt(m_Value(Vec), m_Value(), m_ConstantInt(VecIndex)))) {
1254+
if (IsFreeExtractInsert(cast<VectorType>(Vec->getType()), VecIndex))
1255+
Ops.push_back(&Op);
1256+
1257+
continue;
1258+
}
1259+
1260+
if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get())) {
1261+
if (Shuffle->isIdentity()) {
1262+
Ops.push_back(&Op);
1263+
continue;
1264+
}
1265+
1266+
unsigned EltSize = DL.getTypeSizeInBits(
1267+
cast<VectorType>(cast<VectorType>(Shuffle->getType()))
1268+
->getElementType());
1269+
1270+
// For i32 (or greater) shufflevectors, these will be lowered into a
1271+
// series of insert / extract elements, which will be coalesced away.
1272+
if (EltSize >= 32) {
1273+
Ops.push_back(&Op);
1274+
continue;
1275+
}
1276+
1277+
if (EltSize < 16 || !ST->has16BitInsts())
1278+
continue;
1279+
1280+
int NumSubElts, SubIndex;
1281+
if (Shuffle->changesLength()) {
1282+
if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
1283+
Ops.push_back(&Op);
1284+
continue;
1285+
}
1286+
1287+
if (Shuffle->isExtractSubvectorMask(SubIndex) ||
1288+
Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) {
1289+
if (!(SubIndex % 2)) {
1290+
Ops.push_back(&Op);
1291+
continue;
1292+
}
1293+
}
1294+
}
1295+
1296+
if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
1297+
Shuffle->isSingleSource()) {
1298+
Ops.push_back(&Op);
1299+
continue;
1300+
}
1301+
1302+
if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) {
1303+
if (!(SubIndex % 2)) {
1304+
Ops.push_back(&Op);
1305+
continue;
1306+
}
1307+
}
1308+
}
12421309
}
12431310

12441311
return !Ops.empty();
@@ -1452,3 +1519,30 @@ unsigned GCNTTIImpl::getPrefetchDistance() const {
14521519
bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
14531520
return AMDGPU::isFlatGlobalAddrSpace(AS);
14541521
}
1522+
1523+
InstructionCost GCNTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
1524+
Align Alignment,
1525+
unsigned AddressSpace,
1526+
TTI::TargetCostKind CostKind,
1527+
TTI::OperandValueInfo OpInfo,
1528+
const Instruction *I) {
1529+
if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1530+
if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1531+
VecTy->getElementType()->isIntegerTy(8)) {
1532+
return divideCeil(DL.getTypeSizeInBits(VecTy) - 1,
1533+
getLoadStoreVecRegBitWidth(AddressSpace));
1534+
}
1535+
}
1536+
return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1537+
OpInfo, I);
1538+
}
1539+
1540+
unsigned GCNTTIImpl::getNumberOfParts(Type *Tp) {
1541+
if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1542+
if (VecTy->getElementType()->isIntegerTy(8)) {
1543+
unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1544+
return divideCeil(ElementCount - 1, 4);
1545+
}
1546+
}
1547+
return BaseT::getNumberOfParts(Tp);
1548+
}

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,6 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
118118
return TTI::PSK_FastHardware;
119119
}
120120

121-
unsigned getNumberOfParts(Type *Tp);
122121
unsigned getNumberOfRegisters(unsigned RCID) const;
123122
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const;
124123
unsigned getMinVectorRegisterBitWidth() const;
@@ -278,6 +277,20 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
278277

279278
/// \return if target want to issue a prefetch in address space \p AS.
280279
bool shouldPrefetchAddressSpace(unsigned AS) const override;
280+
281+
/// Account for loads of i8 vector types to have reduced cost. For
282+
/// example the cost of load 4 i8s values is one is the cost of loading
283+
/// a single i32 value.
284+
InstructionCost getMemoryOpCost(
285+
unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
286+
TTI::TargetCostKind CostKind,
287+
TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
288+
const Instruction *I = nullptr);
289+
290+
/// When counting parts on AMD GPUs, account for i8s being grouped
291+
/// together under a single i32 value. Otherwise fall back to base
292+
/// implementation.
293+
unsigned getNumberOfParts(Type *Tp);
281294
};
282295

283296
} // end namespace llvm

0 commit comments

Comments
 (0)