Skip to content

Commit bdad897

Browse files
author
Salinas, David
authored
[AMDGPU] Enable i8 vectorization and sinking of free vector ops (llvm#3971) (llvm#4351)
2 parents e442d97 + 3d9374a commit bdad897

File tree

5 files changed

+1290
-1141
lines changed

5 files changed

+1290
-1141
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 86 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,8 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
347347
unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
348348
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
349349
return 32 * 4 / ElemWidth;
350-
// For a given width return the max 0number of elements that can be combined
350+
351+
// For a given width return the max number of elements that can be combined
351352
// into a wider bit value:
352353
return (ElemWidth == 8 && ST->has16BitInsts()) ? 4
353354
: (ElemWidth == 16 && ST->has16BitInsts()) ? 2
@@ -1266,6 +1267,90 @@ bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,
12661267

12671268
if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
12681269
Ops.push_back(&Op);
1270+
1271+
// Zero cost vector instructions (e.g. extractelement 0 of i32 vectors)
1272+
// will be optimized away, and sinking them can help SDAG combines.
1273+
DataLayout DL = I->getModule()->getDataLayout();
1274+
auto IsFreeExtractInsert = [&DL, this](VectorType *VecType,
1275+
unsigned VecIndex) {
1276+
unsigned EltSize = DL.getTypeSizeInBits(VecType->getElementType());
1277+
return EltSize >= 32 ||
1278+
(EltSize == 16 && VecIndex == 0 && ST->has16BitInsts());
1279+
};
1280+
1281+
uint64_t VecIndex;
1282+
Value *Vec;
1283+
if (match(Op.get(), m_ExtractElt(m_Value(Vec), m_ConstantInt(VecIndex)))) {
1284+
Instruction *VecOpInst =
1285+
dyn_cast<Instruction>(cast<Instruction>(Op.get())->getOperand(0));
1286+
// If a zero cost extractvector instruction is the only use of the vector,
1287+
// then it may be combined with the def.
1288+
if (VecOpInst && VecOpInst->hasOneUse())
1289+
continue;
1290+
1291+
if (IsFreeExtractInsert(cast<VectorType>(Vec->getType()), VecIndex))
1292+
Ops.push_back(&Op);
1293+
1294+
continue;
1295+
}
1296+
1297+
if (match(Op.get(),
1298+
m_InsertElt(m_Value(Vec), m_Value(), m_ConstantInt(VecIndex)))) {
1299+
if (IsFreeExtractInsert(cast<VectorType>(Vec->getType()), VecIndex))
1300+
Ops.push_back(&Op);
1301+
1302+
continue;
1303+
}
1304+
1305+
if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get())) {
1306+
if (Shuffle->isIdentity()) {
1307+
Ops.push_back(&Op);
1308+
continue;
1309+
}
1310+
1311+
unsigned EltSize = DL.getTypeSizeInBits(
1312+
cast<VectorType>(cast<VectorType>(Shuffle->getType()))
1313+
->getElementType());
1314+
1315+
// For i32 (or greater) shufflevectors, these will be lowered into a
1316+
// series of insert / extract elements, which will be coalesced away.
1317+
if (EltSize >= 32) {
1318+
Ops.push_back(&Op);
1319+
continue;
1320+
}
1321+
1322+
if (EltSize < 16 || !ST->has16BitInsts())
1323+
continue;
1324+
1325+
int NumSubElts, SubIndex;
1326+
if (Shuffle->changesLength()) {
1327+
if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
1328+
Ops.push_back(&Op);
1329+
continue;
1330+
}
1331+
1332+
if (Shuffle->isExtractSubvectorMask(SubIndex) ||
1333+
Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) {
1334+
if (!(SubIndex % 2)) {
1335+
Ops.push_back(&Op);
1336+
continue;
1337+
}
1338+
}
1339+
}
1340+
1341+
if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
1342+
Shuffle->isSingleSource()) {
1343+
Ops.push_back(&Op);
1344+
continue;
1345+
}
1346+
1347+
if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) {
1348+
if (!(SubIndex % 2)) {
1349+
Ops.push_back(&Op);
1350+
continue;
1351+
}
1352+
}
1353+
}
12691354
}
12701355

12711356
return !Ops.empty();

0 commit comments

Comments
 (0)