@@ -347,7 +347,8 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
347347unsigned GCNTTIImpl::getMaximumVF (unsigned ElemWidth, unsigned Opcode) const {
348348 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
349349 return 32 * 4 / ElemWidth;
350- // For a given width return the max 0number of elements that can be combined
350+
351+ // For a given width return the max number of elements that can be combined
351352 // into a wider bit value:
352353 return (ElemWidth == 8 && ST->has16BitInsts ()) ? 4
353354 : (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
@@ -1266,6 +1267,90 @@ bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,
12661267
12671268 if (match (&Op, m_FAbs (m_Value ())) || match (&Op, m_FNeg (m_Value ())))
12681269 Ops.push_back (&Op);
1270+
1271+ // Zero cost vector instructions (e.g. extractelement 0 of i32 vectors)
1272+ // will be optimized away, and sinking them can help SDAG combines.
1273+ DataLayout DL = I->getModule ()->getDataLayout ();
1274+ auto IsFreeExtractInsert = [&DL, this ](VectorType *VecType,
1275+ unsigned VecIndex) {
1276+ unsigned EltSize = DL.getTypeSizeInBits (VecType->getElementType ());
1277+ return EltSize >= 32 ||
1278+ (EltSize == 16 && VecIndex == 0 && ST->has16BitInsts ());
1279+ };
1280+
1281+ uint64_t VecIndex;
1282+ Value *Vec;
1283+ if (match (Op.get (), m_ExtractElt (m_Value (Vec), m_ConstantInt (VecIndex)))) {
1284+ Instruction *VecOpInst =
1285+ dyn_cast<Instruction>(cast<Instruction>(Op.get ())->getOperand (0 ));
1286+ // If a zero cost extractvector instruction is the only use of the vector,
1287+ // then it may be combined with the def.
1288+ if (VecOpInst && VecOpInst->hasOneUse ())
1289+ continue ;
1290+
1291+ if (IsFreeExtractInsert (cast<VectorType>(Vec->getType ()), VecIndex))
1292+ Ops.push_back (&Op);
1293+
1294+ continue ;
1295+ }
1296+
1297+ if (match (Op.get (),
1298+ m_InsertElt (m_Value (Vec), m_Value (), m_ConstantInt (VecIndex)))) {
1299+ if (IsFreeExtractInsert (cast<VectorType>(Vec->getType ()), VecIndex))
1300+ Ops.push_back (&Op);
1301+
1302+ continue ;
1303+ }
1304+
1305+ if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get ())) {
1306+ if (Shuffle->isIdentity ()) {
1307+ Ops.push_back (&Op);
1308+ continue ;
1309+ }
1310+
1311+ unsigned EltSize = DL.getTypeSizeInBits (
1312+ cast<VectorType>(cast<VectorType>(Shuffle->getType ()))
1313+ ->getElementType ());
1314+
1315+ // For i32 (or greater) shufflevectors, these will be lowered into a
1316+ // series of insert / extract elements, which will be coalesced away.
1317+ if (EltSize >= 32 ) {
1318+ Ops.push_back (&Op);
1319+ continue ;
1320+ }
1321+
1322+ if (EltSize < 16 || !ST->has16BitInsts ())
1323+ continue ;
1324+
1325+ int NumSubElts, SubIndex;
1326+ if (Shuffle->changesLength ()) {
1327+ if (Shuffle->increasesLength () && Shuffle->isIdentityWithPadding ()) {
1328+ Ops.push_back (&Op);
1329+ continue ;
1330+ }
1331+
1332+ if (Shuffle->isExtractSubvectorMask (SubIndex) ||
1333+ Shuffle->isInsertSubvectorMask (NumSubElts, SubIndex)) {
1334+ if (!(SubIndex % 2 )) {
1335+ Ops.push_back (&Op);
1336+ continue ;
1337+ }
1338+ }
1339+ }
1340+
1341+ if (Shuffle->isReverse () || Shuffle->isZeroEltSplat () ||
1342+ Shuffle->isSingleSource ()) {
1343+ Ops.push_back (&Op);
1344+ continue ;
1345+ }
1346+
1347+ if (Shuffle->isInsertSubvectorMask (NumSubElts, SubIndex)) {
1348+ if (!(SubIndex % 2 )) {
1349+ Ops.push_back (&Op);
1350+ continue ;
1351+ }
1352+ }
1353+ }
12691354 }
12701355
12711356 return !Ops.empty ();
0 commit comments