@@ -347,7 +347,8 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
347
347
unsigned GCNTTIImpl::getMaximumVF (unsigned ElemWidth, unsigned Opcode) const {
348
348
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
349
349
return 32 * 4 / ElemWidth;
350
- // For a given width return the max 0number of elements that can be combined
350
+
351
+ // For a given width return the max number of elements that can be combined
351
352
// into a wider bit value:
352
353
return (ElemWidth == 8 && ST->has16BitInsts ()) ? 4
353
354
: (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
@@ -1266,6 +1267,90 @@ bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,
1266
1267
1267
1268
if (match (&Op, m_FAbs (m_Value ())) || match (&Op, m_FNeg (m_Value ())))
1268
1269
Ops.push_back (&Op);
1270
+
1271
+ // Zero cost vector instructions (e.g. extractelement 0 of i32 vectors)
1272
+ // will be optimized away, and sinking them can help SDAG combines.
1273
+ DataLayout DL = I->getModule ()->getDataLayout ();
1274
+ auto IsFreeExtractInsert = [&DL, this ](VectorType *VecType,
1275
+ unsigned VecIndex) {
1276
+ unsigned EltSize = DL.getTypeSizeInBits (VecType->getElementType ());
1277
+ return EltSize >= 32 ||
1278
+ (EltSize == 16 && VecIndex == 0 && ST->has16BitInsts ());
1279
+ };
1280
+
1281
+ uint64_t VecIndex;
1282
+ Value *Vec;
1283
+ if (match (Op.get (), m_ExtractElt (m_Value (Vec), m_ConstantInt (VecIndex)))) {
1284
+ Instruction *VecOpInst =
1285
+ dyn_cast<Instruction>(cast<Instruction>(Op.get ())->getOperand (0 ));
1286
+ // If a zero cost extractvector instruction is the only use of the vector,
1287
+ // then it may be combined with the def.
1288
+ if (VecOpInst && VecOpInst->hasOneUse ())
1289
+ continue ;
1290
+
1291
+ if (IsFreeExtractInsert (cast<VectorType>(Vec->getType ()), VecIndex))
1292
+ Ops.push_back (&Op);
1293
+
1294
+ continue ;
1295
+ }
1296
+
1297
+ if (match (Op.get (),
1298
+ m_InsertElt (m_Value (Vec), m_Value (), m_ConstantInt (VecIndex)))) {
1299
+ if (IsFreeExtractInsert (cast<VectorType>(Vec->getType ()), VecIndex))
1300
+ Ops.push_back (&Op);
1301
+
1302
+ continue ;
1303
+ }
1304
+
1305
+ if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get ())) {
1306
+ if (Shuffle->isIdentity ()) {
1307
+ Ops.push_back (&Op);
1308
+ continue ;
1309
+ }
1310
+
1311
+ unsigned EltSize = DL.getTypeSizeInBits (
1312
+ cast<VectorType>(cast<VectorType>(Shuffle->getType ()))
1313
+ ->getElementType ());
1314
+
1315
+ // For i32 (or greater) shufflevectors, these will be lowered into a
1316
+ // series of insert / extract elements, which will be coalesced away.
1317
+ if (EltSize >= 32 ) {
1318
+ Ops.push_back (&Op);
1319
+ continue ;
1320
+ }
1321
+
1322
+ if (EltSize < 16 || !ST->has16BitInsts ())
1323
+ continue ;
1324
+
1325
+ int NumSubElts, SubIndex;
1326
+ if (Shuffle->changesLength ()) {
1327
+ if (Shuffle->increasesLength () && Shuffle->isIdentityWithPadding ()) {
1328
+ Ops.push_back (&Op);
1329
+ continue ;
1330
+ }
1331
+
1332
+ if (Shuffle->isExtractSubvectorMask (SubIndex) ||
1333
+ Shuffle->isInsertSubvectorMask (NumSubElts, SubIndex)) {
1334
+ if (!(SubIndex % 2 )) {
1335
+ Ops.push_back (&Op);
1336
+ continue ;
1337
+ }
1338
+ }
1339
+ }
1340
+
1341
+ if (Shuffle->isReverse () || Shuffle->isZeroEltSplat () ||
1342
+ Shuffle->isSingleSource ()) {
1343
+ Ops.push_back (&Op);
1344
+ continue ;
1345
+ }
1346
+
1347
+ if (Shuffle->isInsertSubvectorMask (NumSubElts, SubIndex)) {
1348
+ if (!(SubIndex % 2 )) {
1349
+ Ops.push_back (&Op);
1350
+ continue ;
1351
+ }
1352
+ }
1353
+ }
1269
1354
}
1270
1355
1271
1356
return !Ops.empty ();
0 commit comments