@@ -313,24 +313,6 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
313
313
return !F || !ST->isSingleLaneExecution (*F);
314
314
}
315
315
316
- unsigned GCNTTIImpl::getNumberOfParts (Type *Tp) {
317
- // For certain 8 bit ops, we can pack a v4i8 into a single part
318
- // (e.g. v4i8 shufflevectors -> v_perm v4i8, v4i8). Thus, we
319
- // do not limit the numberOfParts for 8 bit vectors to the
320
- // legalization costs of such. It is left up to other target
321
- // queries (e.g. get*InstrCost) to decide the proper handling
322
- // of 8 bit vectors.
323
- if (FixedVectorType *VTy = dyn_cast<FixedVectorType>(Tp)) {
324
- if (ST->shouldCoerceIllegalTypes () &&
325
- DL.getTypeSizeInBits (VTy->getElementType ()) == 8 ) {
326
- unsigned ElCount = VTy->getElementCount ().getFixedValue ();
327
- return std::max (UINT64_C (1 ), PowerOf2Ceil (ElCount / 4 ));
328
- }
329
- }
330
-
331
- return BaseT::getNumberOfParts (Tp);
332
- }
333
-
334
316
unsigned GCNTTIImpl::getNumberOfRegisters (unsigned RCID) const {
335
317
// NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
336
318
// registers. See getRegisterClassForType for the implementation.
@@ -363,10 +345,12 @@ unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
363
345
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
364
346
return 32 * 4 / ElemWidth;
365
347
366
- return (ST->shouldCoerceIllegalTypes () && ElemWidth == 8 ) ? 4
367
- : (ElemWidth == 16 ) ? 2
368
- : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
369
- : 1 ;
348
+ // For a given width return the max number of elements that can be combined
349
+ // into a wider bit value:
350
+ return (ElemWidth == 8 && ST->has16BitInsts ()) ? 4
351
+ : (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
352
+ : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
353
+ : 1 ;
370
354
}
371
355
372
356
unsigned GCNTTIImpl::getLoadVectorFactor (unsigned VF, unsigned LoadSize,
@@ -1176,8 +1160,7 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1176
1160
1177
1161
unsigned ScalarSize = DL.getTypeSizeInBits (VT->getElementType ());
1178
1162
if (ST->getGeneration () >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1179
- (ScalarSize == 16 ||
1180
- (ScalarSize == 8 && ST->shouldCoerceIllegalTypes ()))) {
1163
+ (ScalarSize == 16 || ScalarSize == 8 )) {
1181
1164
// Larger vector widths may require additional instructions, but are
1182
1165
// typically cheaper than scalarized versions.
1183
1166
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements ();
@@ -1239,6 +1222,90 @@ bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,
1239
1222
1240
1223
if (match (&Op, m_FAbs (m_Value ())) || match (&Op, m_FNeg (m_Value ())))
1241
1224
Ops.push_back (&Op);
1225
+
1226
+ // Zero cost vector instructions (e.g. extractelement 0 of i32 vectors)
1227
+ // will be optimized away, and sinking them can help SDAG combines.
1228
+ DataLayout DL = I->getModule ()->getDataLayout ();
1229
+ auto IsFreeExtractInsert = [&DL, this ](VectorType *VecType,
1230
+ unsigned VecIndex) {
1231
+ unsigned EltSize = DL.getTypeSizeInBits (VecType->getElementType ());
1232
+ return EltSize >= 32 ||
1233
+ (EltSize == 16 && VecIndex == 0 && ST->has16BitInsts ());
1234
+ };
1235
+
1236
+ uint64_t VecIndex;
1237
+ Value *Vec;
1238
+ if (match (Op.get (), m_ExtractElt (m_Value (Vec), m_ConstantInt (VecIndex)))) {
1239
+ Instruction *VecOpInst =
1240
+ dyn_cast<Instruction>(cast<Instruction>(Op.get ())->getOperand (0 ));
1241
+ // If a zero cost extractvector instruction is the only use of the vector,
1242
+ // then it may be combined with the def.
1243
+ if (VecOpInst && VecOpInst->hasOneUse ())
1244
+ continue ;
1245
+
1246
+ if (IsFreeExtractInsert (cast<VectorType>(Vec->getType ()), VecIndex))
1247
+ Ops.push_back (&Op);
1248
+
1249
+ continue ;
1250
+ }
1251
+
1252
+ if (match (Op.get (),
1253
+ m_InsertElt (m_Value (Vec), m_Value (), m_ConstantInt (VecIndex)))) {
1254
+ if (IsFreeExtractInsert (cast<VectorType>(Vec->getType ()), VecIndex))
1255
+ Ops.push_back (&Op);
1256
+
1257
+ continue ;
1258
+ }
1259
+
1260
+ if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get ())) {
1261
+ if (Shuffle->isIdentity ()) {
1262
+ Ops.push_back (&Op);
1263
+ continue ;
1264
+ }
1265
+
1266
+ unsigned EltSize = DL.getTypeSizeInBits (
1267
+ cast<VectorType>(cast<VectorType>(Shuffle->getType ()))
1268
+ ->getElementType ());
1269
+
1270
+ // For i32 (or greater) shufflevectors, these will be lowered into a
1271
+ // series of insert / extract elements, which will be coalesced away.
1272
+ if (EltSize >= 32 ) {
1273
+ Ops.push_back (&Op);
1274
+ continue ;
1275
+ }
1276
+
1277
+ if (EltSize < 16 || !ST->has16BitInsts ())
1278
+ continue ;
1279
+
1280
+ int NumSubElts, SubIndex;
1281
+ if (Shuffle->changesLength ()) {
1282
+ if (Shuffle->increasesLength () && Shuffle->isIdentityWithPadding ()) {
1283
+ Ops.push_back (&Op);
1284
+ continue ;
1285
+ }
1286
+
1287
+ if (Shuffle->isExtractSubvectorMask (SubIndex) ||
1288
+ Shuffle->isInsertSubvectorMask (NumSubElts, SubIndex)) {
1289
+ if (!(SubIndex % 2 )) {
1290
+ Ops.push_back (&Op);
1291
+ continue ;
1292
+ }
1293
+ }
1294
+ }
1295
+
1296
+ if (Shuffle->isReverse () || Shuffle->isZeroEltSplat () ||
1297
+ Shuffle->isSingleSource ()) {
1298
+ Ops.push_back (&Op);
1299
+ continue ;
1300
+ }
1301
+
1302
+ if (Shuffle->isInsertSubvectorMask (NumSubElts, SubIndex)) {
1303
+ if (!(SubIndex % 2 )) {
1304
+ Ops.push_back (&Op);
1305
+ continue ;
1306
+ }
1307
+ }
1308
+ }
1242
1309
}
1243
1310
1244
1311
return !Ops.empty ();
@@ -1452,3 +1519,30 @@ unsigned GCNTTIImpl::getPrefetchDistance() const {
1452
1519
bool GCNTTIImpl::shouldPrefetchAddressSpace (unsigned AS) const {
1453
1520
return AMDGPU::isFlatGlobalAddrSpace (AS);
1454
1521
}
1522
+
1523
+ InstructionCost GCNTTIImpl::getMemoryOpCost (unsigned Opcode, Type *Src,
1524
+ Align Alignment,
1525
+ unsigned AddressSpace,
1526
+ TTI::TargetCostKind CostKind,
1527
+ TTI::OperandValueInfo OpInfo,
1528
+ const Instruction *I) {
1529
+ if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1530
+ if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1531
+ VecTy->getElementType ()->isIntegerTy (8 )) {
1532
+ return divideCeil (DL.getTypeSizeInBits (VecTy) - 1 ,
1533
+ getLoadStoreVecRegBitWidth (AddressSpace));
1534
+ }
1535
+ }
1536
+ return BaseT::getMemoryOpCost (Opcode, Src, Alignment, AddressSpace, CostKind,
1537
+ OpInfo, I);
1538
+ }
1539
+
1540
+ unsigned GCNTTIImpl::getNumberOfParts (Type *Tp) {
1541
+ if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1542
+ if (VecTy->getElementType ()->isIntegerTy (8 )) {
1543
+ unsigned ElementCount = VecTy->getElementCount ().getFixedValue ();
1544
+ return divideCeil (ElementCount - 1 , 4 );
1545
+ }
1546
+ }
1547
+ return BaseT::getNumberOfParts (Tp);
1548
+ }
0 commit comments