@@ -313,24 +313,6 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
313
313
return !F || !ST->isSingleLaneExecution (*F);
314
314
}
315
315
316
- unsigned GCNTTIImpl::getNumberOfParts (Type *Tp) {
317
- // For certain 8 bit ops, we can pack a v4i8 into a single part
318
- // (e.g. v4i8 shufflevectors -> v_perm v4i8, v4i8). Thus, we
319
- // do not limit the numberOfParts for 8 bit vectors to the
320
- // legalization costs of such. It is left up to other target
321
- // queries (e.g. get*InstrCost) to decide the proper handling
322
- // of 8 bit vectors.
323
- if (FixedVectorType *VTy = dyn_cast<FixedVectorType>(Tp)) {
324
- if (ST->shouldCoerceIllegalTypes () &&
325
- DL.getTypeSizeInBits (VTy->getElementType ()) == 8 ) {
326
- unsigned ElCount = VTy->getElementCount ().getFixedValue ();
327
- return std::max (UINT64_C (1 ), PowerOf2Ceil (ElCount / 4 ));
328
- }
329
- }
330
-
331
- return BaseT::getNumberOfParts (Tp);
332
- }
333
-
334
316
unsigned GCNTTIImpl::getNumberOfRegisters (unsigned RCID) const {
335
317
// NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
336
318
// registers. See getRegisterClassForType for the implementation.
@@ -363,10 +345,12 @@ unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
363
345
if (Opcode == Instruction::Load || Opcode == Instruction::Store)
364
346
return 32 * 4 / ElemWidth;
365
347
366
- return (ST->shouldCoerceIllegalTypes () && ElemWidth == 8 ) ? 4
367
- : (ElemWidth == 16 ) ? 2
368
- : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
369
- : 1 ;
348
+ // For a given width return the max number of elements that can be combined
349
+ // into a wider bit value:
350
+ return (ElemWidth == 8 && ST->has16BitInsts ()) ? 4
351
+ : (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
352
+ : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
353
+ : 1 ;
370
354
}
371
355
372
356
unsigned GCNTTIImpl::getLoadVectorFactor (unsigned VF, unsigned LoadSize,
@@ -1176,8 +1160,7 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
1176
1160
1177
1161
unsigned ScalarSize = DL.getTypeSizeInBits (VT->getElementType ());
1178
1162
if (ST->getGeneration () >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1179
- (ScalarSize == 16 ||
1180
- (ScalarSize == 8 && ST->shouldCoerceIllegalTypes ()))) {
1163
+ (ScalarSize == 16 || ScalarSize == 8 )) {
1181
1164
// Larger vector widths may require additional instructions, but are
1182
1165
// typically cheaper than scalarized versions.
1183
1166
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements ();
@@ -1452,3 +1435,30 @@ unsigned GCNTTIImpl::getPrefetchDistance() const {
1452
1435
bool GCNTTIImpl::shouldPrefetchAddressSpace (unsigned AS) const {
1453
1436
return AMDGPU::isFlatGlobalAddrSpace (AS);
1454
1437
}
1438
+
1439
+ InstructionCost GCNTTIImpl::getMemoryOpCost (unsigned Opcode, Type *Src,
1440
+ Align Alignment,
1441
+ unsigned AddressSpace,
1442
+ TTI::TargetCostKind CostKind,
1443
+ TTI::OperandValueInfo OpInfo,
1444
+ const Instruction *I) {
1445
+ if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1446
+ if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1447
+ VecTy->getElementType ()->isIntegerTy (8 )) {
1448
+ return divideCeil (DL.getTypeSizeInBits (VecTy) - 1 ,
1449
+ getLoadStoreVecRegBitWidth (AddressSpace));
1450
+ }
1451
+ }
1452
+ return BaseT::getMemoryOpCost (Opcode, Src, Alignment, AddressSpace, CostKind,
1453
+ OpInfo, I);
1454
+ }
1455
+
1456
+ unsigned GCNTTIImpl::getNumberOfParts (Type *Tp) {
1457
+ if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1458
+ if (VecTy->getElementType ()->isIntegerTy (8 )) {
1459
+ unsigned ElementCount = VecTy->getElementCount ().getFixedValue ();
1460
+ return divideCeil (ElementCount - 1 , 4 );
1461
+ }
1462
+ }
1463
+ return BaseT::getNumberOfParts (Tp);
1464
+ }
0 commit comments