@@ -344,9 +344,10 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344344unsigned GCNTTIImpl::getMaximumVF (unsigned ElemWidth, unsigned Opcode) const {
345345 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346346 return 32 * 4 / ElemWidth;
347- return (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
348- : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
349- : 1 ;
347+ return ElemWidth == 8 ? 4
348+ : (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
349+ : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
350+ : 1 ;
350351}
351352
352353unsigned GCNTTIImpl::getLoadVectorFactor (unsigned VF, unsigned LoadSize,
@@ -1422,3 +1423,31 @@ void GCNTTIImpl::collectKernelLaunchBounds(
14221423 LB.push_back ({" amdgpu-waves-per-eu[0]" , WavesPerEU.first });
14231424 LB.push_back ({" amdgpu-waves-per-eu[1]" , WavesPerEU.second });
14241425}
1426+
1427+ InstructionCost GCNTTIImpl::getMemoryOpCost (unsigned Opcode, Type *Src,
1428+ Align Alignment,
1429+ unsigned AddressSpace,
1430+ TTI::TargetCostKind CostKind,
1431+ TTI::OperandValueInfo OpInfo,
1432+ const Instruction *I) {
1433+ if (VectorType *VecTy = dyn_cast<VectorType>(Src))
1434+ if (Opcode == Instruction::Load &&
1435+ VecTy->getElementType () ==
1436+ IntegerType::getInt8Ty (VecTy->getContext ())) {
1437+ return ((DL.getTypeSizeInBits (VecTy) - 1 ) /
1438+ getLoadStoreVecRegBitWidth (AddressSpace)) +
1439+ 1 ;
1440+ }
1441+ return BaseT::getMemoryOpCost (Opcode, Src, Alignment, AddressSpace, CostKind,
1442+ OpInfo, I);
1443+ }
1444+
1445+ unsigned GCNTTIImpl::getNumberOfParts (Type *Tp) {
1446+ if (VectorType *VecTy = dyn_cast<VectorType>(Tp))
1447+ if (VecTy->getElementType () ==
1448+ IntegerType::getInt8Ty (VecTy->getContext ())) {
1449+ unsigned ElementCount = VecTy->getElementCount ().getFixedValue ();
1450+ return ((ElementCount - 1 ) / 4 ) + 1 ;
1451+ }
1452+ return BaseT::getNumberOfParts (Tp);
1453+ }
0 commit comments