@@ -344,9 +344,10 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344344unsigned GCNTTIImpl::getMaximumVF (unsigned ElemWidth, unsigned Opcode) const {
345345 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346346 return 32 * 4 / ElemWidth;
347- return (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
348- : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
349- : 1 ;
347+ return ElemWidth == 8 ? 4
348+ : (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
349+ : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
350+ : 1 ;
350351}
351352
352353unsigned GCNTTIImpl::getLoadVectorFactor (unsigned VF, unsigned LoadSize,
@@ -1423,3 +1424,32 @@ void GCNTTIImpl::collectKernelLaunchBounds(
14231424 LB.push_back ({" amdgpu-waves-per-eu[0]" , WavesPerEU.first });
14241425 LB.push_back ({" amdgpu-waves-per-eu[1]" , WavesPerEU.second });
14251426}
1427+
1428+ InstructionCost GCNTTIImpl::getMemoryOpCost (unsigned Opcode, Type *Src,
1429+ Align Alignment,
1430+ unsigned AddressSpace,
1431+ TTI::TargetCostKind CostKind,
1432+ TTI::OperandValueInfo OpInfo,
1433+ const Instruction *I) {
1434+ if (VectorType *VecTy = dyn_cast<VectorType>(Src))
1435+ if (Opcode == Instruction::Load &&
1436+ VecTy->getElementType () ==
1437+ IntegerType::getInt8Ty (VecTy->getContext ())) {
1438+ unsigned ElementCount = VecTy->getElementCount ().getFixedValue ();
1439+ return (8 * (ElementCount - 1 ) /
1440+ getLoadStoreVecRegBitWidth (AddressSpace)) +
1441+ 1 ;
1442+ }
1443+ return BaseT::getMemoryOpCost (Opcode, Src, Alignment, AddressSpace, CostKind,
1444+ OpInfo, I);
1445+ }
1446+
1447+ unsigned GCNTTIImpl::getNumberOfParts (Type *Tp) {
1448+ if (VectorType *VecTy = dyn_cast<VectorType>(Tp))
1449+ if (VecTy->getElementType () ==
1450+ IntegerType::getInt8Ty (VecTy->getContext ())) {
1451+ unsigned ElementCount = VecTy->getElementCount ().getFixedValue ();
1452+ return ((ElementCount - 1 ) / 4 ) + 1 ;
1453+ }
1454+ return BaseT::getNumberOfParts (Tp);
1455+ }
0 commit comments