@@ -344,9 +344,10 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344344unsigned GCNTTIImpl::getMaximumVF (unsigned ElemWidth, unsigned Opcode) const {
345345 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346346 return 32 * 4 / ElemWidth;
347- return (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
348- : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
349- : 1 ;
347+ return ElemWidth == 8 ? 4
348+ : (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
349+ : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
350+ : 1 ;
350351}
351352
352353unsigned GCNTTIImpl::getLoadVectorFactor (unsigned VF, unsigned LoadSize,
@@ -562,6 +563,7 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
562563 if (ST->has16BitInsts () && SLT == MVT::i16 )
563564 NElts = (NElts + 1 ) / 2 ;
564565
566+ // i32
565567 return LT.first * NElts * getFullRateInstrCost ();
566568 case ISD::MUL: {
567569 const int QuarterRateCost = getQuarterRateInstrCost (CostKind);
@@ -1423,3 +1425,30 @@ void GCNTTIImpl::collectKernelLaunchBounds(
14231425 LB.push_back ({" amdgpu-waves-per-eu[0]" , WavesPerEU.first });
14241426 LB.push_back ({" amdgpu-waves-per-eu[1]" , WavesPerEU.second });
14251427}
1428+
1429+ InstructionCost GCNTTIImpl::getMemoryOpCost (unsigned Opcode, Type *Src,
1430+ Align Alignment,
1431+ unsigned AddressSpace,
1432+ TTI::TargetCostKind CostKind,
1433+ TTI::OperandValueInfo OpInfo,
1434+ const Instruction *I) {
1435+ if (VectorType *VecTy = dyn_cast<VectorType>(Src))
1436+ if (Opcode == Instruction::Load &&
1437+ VecTy->getElementType () ==
1438+ IntegerType::getInt8Ty (VecTy->getContext ())) {
1439+ unsigned ElementCount = VecTy->getElementCount ().getFixedValue ();
1440+ return ((ElementCount - 1 ) / 4 ) + 1 ;
1441+ }
1442+ return BaseT::getMemoryOpCost (Opcode, Src, Alignment, AddressSpace, CostKind,
1443+ OpInfo, I);
1444+ }
1445+
1446+ unsigned GCNTTIImpl::getNumberOfParts (Type *Tp) {
1447+ if (VectorType *VecTy = dyn_cast<VectorType>(Tp))
1448+ if (VecTy->getElementType () ==
1449+ IntegerType::getInt8Ty (VecTy->getContext ())) {
1450+ unsigned ElementCount = VecTy->getElementCount ().getFixedValue ();
1451+ return ((ElementCount - 1 ) / 4 ) + 1 ;
1452+ }
1453+ return BaseT::getNumberOfParts (Tp);
1454+ }
0 commit comments