@@ -344,9 +344,10 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344344unsigned GCNTTIImpl::getMaximumVF (unsigned ElemWidth, unsigned Opcode) const {
345345 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346346 return 32 * 4 / ElemWidth;
347- return (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
348- : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
349- : 1 ;
347+ return ElemWidth == 8 ? 4
348+ : (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
349+ : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
350+ : 1 ;
350351}
351352
352353unsigned GCNTTIImpl::getLoadVectorFactor (unsigned VF, unsigned LoadSize,
@@ -1120,6 +1121,17 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
11201121 }
11211122}
11221123
1124+ InstructionCost GCNTTIImpl::getScalarizationOverhead (
1125+ VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract,
1126+ TTI::TargetCostKind CostKind, ArrayRef<Value *> VL) {
1127+ unsigned NumVectorElts = cast<FixedVectorType>(InTy)->getNumElements ();
1128+ if (NumVectorElts > 1 &&
1129+ InTy->getElementType () == IntegerType::getInt8Ty (InTy->getContext ()))
1130+ return 0 ;
1131+ return BaseT::getScalarizationOverhead (InTy, DemandedElts, Insert, Extract,
1132+ CostKind, VL);
1133+ }
1134+
11231135InstructionCost GCNTTIImpl::getShuffleCost (TTI::ShuffleKind Kind,
11241136 VectorType *VT, ArrayRef<int > Mask,
11251137 TTI::TargetCostKind CostKind,
@@ -1134,6 +1146,11 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11341146 // Larger vector widths may require additional instructions, but are
11351147 // typically cheaper than scalarized versions.
11361148 unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements ();
1149+
1150+ if (NumVectorElts > 1 &&
1151+ VT->getElementType () == IntegerType::getInt8Ty (VT->getContext ()))
1152+ return 0 ;
1153+
11371154 if (ST->getGeneration () >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
11381155 DL.getTypeSizeInBits (VT->getElementType ()) == 16 ) {
11391156 bool HasVOP3P = ST->hasVOP3PInsts ();
@@ -1423,3 +1440,5 @@ void GCNTTIImpl::collectKernelLaunchBounds(
14231440 LB.push_back ({" amdgpu-waves-per-eu[0]" , WavesPerEU.first });
14241441 LB.push_back ({" amdgpu-waves-per-eu[1]" , WavesPerEU.second });
14251442}
1443+
1444+ bool GCNTTIImpl::canVectorizei8s () const { return true ; }
0 commit comments