@@ -313,6 +313,23 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
313313 return !F || !ST->isSingleLaneExecution (*F);
314314}
315315
316+ unsigned GCNTTIImpl::getNumberOfParts (Type *Tp) {
317+ // For certain 8 bit ops, we can pack a v4i8 into a single part
318+ // (e.g. v4i8 shufflevectors -> v_perm v4i8, v4i8). Thus, we
319+ // do not limit the numberOfParts for 8 bit vectors to the
320+ // legalization costs of such. It is left up to other target
321+ // queries (e.g. get*InstrCost) to decide the proper handling
322+ // of 8 bit vectors.
323+ if (FixedVectorType *VTy = dyn_cast<FixedVectorType>(Tp)) {
324+ if (DL.getTypeSizeInBits (VTy->getElementType ()) == 8 ) {
325+ unsigned ElCount = VTy->getElementCount ().getFixedValue ();
326+ return PowerOf2Ceil (ElCount / 4 );
327+ }
328+ }
329+
330+ return BaseT::getNumberOfParts (Tp);
331+ }
332+
316333unsigned GCNTTIImpl::getNumberOfRegisters (unsigned RCID) const {
317334 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
318335 // registers. See getRegisterClassForType for the implementation.
@@ -344,9 +361,11 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344361unsigned GCNTTIImpl::getMaximumVF (unsigned ElemWidth, unsigned Opcode) const {
345362 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346363 return 32 * 4 / ElemWidth;
347- return (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
348- : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
349- : 1 ;
364+
365+ return (ElemWidth == 8 ) ? 4
366+ : (ElemWidth == 16 ) ? 2
367+ : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
368+ : 1 ;
350369}
351370
352371unsigned GCNTTIImpl::getLoadVectorFactor (unsigned VF, unsigned LoadSize,
@@ -1154,14 +1173,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11541173
11551174 Kind = improveShuffleKindFromMask (Kind, Mask, VT, Index, SubTp);
11561175
1157- // Larger vector widths may require additional instructions, but are
1158- // typically cheaper than scalarized versions.
1159- unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements ();
1176+ unsigned ScalarSize = DL.getTypeSizeInBits (VT->getElementType ());
11601177 if (ST->getGeneration () >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1161- DL.getTypeSizeInBits (VT->getElementType ()) == 16 ) {
1162- bool HasVOP3P = ST->hasVOP3PInsts ();
1178+ (ScalarSize == 16 || ScalarSize == 8 )) {
1179+ // Larger vector widths may require additional instructions, but are
1180+ // typically cheaper than scalarized versions.
1181+ unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements ();
11631182 unsigned RequestedElts =
11641183 count_if (Mask, [](int MaskElt) { return MaskElt != -1 ; });
1184+ unsigned EltsPerReg = 32 / ScalarSize;
11651185 if (RequestedElts == 0 )
11661186 return 0 ;
11671187 switch (Kind) {
@@ -1170,9 +1190,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11701190 case TTI::SK_PermuteSingleSrc: {
11711191 // With op_sel VOP3P instructions freely can access the low half or high
11721192 // half of a register, so any swizzle of two elements is free.
1173- if (HasVOP3P && NumVectorElts == 2 )
1193+ if (ST-> hasVOP3PInsts () && ScalarSize == 16 && NumVectorElts == 2 )
11741194 return 0 ;
1175- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1195+ unsigned NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
11761196 // SK_Broadcast just reuses the same mask
11771197 unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
11781198 return NumPerms + NumPermMasks;
@@ -1184,12 +1204,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11841204 return 0 ;
11851205 // Insert/extract subvectors only require shifts / extract code to get the
11861206 // relevant bits
1187- return alignTo (RequestedElts, 2 ) / 2 ;
1207+ return alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
11881208 }
11891209 case TTI::SK_PermuteTwoSrc:
11901210 case TTI::SK_Splice:
11911211 case TTI::SK_Select: {
1192- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1212+ unsigned NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
11931213 // SK_Select just reuses the same mask
11941214 unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
11951215 return NumPerms + NumPermMasks;
0 commit comments