@@ -313,6 +313,24 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
313313 return !F || !ST->isSingleLaneExecution (*F);
314314}
315315
316+ unsigned GCNTTIImpl::getNumberOfParts (Type *Tp) {
317+ // For certain 8 bit ops, we can pack a v4i8 into a single part
318+ // (e.g. v4i8 shufflevectors -> v_perm v4i8, v4i8). Thus, we
319+ // do not limit the numberOfParts for 8 bit vectors to the
320+ // legalization costs of such. It is left up to other target
321+ // queries (e.g. get*InstrCost) to decide the proper handling
322+ // of 8 bit vectors.
323+ if (FixedVectorType *VTy = dyn_cast<FixedVectorType>(Tp)) {
324+ if (ST->shouldCoerceIllegalTypes () &&
325+ DL.getTypeSizeInBits (VTy->getElementType ()) == 8 ) {
326+ unsigned ElCount = VTy->getElementCount ().getFixedValue ();
327+ return PowerOf2Ceil (ElCount / 4 );
328+ }
329+ }
330+
331+ return BaseT::getNumberOfParts (Tp);
332+ }
333+
316334unsigned GCNTTIImpl::getNumberOfRegisters (unsigned RCID) const {
317335 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
318336 // registers. See getRegisterClassForType for the implementation.
@@ -344,9 +362,11 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344362unsigned GCNTTIImpl::getMaximumVF (unsigned ElemWidth, unsigned Opcode) const {
345363 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346364 return 32 * 4 / ElemWidth;
347- return (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
348- : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
349- : 1 ;
365+
366+ return (ST->shouldCoerceIllegalTypes () && ElemWidth == 8 ) ? 4
367+ : (ElemWidth == 16 ) ? 2
368+ : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
369+ : 1 ;
350370}
351371
352372unsigned GCNTTIImpl::getLoadVectorFactor (unsigned VF, unsigned LoadSize,
@@ -1131,14 +1151,16 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11311151
11321152 Kind = improveShuffleKindFromMask (Kind, Mask, VT, Index, SubTp);
11331153
1134- // Larger vector widths may require additional instructions, but are
1135- // typically cheaper than scalarized versions.
1136- unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements ();
1154+ unsigned ScalarSize = DL.getTypeSizeInBits (VT->getElementType ());
11371155 if (ST->getGeneration () >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1138- DL.getTypeSizeInBits (VT->getElementType ()) == 16 ) {
1139- bool HasVOP3P = ST->hasVOP3PInsts ();
1156+ (ScalarSize == 16 ||
1157+ (ScalarSize == 8 && ST->shouldCoerceIllegalTypes ()))) {
1158+ // Larger vector widths may require additional instructions, but are
1159+ // typically cheaper than scalarized versions.
1160+ unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements ();
11401161 unsigned RequestedElts =
11411162 count_if (Mask, [](int MaskElt) { return MaskElt != -1 ; });
1163+ unsigned EltsPerReg = 32 / ScalarSize;
11421164 if (RequestedElts == 0 )
11431165 return 0 ;
11441166 switch (Kind) {
@@ -1147,9 +1169,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11471169 case TTI::SK_PermuteSingleSrc: {
11481170 // With op_sel VOP3P instructions freely can access the low half or high
11491171 // half of a register, so any swizzle of two elements is free.
1150- if (HasVOP3P && NumVectorElts == 2 )
1172+ if (ST-> hasVOP3PInsts () && ScalarSize == 16 && NumVectorElts == 2 )
11511173 return 0 ;
1152- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1174+ unsigned NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
11531175 // SK_Broadcast just reuses the same mask
11541176 unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
11551177 return NumPerms + NumPermMasks;
@@ -1161,12 +1183,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11611183 return 0 ;
11621184 // Insert/extract subvectors only require shifts / extract code to get the
11631185 // relevant bits
1164- return alignTo (RequestedElts, 2 ) / 2 ;
1186+ return alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
11651187 }
11661188 case TTI::SK_PermuteTwoSrc:
11671189 case TTI::SK_Splice:
11681190 case TTI::SK_Select: {
1169- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1191+ unsigned NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
11701192 // SK_Select just reuses the same mask
11711193 unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
11721194 return NumPerms + NumPermMasks;
0 commit comments