@@ -313,6 +313,23 @@ bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
313313 return !F || !ST->isSingleLaneExecution (*F);
314314}
315315
316+ unsigned GCNTTIImpl::getNumberOfParts (Type *Tp) {
317+ // For certain 8 bit ops, we can pack a v4i8 into a single part
318+ // (e.g. v4i8 shufflevectors -> v_perm v4i8, v4i8). Thus, we
319+ // do not limit the numberOfParts for 8 bit vectors to the
320+ // legalization costs of such. It is left up to other target
321+ // queries (e.g. get*InstrCost) to decide the proper handling
322+ // of 8 bit vectors.
323+ if (FixedVectorType *VTy = dyn_cast<FixedVectorType>(Tp)) {
324+ if (DL.getTypeSizeInBits (VTy->getElementType ()) == 8 ) {
325+ unsigned ElCount = VTy->getElementCount ().getFixedValue ();
326+ return PowerOf2Ceil (ElCount / 4 );
327+ }
328+ }
329+
330+ return BaseT::getNumberOfParts (Tp);
331+ }
332+
316333unsigned GCNTTIImpl::getNumberOfRegisters (unsigned RCID) const {
317334 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
318335 // registers. See getRegisterClassForType for the implementation.
@@ -344,9 +361,11 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
344361unsigned GCNTTIImpl::getMaximumVF (unsigned ElemWidth, unsigned Opcode) const {
345362 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
346363 return 32 * 4 / ElemWidth;
347- return (ElemWidth == 16 && ST->has16BitInsts ()) ? 2
348- : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
349- : 1 ;
364+
365+ return (ElemWidth == 8 ) ? 4
366+ : (ElemWidth == 16 ) ? 2
367+ : (ElemWidth == 32 && ST->hasPackedFP32Ops ()) ? 2
368+ : 1 ;
350369}
351370
352371unsigned GCNTTIImpl::getLoadVectorFactor (unsigned VF, unsigned LoadSize,
@@ -1133,14 +1152,15 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11331152
11341153 Kind = improveShuffleKindFromMask (Kind, Mask, VT, Index, SubTp);
11351154
1136- // Larger vector widths may require additional instructions, but are
1137- // typically cheaper than scalarized versions.
1138- unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements ();
1155+ unsigned ScalarSize = DL.getTypeSizeInBits (VT->getElementType ());
11391156 if (ST->getGeneration () >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1140- DL.getTypeSizeInBits (VT->getElementType ()) == 16 ) {
1141- bool HasVOP3P = ST->hasVOP3PInsts ();
1157+ (ScalarSize == 16 || ScalarSize == 8 )) {
1158+ // Larger vector widths may require additional instructions, but are
1159+ // typically cheaper than scalarized versions.
1160+ unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements ();
11421161 unsigned RequestedElts =
11431162 count_if (Mask, [](int MaskElt) { return MaskElt != -1 ; });
1163+ unsigned EltsPerReg = 32 / ScalarSize;
11441164 if (RequestedElts == 0 )
11451165 return 0 ;
11461166 switch (Kind) {
@@ -1149,9 +1169,9 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11491169 case TTI::SK_PermuteSingleSrc: {
11501170 // With op_sel VOP3P instructions freely can access the low half or high
11511171 // half of a register, so any swizzle of two elements is free.
1152- if (HasVOP3P && NumVectorElts == 2 )
1172+ if (ST-> hasVOP3PInsts () && ScalarSize == 16 && NumVectorElts == 2 )
11531173 return 0 ;
1154- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1174+ unsigned NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
11551175 // SK_Broadcast just reuses the same mask
11561176 unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
11571177 return NumPerms + NumPermMasks;
@@ -1163,12 +1183,12 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11631183 return 0 ;
11641184 // Insert/extract subvectors only require shifts / extract code to get the
11651185 // relevant bits
1166- return alignTo (RequestedElts, 2 ) / 2 ;
1186+ return alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
11671187 }
11681188 case TTI::SK_PermuteTwoSrc:
11691189 case TTI::SK_Splice:
11701190 case TTI::SK_Select: {
1171- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1191+ unsigned NumPerms = alignTo (RequestedElts, EltsPerReg ) / EltsPerReg ;
11721192 // SK_Select just reuses the same mask
11731193 unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
11741194 return NumPerms + NumPermMasks;
0 commit comments