@@ -1120,6 +1120,64 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
11201120 }
11211121}
11221122
1123+ // / Certain shufflemasks may not be either Identity masks or InsertSubvector
1124+ // / masks, but do not require instructions to produce. An example is if we are
1125+ // / shuffling two <16 x i8> sources with the 16 element mask: {0, 1, 2, 3, 4, 5,
1126+ // / 6, 7, 24, 25, 26, 27, poison, poison, posion, poison}. The result of this
1127+ // / shuffle is {first v4i8 of src0, second v4i8 of src0, third v4i8 of src1,
1128+ // / posion}. In order to produce this result, we do not need to insert shuffle
1129+ // / code, as these vectors already exist the source registers. Thus, we simply
1130+ // / need to ensure these registers are contiguous to produce the result.
1131+ // / countIdentityPerms analyzes the \p Mask to count the number of such register
1132+ // / aligned vectors (based on the provided \p ScalarSize ).
1133+ static unsigned countIdentityPerms (ArrayRef<int > Mask, unsigned ScalarSize) {
1134+ unsigned IdentityPerms = 0 ;
1135+ unsigned EltsPerPerm = 32 / ScalarSize;
1136+ if (!EltsPerPerm)
1137+ return 0 ;
1138+
1139+ // Split the shuffle mask into a number of 32 bit wide shuffles.
1140+ for (unsigned PermCand = 0 ; PermCand < (Mask.size () / EltsPerPerm);
1141+ PermCand++) {
1142+ std::pair<int , int > BasisIndex (-1 , -1 );
1143+ bool FoundMismatch = false ;
1144+
1145+ // Analyze the 32 bit mask for register-aligned vectors.
1146+ for (int PermElement = 0 ; PermElement < (int )EltsPerPerm; PermElement++) {
1147+ unsigned Index = PermCand * EltsPerPerm + PermElement;
1148+ assert (Index < Mask.size ());
1149+ int MaskVal = Mask[Index];
1150+
1151+ // Maskval of -1 is dont-care.
1152+ if (MaskVal == -1 )
1153+ continue ;
1154+ if (BasisIndex.second == -1 ) {
1155+ // Check if this mask represents alignment to bit position in the
1156+ // regsiter.
1157+ if (PermElement > MaskVal || ((MaskVal - PermElement) % EltsPerPerm)) {
1158+ FoundMismatch = true ;
1159+ }
1160+ BasisIndex = {MaskVal, PermElement};
1161+ continue ;
1162+ }
1163+
1164+ if (MaskVal < BasisIndex.first ) {
1165+ FoundMismatch = true ;
1166+ break ;
1167+ }
1168+
1169+ // Check if this mask is contiguous with the previously matched mask
1170+ if ((MaskVal - BasisIndex.first ) != (PermElement - BasisIndex.second )) {
1171+ FoundMismatch = true ;
1172+ break ;
1173+ }
1174+ }
1175+ if (!FoundMismatch)
1176+ IdentityPerms += 1 ;
1177+ }
1178+ return IdentityPerms;
1179+ }
1180+
11231181InstructionCost GCNTTIImpl::getShuffleCost (TTI::ShuffleKind Kind,
11241182 VectorType *VT, ArrayRef<int > Mask,
11251183 TTI::TargetCostKind CostKind,
@@ -1133,12 +1191,13 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11331191
11341192 // Larger vector widths may require additional instructions, but are
11351193 // typically cheaper than scalarized versions.
1136- unsigned NumVectorElts = cast<FixedVectorType> (VT)-> getNumElements ( );
1194+ unsigned ScalarSize = DL. getTypeSizeInBits (VT-> getElementType () );
11371195 if (ST->getGeneration () >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1138- DL. getTypeSizeInBits (VT-> getElementType ()) == 16 ) {
1196+ ScalarSize == 16 ) {
11391197 bool HasVOP3P = ST->hasVOP3PInsts ();
11401198 unsigned RequestedElts =
11411199 count_if (Mask, [](int MaskElt) { return MaskElt != -1 ; });
1200+ unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements ();
11421201 if (RequestedElts == 0 )
11431202 return 0 ;
11441203 switch (Kind) {
@@ -1149,9 +1208,13 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11491208 // half of a register, so any swizzle of two elements is free.
11501209 if (HasVOP3P && NumVectorElts == 2 )
11511210 return 0 ;
1152- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1211+ unsigned NumPerms = alignTo (Mask.size (), 2 ) / 2 ;
1212+ unsigned IdentPerms = countIdentityPerms (Mask, ScalarSize);
1213+ assert (IdentPerms <= NumPerms);
1214+ NumPerms -= IdentPerms;
11531215 // SK_Broadcast just reuses the same mask
1154- unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
1216+ unsigned NumPermMasks =
1217+ Kind == (TTI::SK_Broadcast && NumPerms > 1 ) ? 1 : NumPerms;
11551218 return NumPerms + NumPermMasks;
11561219 }
11571220 case TTI::SK_ExtractSubvector:
@@ -1166,9 +1229,13 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11661229 case TTI::SK_PermuteTwoSrc:
11671230 case TTI::SK_Splice:
11681231 case TTI::SK_Select: {
1169- unsigned NumPerms = alignTo (RequestedElts, 2 ) / 2 ;
1232+ unsigned NumPerms = alignTo (Mask.size (), 2 ) / 2 ;
1233+ unsigned IdentPerms = countIdentityPerms (Mask, ScalarSize);
1234+ assert (IdentPerms <= NumPerms);
1235+ NumPerms -= IdentPerms;
11701236 // SK_Select just reuses the same mask
1171- unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
1237+ unsigned NumPermMasks =
1238+ Kind == (TTI::SK_Select && NumPerms > 1 ) ? 1 : NumPerms;
11721239 return NumPerms + NumPermMasks;
11731240 }
11741241
0 commit comments