Skip to content

Commit a527819

Browse files
committed
[AMDGPU] Do not add cost for register aligned vectorshuffles
Change-Id: I9bde8614ca1a004bcb833271225e00e1436ba022
1 parent ae0aa2d commit a527819

File tree

3 files changed

+323
-30
lines changed

3 files changed

+323
-30
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 73 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1120,6 +1120,64 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
11201120
}
11211121
}
11221122

1123+
/// Certain shufflemasks may not be either Identity masks or InsertSubvector
1124+
/// masks, but do not require instructions to produce. An example is if we are
1125+
/// shuffling two <16 x i8> sources with the 16 element mask: {0, 1, 2, 3, 4, 5,
1126+
/// 6, 7, 24, 25, 26, 27, poison, poison, posion, poison}. The result of this
1127+
/// shuffle is {first v4i8 of src0, second v4i8 of src0, third v4i8 of src1,
1128+
/// posion}. In order to produce this result, we do not need to insert shuffle
1129+
/// code, as these vectors already exist the source registers. Thus, we simply
1130+
/// need to ensure these registers are contiguous to produce the result.
1131+
/// countIdentityPerms analyzes the \p Mask to count the number of such register
1132+
/// aligned vectors (based on the provided \p ScalarSize ).
1133+
static unsigned countIdentityPerms(ArrayRef<int> Mask, unsigned ScalarSize) {
1134+
unsigned IdentityPerms = 0;
1135+
unsigned EltsPerPerm = 32 / ScalarSize;
1136+
if (!EltsPerPerm)
1137+
return 0;
1138+
1139+
// Split the shuffle mask into a number of 32 bit wide shuffles.
1140+
for (unsigned PermCand = 0; PermCand < (Mask.size() / EltsPerPerm);
1141+
PermCand++) {
1142+
std::pair<int, int> BasisIndex(-1, -1);
1143+
bool FoundMismatch = false;
1144+
1145+
// Analyze the 32 bit mask for register-aligned vectors.
1146+
for (int PermElement = 0; PermElement < (int)EltsPerPerm; PermElement++) {
1147+
unsigned Index = PermCand * EltsPerPerm + PermElement;
1148+
assert(Index < Mask.size());
1149+
int MaskVal = Mask[Index];
1150+
1151+
// Maskval of -1 is dont-care.
1152+
if (MaskVal == -1)
1153+
continue;
1154+
if (BasisIndex.second == -1) {
1155+
// Check if this mask represents alignment to bit position in the
1156+
// regsiter.
1157+
if (PermElement > MaskVal || ((MaskVal - PermElement) % EltsPerPerm)) {
1158+
FoundMismatch = true;
1159+
}
1160+
BasisIndex = {MaskVal, PermElement};
1161+
continue;
1162+
}
1163+
1164+
if (MaskVal < BasisIndex.first) {
1165+
FoundMismatch = true;
1166+
break;
1167+
}
1168+
1169+
// Check if this mask is contiguous with the previously matched mask
1170+
if ((MaskVal - BasisIndex.first) != (PermElement - BasisIndex.second)) {
1171+
FoundMismatch = true;
1172+
break;
1173+
}
1174+
}
1175+
if (!FoundMismatch)
1176+
IdentityPerms += 1;
1177+
}
1178+
return IdentityPerms;
1179+
}
1180+
11231181
InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11241182
VectorType *VT, ArrayRef<int> Mask,
11251183
TTI::TargetCostKind CostKind,
@@ -1133,12 +1191,13 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11331191

11341192
// Larger vector widths may require additional instructions, but are
11351193
// typically cheaper than scalarized versions.
1136-
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
1194+
unsigned ScalarSize = DL.getTypeSizeInBits(VT->getElementType());
11371195
if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1138-
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1196+
ScalarSize == 16) {
11391197
bool HasVOP3P = ST->hasVOP3PInsts();
11401198
unsigned RequestedElts =
11411199
count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
1200+
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
11421201
if (RequestedElts == 0)
11431202
return 0;
11441203
switch (Kind) {
@@ -1149,9 +1208,13 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11491208
// half of a register, so any swizzle of two elements is free.
11501209
if (HasVOP3P && NumVectorElts == 2)
11511210
return 0;
1152-
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1211+
unsigned NumPerms = alignTo(Mask.size(), 2) / 2;
1212+
unsigned IdentPerms = countIdentityPerms(Mask, ScalarSize);
1213+
assert(IdentPerms <= NumPerms);
1214+
NumPerms -= IdentPerms;
11531215
// SK_Broadcast just reuses the same mask
1154-
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
1216+
unsigned NumPermMasks =
1217+
Kind == (TTI::SK_Broadcast && NumPerms > 1) ? 1 : NumPerms;
11551218
return NumPerms + NumPermMasks;
11561219
}
11571220
case TTI::SK_ExtractSubvector:
@@ -1166,9 +1229,13 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
11661229
case TTI::SK_PermuteTwoSrc:
11671230
case TTI::SK_Splice:
11681231
case TTI::SK_Select: {
1169-
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
1232+
unsigned NumPerms = alignTo(Mask.size(), 2) / 2;
1233+
unsigned IdentPerms = countIdentityPerms(Mask, ScalarSize);
1234+
assert(IdentPerms <= NumPerms);
1235+
NumPerms -= IdentPerms;
11701236
// SK_Select just reuses the same mask
1171-
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
1237+
unsigned NumPermMasks =
1238+
Kind == (TTI::SK_Select && NumPerms > 1) ? 1 : NumPerms;
11721239
return NumPerms + NumPermMasks;
11731240
}
11741241

0 commit comments

Comments
 (0)