Skip to content

Commit 09ac59a

Browse files
author
Leon Clark
committed
Add implementation and update tests.
1 parent 1d817c3 commit 09ac59a

File tree

3 files changed

+144
-54
lines changed

3 files changed

+144
-54
lines changed

clang/test/CodeGenOpenCL/preserve_vec3.cl

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ typedef float float4 __attribute__((ext_vector_type(4)));
1111
// CHECK-LABEL: define dso_local spir_kernel void @foo(
1212
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
1313
// CHECK-NEXT: [[ENTRY:.*:]]
14-
// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
15-
// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
14+
// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
15+
// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
1616
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]]
1717
// CHECK-NEXT: ret void
1818
//
@@ -23,8 +23,8 @@ void kernel foo(global float3 *a, global float3 *b) {
2323
// CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3(
2424
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
2525
// CHECK-NEXT: [[ENTRY:.*:]]
26-
// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
27-
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
26+
// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
27+
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
2828
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
2929
// CHECK-NEXT: ret void
3030
//
@@ -35,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) {
3535
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4(
3636
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] {
3737
// CHECK-NEXT: [[ENTRY:.*:]]
38-
// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
39-
// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
38+
// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
39+
// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
4040
// CHECK-NEXT: store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
4141
// CHECK-NEXT: ret void
4242
//
@@ -47,8 +47,8 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) {
4747
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2(
4848
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] {
4949
// CHECK-NEXT: [[ENTRY:.*:]]
50-
// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
51-
// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
50+
// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
51+
// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
5252
// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
5353
// CHECK-NEXT: ret void
5454
//
@@ -59,8 +59,8 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
5959
// CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3(
6060
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
6161
// CHECK-NEXT: [[ENTRY:.*:]]
62-
// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
63-
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
62+
// CHECK-NEXT: [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
63+
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
6464
// CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]]
6565
// CHECK-NEXT: ret void
6666
//

llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -987,6 +987,95 @@ static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) {
987987
return true;
988988
}
989989

990+
// If `I` is a load instruction, used only by shufflevector instructions with
991+
// poison values, attempt to shrink the load to only the lanes being used.
992+
static bool shrinkLoadsForBroadcast(Instruction &I) {
993+
auto *OldLoad = dyn_cast<LoadInst>(&I);
994+
if (!OldLoad)
995+
return false;
996+
997+
auto *VecTy = dyn_cast<FixedVectorType>(I.getType());
998+
if (!VecTy)
999+
return false;
1000+
1001+
auto IsPoisonOrUndef = [](Value *V) -> bool {
1002+
if (auto *C = dyn_cast<Constant>(V)) {
1003+
return isa<PoisonValue>(C) || isa<UndefValue>(C);
1004+
}
1005+
return false;
1006+
};
1007+
1008+
using IndexRange = std::pair<unsigned, unsigned>;
1009+
auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
1010+
auto OutputRange = IndexRange(VecTy->getNumElements(), 0u);
1011+
for (auto &Use: I.uses()) {
1012+
// All uses must be ShuffleVector instructions.
1013+
auto *Shuffle = dyn_cast<ShuffleVectorInst>(Use.getUser());
1014+
if (!Shuffle)
1015+
return {};
1016+
1017+
// Get index range for value.
1018+
auto *Op0 = Shuffle->getOperand(0u);
1019+
auto *Op1 = Shuffle->getOperand(1u);
1020+
if (!IsPoisonOrUndef(Op1))
1021+
return {};
1022+
1023+
// Find the min and max indices used by the ShuffleVector instruction.
1024+
auto Mask = Shuffle->getShuffleMask();
1025+
auto *Op0Ty = cast<FixedVectorType>(Op0->getType());
1026+
auto NumElems = Op0Ty->getNumElements();
1027+
1028+
for (unsigned Index: Mask) {
1029+
if (Index < NumElems) {
1030+
OutputRange.first = std::min(Index, OutputRange.first);
1031+
OutputRange.second = std::max(Index, OutputRange.second);
1032+
}
1033+
}
1034+
}
1035+
return OutputRange;
1036+
};
1037+
1038+
if (auto Indices = GetIndexRangeInShuffles()) {
1039+
auto OldSize = VecTy->getNumElements();
1040+
auto NewSize = Indices->second + 1u;
1041+
1042+
if (NewSize < OldSize) {
1043+
auto Builder = IRBuilder(&I);
1044+
Builder.SetCurrentDebugLocation(I.getDebugLoc());
1045+
1046+
// Create new load of smaller vector.
1047+
auto *ElemTy = VecTy->getElementType();
1048+
auto *NewVecTy = FixedVectorType::get(ElemTy, NewSize);
1049+
auto *NewLoad = cast<LoadInst>(
1050+
Builder.CreateLoad(NewVecTy, OldLoad->getPointerOperand()));
1051+
NewLoad->copyMetadata(I);
1052+
1053+
// Replace all users.
1054+
auto OldShuffles = SmallVector<ShuffleVectorInst*, 4u>{};
1055+
for (auto &Use: I.uses()) {
1056+
auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
1057+
1058+
Builder.SetInsertPoint(Shuffle);
1059+
Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
1060+
auto *NewShuffle = Builder.CreateShuffleVector(
1061+
NewLoad, PoisonValue::get(NewVecTy), Shuffle->getShuffleMask()
1062+
);
1063+
1064+
Shuffle->replaceAllUsesWith(NewShuffle);
1065+
OldShuffles.push_back(Shuffle);
1066+
}
1067+
1068+
// Erase old users.
1069+
for (auto *Shuffle: OldShuffles)
1070+
Shuffle->eraseFromParent();
1071+
1072+
I.eraseFromParent();
1073+
return true;
1074+
}
1075+
}
1076+
return false;
1077+
}
1078+
9901079
namespace {
9911080
class StrNCmpInliner {
9921081
public:
@@ -1325,6 +1414,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
13251414
MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
13261415
MadeChange |= foldPatternedLoads(I, DL);
13271416
MadeChange |= foldICmpOrChain(I, DL, TTI, AA, DT);
1417+
MadeChange |= shrinkLoadsForBroadcast(I);
13281418
// NOTE: This function introduces erasing of the instruction `I`, so it
13291419
// needs to be called at the end of this sequence, otherwise we may make
13301420
// bugs.

0 commit comments

Comments
 (0)