Skip to content

Commit b8ec653

Browse files
author
Leon Clark
committed
Add implementation and update tests.
1 parent f1c0927 commit b8ec653

File tree

3 files changed

+144
-54
lines changed

3 files changed

+144
-54
lines changed

clang/test/CodeGenOpenCL/preserve_vec3.cl

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ typedef float float4 __attribute__((ext_vector_type(4)));
1111
// CHECK-LABEL: define dso_local spir_kernel void @foo(
1212
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
1313
// CHECK-NEXT: [[ENTRY:.*:]]
14-
// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
15-
// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
14+
// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
15+
// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
1616
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]]
1717
// CHECK-NEXT: ret void
1818
//
@@ -23,8 +23,8 @@ void kernel foo(global float3 *a, global float3 *b) {
2323
// CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3(
2424
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
2525
// CHECK-NEXT: [[ENTRY:.*:]]
26-
// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
27-
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
26+
// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
27+
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
2828
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
2929
// CHECK-NEXT: ret void
3030
//
@@ -35,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) {
3535
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4(
3636
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] {
3737
// CHECK-NEXT: [[ENTRY:.*:]]
38-
// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
39-
// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
38+
// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
39+
// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
4040
// CHECK-NEXT: store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
4141
// CHECK-NEXT: ret void
4242
//
@@ -47,8 +47,8 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) {
4747
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2(
4848
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] {
4949
// CHECK-NEXT: [[ENTRY:.*:]]
50-
// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
51-
// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
50+
// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
51+
// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
5252
// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
5353
// CHECK-NEXT: ret void
5454
//
@@ -59,8 +59,8 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
5959
// CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3(
6060
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
6161
// CHECK-NEXT: [[ENTRY:.*:]]
62-
// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
63-
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
62+
// CHECK-NEXT: [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
63+
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
6464
// CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]]
6565
// CHECK-NEXT: ret void
6666
//

llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -915,6 +915,95 @@ static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) {
915915
return true;
916916
}
917917

918+
// If `I` is a load instruction, used only by shufflevector instructions with
919+
// poison values, attempt to shrink the load to only the lanes being used.
920+
static bool shrinkLoadsForBroadcast(Instruction &I) {
921+
auto *OldLoad = dyn_cast<LoadInst>(&I);
922+
if (!OldLoad)
923+
return false;
924+
925+
auto *VecTy = dyn_cast<FixedVectorType>(I.getType());
926+
if (!VecTy)
927+
return false;
928+
929+
auto IsPoisonOrUndef = [](Value *V) -> bool {
930+
if (auto *C = dyn_cast<Constant>(V)) {
931+
return isa<PoisonValue>(C) || isa<UndefValue>(C);
932+
}
933+
return false;
934+
};
935+
936+
using IndexRange = std::pair<unsigned, unsigned>;
937+
auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
938+
auto OutputRange = IndexRange(VecTy->getNumElements(), 0u);
939+
for (auto &Use: I.uses()) {
940+
// All uses must be ShuffleVector instructions.
941+
auto *Shuffle = dyn_cast<ShuffleVectorInst>(Use.getUser());
942+
if (!Shuffle)
943+
return {};
944+
945+
// Get index range for value.
946+
auto *Op0 = Shuffle->getOperand(0u);
947+
auto *Op1 = Shuffle->getOperand(1u);
948+
if (!IsPoisonOrUndef(Op1))
949+
return {};
950+
951+
// Find the min and max indices used by the ShuffleVector instruction.
952+
auto Mask = Shuffle->getShuffleMask();
953+
auto *Op0Ty = cast<FixedVectorType>(Op0->getType());
954+
auto NumElems = Op0Ty->getNumElements();
955+
956+
for (unsigned Index: Mask) {
957+
if (Index < NumElems) {
958+
OutputRange.first = std::min(Index, OutputRange.first);
959+
OutputRange.second = std::max(Index, OutputRange.second);
960+
}
961+
}
962+
}
963+
return OutputRange;
964+
};
965+
966+
if (auto Indices = GetIndexRangeInShuffles()) {
967+
auto OldSize = VecTy->getNumElements();
968+
auto NewSize = Indices->second + 1u;
969+
970+
if (NewSize < OldSize) {
971+
auto Builder = IRBuilder(&I);
972+
Builder.SetCurrentDebugLocation(I.getDebugLoc());
973+
974+
// Create new load of smaller vector.
975+
auto *ElemTy = VecTy->getElementType();
976+
auto *NewVecTy = FixedVectorType::get(ElemTy, NewSize);
977+
auto *NewLoad = cast<LoadInst>(
978+
Builder.CreateLoad(NewVecTy, OldLoad->getPointerOperand()));
979+
NewLoad->copyMetadata(I);
980+
981+
// Replace all users.
982+
auto OldShuffles = SmallVector<ShuffleVectorInst*, 4u>{};
983+
for (auto &Use: I.uses()) {
984+
auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
985+
986+
Builder.SetInsertPoint(Shuffle);
987+
Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
988+
auto *NewShuffle = Builder.CreateShuffleVector(
989+
NewLoad, PoisonValue::get(NewVecTy), Shuffle->getShuffleMask()
990+
);
991+
992+
Shuffle->replaceAllUsesWith(NewShuffle);
993+
OldShuffles.push_back(Shuffle);
994+
}
995+
996+
// Erase old users.
997+
for (auto *Shuffle: OldShuffles)
998+
Shuffle->eraseFromParent();
999+
1000+
I.eraseFromParent();
1001+
return true;
1002+
}
1003+
}
1004+
return false;
1005+
}
1006+
9181007
namespace {
9191008
class StrNCmpInliner {
9201009
public:
@@ -1251,6 +1340,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
12511340
MadeChange |= tryToRecognizeTableBasedCttz(I);
12521341
MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
12531342
MadeChange |= foldPatternedLoads(I, DL);
1343+
MadeChange |= shrinkLoadsForBroadcast(I);
12541344
// NOTE: This function introduces erasing of the instruction `I`, so it
12551345
// needs to be called at the end of this sequence, otherwise we may make
12561346
// bugs.

0 commit comments

Comments
 (0)