Skip to content

Commit f566b7a

Browse files
author
Leon Clark
committed
Add implementation and update tests.
1 parent 3a75016 commit f566b7a

File tree

3 files changed

+144
-54
lines changed

3 files changed

+144
-54
lines changed

clang/test/CodeGenOpenCL/preserve_vec3.cl

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ typedef float float4 __attribute__((ext_vector_type(4)));
1111
// CHECK-LABEL: define dso_local spir_kernel void @foo(
1212
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
1313
// CHECK-NEXT: [[ENTRY:.*:]]
14-
// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
15-
// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
14+
// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
15+
// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
1616
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]]
1717
// CHECK-NEXT: ret void
1818
//
@@ -23,8 +23,8 @@ void kernel foo(global float3 *a, global float3 *b) {
2323
// CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3(
2424
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
2525
// CHECK-NEXT: [[ENTRY:.*:]]
26-
// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
27-
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
26+
// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
27+
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
2828
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
2929
// CHECK-NEXT: ret void
3030
//
@@ -35,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) {
3535
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4(
3636
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] {
3737
// CHECK-NEXT: [[ENTRY:.*:]]
38-
// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
39-
// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
38+
// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
39+
// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
4040
// CHECK-NEXT: store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
4141
// CHECK-NEXT: ret void
4242
//
@@ -47,8 +47,8 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) {
4747
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2(
4848
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] {
4949
// CHECK-NEXT: [[ENTRY:.*:]]
50-
// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
51-
// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
50+
// CHECK-NEXT: [[LOADVECN:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
51+
// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <3 x float> [[LOADVECN]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
5252
// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
5353
// CHECK-NEXT: ret void
5454
//
@@ -59,8 +59,8 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
5959
// CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3(
6060
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
6161
// CHECK-NEXT: [[ENTRY:.*:]]
62-
// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
63-
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
62+
// CHECK-NEXT: [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
63+
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
6464
// CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]]
6565
// CHECK-NEXT: ret void
6666
//

llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -916,6 +916,95 @@ static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) {
916916
return true;
917917
}
918918

919+
// If `I` is a load instruction, used only by shufflevector instructions with
920+
// poison values, attempt to shrink the load to only the lanes being used.
921+
static bool shrinkLoadsForBroadcast(Instruction &I) {
922+
auto *OldLoad = dyn_cast<LoadInst>(&I);
923+
if (!OldLoad)
924+
return false;
925+
926+
auto *VecTy = dyn_cast<FixedVectorType>(I.getType());
927+
if (!VecTy)
928+
return false;
929+
930+
auto IsPoisonOrUndef = [](Value *V) -> bool {
931+
if (auto *C = dyn_cast<Constant>(V)) {
932+
return isa<PoisonValue>(C) || isa<UndefValue>(C);
933+
}
934+
return false;
935+
};
936+
937+
using IndexRange = std::pair<unsigned, unsigned>;
938+
auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
939+
auto OutputRange = IndexRange(VecTy->getNumElements(), 0u);
940+
for (auto &Use: I.uses()) {
941+
// All uses must be ShuffleVector instructions.
942+
auto *Shuffle = dyn_cast<ShuffleVectorInst>(Use.getUser());
943+
if (!Shuffle)
944+
return {};
945+
946+
// Get index range for value.
947+
auto *Op0 = Shuffle->getOperand(0u);
948+
auto *Op1 = Shuffle->getOperand(1u);
949+
if (!IsPoisonOrUndef(Op1))
950+
return {};
951+
952+
// Find the min and max indices used by the ShuffleVector instruction.
953+
auto Mask = Shuffle->getShuffleMask();
954+
auto *Op0Ty = cast<FixedVectorType>(Op0->getType());
955+
auto NumElems = Op0Ty->getNumElements();
956+
957+
for (unsigned Index: Mask) {
958+
if (Index < NumElems) {
959+
OutputRange.first = std::min(Index, OutputRange.first);
960+
OutputRange.second = std::max(Index, OutputRange.second);
961+
}
962+
}
963+
}
964+
return OutputRange;
965+
};
966+
967+
if (auto Indices = GetIndexRangeInShuffles()) {
968+
auto OldSize = VecTy->getNumElements();
969+
auto NewSize = Indices->second + 1u;
970+
971+
if (NewSize < OldSize) {
972+
auto Builder = IRBuilder(&I);
973+
Builder.SetCurrentDebugLocation(I.getDebugLoc());
974+
975+
// Create new load of smaller vector.
976+
auto *ElemTy = VecTy->getElementType();
977+
auto *NewVecTy = FixedVectorType::get(ElemTy, NewSize);
978+
auto *NewLoad = cast<LoadInst>(
979+
Builder.CreateLoad(NewVecTy, OldLoad->getPointerOperand()));
980+
NewLoad->copyMetadata(I);
981+
982+
// Replace all users.
983+
auto OldShuffles = SmallVector<ShuffleVectorInst*, 4u>{};
984+
for (auto &Use: I.uses()) {
985+
auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
986+
987+
Builder.SetInsertPoint(Shuffle);
988+
Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
989+
auto *NewShuffle = Builder.CreateShuffleVector(
990+
NewLoad, PoisonValue::get(NewVecTy), Shuffle->getShuffleMask()
991+
);
992+
993+
Shuffle->replaceAllUsesWith(NewShuffle);
994+
OldShuffles.push_back(Shuffle);
995+
}
996+
997+
// Erase old users.
998+
for (auto *Shuffle: OldShuffles)
999+
Shuffle->eraseFromParent();
1000+
1001+
I.eraseFromParent();
1002+
return true;
1003+
}
1004+
}
1005+
return false;
1006+
}
1007+
9191008
namespace {
9201009
class StrNCmpInliner {
9211010
public:
@@ -1253,6 +1342,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
12531342
MadeChange |= tryToRecognizeTableBasedCttz(I);
12541343
MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
12551344
MadeChange |= foldPatternedLoads(I, DL);
1345+
MadeChange |= shrinkLoadsForBroadcast(I);
12561346
// NOTE: This function introduces erasing of the instruction `I`, so it
12571347
// needs to be called at the end of this sequence, otherwise we may make
12581348
// bugs.

0 commit comments

Comments
 (0)