Skip to content

Commit 9115bef

Browse files
PeddleSpamLeon ClarkRKSimon
authored
[VectorCombine] Shrink loads used in shufflevector rebroadcasts. (#153138)
Reopen #128938. Attempt to shrink the size of vector loads where only some of the incoming lanes are used for rebroadcasts in shufflevector instructions. --------- Co-authored-by: Leon Clark <[email protected]> Co-authored-by: Simon Pilgrim <[email protected]>
1 parent b8104fa commit 9115bef

File tree

7 files changed

+567
-36
lines changed

7 files changed

+567
-36
lines changed

clang/test/CodeGenOpenCL/preserve_vec3.cl

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ typedef float float4 __attribute__((ext_vector_type(4)));
1111
// CHECK-LABEL: define dso_local spir_kernel void @foo(
1212
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
1313
// CHECK-NEXT: [[ENTRY:.*:]]
14-
// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
15-
// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
14+
// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
15+
// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
1616
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]]
1717
// CHECK-NEXT: ret void
1818
//
@@ -23,8 +23,8 @@ void kernel foo(global float3 *a, global float3 *b) {
2323
// CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3(
2424
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
2525
// CHECK-NEXT: [[ENTRY:.*:]]
26-
// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
27-
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
26+
// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
27+
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
2828
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
2929
// CHECK-NEXT: ret void
3030
//
@@ -35,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) {
3535
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4(
3636
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] {
3737
// CHECK-NEXT: [[ENTRY:.*:]]
38-
// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
39-
// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
38+
// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
39+
// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
4040
// CHECK-NEXT: store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
4141
// CHECK-NEXT: ret void
4242
//
@@ -47,9 +47,9 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) {
4747
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2(
4848
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] {
4949
// CHECK-NEXT: [[ENTRY:.*:]]
50-
// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
51-
// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
52-
// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
50+
// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
51+
// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
52+
// CHECK-NEXT: store <4 x float> [[TMP1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
5353
// CHECK-NEXT: ret void
5454
//
5555
void kernel float3_to_double2(global float3 *a, global double2 *b) {
@@ -59,8 +59,8 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
5959
// CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3(
6060
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
6161
// CHECK-NEXT: [[ENTRY:.*:]]
62-
// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
63-
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
62+
// CHECK-NEXT: [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
63+
// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
6464
// CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]]
6565
// CHECK-NEXT: ret void
6666
//

llvm/lib/Transforms/Vectorize/VectorCombine.cpp

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "llvm/ADT/DenseMap.h"
1717
#include "llvm/ADT/STLExtras.h"
1818
#include "llvm/ADT/ScopeExit.h"
19+
#include "llvm/ADT/SmallVector.h"
1920
#include "llvm/ADT/Statistic.h"
2021
#include "llvm/Analysis/AssumptionCache.h"
2122
#include "llvm/Analysis/BasicAliasAnalysis.h"
@@ -29,13 +30,16 @@
2930
#include "llvm/IR/Dominators.h"
3031
#include "llvm/IR/Function.h"
3132
#include "llvm/IR/IRBuilder.h"
33+
#include "llvm/IR/Instructions.h"
3234
#include "llvm/IR/PatternMatch.h"
3335
#include "llvm/Support/CommandLine.h"
3436
#include "llvm/Transforms/Utils/Local.h"
3537
#include "llvm/Transforms/Utils/LoopUtils.h"
3638
#include <numeric>
39+
#include <optional>
3740
#include <queue>
3841
#include <set>
42+
#include <tuple>
3943

4044
#define DEBUG_TYPE "vector-combine"
4145
#include "llvm/Transforms/Utils/InstructionWorklist.h"
@@ -137,6 +141,7 @@ class VectorCombine {
137141
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
138142
bool foldInterleaveIntrinsics(Instruction &I);
139143
bool shrinkType(Instruction &I);
144+
bool shrinkLoadForShuffles(Instruction &I);
140145

141146
void replaceValue(Value &Old, Value &New) {
142147
LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
@@ -3862,6 +3867,133 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
38623867
return true;
38633868
}
38643869

3870+
// Attempt to shrink loads that are only used by shufflevector instructions.
3871+
bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
3872+
auto *OldLoad = dyn_cast<LoadInst>(&I);
3873+
if (!OldLoad || !OldLoad->isSimple())
3874+
return false;
3875+
3876+
auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
3877+
if (!OldLoadTy)
3878+
return false;
3879+
3880+
unsigned const OldNumElements = OldLoadTy->getNumElements();
3881+
3882+
// Search all uses of load. If all uses are shufflevector instructions, and
3883+
// the second operands are all poison values, find the minimum and maximum
3884+
// indices of the vector elements referenced by all shuffle masks.
3885+
// Otherwise return `std::nullopt`.
3886+
using IndexRange = std::pair<int, int>;
3887+
auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
3888+
IndexRange OutputRange = IndexRange(OldNumElements, -1);
3889+
for (llvm::Use &Use : I.uses()) {
3890+
// Ensure all uses match the required pattern.
3891+
User *Shuffle = Use.getUser();
3892+
ArrayRef<int> Mask;
3893+
3894+
if (!match(Shuffle,
3895+
m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
3896+
return std::nullopt;
3897+
3898+
// Ignore shufflevector instructions that have no uses.
3899+
if (Shuffle->use_empty())
3900+
continue;
3901+
3902+
// Find the min and max indices used by the shufflevector instruction.
3903+
for (int Index : Mask) {
3904+
if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
3905+
OutputRange.first = std::min(Index, OutputRange.first);
3906+
OutputRange.second = std::max(Index, OutputRange.second);
3907+
}
3908+
}
3909+
}
3910+
3911+
if (OutputRange.second < OutputRange.first)
3912+
return std::nullopt;
3913+
3914+
return OutputRange;
3915+
};
3916+
3917+
// Get the range of vector elements used by shufflevector instructions.
3918+
if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
3919+
unsigned const NewNumElements = Indices->second + 1u;
3920+
3921+
// If the range of vector elements is smaller than the full load, attempt
3922+
// to create a smaller load.
3923+
if (NewNumElements < OldNumElements) {
3924+
IRBuilder Builder(&I);
3925+
Builder.SetCurrentDebugLocation(I.getDebugLoc());
3926+
3927+
// Calculate costs of old and new ops.
3928+
Type *ElemTy = OldLoadTy->getElementType();
3929+
FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
3930+
Value *PtrOp = OldLoad->getPointerOperand();
3931+
3932+
InstructionCost OldCost = TTI.getMemoryOpCost(
3933+
Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
3934+
OldLoad->getPointerAddressSpace(), CostKind);
3935+
InstructionCost NewCost =
3936+
TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
3937+
OldLoad->getPointerAddressSpace(), CostKind);
3938+
3939+
using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
3940+
SmallVector<UseEntry, 4u> NewUses;
3941+
unsigned const MaxIndex = NewNumElements * 2u;
3942+
3943+
for (llvm::Use &Use : I.uses()) {
3944+
auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
3945+
ArrayRef<int> OldMask = Shuffle->getShuffleMask();
3946+
3947+
// Create entry for new use.
3948+
NewUses.push_back({Shuffle, OldMask});
3949+
3950+
// Validate mask indices.
3951+
for (int Index : OldMask) {
3952+
if (Index >= static_cast<int>(MaxIndex))
3953+
return false;
3954+
}
3955+
3956+
// Update costs.
3957+
OldCost +=
3958+
TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(),
3959+
OldLoadTy, OldMask, CostKind);
3960+
NewCost +=
3961+
TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(),
3962+
NewLoadTy, OldMask, CostKind);
3963+
}
3964+
3965+
LLVM_DEBUG(
3966+
dbgs() << "Found a load used only by shufflevector instructions: "
3967+
<< I << "\n OldCost: " << OldCost
3968+
<< " vs NewCost: " << NewCost << "\n");
3969+
3970+
if (OldCost < NewCost || !NewCost.isValid())
3971+
return false;
3972+
3973+
// Create new load of smaller vector.
3974+
auto *NewLoad = cast<LoadInst>(
3975+
Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
3976+
NewLoad->copyMetadata(I);
3977+
3978+
// Replace all uses.
3979+
for (UseEntry &Use : NewUses) {
3980+
ShuffleVectorInst *Shuffle = Use.first;
3981+
std::vector<int> &NewMask = Use.second;
3982+
3983+
Builder.SetInsertPoint(Shuffle);
3984+
Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
3985+
Value *NewShuffle = Builder.CreateShuffleVector(
3986+
NewLoad, PoisonValue::get(NewLoadTy), NewMask);
3987+
3988+
replaceValue(*Shuffle, *NewShuffle);
3989+
}
3990+
3991+
return true;
3992+
}
3993+
}
3994+
return false;
3995+
}
3996+
38653997
/// This is the entry point for all transforms. Pass manager differences are
38663998
/// handled in the callers of this function.
38673999
bool VectorCombine::run() {
@@ -3938,6 +4070,9 @@ bool VectorCombine::run() {
39384070
MadeChange |= foldSelectShuffle(I);
39394071
MadeChange |= foldShuffleToIdentity(I);
39404072
break;
4073+
case Instruction::Load:
4074+
MadeChange |= shrinkLoadForShuffles(I);
4075+
break;
39414076
case Instruction::BitCast:
39424077
MadeChange |= foldBitcastShuffle(I);
39434078
break;

llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,13 @@ $getAt = comdat any
1111

1212
define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 {
1313
; SSE-LABEL: @ConvertVectors_ByRef(
14-
; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
15-
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
14+
; SSE-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
15+
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
1616
; SSE-NEXT: ret <4 x float> [[TMP3]]
1717
;
1818
; AVX-LABEL: @ConvertVectors_ByRef(
19-
; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
20-
; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
19+
; AVX-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
20+
; AVX-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
2121
; AVX-NEXT: ret <4 x float> [[TMP3]]
2222
;
2323
%2 = alloca ptr, align 8

llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -252,8 +252,7 @@ define <4 x i32> @unsafe_load_i32_insert_v4i32_addrspace(ptr align 16 dereferenc
252252
define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) %p) nofree nosync {
253253
; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
254254
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
255-
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
256-
; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
255+
; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
257256
; CHECK-NEXT: ret <8 x i16> [[R]]
258257
;
259258
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
@@ -341,8 +340,7 @@ define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceab
341340
define <8 x i16> @gep10_load_i16_insert_v8i16(ptr align 16 dereferenceable(32) %p) nofree nosync {
342341
; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
343342
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
344-
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
345-
; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
343+
; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
346344
; CHECK-NEXT: ret <8 x i16> [[R]]
347345
;
348346
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0

llvm/test/Transforms/VectorCombine/X86/load-widening.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -443,8 +443,8 @@ define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize
443443

444444
define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address {
445445
; CHECK-LABEL: @load_v2i32_v4i32_asan(
446-
; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
447-
; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
446+
; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 1
447+
; CHECK-NEXT: [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
448448
; CHECK-NEXT: ret <4 x i32> [[S]]
449449
;
450450
%l = load <2 x i32>, ptr %p, align 1

llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -47,21 +47,12 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
4747
; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles)
4848

4949
define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) {
50-
; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64(
51-
; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
52-
; SSE-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
53-
; SSE-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
54-
; SSE-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> <i32 0, i32 4, i32 4, i32 0>
55-
; SSE-NEXT: ret <4 x double> [[BLEND]]
56-
;
57-
; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64(
58-
; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
59-
; AVX-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
60-
; AVX-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
61-
; AVX-NEXT: [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
62-
; AVX-NEXT: [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
63-
; AVX-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
64-
; AVX-NEXT: ret <4 x double> [[BLEND]]
50+
; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64(
51+
; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
52+
; CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 32
53+
; CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 32
54+
; CHECK-NEXT: [[BLEND:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 1, i32 0>
55+
; CHECK-NEXT: ret <4 x double> [[BLEND]]
6556
;
6657
%ld0 = load <4 x double>, ptr %p0, align 32
6758
%ld1 = load <4 x double>, ptr %p1, align 32
@@ -81,3 +72,6 @@ define <2 x float> @PR86068(<2 x float> %a0, <2 x float> %a1) {
8172
%s2 = shufflevector <2 x float> %s1, <2 x float> %a0, <2 x i32> <i32 0, i32 3>
8273
ret <2 x float> %s2
8374
}
75+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
76+
; AVX: {{.*}}
77+
; SSE: {{.*}}

0 commit comments

Comments
 (0)