-
Notifications
You must be signed in to change notification settings - Fork 15.4k
Revert "[VectorCombine] Shrink loads used in shufflevector rebroadcasts. (#153138)" #153403
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…ts. (llvm#153138)" This reverts commit 9115bef.
|
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-clang Author: Leon Clark (PeddleSpam) ChangesThis reverts commit 9115bef. Patch is 39.73 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/153403.diff 7 Files Affected:
diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl
index e73657e30d884..49ebae6fc7013 100644
--- a/clang/test/CodeGenOpenCL/preserve_vec3.cl
+++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl
@@ -11,8 +11,8 @@ typedef float float4 __attribute__((ext_vector_type(4)));
// CHECK-LABEL: define dso_local spir_kernel void @foo(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META6:![0-9]+]] !kernel_arg_type_qual [[META7:![0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[EXTRACTVEC1:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8:![0-9]+]]
// CHECK-NEXT: ret void
//
@@ -23,8 +23,8 @@ void kernel foo(global float3 *a, global float3 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float4_to_float3(
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[A:%.*]], ptr addrspace(1) noundef readonly align 16 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META12:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
-// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@@ -35,8 +35,8 @@ void kernel float4_to_float3(global float3 *a, global float4 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_float4(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META11]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[ASTYPE:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x float> [[ASTYPE]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
@@ -47,9 +47,9 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @float3_to_double2(
// CHECK-SAME: ptr addrspace(1) noundef readonly align 16 captures(none) [[A:%.*]], ptr addrspace(1) noundef writeonly align 16 captures(none) initializes((0, 16)) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META13:![0-9]+]] !kernel_arg_base_type [[META14:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <3 x float>, ptr addrspace(1) [[A]], align 16
-// CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT: store <4 x float> [[TMP1]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[LOADVECN:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16
+// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x float> [[LOADVECN]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[B]], align 16, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
void kernel float3_to_double2(global float3 *a, global double2 *b) {
@@ -59,8 +59,8 @@ void kernel float3_to_double2(global float3 *a, global double2 *b) {
// CHECK-LABEL: define dso_local spir_kernel void @char8_to_short3(
// CHECK-SAME: ptr addrspace(1) noundef writeonly align 8 captures(none) initializes((0, 8)) [[A:%.*]], ptr addrspace(1) noundef readonly align 8 captures(none) [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META16:![0-9]+]] !kernel_arg_type_qual [[META7]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = load <3 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
-// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <3 x i16> [[TMP0]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr addrspace(1) [[B]], align 8, !tbaa [[TBAA8]]
+// CHECK-NEXT: [[EXTRACTVEC:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
// CHECK-NEXT: store <4 x i16> [[EXTRACTVEC]], ptr addrspace(1) [[A]], align 8, !tbaa [[TBAA8]]
// CHECK-NEXT: ret void
//
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index c45005db6016b..c43505432367c 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -16,7 +16,6 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/ScopeExit.h"
-#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
@@ -30,16 +29,13 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include <numeric>
-#include <optional>
#include <queue>
#include <set>
-#include <tuple>
#define DEBUG_TYPE "vector-combine"
#include "llvm/Transforms/Utils/InstructionWorklist.h"
@@ -141,7 +137,6 @@ class VectorCombine {
bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
bool foldInterleaveIntrinsics(Instruction &I);
bool shrinkType(Instruction &I);
- bool shrinkLoadForShuffles(Instruction &I);
bool shrinkPhiOfShuffles(Instruction &I);
void replaceValue(Value &Old, Value &New) {
@@ -3868,133 +3863,6 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
return true;
}
-// Attempt to shrink loads that are only used by shufflevector instructions.
-bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
- auto *OldLoad = dyn_cast<LoadInst>(&I);
- if (!OldLoad || !OldLoad->isSimple())
- return false;
-
- auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
- if (!OldLoadTy)
- return false;
-
- unsigned const OldNumElements = OldLoadTy->getNumElements();
-
- // Search all uses of load. If all uses are shufflevector instructions, and
- // the second operands are all poison values, find the minimum and maximum
- // indices of the vector elements referenced by all shuffle masks.
- // Otherwise return `std::nullopt`.
- using IndexRange = std::pair<int, int>;
- auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
- IndexRange OutputRange = IndexRange(OldNumElements, -1);
- for (llvm::Use &Use : I.uses()) {
- // Ensure all uses match the required pattern.
- User *Shuffle = Use.getUser();
- ArrayRef<int> Mask;
-
- if (!match(Shuffle,
- m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
- return std::nullopt;
-
- // Ignore shufflevector instructions that have no uses.
- if (Shuffle->use_empty())
- continue;
-
- // Find the min and max indices used by the shufflevector instruction.
- for (int Index : Mask) {
- if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
- OutputRange.first = std::min(Index, OutputRange.first);
- OutputRange.second = std::max(Index, OutputRange.second);
- }
- }
- }
-
- if (OutputRange.second < OutputRange.first)
- return std::nullopt;
-
- return OutputRange;
- };
-
- // Get the range of vector elements used by shufflevector instructions.
- if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
- unsigned const NewNumElements = Indices->second + 1u;
-
- // If the range of vector elements is smaller than the full load, attempt
- // to create a smaller load.
- if (NewNumElements < OldNumElements) {
- IRBuilder Builder(&I);
- Builder.SetCurrentDebugLocation(I.getDebugLoc());
-
- // Calculate costs of old and new ops.
- Type *ElemTy = OldLoadTy->getElementType();
- FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
- Value *PtrOp = OldLoad->getPointerOperand();
-
- InstructionCost OldCost = TTI.getMemoryOpCost(
- Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
- OldLoad->getPointerAddressSpace(), CostKind);
- InstructionCost NewCost =
- TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
- OldLoad->getPointerAddressSpace(), CostKind);
-
- using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
- SmallVector<UseEntry, 4u> NewUses;
- unsigned const MaxIndex = NewNumElements * 2u;
-
- for (llvm::Use &Use : I.uses()) {
- auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
- ArrayRef<int> OldMask = Shuffle->getShuffleMask();
-
- // Create entry for new use.
- NewUses.push_back({Shuffle, OldMask});
-
- // Validate mask indices.
- for (int Index : OldMask) {
- if (Index >= static_cast<int>(MaxIndex))
- return false;
- }
-
- // Update costs.
- OldCost +=
- TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(),
- OldLoadTy, OldMask, CostKind);
- NewCost +=
- TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(),
- NewLoadTy, OldMask, CostKind);
- }
-
- LLVM_DEBUG(
- dbgs() << "Found a load used only by shufflevector instructions: "
- << I << "\n OldCost: " << OldCost
- << " vs NewCost: " << NewCost << "\n");
-
- if (OldCost < NewCost || !NewCost.isValid())
- return false;
-
- // Create new load of smaller vector.
- auto *NewLoad = cast<LoadInst>(
- Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
- NewLoad->copyMetadata(I);
-
- // Replace all uses.
- for (UseEntry &Use : NewUses) {
- ShuffleVectorInst *Shuffle = Use.first;
- std::vector<int> &NewMask = Use.second;
-
- Builder.SetInsertPoint(Shuffle);
- Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
- Value *NewShuffle = Builder.CreateShuffleVector(
- NewLoad, PoisonValue::get(NewLoadTy), NewMask);
-
- replaceValue(*Shuffle, *NewShuffle);
- }
-
- return true;
- }
- }
- return false;
-}
-
// Attempt to narrow a phi of shufflevector instructions where the two incoming
// values have the same operands but different masks. If the two shuffle masks
// are offsets of one another we can use one branch to rotate the incoming
@@ -4166,9 +4034,6 @@ bool VectorCombine::run() {
MadeChange |= foldSelectShuffle(I);
MadeChange |= foldShuffleToIdentity(I);
break;
- case Instruction::Load:
- MadeChange |= shrinkLoadForShuffles(I);
- break;
case Instruction::BitCast:
MadeChange |= foldBitcastShuffle(I);
break;
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
index 86bdd125e5e57..f6e8fcd5d1d8c 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
@@ -11,13 +11,13 @@ $getAt = comdat any
define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 {
; SSE-LABEL: @ConvertVectors_ByRef(
-; SSE-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
-; SSE-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; SSE-NEXT: ret <4 x float> [[TMP3]]
;
; AVX-LABEL: @ConvertVectors_ByRef(
-; AVX-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[TMP0:%.*]], align 16
-; AVX-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
; AVX-NEXT: ret <4 x float> [[TMP3]]
;
%2 = alloca ptr, align 8
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index 0c2346e616e36..977da754ec5a7 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -252,7 +252,8 @@ define <4 x i32> @unsafe_load_i32_insert_v4i32_addrspace(ptr align 16 dereferenc
define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) %p) nofree nosync {
; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
+; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <8 x i16> [[R]]
;
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
@@ -340,7 +341,8 @@ define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceab
define <8 x i16> @gep10_load_i16_insert_v8i16(ptr align 16 dereferenceable(32) %p) nofree nosync {
; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
-; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
+; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <8 x i16> [[R]]
;
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
index eacc40bfa9b53..30a089818074e 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-widening.ll
@@ -443,8 +443,8 @@ define <8 x float> @load_v2f32_v8f32_hwasan(ptr dereferenceable(32) %p) sanitize
define <4 x i32> @load_v2i32_v4i32_asan(ptr dereferenceable(16) %p) sanitize_address {
; CHECK-LABEL: @load_v2i32_v4i32_asan(
-; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i32>, ptr [[P:%.*]], align 1
-; CHECK-NEXT: [[S:%.*]] = shufflevector <1 x i32> [[TMP1]], <1 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 1
+; CHECK-NEXT: [[S:%.*]] = shufflevector <2 x i32> [[L]], <2 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: ret <4 x i32> [[S]]
;
%l = load <2 x i32>, ptr %p, align 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
index eddfc57a7d256..b30dc9ffdc596 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
@@ -47,12 +47,21 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles)
define <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1) {
-; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT: [[TMP1:%.*]] = load <1 x double>, ptr [[P0]], align 32
-; CHECK-NEXT: [[TMP2:%.*]] = load <1 x double>, ptr [[P1]], align 32
-; CHECK-NEXT: [[BLEND:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 1, i32 0>
-; CHECK-NEXT: ret <4 x double> [[BLEND]]
+; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
+; SSE-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
+; SSE-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> <i32 0, i32 4, i32 4, i32 0>
+; SSE-NEXT: ret <4 x double> [[BLEND]]
+;
+; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; AVX-NEXT: [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
+; AVX-NEXT: [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
+; AVX-NEXT: [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
+; AVX-NEXT: [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
+; AVX-NEXT: [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; AVX-NEXT: ret <4 x double> [[BLEND]]
;
%ld0 = load <4 x double>, ptr %p0, align 32
%ld1 = load <4 x double>, ptr %p1, align 32
@@ -72,6 +81,3 @@ define <2 x float> @PR86068(<2 x float> %a0, <2 x float> %a1) {
%s2 = shufflevector <2 x float> %s1, <2 x float> %a0, <2 x i32> <i32 0, i32 3>
ret <2 x float> %s2
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
-; SSE: {{.*}}
diff --git a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
deleted file mode 100644
index 7d9393ab77f20..0000000000000
--- a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
+++ /dev/null
@@ -1,404 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=vector-combine -S < %s | FileCheck %s
-
-define <8 x half> @shuffle_v4_v8f16_r0_1_volatile(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
-; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r0...
[truncated]
|
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' 'HEAD~1' HEAD llvm/lib/Transforms/Vectorize/VectorCombine.cpp llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll llvm/test/Transforms/VectorCombine/X86/load-widening.ll llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.llThe following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
}Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
}Please refer to the Undefined Behavior Manual for more information. |
This reverts commit 9115bef.