diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index db4a5713a49a2..2ff93a02fb5c9 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5985,10 +5985,9 @@ static bool isMaskedLoadCompress( // Check for potential segmented(interleaved) loads. VectorType *AlignedLoadVecTy = getWidenedType( ScalarTy, getFullVectorNumberOfElements(TTI, ScalarTy, *Diff + 1)); - if (!isSafeToLoadUnconditionally( - Ptr0, AlignedLoadVecTy, CommonAlignment, DL, - cast(Order.empty() ? VL.back() : VL[Order.back()]), &AC, - &DT, &TLI)) + if (!isSafeToLoadUnconditionally(Ptr0, AlignedLoadVecTy, CommonAlignment, + DL, cast(VL.back()), &AC, &DT, + &TLI)) AlignedLoadVecTy = LoadVecTy; if (TTI.isLegalInterleavedAccessType(AlignedLoadVecTy, CompressMask[1], CommonAlignment, @@ -5998,9 +5997,6 @@ static bool isMaskedLoadCompress( Instruction::Load, AlignedLoadVecTy, CompressMask[1], std::nullopt, CommonAlignment, LI->getPointerAddressSpace(), CostKind, IsMasked); - if (!Mask.empty()) - InterleavedCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, - VecTy, Mask, CostKind); if (InterleavedCost < GatherCost) { InterleaveFactor = CompressMask[1]; LoadVecTy = AlignedLoadVecTy; @@ -6008,6 +6004,8 @@ static bool isMaskedLoadCompress( } } } + InstructionCost CompressCost = ::getShuffleCost( + TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind); if (!Order.empty()) { SmallVector NewMask(Sz, PoisonMaskElem); for (unsigned I : seq(Sz)) { @@ -6015,8 +6013,6 @@ static bool isMaskedLoadCompress( } CompressMask.swap(NewMask); } - InstructionCost CompressCost = ::getShuffleCost( - TTI, TTI::SK_PermuteSingleSrc, LoadVecTy, CompressMask, CostKind); InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost; return TotalVecCost < GatherCost; } @@ -13626,10 +13622,11 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, SmallVector PointerOps(Scalars.size()); for (auto [I, V] : enumerate(Scalars)) PointerOps[I] = cast(V)->getPointerOperand(); - (void)isMaskedLoadCompress( + [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress( Scalars, PointerOps, E->ReorderIndices, *TTI, *DL, *SE, *AC, *DT, *TLI, [](Value *) { return true; }, IsMasked, InterleaveFactor, CompressMask, LoadVecTy); + assert(IsVectorized && "Failed to vectorize load"); CompressEntryToData.try_emplace(E, CompressMask, LoadVecTy, InterleaveFactor, IsMasked); Align CommonAlignment = LI0->getAlign(); diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll index bce0884e92925..07094c642f8da 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-with-external-use-ptr.ll @@ -15,10 +15,11 @@ define i16 @test() { ; CHECK-NEXT: [[PEDGE_061_I:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ null, [[ENTRY]] ] ; CHECK-NEXT: [[INCDEC_PTR_I]] = getelementptr [[S]], ptr [[PEDGE_061_I]], i64 -1 ; CHECK-NEXT: [[PPREV_0_I]] = getelementptr [[S]], ptr [[PPREV_062_I]], i64 -1 -; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i16> @llvm.experimental.vp.strided.load.v2i16.p0.i64(ptr align 2 [[PPREV_0_I]], i64 4, <2 x i1> splat (i1 true), i32 2) -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i16> [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i16> [[TMP1]], i32 1 -; CHECK-NEXT: [[CMP_I178:%.*]] = icmp ult i16 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = call <3 x i16> @llvm.masked.load.v3i16.p0(ptr [[PPREV_0_I]], i32 2, <3 x i1> , <3 x i16> poison) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i16> [[TMP1]], <3 x i16> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1 +; CHECK-NEXT: [[CMP_I178:%.*]] = icmp ult i16 [[TMP4]], [[TMP3]] ; CHECK-NEXT: br label [[WHILE_BODY_I]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll index 1b65a7ac1c311..4dd659a7ae802 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll @@ -9,18 +9,20 @@ define void @test(ptr %mdct_forward_x) { ; CHECK: [[FOR_COND]]: ; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[MDCT_FORWARD_X]], align 8 ; CHECK-NEXT: [[ARRAYIDX2_I_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 32 +; CHECK-NEXT: [[ARRAYIDX5_I_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 40 ; CHECK-NEXT: [[ADD_PTR_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 24 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x ptr> [[TMP1]], <4 x ptr> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <4 x ptr> [[TMP2]], <4 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[ARRAYIDX2_I_I]], i64 -8, <2 x i1> splat (i1 true), i32 2) -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x ptr> [[TMP2]], <4 x ptr> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, <2 x ptr> [[TMP5]], <2 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> [[TMP6]], i32 4, <2 x i1> splat (i1 true), <2 x float> poison) +; CHECK-NEXT: [[TMP5:%.*]] = call <3 x float> @llvm.masked.load.v3f32.p0(ptr [[ADD_PTR_I]], i32 4, <3 x i1> , <3 x float> poison) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP5]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <3 x float> @llvm.masked.load.v3f32.p0(ptr [[ARRAYIDX5_I_I]], i32 4, <3 x i1> , <3 x float> poison) +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <2 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true), <4 x float> poison) -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> , <4 x float> [[TMP10]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <3 x float> [[TMP5]], <3 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> , <4 x float> [[TMP22]], <4 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP11]], <2 x float> [[TMP4]], i64 0) ; CHECK-NEXT: [[TMP13:%.*]] = fsub <4 x float> [[TMP9]], [[TMP12]] ; CHECK-NEXT: [[TMP14:%.*]] = fadd <4 x float> [[TMP9]], [[TMP12]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll index 843d1cf46ffcc..7d65fe1bcde76 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll @@ -9,17 +9,16 @@ define void @test() { ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[M1:%.*]] = alloca [[STRUCT_AE:%.*]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[M1]], i64 8 +; CHECK-NEXT: [[ARRAYIDX_I5_I:%.*]] = getelementptr i8, ptr [[M1]], i64 48 ; CHECK-NEXT: [[ARRAYIDX_I4:%.*]] = getelementptr i8, ptr null, i64 16 -; CHECK-NEXT: [[ARRAYIDX_I5_I:%.*]] = getelementptr i8, ptr [[M1]], i64 40 ; CHECK-NEXT: [[TMP1:%.*]] = load <5 x double>, ptr [[M1]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <5 x double> [[TMP1]], <5 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = load <6 x double>, ptr [[M1]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <6 x double> [[TMP4]], <6 x double> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX_I5_I]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = load <7 x double>, ptr [[TMP0]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <7 x double> [[TMP4]], <7 x double> poison, <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <5 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <5 x double> [[TMP1]], <5 x double> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <5 x double> [[TMP7]], <5 x double> [[TMP1]], <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x double> [[TMP8]], [[TMP5]] ; CHECK-NEXT: [[TMP10:%.*]] = fptosi <4 x double> [[TMP9]] to <4 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = sitofp <4 x i32> [[TMP10]] to <4 x double>