From b22615bca9daca61393051a7aa2604f08ce0d7d8 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 7 Mar 2025 21:17:40 +0000 Subject: [PATCH] Revert "[VPlan] Fold NOT into predicate of wide compares. (#129430)" This reverts commit cb3ce30ca813b4158b93756e465fa0afe71bfc49. --- llvm/lib/Transforms/Vectorize/VPlan.h | 9 - .../Transforms/Vectorize/VPlanTransforms.cpp | 21 +- .../LoopVectorize/AArch64/masked-call.ll | 12 +- .../LoopVectorize/AArch64/masked-op-cost.ll | 4 +- .../AArch64/simple_early_exit.ll | 15 +- .../LoopVectorize/AArch64/sve-select-cmp.ll | 35 ++-- .../LoopVectorize/AArch64/sve-tail-folding.ll | 3 +- .../AArch64/tail-fold-uniform-memops.ll | 5 +- .../RISCV/select-cmp-reduction.ll | 50 +++-- ...rize-force-tail-with-evl-cond-reduction.ll | 5 +- .../LoopVectorize/X86/cost-model.ll | 6 +- ...bounds-flags-for-reverse-vector-pointer.ll | 3 +- .../LoopVectorize/X86/gep-use-outside-loop.ll | 6 +- .../LoopVectorize/X86/induction-costs.ll | 15 +- .../LoopVectorize/X86/masked-store-cost.ll | 15 +- .../LoopVectorize/X86/masked_load_store.ll | 180 ++++++++++++------ .../X86/pr55096-scalarize-add.ll | 3 +- .../test/Transforms/LoopVectorize/debugloc.ll | 10 +- .../epilog-vectorization-any-of-reductions.ll | 12 +- .../Transforms/LoopVectorize/if-reduction.ll | 6 +- .../LoopVectorize/load-deref-pred-align.ll | 9 +- .../Transforms/LoopVectorize/loop-form.ll | 3 +- .../LoopVectorize/pointer-induction.ll | 3 +- .../Transforms/LoopVectorize/select-cmp.ll | 66 ++++--- ...tion-start-value-may-be-undef-or-poison.ll | 51 ++--- .../LoopVectorize/single_early_exit.ll | 6 +- .../single_early_exit_live_outs.ll | 54 ++++-- .../vplan-sink-scalars-and-merge.ll | 15 +- 28 files changed, 381 insertions(+), 241 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 50e6209738c1f..b277ed4816b8e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -459,9 +459,6 @@ class VPRecipeBase : public ilist_node_with_parent, /// Returns the debug location of the recipe. DebugLoc getDebugLoc() const { return DL; } - /// Set the recipe's debug location to \p NewDL. - void setDebugLoc(DebugLoc NewDL) { DL = NewDL; } - protected: /// Compute the cost of this recipe either using a recipe's specialized /// implementation or using the legacy cost model and the underlying @@ -797,12 +794,6 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe { return CmpPredicate; } - void setPredicate(CmpInst::Predicate Pred) { - assert(OpType == OperationType::Cmp && - "recipe doesn't have a compare predicate"); - CmpPredicate = Pred; - } - GEPNoWrapFlags getGEPNoWrapFlags() const { return GEPFlags; } /// Returns true if the recipe has fast-math flags. diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 5c9aa9220ca1d..7646350ca0ed2 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -972,25 +972,8 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { if (match(&R, m_c_Mul(m_VPValue(A), m_SpecificInt(1)))) return R.getVPSingleValue()->replaceAllUsesWith(A); - if (match(&R, m_Not(m_VPValue(A)))) { - if (match(A, m_Not(m_VPValue(A)))) - return R.getVPSingleValue()->replaceAllUsesWith(A); - - // Try to fold Not into compares by adjusting the predicate in-place. - if (isa(A) && A->getNumUsers() == 1) { - auto *WideCmp = cast(A); - if (WideCmp->getOpcode() == Instruction::ICmp || - WideCmp->getOpcode() == Instruction::FCmp) { - WideCmp->setPredicate( - CmpInst::getInversePredicate(WideCmp->getPredicate())); - R.getVPSingleValue()->replaceAllUsesWith(WideCmp); - // If WideCmp doesn't have a debug location, use the one from the - // negation, to preserve the location. - if (!WideCmp->getDebugLoc() && R.getDebugLoc()) - WideCmp->setDebugLoc(R.getDebugLoc()); - } - } - } + if (match(&R, m_Not(m_Not(m_VPValue(A))))) + return R.getVPSingleValue()->replaceAllUsesWith(A); // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0. if ((match(&R, diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll index 0850648c518b0..07873fba86b6d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -1037,7 +1037,8 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFALWAYS-NEXT: [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]] ; TFALWAYS-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0 ; TFALWAYS-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP6]], i32 1 -; TFALWAYS-NEXT: [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP8]], zeroinitializer +; TFALWAYS-NEXT: [[TMP9:%.*]] = fcmp ogt <2 x double> [[TMP8]], zeroinitializer +; TFALWAYS-NEXT: [[TMP10:%.*]] = xor <2 x i1> [[TMP9]], splat (i1 true) ; TFALWAYS-NEXT: [[TMP11:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP10]], <2 x i1> zeroinitializer ; TFALWAYS-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP11]], <2 x double> splat (double 1.000000e+00), <2 x double> zeroinitializer ; TFALWAYS-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0 @@ -1081,7 +1082,8 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFFALLBACK-NEXT: [[TMP6:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]] ; TFFALLBACK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0 ; TFFALLBACK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP6]], i32 1 -; TFFALLBACK-NEXT: [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP8]], zeroinitializer +; TFFALLBACK-NEXT: [[TMP9:%.*]] = fcmp ogt <2 x double> [[TMP8]], zeroinitializer +; TFFALLBACK-NEXT: [[TMP10:%.*]] = xor <2 x i1> [[TMP9]], splat (i1 true) ; TFFALLBACK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP10]], <2 x i1> zeroinitializer ; TFFALLBACK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP11]], <2 x double> splat (double 1.000000e+00), <2 x double> zeroinitializer ; TFFALLBACK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0 @@ -1131,8 +1133,10 @@ define void @test_widen_exp_v2(ptr noalias %p2, ptr noalias %p, i64 %n) #5 { ; TFA_INTERLEAVE-NEXT: [[TMP10:%.*]] = tail call double @llvm.exp.f64(double [[TMP4]]) #[[ATTR7]] ; TFA_INTERLEAVE-NEXT: [[TMP11:%.*]] = insertelement <2 x double> poison, double [[TMP9]], i32 0 ; TFA_INTERLEAVE-NEXT: [[TMP12:%.*]] = insertelement <2 x double> [[TMP11]], double [[TMP10]], i32 1 -; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = fcmp ule <2 x double> [[TMP8]], zeroinitializer -; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = fcmp ule <2 x double> [[TMP12]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP13:%.*]] = fcmp ogt <2 x double> [[TMP8]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP14:%.*]] = fcmp ogt <2 x double> [[TMP12]], zeroinitializer +; TFA_INTERLEAVE-NEXT: [[TMP15:%.*]] = xor <2 x i1> [[TMP13]], splat (i1 true) +; TFA_INTERLEAVE-NEXT: [[TMP16:%.*]] = xor <2 x i1> [[TMP14]], splat (i1 true) ; TFA_INTERLEAVE-NEXT: [[TMP17:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i1> [[TMP15]], <2 x i1> zeroinitializer ; TFA_INTERLEAVE-NEXT: [[TMP18:%.*]] = select <2 x i1> [[ACTIVE_LANE_MASK2]], <2 x i1> [[TMP16]], <2 x i1> zeroinitializer ; TFA_INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP17]], <2 x double> splat (double 1.000000e+00), <2 x double> zeroinitializer diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll index 1bacae764f760..93bc131ee5c5a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll @@ -5,8 +5,8 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-COST: Checking a loop in 'fixed_width' -; CHECK-COST: Cost of 10 for VF 2: WIDEN store vp<{{.+}}>, ir<2>, ir<{{.+}}> -; CHECK-COST: Cost of 20 for VF 4: WIDEN store vp<{{.+}}>, ir<2>, ir<{{.+}}> +; CHECK-COST: Cost of 10 for VF 2: WIDEN store vp<%6>, ir<2>, vp<%5> +; CHECK-COST: Cost of 20 for VF 4: WIDEN store vp<%6>, ir<2>, vp<%5> ; CHECK-COST: Selecting VF: 1. ; We should decide this loop is not worth vectorising using fixed width vectors diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll index 53065a062328b..5f926db1131f6 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll @@ -43,8 +43,9 @@ define i64 @same_exit_block_pre_inc_use1() #1 { ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP14]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = icmp ne [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq [[WIDE_LOAD]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], [[TMP5]] +; CHECK-NEXT: [[TMP16:%.*]] = xor [[TMP15]], splat (i1 true) ; CHECK-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP16]]) ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] @@ -125,8 +126,9 @@ define i64 @same_exit_block_pre_inc_use4() { ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = icmp uge <2 x i64> [[VEC_IND]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <2 x i64> [[VEC_IND]], [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true) ; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP4]]) ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2) @@ -202,8 +204,9 @@ define i64 @loop_contains_safe_call() #1 { ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 1 ; CHECK-NEXT: [[TMP3:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[WIDE_LOAD]]) -; CHECK-NEXT: [[TMP5:%.*]] = fcmp fast oge <4 x float> [[TMP3]], splat (float 3.000000e+00) +; CHECK-NEXT: [[TMP4:%.*]] = fcmp fast ult <4 x float> [[TMP3]], splat (float 3.000000e+00) ; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) @@ -296,8 +299,9 @@ define i64 @loop_contains_safe_div() #1 { ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP2]], align 1 ; CHECK-NEXT: [[TMP13:%.*]] = udiv [[WIDE_LOAD]], splat (i32 20000) -; CHECK-NEXT: [[TMP15:%.*]] = icmp ne [[TMP13]], splat (i32 1) +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq [[TMP13]], splat (i32 1) ; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX2]], [[TMP5]] +; CHECK-NEXT: [[TMP15:%.*]] = xor [[TMP14]], splat (i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[TMP15]]) ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[INDEX1]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] @@ -377,11 +381,12 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align( ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD]], splat (i32 1) +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 1) ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P2]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 ; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll index 1018bdd7a4ea1..daf29847e81d6 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll @@ -10,8 +10,9 @@ define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 { ; CHECK-VF4IC1: vector.body: ; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] ; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load -; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp ne [[VEC_LOAD]], splat (i32 3) -; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or [[VEC_PHI]], [[VEC_ICMP]] +; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq [[VEC_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[NOT:%*]] = xor [[VEC_ICMP]], splat (i1 true) +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or [[VEC_PHI]], [[NOT]] ; CHECK-VF4IC1: middle.block: ; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[VEC_SEL]]) ; CHECK-VF4IC1-NEXT: [[FR:%.*]] = freeze i1 [[OR_RDX]] @@ -24,14 +25,18 @@ define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 { ; CHECK-VF4IC4: [[VEC_PHI2:%.*]] = phi [ zeroinitializer, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ] ; CHECK-VF4IC4: [[VEC_PHI3:%.*]] = phi [ zeroinitializer, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ] ; CHECK-VF4IC4: [[VEC_PHI4:%.*]] = phi [ zeroinitializer, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ] -; CHECK-VF4IC4: [[VEC_ICMP1:%.*]] = icmp ne {{.*}}, splat (i32 3) -; CHECK-VF4IC4-NEXT: [[VEC_ICMP2:%.*]] = icmp ne {{.*}}, splat (i32 3) -; CHECK-VF4IC4-NEXT: [[VEC_ICMP3:%.*]] = icmp ne {{.*}}, splat (i32 3) -; CHECK-VF4IC4-NEXT: [[VEC_ICMP4:%.*]] = icmp ne {{.*}}, splat (i32 3) -; CHECK-VF4IC4-NEXT: [[VEC_SEL1:%.*]] = or [[VEC_PHI1]], [[VEC_ICMP1]] -; CHECK-VF4IC4-NEXT: [[VEC_SEL2:%.*]] = or [[VEC_PHI2]], [[VEC_ICMP2]] -; CHECK-VF4IC4-NEXT: [[VEC_SEL3:%.*]] = or [[VEC_PHI3]], [[VEC_ICMP3]] -; CHECK-VF4IC4-NEXT: [[VEC_SEL4:%.*]] = or [[VEC_PHI4]], [[VEC_ICMP4]] +; CHECK-VF4IC4: [[VEC_ICMP1:%.*]] = icmp eq {{.*}}, splat (i32 3) +; CHECK-VF4IC4-NEXT: [[VEC_ICMP2:%.*]] = icmp eq {{.*}}, splat (i32 3) +; CHECK-VF4IC4-NEXT: [[VEC_ICMP3:%.*]] = icmp eq {{.*}}, splat (i32 3) +; CHECK-VF4IC4-NEXT: [[VEC_ICMP4:%.*]] = icmp eq {{.*}}, splat (i32 3) +; CHECK-VF4IC4-NEXT: [[NOT1:%.*]] = xor [[VEC_ICMP1]], splat (i1 true) +; CHECK-VF4IC4-NEXT: [[NOT2:%.*]] = xor [[VEC_ICMP2]], splat (i1 true) +; CHECK-VF4IC4-NEXT: [[NOT3:%.*]] = xor [[VEC_ICMP3]], splat (i1 true) +; CHECK-VF4IC4-NEXT: [[NOT4:%.*]] = xor [[VEC_ICMP4]], splat (i1 true) +; CHECK-VF4IC4-NEXT: [[VEC_SEL1:%.*]] = or [[VEC_PHI1]], [[NOT1]] +; CHECK-VF4IC4-NEXT: [[VEC_SEL2:%.*]] = or [[VEC_PHI2]], [[NOT2]] +; CHECK-VF4IC4-NEXT: [[VEC_SEL3:%.*]] = or [[VEC_PHI3]], [[NOT3]] +; CHECK-VF4IC4-NEXT: [[VEC_SEL4:%.*]] = or [[VEC_PHI4]], [[NOT4]] ; CHECK-VF4IC4: middle.block: ; CHECK-VF4IC4-NEXT: [[OR1:%.*]] = or [[VEC_SEL2]], [[VEC_SEL1]] ; CHECK-VF4IC4-NEXT: [[OR2:%.*]] = or [[VEC_SEL3]], [[OR1]] @@ -66,8 +71,9 @@ define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 ; CHECK-VF4IC1: vector.body: ; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] ; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load -; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp ne [[VEC_LOAD]], splat (i32 3) -; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or [[VEC_PHI]], [[VEC_ICMP]] +; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq [[VEC_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[NOT:%*]] = xor [[VEC_ICMP]], splat (i1 true) +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or [[VEC_PHI]], [[NOT]] ; CHECK-VF4IC1: middle.block: ; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[VEC_SEL]]) ; CHECK-VF4IC1-NEXT: [[FR:%.*]] = freeze i1 [[OR_RDX]] @@ -99,8 +105,9 @@ define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) #0 { ; CHECK-VF4IC1: vector.body: ; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ] ; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load -; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = fcmp fast one [[VEC_LOAD]], splat (float 3.000000e+00) -; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or [[VEC_PHI]], [[VEC_ICMP]] +; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = fcmp fast ueq [[VEC_LOAD]], splat (float 3.000000e+00) +; CHECK-VF4IC1-NEXT: [[NOT:%*]] = xor [[VEC_ICMP]], splat (i1 true) +; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = or [[VEC_PHI]], [[NOT]] ; CHECK-VF4IC1: middle.block: ; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1( [[VEC_SEL]]) ; CHECK-VF4IC1-NEXT: [[FR:%.*]] = freeze i1 [[OR_RDX]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll index 6de629aea25c6..75b2df93c9350 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -460,7 +460,8 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP14:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq [[WIDE_MASKED_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = xor [[TMP13]], splat (i1 true) ; CHECK-NEXT: [[TMP15:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP14]], zeroinitializer ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[BROADCAST_SPLAT]], i32 4, [[TMP15]], poison) ; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], [[WIDE_MASKED_GATHER]], zeroinitializer diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll index bcbc65f50cf2f..ed5467258c71f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/tail-fold-uniform-memops.ll @@ -62,8 +62,9 @@ define void @cond_uniform_load(ptr nocapture %dst, ptr nocapture readonly %src, ; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[INIT_ACTIVE_LANE_MASK]], %vector.ph ], [ [[NEXT_ACTIVE_LANE_MASK:%.*]], %vector.body ] ; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[IDX]], 0 ; CHECK: [[COND_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{%.*}}, i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison) -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <4 x i32> [[COND_LOAD]], zeroinitializer -; CHECK-NEXT: [[MASK:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[COND_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[MASK:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer ; CHECK-NEXT: call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[SRC_SPLAT]], i32 4, <4 x i1> [[MASK]], <4 x i32> poison) entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll index d4c90b2ed7640..9a79b918de7e4 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll @@ -20,8 +20,9 @@ define i32 @select_icmp(i32 %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0 { ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp sge <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[NOT:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -49,8 +50,9 @@ define i32 @select_icmp(i32 %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0 { ; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP4]] ; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 ; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; SCALABLE-NEXT: [[TMP8:%.*]] = icmp sge [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; SCALABLE-NEXT: [[TMP9]] = or [[VEC_PHI]], [[TMP8]] +; SCALABLE-NEXT: [[TMP8:%.*]] = icmp slt [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; SCALABLE-NEXT: [[NOT:%.*]] = xor [[TMP8]], splat (i1 true) +; SCALABLE-NEXT: [[TMP9]] = or [[VEC_PHI]], [[NOT]] ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] ; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -93,8 +95,9 @@ define i32 @select_fcmp(float %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fcmp fast uge <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]] +; CHECK-NEXT: [[TMP4:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[NOT:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -122,8 +125,9 @@ define i32 @select_fcmp(float %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0 ; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[TMP4]] ; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 ; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; SCALABLE-NEXT: [[TMP8:%.*]] = fcmp fast uge [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; SCALABLE-NEXT: [[TMP9]] = or [[VEC_PHI]], [[TMP8]] +; SCALABLE-NEXT: [[TMP8:%.*]] = fcmp fast olt [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; SCALABLE-NEXT: [[NOT:%.*]] = xor [[TMP8]], splat (i1 true) +; SCALABLE-NEXT: [[TMP9]] = or [[VEC_PHI]], [[NOT]] ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] ; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -164,8 +168,9 @@ define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 { ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD]], splat (i32 3) -; CHECK-NEXT: [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-NEXT: [[NOT:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -191,8 +196,9 @@ define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 { ; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[TMP4]] ; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 ; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; SCALABLE-NEXT: [[TMP8:%.*]] = icmp ne [[WIDE_LOAD]], splat (i32 3) -; SCALABLE-NEXT: [[TMP9]] = or [[VEC_PHI]], [[TMP8]] +; SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq [[WIDE_LOAD]], splat (i32 3) +; SCALABLE-NEXT: [[NOT:%.*]] = xor [[TMP8]], splat (i1 true) +; SCALABLE-NEXT: [[TMP9]] = or [[VEC_PHI]], [[NOT]] ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] ; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -233,8 +239,9 @@ define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD]], splat (i32 3) -; CHECK-NEXT: [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-NEXT: [[NOT:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -260,8 +267,9 @@ define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 ; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[TMP4]] ; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 ; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; SCALABLE-NEXT: [[TMP8:%.*]] = icmp ne [[WIDE_LOAD]], splat (i32 3) -; SCALABLE-NEXT: [[TMP9]] = or [[VEC_PHI]], [[TMP8]] +; SCALABLE-NEXT: [[TMP8:%.*]] = icmp eq [[WIDE_LOAD]], splat (i32 3) +; SCALABLE-NEXT: [[NOT:%.*]] = xor [[TMP8]], splat (i1 true) +; SCALABLE-NEXT: [[TMP9]] = or [[VEC_PHI]], [[NOT]] ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] ; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -302,8 +310,9 @@ define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) #0 { ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[V:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fcmp fast one <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00) -; CHECK-NEXT: [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]] +; CHECK-NEXT: [[TMP4:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00) +; CHECK-NEXT: [[NOT:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] @@ -329,8 +338,9 @@ define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) #0 { ; SCALABLE-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[V:%.*]], i64 [[TMP4]] ; SCALABLE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0 ; SCALABLE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 -; SCALABLE-NEXT: [[TMP8:%.*]] = fcmp fast one [[WIDE_LOAD]], splat (float 3.000000e+00) -; SCALABLE-NEXT: [[TMP9]] = or [[VEC_PHI]], [[TMP8]] +; SCALABLE-NEXT: [[TMP8:%.*]] = fcmp fast ueq [[WIDE_LOAD]], splat (float 3.000000e+00) +; SCALABLE-NEXT: [[NOT:%.*]] = xor [[TMP8]], splat (i1 true) +; SCALABLE-NEXT: [[TMP9]] = or [[VEC_PHI]], [[NOT]] ; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]] ; SCALABLE-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; SCALABLE-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll index 8efd3f58e44d6..082386e39f3f6 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll @@ -301,9 +301,10 @@ define i32 @cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]] ; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 ; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], splat (i1 true), i32 [[TMP11]]) -; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp sle [[VP_OP_LOAD]], splat (i32 3) +; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp sgt [[VP_OP_LOAD]], splat (i32 3) ; IF-EVL-OUTLOOP-NEXT: [[TMP19:%.*]] = add [[VEC_PHI]], [[VP_OP_LOAD]] -; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = select [[TMP15]], [[TMP18]], zeroinitializer +; IF-EVL-OUTLOOP-NEXT: [[TMP20:%.*]] = xor [[TMP18]], splat (i1 true) +; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = select [[TMP15]], [[TMP20]], zeroinitializer ; IF-EVL-OUTLOOP-NEXT: [[PREDPHI1:%.*]] = select [[TMP21]], [[VEC_PHI]], [[TMP19]] ; IF-EVL-OUTLOOP-NEXT: [[PREDPHI]] = call @llvm.vp.merge.nxv4i32( splat (i1 true), [[PREDPHI1]], [[VEC_PHI]], i32 [[TMP11]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP23:%.*]] = zext i32 [[TMP11]] to i64 diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll index f18750644f8ff..b11f72c886bdc 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -479,8 +479,10 @@ define i1 @any_of_cost(ptr %start, ptr %end) #0 { ; CHECK-NEXT: [[TMP19:%.*]] = load ptr, ptr [[TMP13]], align 8 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP18]], i32 0 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x ptr> [[TMP20]], ptr [[TMP19]], i32 1 -; CHECK-NEXT: [[TMP24:%.*]] = icmp ne <2 x ptr> [[TMP17]], zeroinitializer -; CHECK-NEXT: [[TMP25:%.*]] = icmp ne <2 x ptr> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq <2 x ptr> [[TMP17]], zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = icmp eq <2 x ptr> [[TMP21]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = xor <2 x i1> [[TMP22]], splat (i1 true) +; CHECK-NEXT: [[TMP25:%.*]] = xor <2 x i1> [[TMP23]], splat (i1 true) ; CHECK-NEXT: [[TMP26]] = or <2 x i1> [[VEC_PHI]], [[TMP24]] ; CHECK-NEXT: [[TMP27]] = or <2 x i1> [[VEC_PHI3]], [[TMP25]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll index c188681cf0361..3d23090dd1235 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/drop-inbounds-flags-for-reverse-vector-pointer.ll @@ -22,7 +22,7 @@ define i1 @fn(ptr %nno) #0 { ; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], ; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], splat (i64 10) ; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], splat (i64 1) -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <4 x i64> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw i32, ptr [[NNO]], i64 [[TMP22]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[TMP23]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 -3 @@ -31,6 +31,7 @@ define i1 @fn(ptr %nno) #0 { ; CHECK-NEXT: [[REVERSE1:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = shl <4 x i32> [[REVERSE1]], splat (i32 1) ; CHECK-NEXT: [[TMP8:%.*]] = urem <4 x i32> [[TMP7]], splat (i32 10) +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true) ; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[REVERSE1]], <4 x i32> [[TMP8]] ; CHECK-NEXT: [[TMP11]] = or <4 x i32> [[PREDPHI]], [[VEC_PHI]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll b/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll index d3b027ce1b1c2..e22b090f6c0d0 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll @@ -15,7 +15,8 @@ define void @gep_use_in_dead_block(ptr noalias %dst, ptr %src) { ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[TMP4]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP5]], align 2 -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <4 x i16> [[WIDE_LOAD]], splat (i16 10) +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i16> [[WIDE_LOAD]], splat (i16 10) +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true) ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[TMP8]], i32 0 ; CHECK-NEXT: call void @llvm.masked.store.v4i16.p0(<4 x i16> zeroinitializer, ptr [[TMP12]], i32 2, <4 x i1> [[TMP7]]) @@ -90,7 +91,8 @@ define void @gep_use_outside_loop(ptr noalias %dst, ptr %src) { ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i16, ptr [[TMP2]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP3]], align 2 -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <4 x i16> [[WIDE_LOAD]], splat (i16 10) +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i16> [[WIDE_LOAD]], splat (i16 10) +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[TMP6]], i32 0 ; CHECK-NEXT: call void @llvm.masked.store.v4i16.p0(<4 x i16> zeroinitializer, ptr [[TMP7]], i32 2, <4 x i1> [[TMP5]]) diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll index 47afc7c6def3d..6dc8fec23575a 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll @@ -480,10 +480,14 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <8 x i64> [[STEP_ADD]], splat (i64 8) ; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <8 x i64> [[STEP_ADD1]], splat (i64 8) ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt <8 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt <8 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt <8 x i64> [[STEP_ADD1]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP11:%.*]] = icmp ugt <8 x i64> [[STEP_ADD2]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp ule <8 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp ule <8 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ule <8 x i64> [[STEP_ADD1]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <8 x i64> [[STEP_ADD2]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i1> [[TMP5]], splat (i1 true) +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i1> [[TMP6]], splat (i1 true) +; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i1> [[TMP7]], splat (i1 true) ; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[BROADCAST_SPLAT5]], [[VEC_IND]] ; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[BROADCAST_SPLAT5]], [[STEP_ADD]] ; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[BROADCAST_SPLAT5]], [[STEP_ADD1]] @@ -523,7 +527,8 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[LOOP_HEADER]] ] ; CHECK-NEXT: [[VEC_IND5:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], [[LOOP_HEADER]] ] ; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX4]], 0 -; CHECK-NEXT: [[TMP33:%.*]] = icmp ugt <4 x i64> [[VEC_IND5]], [[BROADCAST_SPLAT8]] +; CHECK-NEXT: [[TMP32:%.*]] = icmp ule <4 x i64> [[VEC_IND5]], [[BROADCAST_SPLAT8]] +; CHECK-NEXT: [[TMP33:%.*]] = xor <4 x i1> [[TMP32]], splat (i1 true) ; CHECK-NEXT: [[TMP34:%.*]] = or <4 x i64> [[BROADCAST_SPLAT10]], [[VEC_IND5]] ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP31]] ; CHECK-NEXT: [[TMP36:%.*]] = trunc <4 x i64> [[TMP34]] to <4 x i32> diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll index 31ca2d66a0c1e..aa91f61874a24 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked-store-cost.ll @@ -23,10 +23,14 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <8 x i64> [[STEP_ADD]], splat (i64 8) ; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <8 x i64> [[STEP_ADD1]], splat (i64 8) ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt <8 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt <8 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt <8 x i64> [[STEP_ADD1]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP11:%.*]] = icmp ugt <8 x i64> [[STEP_ADD2]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp ule <8 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp ule <8 x i64> [[STEP_ADD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ule <8 x i64> [[STEP_ADD1]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <8 x i64> [[STEP_ADD2]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i1> [[TMP4]], splat (i1 true) +; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i1> [[TMP5]], splat (i1 true) +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i1> [[TMP6]], splat (i1 true) +; CHECK-NEXT: [[TMP11:%.*]] = xor <8 x i1> [[TMP7]], splat (i1 true) ; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i64> [[BROADCAST_SPLAT5]], [[VEC_IND]] ; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i64> [[BROADCAST_SPLAT5]], [[STEP_ADD]] ; CHECK-NEXT: [[TMP14:%.*]] = or <8 x i64> [[BROADCAST_SPLAT5]], [[STEP_ADD1]] @@ -66,7 +70,8 @@ define i32 @test_scalar_predicated_cost(i64 %x, i64 %y, ptr %A) #0 { ; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[LOOP_HEADER]] ] ; CHECK-NEXT: [[VEC_IND5:%.*]] = phi <4 x i64> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], [[LOOP_HEADER]] ] ; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[INDEX4]], 0 -; CHECK-NEXT: [[TMP33:%.*]] = icmp ugt <4 x i64> [[VEC_IND5]], [[BROADCAST_SPLAT8]] +; CHECK-NEXT: [[TMP32:%.*]] = icmp ule <4 x i64> [[VEC_IND5]], [[BROADCAST_SPLAT8]] +; CHECK-NEXT: [[TMP33:%.*]] = xor <4 x i1> [[TMP32]], splat (i1 true) ; CHECK-NEXT: [[TMP34:%.*]] = or <4 x i64> [[BROADCAST_SPLAT10]], [[VEC_IND5]] ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP31]] ; CHECK-NEXT: [[TMP36:%.*]] = trunc <4 x i64> [[TMP34]] to <4 x i32> diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll index bbc08055f3f7f..206bbdf262b72 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -1728,10 +1728,14 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1-NEXT: [[TMP7:%.*]] = and <4 x i8> [[WIDE_LOAD1]], splat (i8 1) ; AVX1-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD2]], splat (i8 1) ; AVX1-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD3]], splat (i8 1) -; AVX1-NEXT: [[TMP14:%.*]] = icmp ne <4 x i8> [[TMP6]], zeroinitializer -; AVX1-NEXT: [[TMP15:%.*]] = icmp ne <4 x i8> [[TMP7]], zeroinitializer -; AVX1-NEXT: [[TMP16:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer -; AVX1-NEXT: [[TMP17:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer +; AVX1-NEXT: [[TMP10:%.*]] = icmp eq <4 x i8> [[TMP6]], zeroinitializer +; AVX1-NEXT: [[TMP11:%.*]] = icmp eq <4 x i8> [[TMP7]], zeroinitializer +; AVX1-NEXT: [[TMP12:%.*]] = icmp eq <4 x i8> [[TMP8]], zeroinitializer +; AVX1-NEXT: [[TMP13:%.*]] = icmp eq <4 x i8> [[TMP9]], zeroinitializer +; AVX1-NEXT: [[TMP14:%.*]] = xor <4 x i1> [[TMP10]], splat (i1 true) +; AVX1-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[TMP11]], splat (i1 true) +; AVX1-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP12]], splat (i1 true) +; AVX1-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP13]], splat (i1 true) ; AVX1-NEXT: [[TMP18:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 0 ; AVX1-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 4 @@ -1741,10 +1745,14 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP15]], <4 x ptr> poison) ; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP16]], <4 x ptr> poison) ; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison) -; AVX1-NEXT: [[TMP27:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX1-NEXT: [[TMP28:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer -; AVX1-NEXT: [[TMP29:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer -; AVX1-NEXT: [[TMP30:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer +; AVX1-NEXT: [[TMP23:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX1-NEXT: [[TMP24:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer +; AVX1-NEXT: [[TMP25:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer +; AVX1-NEXT: [[TMP26:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer +; AVX1-NEXT: [[TMP27:%.*]] = xor <4 x i1> [[TMP23]], splat (i1 true) +; AVX1-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP24]], splat (i1 true) +; AVX1-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP25]], splat (i1 true) +; AVX1-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP26]], splat (i1 true) ; AVX1-NEXT: [[TMP31:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> [[TMP27]], <4 x i1> zeroinitializer ; AVX1-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP15]], <4 x i1> [[TMP28]], <4 x i1> zeroinitializer ; AVX1-NEXT: [[TMP33:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP29]], <4 x i1> zeroinitializer @@ -1780,11 +1788,13 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1-NEXT: [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[TMP56]], i32 0 ; AVX1-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP57]], align 1 ; AVX1-NEXT: [[TMP44:%.*]] = and <4 x i8> [[WIDE_LOAD11]], splat (i8 1) -; AVX1-NEXT: [[TMP46:%.*]] = icmp ne <4 x i8> [[TMP44]], zeroinitializer +; AVX1-NEXT: [[TMP45:%.*]] = icmp eq <4 x i8> [[TMP44]], zeroinitializer +; AVX1-NEXT: [[TMP46:%.*]] = xor <4 x i1> [[TMP45]], splat (i1 true) ; AVX1-NEXT: [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP55]] ; AVX1-NEXT: [[TMP48:%.*]] = getelementptr ptr, ptr [[TMP47]], i32 0 ; AVX1-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP48]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison) -; AVX1-NEXT: [[TMP50:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer +; AVX1-NEXT: [[TMP49:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer +; AVX1-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP49]], splat (i1 true) ; AVX1-NEXT: [[TMP51:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer ; AVX1-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP55]] ; AVX1-NEXT: [[TMP53:%.*]] = getelementptr double, ptr [[TMP52]], i32 0 @@ -1854,10 +1864,14 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX2-NEXT: [[TMP7:%.*]] = and <4 x i8> [[WIDE_LOAD1]], splat (i8 1) ; AVX2-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD2]], splat (i8 1) ; AVX2-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD3]], splat (i8 1) -; AVX2-NEXT: [[TMP14:%.*]] = icmp ne <4 x i8> [[TMP6]], zeroinitializer -; AVX2-NEXT: [[TMP15:%.*]] = icmp ne <4 x i8> [[TMP7]], zeroinitializer -; AVX2-NEXT: [[TMP16:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer -; AVX2-NEXT: [[TMP17:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer +; AVX2-NEXT: [[TMP10:%.*]] = icmp eq <4 x i8> [[TMP6]], zeroinitializer +; AVX2-NEXT: [[TMP11:%.*]] = icmp eq <4 x i8> [[TMP7]], zeroinitializer +; AVX2-NEXT: [[TMP12:%.*]] = icmp eq <4 x i8> [[TMP8]], zeroinitializer +; AVX2-NEXT: [[TMP13:%.*]] = icmp eq <4 x i8> [[TMP9]], zeroinitializer +; AVX2-NEXT: [[TMP14:%.*]] = xor <4 x i1> [[TMP10]], splat (i1 true) +; AVX2-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[TMP11]], splat (i1 true) +; AVX2-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP12]], splat (i1 true) +; AVX2-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP13]], splat (i1 true) ; AVX2-NEXT: [[TMP18:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] ; AVX2-NEXT: [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 0 ; AVX2-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 4 @@ -1867,10 +1881,14 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP15]], <4 x ptr> poison) ; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP16]], <4 x ptr> poison) ; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison) -; AVX2-NEXT: [[TMP27:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX2-NEXT: [[TMP28:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer -; AVX2-NEXT: [[TMP29:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer -; AVX2-NEXT: [[TMP30:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer +; AVX2-NEXT: [[TMP23:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX2-NEXT: [[TMP24:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer +; AVX2-NEXT: [[TMP25:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer +; AVX2-NEXT: [[TMP26:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer +; AVX2-NEXT: [[TMP27:%.*]] = xor <4 x i1> [[TMP23]], splat (i1 true) +; AVX2-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP24]], splat (i1 true) +; AVX2-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP25]], splat (i1 true) +; AVX2-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP26]], splat (i1 true) ; AVX2-NEXT: [[TMP31:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> [[TMP27]], <4 x i1> zeroinitializer ; AVX2-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP15]], <4 x i1> [[TMP28]], <4 x i1> zeroinitializer ; AVX2-NEXT: [[TMP33:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP29]], <4 x i1> zeroinitializer @@ -1906,11 +1924,13 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX2-NEXT: [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[TMP56]], i32 0 ; AVX2-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP57]], align 1 ; AVX2-NEXT: [[TMP44:%.*]] = and <4 x i8> [[WIDE_LOAD11]], splat (i8 1) -; AVX2-NEXT: [[TMP46:%.*]] = icmp ne <4 x i8> [[TMP44]], zeroinitializer +; AVX2-NEXT: [[TMP45:%.*]] = icmp eq <4 x i8> [[TMP44]], zeroinitializer +; AVX2-NEXT: [[TMP46:%.*]] = xor <4 x i1> [[TMP45]], splat (i1 true) ; AVX2-NEXT: [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP55]] ; AVX2-NEXT: [[TMP48:%.*]] = getelementptr ptr, ptr [[TMP47]], i32 0 ; AVX2-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP48]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison) -; AVX2-NEXT: [[TMP50:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer +; AVX2-NEXT: [[TMP49:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer +; AVX2-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP49]], splat (i1 true) ; AVX2-NEXT: [[TMP51:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer ; AVX2-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP55]] ; AVX2-NEXT: [[TMP53:%.*]] = getelementptr double, ptr [[TMP52]], i32 0 @@ -1980,10 +2000,14 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX512-NEXT: [[TMP7:%.*]] = and <8 x i8> [[WIDE_LOAD1]], splat (i8 1) ; AVX512-NEXT: [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD2]], splat (i8 1) ; AVX512-NEXT: [[TMP9:%.*]] = and <8 x i8> [[WIDE_LOAD3]], splat (i8 1) -; AVX512-NEXT: [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP6]], zeroinitializer -; AVX512-NEXT: [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP7]], zeroinitializer -; AVX512-NEXT: [[TMP16:%.*]] = icmp ne <8 x i8> [[TMP8]], zeroinitializer -; AVX512-NEXT: [[TMP17:%.*]] = icmp ne <8 x i8> [[TMP9]], zeroinitializer +; AVX512-NEXT: [[TMP10:%.*]] = icmp eq <8 x i8> [[TMP6]], zeroinitializer +; AVX512-NEXT: [[TMP11:%.*]] = icmp eq <8 x i8> [[TMP7]], zeroinitializer +; AVX512-NEXT: [[TMP12:%.*]] = icmp eq <8 x i8> [[TMP8]], zeroinitializer +; AVX512-NEXT: [[TMP13:%.*]] = icmp eq <8 x i8> [[TMP9]], zeroinitializer +; AVX512-NEXT: [[TMP14:%.*]] = xor <8 x i1> [[TMP10]], splat (i1 true) +; AVX512-NEXT: [[TMP15:%.*]] = xor <8 x i1> [[TMP11]], splat (i1 true) +; AVX512-NEXT: [[TMP16:%.*]] = xor <8 x i1> [[TMP12]], splat (i1 true) +; AVX512-NEXT: [[TMP17:%.*]] = xor <8 x i1> [[TMP13]], splat (i1 true) ; AVX512-NEXT: [[TMP18:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] ; AVX512-NEXT: [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 0 ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8 @@ -1993,10 +2017,14 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX512-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP20]], i32 8, <8 x i1> [[TMP15]], <8 x ptr> poison) ; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP21]], i32 8, <8 x i1> [[TMP16]], <8 x ptr> poison) ; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP22]], i32 8, <8 x i1> [[TMP17]], <8 x ptr> poison) -; AVX512-NEXT: [[TMP27:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX512-NEXT: [[TMP28:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer -; AVX512-NEXT: [[TMP29:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer -; AVX512-NEXT: [[TMP30:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer +; AVX512-NEXT: [[TMP23:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX512-NEXT: [[TMP24:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer +; AVX512-NEXT: [[TMP25:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer +; AVX512-NEXT: [[TMP26:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer +; AVX512-NEXT: [[TMP27:%.*]] = xor <8 x i1> [[TMP23]], splat (i1 true) +; AVX512-NEXT: [[TMP28:%.*]] = xor <8 x i1> [[TMP24]], splat (i1 true) +; AVX512-NEXT: [[TMP29:%.*]] = xor <8 x i1> [[TMP25]], splat (i1 true) +; AVX512-NEXT: [[TMP30:%.*]] = xor <8 x i1> [[TMP26]], splat (i1 true) ; AVX512-NEXT: [[TMP31:%.*]] = select <8 x i1> [[TMP14]], <8 x i1> [[TMP27]], <8 x i1> zeroinitializer ; AVX512-NEXT: [[TMP32:%.*]] = select <8 x i1> [[TMP15]], <8 x i1> [[TMP28]], <8 x i1> zeroinitializer ; AVX512-NEXT: [[TMP33:%.*]] = select <8 x i1> [[TMP16]], <8 x i1> [[TMP29]], <8 x i1> zeroinitializer @@ -2032,11 +2060,13 @@ define void @foo7(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[TMP56]], i32 0 ; AVX512-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i8>, ptr [[TMP57]], align 1 ; AVX512-NEXT: [[TMP44:%.*]] = and <8 x i8> [[WIDE_LOAD11]], splat (i8 1) -; AVX512-NEXT: [[TMP46:%.*]] = icmp ne <8 x i8> [[TMP44]], zeroinitializer +; AVX512-NEXT: [[TMP45:%.*]] = icmp eq <8 x i8> [[TMP44]], zeroinitializer +; AVX512-NEXT: [[TMP46:%.*]] = xor <8 x i1> [[TMP45]], splat (i1 true) ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP55]] ; AVX512-NEXT: [[TMP48:%.*]] = getelementptr ptr, ptr [[TMP47]], i32 0 ; AVX512-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP48]], i32 8, <8 x i1> [[TMP46]], <8 x ptr> poison) -; AVX512-NEXT: [[TMP50:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer +; AVX512-NEXT: [[TMP49:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer +; AVX512-NEXT: [[TMP50:%.*]] = xor <8 x i1> [[TMP49]], splat (i1 true) ; AVX512-NEXT: [[TMP51:%.*]] = select <8 x i1> [[TMP46]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP55]] ; AVX512-NEXT: [[TMP53:%.*]] = getelementptr double, ptr [[TMP52]], i32 0 @@ -2151,10 +2181,14 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1-NEXT: [[TMP7:%.*]] = and <4 x i8> [[WIDE_LOAD1]], splat (i8 1) ; AVX1-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD2]], splat (i8 1) ; AVX1-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD3]], splat (i8 1) -; AVX1-NEXT: [[TMP14:%.*]] = icmp ne <4 x i8> [[TMP6]], zeroinitializer -; AVX1-NEXT: [[TMP15:%.*]] = icmp ne <4 x i8> [[TMP7]], zeroinitializer -; AVX1-NEXT: [[TMP16:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer -; AVX1-NEXT: [[TMP17:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer +; AVX1-NEXT: [[TMP10:%.*]] = icmp eq <4 x i8> [[TMP6]], zeroinitializer +; AVX1-NEXT: [[TMP11:%.*]] = icmp eq <4 x i8> [[TMP7]], zeroinitializer +; AVX1-NEXT: [[TMP12:%.*]] = icmp eq <4 x i8> [[TMP8]], zeroinitializer +; AVX1-NEXT: [[TMP13:%.*]] = icmp eq <4 x i8> [[TMP9]], zeroinitializer +; AVX1-NEXT: [[TMP14:%.*]] = xor <4 x i1> [[TMP10]], splat (i1 true) +; AVX1-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[TMP11]], splat (i1 true) +; AVX1-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP12]], splat (i1 true) +; AVX1-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP13]], splat (i1 true) ; AVX1-NEXT: [[TMP18:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] ; AVX1-NEXT: [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 0 ; AVX1-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 4 @@ -2164,10 +2198,14 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP15]], <4 x ptr> poison) ; AVX1-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP16]], <4 x ptr> poison) ; AVX1-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison) -; AVX1-NEXT: [[TMP27:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX1-NEXT: [[TMP28:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer -; AVX1-NEXT: [[TMP29:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer -; AVX1-NEXT: [[TMP30:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer +; AVX1-NEXT: [[TMP23:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX1-NEXT: [[TMP24:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer +; AVX1-NEXT: [[TMP25:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer +; AVX1-NEXT: [[TMP26:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer +; AVX1-NEXT: [[TMP27:%.*]] = xor <4 x i1> [[TMP23]], splat (i1 true) +; AVX1-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP24]], splat (i1 true) +; AVX1-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP25]], splat (i1 true) +; AVX1-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP26]], splat (i1 true) ; AVX1-NEXT: [[TMP31:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> [[TMP27]], <4 x i1> zeroinitializer ; AVX1-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP15]], <4 x i1> [[TMP28]], <4 x i1> zeroinitializer ; AVX1-NEXT: [[TMP33:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP29]], <4 x i1> zeroinitializer @@ -2203,11 +2241,13 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX1-NEXT: [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[TMP56]], i32 0 ; AVX1-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP57]], align 1 ; AVX1-NEXT: [[TMP44:%.*]] = and <4 x i8> [[WIDE_LOAD11]], splat (i8 1) -; AVX1-NEXT: [[TMP46:%.*]] = icmp ne <4 x i8> [[TMP44]], zeroinitializer +; AVX1-NEXT: [[TMP45:%.*]] = icmp eq <4 x i8> [[TMP44]], zeroinitializer +; AVX1-NEXT: [[TMP46:%.*]] = xor <4 x i1> [[TMP45]], splat (i1 true) ; AVX1-NEXT: [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP55]] ; AVX1-NEXT: [[TMP48:%.*]] = getelementptr ptr, ptr [[TMP47]], i32 0 ; AVX1-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP48]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison) -; AVX1-NEXT: [[TMP50:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer +; AVX1-NEXT: [[TMP49:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer +; AVX1-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP49]], splat (i1 true) ; AVX1-NEXT: [[TMP51:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer ; AVX1-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP55]] ; AVX1-NEXT: [[TMP53:%.*]] = getelementptr double, ptr [[TMP52]], i32 0 @@ -2277,10 +2317,14 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX2-NEXT: [[TMP7:%.*]] = and <4 x i8> [[WIDE_LOAD1]], splat (i8 1) ; AVX2-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD2]], splat (i8 1) ; AVX2-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD3]], splat (i8 1) -; AVX2-NEXT: [[TMP14:%.*]] = icmp ne <4 x i8> [[TMP6]], zeroinitializer -; AVX2-NEXT: [[TMP15:%.*]] = icmp ne <4 x i8> [[TMP7]], zeroinitializer -; AVX2-NEXT: [[TMP16:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer -; AVX2-NEXT: [[TMP17:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer +; AVX2-NEXT: [[TMP10:%.*]] = icmp eq <4 x i8> [[TMP6]], zeroinitializer +; AVX2-NEXT: [[TMP11:%.*]] = icmp eq <4 x i8> [[TMP7]], zeroinitializer +; AVX2-NEXT: [[TMP12:%.*]] = icmp eq <4 x i8> [[TMP8]], zeroinitializer +; AVX2-NEXT: [[TMP13:%.*]] = icmp eq <4 x i8> [[TMP9]], zeroinitializer +; AVX2-NEXT: [[TMP14:%.*]] = xor <4 x i1> [[TMP10]], splat (i1 true) +; AVX2-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[TMP11]], splat (i1 true) +; AVX2-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP12]], splat (i1 true) +; AVX2-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP13]], splat (i1 true) ; AVX2-NEXT: [[TMP18:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] ; AVX2-NEXT: [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 0 ; AVX2-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 4 @@ -2290,10 +2334,14 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX2-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP20]], i32 8, <4 x i1> [[TMP15]], <4 x ptr> poison) ; AVX2-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP21]], i32 8, <4 x i1> [[TMP16]], <4 x ptr> poison) ; AVX2-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP22]], i32 8, <4 x i1> [[TMP17]], <4 x ptr> poison) -; AVX2-NEXT: [[TMP27:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX2-NEXT: [[TMP28:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer -; AVX2-NEXT: [[TMP29:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer -; AVX2-NEXT: [[TMP30:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer +; AVX2-NEXT: [[TMP23:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX2-NEXT: [[TMP24:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer +; AVX2-NEXT: [[TMP25:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer +; AVX2-NEXT: [[TMP26:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer +; AVX2-NEXT: [[TMP27:%.*]] = xor <4 x i1> [[TMP23]], splat (i1 true) +; AVX2-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP24]], splat (i1 true) +; AVX2-NEXT: [[TMP29:%.*]] = xor <4 x i1> [[TMP25]], splat (i1 true) +; AVX2-NEXT: [[TMP30:%.*]] = xor <4 x i1> [[TMP26]], splat (i1 true) ; AVX2-NEXT: [[TMP31:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> [[TMP27]], <4 x i1> zeroinitializer ; AVX2-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP15]], <4 x i1> [[TMP28]], <4 x i1> zeroinitializer ; AVX2-NEXT: [[TMP33:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP29]], <4 x i1> zeroinitializer @@ -2329,11 +2377,13 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX2-NEXT: [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[TMP56]], i32 0 ; AVX2-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, ptr [[TMP57]], align 1 ; AVX2-NEXT: [[TMP44:%.*]] = and <4 x i8> [[WIDE_LOAD11]], splat (i8 1) -; AVX2-NEXT: [[TMP46:%.*]] = icmp ne <4 x i8> [[TMP44]], zeroinitializer +; AVX2-NEXT: [[TMP45:%.*]] = icmp eq <4 x i8> [[TMP44]], zeroinitializer +; AVX2-NEXT: [[TMP46:%.*]] = xor <4 x i1> [[TMP45]], splat (i1 true) ; AVX2-NEXT: [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP55]] ; AVX2-NEXT: [[TMP48:%.*]] = getelementptr ptr, ptr [[TMP47]], i32 0 ; AVX2-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr [[TMP48]], i32 8, <4 x i1> [[TMP46]], <4 x ptr> poison) -; AVX2-NEXT: [[TMP50:%.*]] = icmp ne <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer +; AVX2-NEXT: [[TMP49:%.*]] = icmp eq <4 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer +; AVX2-NEXT: [[TMP50:%.*]] = xor <4 x i1> [[TMP49]], splat (i1 true) ; AVX2-NEXT: [[TMP51:%.*]] = select <4 x i1> [[TMP46]], <4 x i1> [[TMP50]], <4 x i1> zeroinitializer ; AVX2-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP55]] ; AVX2-NEXT: [[TMP53:%.*]] = getelementptr double, ptr [[TMP52]], i32 0 @@ -2403,10 +2453,14 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX512-NEXT: [[TMP7:%.*]] = and <8 x i8> [[WIDE_LOAD1]], splat (i8 1) ; AVX512-NEXT: [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD2]], splat (i8 1) ; AVX512-NEXT: [[TMP9:%.*]] = and <8 x i8> [[WIDE_LOAD3]], splat (i8 1) -; AVX512-NEXT: [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP6]], zeroinitializer -; AVX512-NEXT: [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP7]], zeroinitializer -; AVX512-NEXT: [[TMP16:%.*]] = icmp ne <8 x i8> [[TMP8]], zeroinitializer -; AVX512-NEXT: [[TMP17:%.*]] = icmp ne <8 x i8> [[TMP9]], zeroinitializer +; AVX512-NEXT: [[TMP10:%.*]] = icmp eq <8 x i8> [[TMP6]], zeroinitializer +; AVX512-NEXT: [[TMP11:%.*]] = icmp eq <8 x i8> [[TMP7]], zeroinitializer +; AVX512-NEXT: [[TMP12:%.*]] = icmp eq <8 x i8> [[TMP8]], zeroinitializer +; AVX512-NEXT: [[TMP13:%.*]] = icmp eq <8 x i8> [[TMP9]], zeroinitializer +; AVX512-NEXT: [[TMP14:%.*]] = xor <8 x i1> [[TMP10]], splat (i1 true) +; AVX512-NEXT: [[TMP15:%.*]] = xor <8 x i1> [[TMP11]], splat (i1 true) +; AVX512-NEXT: [[TMP16:%.*]] = xor <8 x i1> [[TMP12]], splat (i1 true) +; AVX512-NEXT: [[TMP17:%.*]] = xor <8 x i1> [[TMP13]], splat (i1 true) ; AVX512-NEXT: [[TMP18:%.*]] = getelementptr ptr, ptr [[IN:%.*]], i64 [[TMP0]] ; AVX512-NEXT: [[TMP19:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 0 ; AVX512-NEXT: [[TMP20:%.*]] = getelementptr ptr, ptr [[TMP18]], i32 8 @@ -2416,10 +2470,14 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX512-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP20]], i32 8, <8 x i1> [[TMP15]], <8 x ptr> poison) ; AVX512-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP21]], i32 8, <8 x i1> [[TMP16]], <8 x ptr> poison) ; AVX512-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP22]], i32 8, <8 x i1> [[TMP17]], <8 x ptr> poison) -; AVX512-NEXT: [[TMP27:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer -; AVX512-NEXT: [[TMP28:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer -; AVX512-NEXT: [[TMP29:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer -; AVX512-NEXT: [[TMP30:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer +; AVX512-NEXT: [[TMP23:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX512-NEXT: [[TMP24:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD4]], zeroinitializer +; AVX512-NEXT: [[TMP25:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD5]], zeroinitializer +; AVX512-NEXT: [[TMP26:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD6]], zeroinitializer +; AVX512-NEXT: [[TMP27:%.*]] = xor <8 x i1> [[TMP23]], splat (i1 true) +; AVX512-NEXT: [[TMP28:%.*]] = xor <8 x i1> [[TMP24]], splat (i1 true) +; AVX512-NEXT: [[TMP29:%.*]] = xor <8 x i1> [[TMP25]], splat (i1 true) +; AVX512-NEXT: [[TMP30:%.*]] = xor <8 x i1> [[TMP26]], splat (i1 true) ; AVX512-NEXT: [[TMP31:%.*]] = select <8 x i1> [[TMP14]], <8 x i1> [[TMP27]], <8 x i1> zeroinitializer ; AVX512-NEXT: [[TMP32:%.*]] = select <8 x i1> [[TMP15]], <8 x i1> [[TMP28]], <8 x i1> zeroinitializer ; AVX512-NEXT: [[TMP33:%.*]] = select <8 x i1> [[TMP16]], <8 x i1> [[TMP29]], <8 x i1> zeroinitializer @@ -2455,11 +2513,13 @@ define void @foo8(ptr noalias nocapture %out, ptr noalias nocapture readonly %in ; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds i8, ptr [[TMP56]], i32 0 ; AVX512-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i8>, ptr [[TMP57]], align 1 ; AVX512-NEXT: [[TMP44:%.*]] = and <8 x i8> [[WIDE_LOAD11]], splat (i8 1) -; AVX512-NEXT: [[TMP46:%.*]] = icmp ne <8 x i8> [[TMP44]], zeroinitializer +; AVX512-NEXT: [[TMP45:%.*]] = icmp eq <8 x i8> [[TMP44]], zeroinitializer +; AVX512-NEXT: [[TMP46:%.*]] = xor <8 x i1> [[TMP45]], splat (i1 true) ; AVX512-NEXT: [[TMP47:%.*]] = getelementptr ptr, ptr [[IN]], i64 [[TMP55]] ; AVX512-NEXT: [[TMP48:%.*]] = getelementptr ptr, ptr [[TMP47]], i32 0 ; AVX512-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call <8 x ptr> @llvm.masked.load.v8p0.p0(ptr [[TMP48]], i32 8, <8 x i1> [[TMP46]], <8 x ptr> poison) -; AVX512-NEXT: [[TMP50:%.*]] = icmp ne <8 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer +; AVX512-NEXT: [[TMP49:%.*]] = icmp eq <8 x ptr> [[WIDE_MASKED_LOAD12]], zeroinitializer +; AVX512-NEXT: [[TMP50:%.*]] = xor <8 x i1> [[TMP49]], splat (i1 true) ; AVX512-NEXT: [[TMP51:%.*]] = select <8 x i1> [[TMP46]], <8 x i1> [[TMP50]], <8 x i1> zeroinitializer ; AVX512-NEXT: [[TMP52:%.*]] = getelementptr double, ptr [[OUT]], i64 [[TMP55]] ; AVX512-NEXT: [[TMP53:%.*]] = getelementptr double, ptr [[TMP52]], i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll b/llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll index e888ad3b8eb4e..0ccffe76fe477 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr55096-scalarize-add.ll @@ -14,7 +14,8 @@ define void @test_pr55096(i64 %c, ptr %p) { ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[INDEX]] to i16 ; CHECK-NEXT: [[TMP0:%.*]] = mul i16 [[DOTCAST]], 2008 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i16 6229, [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = icmp uge <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i1> [[TMP1]], splat (i1 true) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0 ; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: diff --git a/llvm/test/Transforms/LoopVectorize/debugloc.ll b/llvm/test/Transforms/LoopVectorize/debugloc.ll index e5d6adac14f36..50d42bbc3a18d 100644 --- a/llvm/test/Transforms/LoopVectorize/debugloc.ll +++ b/llvm/test/Transforms/LoopVectorize/debugloc.ll @@ -61,8 +61,9 @@ define i32 @test_debug_loc_on_branch_in_loop(ptr noalias %src, ptr noalias %dst) ; CHECK-LABEL: define i32 @test_debug_loc_on_branch_in_loop( ; CHECK-LABEL: vector.body: ; CHECK: [[LOAD:%.+]] = load <2 x i32>, ptr {{.+}}, align 4 -; CHECK-NEXT: [[CMP:%.+]] = icmp ne <2 x i32> [[LOAD]], splat (i32 10), !dbg [[LOC3:!.+]] -; CHECK-NEXT: [[EXT:%.+]] = extractelement <2 x i1> [[CMP]], i32 0, !dbg [[LOC3]] +; CHECK-NEXT: [[CMP:%.+]] = icmp eq <2 x i32> [[LOAD]], splat (i32 10) +; CHECK-NEXT: [[XOR:%.+]] = xor <2 x i1> [[CMP:%.+]], splat (i1 true), !dbg [[LOC3:!.+]] +; CHECK-NEXT: [[EXT:%.+]] = extractelement <2 x i1> [[XOR]], i32 0, !dbg [[LOC3]] ; CHECK-NEXT: br i1 [[EXT]], label %pred.store.if, label %pred.store.continue, !dbg [[LOC3]] ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: @@ -99,8 +100,9 @@ define i32 @test_different_debug_loc_on_replicate_recipe(ptr noalias %src, ptr n ; CHECK-LABEL: define i32 @test_different_debug_loc_on_replicate_recipe( ; CHECK-LABEL: vector.body: ; CHECK: [[LOAD:%.+]] = load <2 x i32>, ptr {{.+}}, align 4 -; CHECK-NEXT: [[CMP:%.+]] = icmp ne <2 x i32> [[LOAD]], splat (i32 10), !dbg [[LOC4:!.+]] -; CHECK-NEXT: [[EXT:%.+]] = extractelement <2 x i1> [[CMP]], i32 0, !dbg [[LOC4]] +; CHECK-NEXT: [[CMP:%.+]] = icmp eq <2 x i32> [[LOAD]], splat (i32 10) +; CHECK-NEXT: [[XOR:%.+]] = xor <2 x i1> [[CMP:%.+]], splat (i1 true), !dbg [[LOC4:!.+]] +; CHECK-NEXT: [[EXT:%.+]] = extractelement <2 x i1> [[XOR]], i32 0, !dbg [[LOC4]] ; CHECK-NEXT: br i1 [[EXT]], label %pred.store.if, label %pred.store.continue, !dbg [[LOC4]] ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll index 2459191db49e3..8ae3e8f46c8b2 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll @@ -222,7 +222,8 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], splat (i1 true) ; CHECK-NEXT: [[TMP3]] = or <4 x i1> [[VEC_PHI]], [[TMP2]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) @@ -259,7 +260,8 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) { ; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI10:%.*]] = phi <4 x i1> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND11:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <4 x i32> [[VEC_IND11]], [[BROADCAST_SPLAT14]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i32> [[VEC_IND11]], [[BROADCAST_SPLAT14]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true) ; CHECK-NEXT: [[TMP10]] = or <4 x i1> [[VEC_PHI10]], [[TMP9]] ; CHECK-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX9]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT12]] = add <4 x i32> [[VEC_IND11]], splat (i32 4) @@ -356,7 +358,8 @@ define i1 @any_of_reduction_i1_epilog2(ptr %start, ptr %end, i64 %x) { ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i64> [[TMP16]], i64 [[TMP13]], i32 1 ; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i64> [[TMP17]], i64 [[TMP14]], i32 2 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP18]], i64 [[TMP15]], i32 3 -; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <4 x i64> [[TMP19]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq <4 x i64> [[TMP19]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP21:%.*]] = xor <4 x i1> [[TMP20]], splat (i1 true) ; CHECK-NEXT: [[RDX_SELECT_CMP]] = or <4 x i1> [[VEC_PHI]], [[TMP21]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -410,7 +413,8 @@ define i1 @any_of_reduction_i1_epilog2(ptr %start, ptr %end, i64 %x) { ; CHECK-NEXT: [[TMP39:%.*]] = insertelement <4 x i64> [[TMP38]], i64 [[TMP35]], i32 1 ; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i64> [[TMP39]], i64 [[TMP36]], i32 2 ; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i64> [[TMP40]], i64 [[TMP37]], i32 3 -; CHECK-NEXT: [[TMP46:%.*]] = icmp ne <4 x i64> [[TMP41]], [[BROADCAST_SPLAT19]] +; CHECK-NEXT: [[TMP42:%.*]] = icmp eq <4 x i64> [[TMP41]], [[BROADCAST_SPLAT19]] +; CHECK-NEXT: [[TMP46:%.*]] = xor <4 x i1> [[TMP42]], splat (i1 true) ; CHECK-NEXT: [[TMP43]] = or <4 x i1> [[VEC_PHI12]], [[TMP46]] ; CHECK-NEXT: [[INDEX_NEXT20]] = add nuw i64 [[INDEX11]], 4 ; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT20]], [[N_VEC8]] diff --git a/llvm/test/Transforms/LoopVectorize/if-reduction.ll b/llvm/test/Transforms/LoopVectorize/if-reduction.ll index e985d26557633..f266a13924f06 100644 --- a/llvm/test/Transforms/LoopVectorize/if-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/if-reduction.ll @@ -1218,7 +1218,8 @@ define float @fcmp_multi(ptr nocapture readonly %a, i32 %n) nounwind readonly { ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], splat (float 1.000000e+00) ; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true) -; CHECK-NEXT: [[TMP6:%.*]] = fcmp uge <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00) +; CHECK-NEXT: [[TMP5:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00) +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP6]], <4 x i1> zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00) ; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <4 x float> [[WIDE_LOAD]], splat (float 2.000000e+00) @@ -1343,9 +1344,10 @@ define float @fcmp_fadd_fsub(ptr nocapture readonly %a, i32 %n) nounwind readonl ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt <4 x float> [[WIDE_LOAD]], splat (float 1.000000e+00) ; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true) -; CHECK-NEXT: [[TMP8:%.*]] = fcmp uge <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00) +; CHECK-NEXT: [[TMP5:%.*]] = fcmp olt <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00) ; CHECK-NEXT: [[TMP6:%.*]] = fsub fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) ; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP8]], <4 x i1> zeroinitializer ; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP7]], <4 x float> [[TMP6]] ; CHECK-NEXT: [[PREDPHI1]] = select <4 x i1> [[TMP9]], <4 x float> [[VEC_PHI]], <4 x float> [[PREDPHI]] diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll index 219aace657a23..5fcd0de911f85 100644 --- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll +++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll @@ -349,7 +349,8 @@ define void @test_rev_loops_deref_loads(ptr nocapture noundef writeonly %dest) { ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 -1 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <2 x i32> [[REVERSE]], splat (i32 3) +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], splat (i32 3) +; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP0]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i32 -1 @@ -460,7 +461,8 @@ define void @test_rev_loops_non_deref_loads(ptr nocapture noundef writeonly %des ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 -1 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4 ; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <2 x i32> [[REVERSE]], splat (i32 3) +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <2 x i32> [[REVERSE]], splat (i32 3) +; CHECK-NEXT: [[TMP6:%.*]] = xor <2 x i1> [[TMP5]], splat (i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 ; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: @@ -663,7 +665,8 @@ define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 -1 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <2 x i32> [[REVERSE]], splat (i32 3) +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], splat (i32 3) +; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2) ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP7]] diff --git a/llvm/test/Transforms/LoopVectorize/loop-form.ll b/llvm/test/Transforms/LoopVectorize/loop-form.ll index f3d0ee82a0879..4a9380b3f35e8 100644 --- a/llvm/test/Transforms/LoopVectorize/loop-form.ll +++ b/llvm/test/Transforms/LoopVectorize/loop-form.ll @@ -1089,7 +1089,8 @@ define void @scalar_predication(ptr %addr) { ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr float, ptr [[ADDR:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fcmp une <2 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = fcmp oeq <2 x float> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true) ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 ; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll index 9fd488e86f142..531164a2c5dd0 100644 --- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll @@ -31,7 +31,8 @@ define void @a(ptr readnone %b) { ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 -3 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1 ; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD]], <4 x i8> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <4 x i8> [[REVERSE]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i8> [[REVERSE]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true) ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP7]], i32 0 ; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll index 0cf27a1bba1b3..550e52d318230 100644 --- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll +++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll @@ -20,7 +20,8 @@ define i32 @select_const_i32_from_icmp(ptr %v, i64 %n) { ; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]] ; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 -; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true) ; CHECK-VF4IC1-NEXT: [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]] ; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -74,10 +75,14 @@ define i32 @select_const_i32_from_icmp(ptr %v, i64 %n) { ; CHECK-VF4IC4-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 ; CHECK-VF4IC4-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 ; CHECK-VF4IC4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4 -; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD]], splat (i32 3) -; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD4]], splat (i32 3) -; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD5]], splat (i32 3) -; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD6]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD4]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD5]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD6]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true) +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP7]], splat (i1 true) +; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true) +; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[TMP9]], splat (i1 true) ; CHECK-VF4IC4-NEXT: [[TMP14]] = or <4 x i1> [[VEC_PHI]], [[TMP10]] ; CHECK-VF4IC4-NEXT: [[TMP15]] = or <4 x i1> [[VEC_PHI1]], [[TMP11]] ; CHECK-VF4IC4-NEXT: [[TMP16]] = or <4 x i1> [[VEC_PHI2]], [[TMP12]] @@ -406,7 +411,8 @@ define i32 @select_i32_from_icmp(ptr %v, i32 %a, i32 %b, i64 %n) { ; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]] ; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 -; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true) ; CHECK-VF4IC1-NEXT: [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]] ; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -460,10 +466,14 @@ define i32 @select_i32_from_icmp(ptr %v, i32 %a, i32 %b, i64 %n) { ; CHECK-VF4IC4-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 ; CHECK-VF4IC4-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 ; CHECK-VF4IC4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4 -; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD]], splat (i32 3) -; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD4]], splat (i32 3) -; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD5]], splat (i32 3) -; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD6]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD4]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD5]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD6]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true) +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP7]], splat (i1 true) +; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true) +; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[TMP9]], splat (i1 true) ; CHECK-VF4IC4-NEXT: [[TMP14]] = or <4 x i1> [[VEC_PHI]], [[TMP10]] ; CHECK-VF4IC4-NEXT: [[TMP15]] = or <4 x i1> [[VEC_PHI1]], [[TMP11]] ; CHECK-VF4IC4-NEXT: [[TMP16]] = or <4 x i1> [[VEC_PHI2]], [[TMP12]] @@ -601,7 +611,8 @@ define i32 @select_const_i32_from_fcmp_fast(ptr %v, i64 %n) { ; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP0]] ; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 ; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 -; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = fcmp fast one <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00) +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00) +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true) ; CHECK-VF4IC1-NEXT: [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]] ; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -655,10 +666,14 @@ define i32 @select_const_i32_from_fcmp_fast(ptr %v, i64 %n) { ; CHECK-VF4IC4-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP3]], align 4 ; CHECK-VF4IC4-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 ; CHECK-VF4IC4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 -; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = fcmp fast one <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00) -; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = fcmp fast one <4 x float> [[WIDE_LOAD4]], splat (float 3.000000e+00) -; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = fcmp fast one <4 x float> [[WIDE_LOAD5]], splat (float 3.000000e+00) -; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = fcmp fast one <4 x float> [[WIDE_LOAD6]], splat (float 3.000000e+00) +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00) +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD4]], splat (float 3.000000e+00) +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD5]], splat (float 3.000000e+00) +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD6]], splat (float 3.000000e+00) +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true) +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP7]], splat (i1 true) +; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true) +; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[TMP9]], splat (i1 true) ; CHECK-VF4IC4-NEXT: [[TMP14]] = or <4 x i1> [[VEC_PHI]], [[TMP10]] ; CHECK-VF4IC4-NEXT: [[TMP15]] = or <4 x i1> [[VEC_PHI1]], [[TMP11]] ; CHECK-VF4IC4-NEXT: [[TMP16]] = or <4 x i1> [[VEC_PHI2]], [[TMP12]] @@ -796,7 +811,8 @@ define i32 @select_const_i32_from_fcmp(ptr %v, i64 %n) { ; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP0]] ; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 ; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 -; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = fcmp one <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00) +; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = fcmp ueq <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00) +; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true) ; CHECK-VF4IC1-NEXT: [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]] ; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -850,10 +866,14 @@ define i32 @select_const_i32_from_fcmp(ptr %v, i64 %n) { ; CHECK-VF4IC4-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP3]], align 4 ; CHECK-VF4IC4-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 ; CHECK-VF4IC4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 -; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = fcmp one <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00) -; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = fcmp one <4 x float> [[WIDE_LOAD4]], splat (float 3.000000e+00) -; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = fcmp one <4 x float> [[WIDE_LOAD5]], splat (float 3.000000e+00) -; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = fcmp one <4 x float> [[WIDE_LOAD6]], splat (float 3.000000e+00) +; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = fcmp ueq <4 x float> [[WIDE_LOAD]], splat (float 3.000000e+00) +; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = fcmp ueq <4 x float> [[WIDE_LOAD4]], splat (float 3.000000e+00) +; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = fcmp ueq <4 x float> [[WIDE_LOAD5]], splat (float 3.000000e+00) +; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = fcmp ueq <4 x float> [[WIDE_LOAD6]], splat (float 3.000000e+00) +; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP6]], splat (i1 true) +; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP7]], splat (i1 true) +; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true) +; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[TMP9]], splat (i1 true) ; CHECK-VF4IC4-NEXT: [[TMP14]] = or <4 x i1> [[VEC_PHI]], [[TMP10]] ; CHECK-VF4IC4-NEXT: [[TMP15]] = or <4 x i1> [[VEC_PHI1]], [[TMP11]] ; CHECK-VF4IC4-NEXT: [[TMP16]] = or <4 x i1> [[VEC_PHI2]], [[TMP12]] @@ -985,7 +1005,8 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) { ; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 ; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = icmp ne <4 x i32> [[BROADCAST_SPLAT]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], splat (i32 3) +; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true) ; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK-VF4IC1: [[VECTOR_BODY]]: ; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] @@ -1026,7 +1047,8 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) { ; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] ; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0 ; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = icmp ne <4 x i32> [[BROADCAST_SPLAT]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = icmp eq <4 x i32> [[BROADCAST_SPLAT]], splat (i32 3) +; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true) ; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK-VF4IC4: [[VECTOR_BODY]]: ; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll index fc990ff0b0adf..34fc0587c5d87 100644 --- a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll +++ b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll @@ -11,21 +11,22 @@ define i64 @pr62565_incoming_value_known_undef(i64 %a, ptr %src) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[WIDE_LOAD]], splat (i32 1) -; CHECK-NEXT: [[TMP4]] = or <2 x i1> [[VEC_PHI]], [[TMP3]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], splat (i32 1) +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true) +; CHECK-NEXT: [[TMP5]] = or <2 x i1> [[VEC_PHI]], [[TMP4]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP4]]) -; CHECK-NEXT: [[TMP7:%.*]] = freeze i1 [[TMP6]] -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i64 [[A]], i64 undef +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP5]]) +; CHECK-NEXT: [[TMP8:%.*]] = freeze i1 [[TMP7]] +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP8]], i64 [[A]], i64 undef ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 33, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ] @@ -73,21 +74,22 @@ define i64 @pr62565_incoming_value_known_poison(i64 %a, ptr %src) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[WIDE_LOAD]], splat (i32 1) -; CHECK-NEXT: [[TMP4]] = or <2 x i1> [[VEC_PHI]], [[TMP3]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], splat (i32 1) +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true) +; CHECK-NEXT: [[TMP5]] = or <2 x i1> [[VEC_PHI]], [[TMP4]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP4]]) -; CHECK-NEXT: [[TMP7:%.*]] = freeze i1 [[TMP6]] -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i64 [[A]], i64 poison +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP5]]) +; CHECK-NEXT: [[TMP8:%.*]] = freeze i1 [[TMP7]] +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP8]], i64 [[A]], i64 poison ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 33, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ] @@ -135,21 +137,22 @@ define i64 @pr62565_incoming_value_may_be_poison(i64 %a, ptr %src, i64 %start) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[WIDE_LOAD]], splat (i32 1) -; CHECK-NEXT: [[TMP4]] = or <2 x i1> [[VEC_PHI]], [[TMP3]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], splat (i32 1) +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true) +; CHECK-NEXT: [[TMP5]] = or <2 x i1> [[VEC_PHI]], [[TMP4]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP4]]) -; CHECK-NEXT: [[TMP7:%.*]] = freeze i1 [[TMP6]] -; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i64 [[A]], i64 [[START]] +; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP5]]) +; CHECK-NEXT: [[TMP8:%.*]] = freeze i1 [[TMP7]] +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP8]], i64 [[A]], i64 [[START]] ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 33, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll index 9e055356ef79c..51eb2f650738c 100644 --- a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll +++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll @@ -24,8 +24,9 @@ define i64 @same_exit_block_phi_of_consts() { ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 ; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] @@ -102,8 +103,9 @@ define i64 @diff_exit_block_phi_of_consts() { ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 ; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]] diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll index 957769e53af7a..e24c6090b704b 100644 --- a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll +++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll @@ -24,8 +24,9 @@ define i64 @same_exit_block_pre_inc_use1() { ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) @@ -193,8 +194,9 @@ define i64 @same_exit_block_pre_inc_use1_gep_two_indices() { ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P2]], i64 0, i64 [[TMP0]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) @@ -275,8 +277,9 @@ define i64 @same_exit_block_pre_inc_use1_alloca_diff_type() { ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) @@ -357,8 +360,9 @@ define i64 @same_exit_block_pre_inc_use2() { ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) @@ -438,8 +442,9 @@ define i64 @same_exit_block_pre_inc_use3() { ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) @@ -519,8 +524,9 @@ define i64 @same_exit_block_pre_inc_use4() { ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = icmp uge <4 x i64> [[VEC_IND]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i64> [[VEC_IND]], [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true) ; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP4]]) ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) @@ -600,12 +606,13 @@ define i64 @same_exit_block_post_inc_use() { ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP0]], 1 ; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP1]], 1 ; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP2]], 1 ; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP3]], 1 ; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true) ; CHECK-NEXT: [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]]) ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) @@ -697,8 +704,9 @@ define i64 @same_exit_block_post_inc_use2() { ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i64> [[TMP12]], i64 [[TMP9]], i32 1 ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i64> [[TMP13]], i64 [[TMP10]], i32 2 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i64> [[TMP14]], i64 [[TMP11]], i32 3 -; CHECK-NEXT: [[TMP17:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP16]], splat (i1 true) ; CHECK-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]]) ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) @@ -780,8 +788,9 @@ define i64 @diff_exit_block_pre_inc_use1() { ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) @@ -869,8 +878,9 @@ define i64 @diff_exit_block_pre_inc_use2() { ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) @@ -957,8 +967,9 @@ define i64 @diff_exit_block_pre_inc_use3() { ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD3]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD3]] ; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX2]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) @@ -1048,12 +1059,13 @@ define i64 @diff_exit_block_post_inc_use1() { ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP0]], 1 ; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP1]], 1 ; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP2]], 1 ; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP3]], 1 ; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[TMP8]], splat (i1 true) ; CHECK-NEXT: [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]]) ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) @@ -1152,8 +1164,9 @@ define i64 @diff_exit_block_post_inc_use2() { ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP15]], align 1 -; CHECK-NEXT: [[TMP17:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP16]], splat (i1 true) ; CHECK-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP17]]) ; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) @@ -1240,8 +1253,9 @@ define i64 @loop_contains_safe_call() { ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 1 ; CHECK-NEXT: [[TMP3:%.*]] = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> [[WIDE_LOAD]]) -; CHECK-NEXT: [[TMP5:%.*]] = fcmp fast oge <4 x float> [[TMP3]], splat (float 3.000000e+00) +; CHECK-NEXT: [[TMP4:%.*]] = fcmp fast ult <4 x float> [[TMP3]], splat (float 3.000000e+00) ; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) @@ -1318,8 +1332,9 @@ define i64 @loop_contains_safe_div() { ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 1 ; CHECK-NEXT: [[TMP3:%.*]] = udiv <4 x i32> [[WIDE_LOAD]], splat (i32 20000) -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP3]], splat (i32 1) +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[TMP3]], splat (i32 1) ; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], splat (i1 true) ; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) @@ -1394,11 +1409,12 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align( ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <4 x i32> [[WIDE_LOAD]], splat (i32 1) +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], splat (i32 1) ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P2]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 ; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP3]], splat (i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) @@ -1482,8 +1498,9 @@ define i64 @same_exit_block_pre_inc_use1_reverse() { ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 -3 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP6]], align 1 ; CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x i8> [[WIDE_LOAD2]], <4 x i8> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <4 x i8> [[REVERSE]], [[REVERSE3]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i8> [[REVERSE]], [[REVERSE3]] ; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i1> [[TMP7]], splat (i1 true) ; CHECK-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]]) ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 1020 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4) @@ -1561,8 +1578,9 @@ define i64 @same_exit_block_pre_inc_use1_deref_ptrs(ptr dereferenceable(1024) %p ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]] ; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true) ; CHECK-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4) diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll index 241b7d3380ba7..aa05bb153966e 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -274,8 +274,9 @@ define void @uniform_gep(i64 %k, ptr noalias %A, ptr noalias %B) { ; CHECK-NEXT: EMIT vp<[[WIDE_CAN_IV:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule vp<[[WIDE_CAN_IV]]>, vp<[[BTC]]> ; CHECK-NEXT: CLONE ir<%lv> = load ir<%gep.A.uniform> -; CHECK-NEXT: WIDEN ir<%cmp> = icmp uge ir<%iv>, ir<%k> -; CHECK-NEXT: EMIT vp<[[MASK2:%.+]]> = logical-and vp<[[MASK]]>, ir<%cmp> +; CHECK-NEXT: WIDEN ir<%cmp> = icmp ult ir<%iv>, ir<%k> +; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not ir<%cmp> +; CHECK-NEXT: EMIT vp<[[MASK2:%.+]]> = logical-and vp<[[MASK]]>, vp<[[NOT2]]> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { @@ -981,12 +982,13 @@ define void @sinking_requires_duplication(ptr %addr) { ; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%addr>, vp<[[STEPS]]> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = vector-pointer ir<%gep> ; CHECK-NEXT: WIDEN ir<%0> = load vp<[[VEC_PTR]]> -; CHECK-NEXT: WIDEN ir<%pred> = fcmp une ir<%0>, ir<0.000000e+00> +; CHECK-NEXT: WIDEN ir<%pred> = fcmp oeq ir<%0>, ir<0.000000e+00> +; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = not ir<%pred> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { ; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK ir<%pred> +; CHECK-NEXT: BRANCH-ON-MASK vp<[[MASK]]> ; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: @@ -1147,12 +1149,13 @@ define void @ptr_induction_remove_dead_recipe(ptr %start, ptr %end) { ; CHECK-NEXT: CLONE ir<%ptr.iv.next> = getelementptr inbounds vp<[[PTR_IV]]>, ir<-1> ; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = reverse-vector-pointer inbounds ir<%ptr.iv.next>, vp<[[VF]]> ; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> -; CHECK-NEXT: WIDEN ir<%c.1> = icmp ne ir<%l>, ir<0> +; CHECK-NEXT: WIDEN ir<%c.1> = icmp eq ir<%l>, ir<0> +; CHECK-NEXT: EMIT vp<[[NEG:%.+]]> = not ir<%c.1> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { ; CHECK-NEXT: pred.store.entry: -; CHECK-NEXT: BRANCH-ON-MASK ir<%c.1> +; CHECK-NEXT: BRANCH-ON-MASK vp<[[NEG]]> ; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue ; CHECK-EMPTY: ; CHECK-NEXT: pred.store.if: