diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 47d8cc260511e..773307bd5c394 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -417,6 +417,10 @@ m_BranchOnCount(const Op0_t &Op0, const Op1_t &Op1) { return m_VPInstruction(Op0, Op1); } +inline VPInstruction_match m_AnyOf() { + return m_VPInstruction(); +} + template inline VPInstruction_match m_AnyOf(const Op0_t &Op0) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index d05c22e3aeb61..0661de3f99d08 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1220,6 +1220,37 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) { } } + // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... -> + // any-of (fcmp uno %A, %B), ... + if (match(Def, m_AnyOf())) { + SmallVector NewOps; + VPRecipeBase *UnpairedCmp = nullptr; + for (VPValue *Op : Def->operands()) { + VPValue *X; + if (Op->getNumUsers() > 1 || + !match(Op, m_SpecificCmp(CmpInst::FCMP_UNO, m_VPValue(X), + m_Deferred(X)))) { + NewOps.push_back(Op); + continue; + } + if (UnpairedCmp) { + NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO, + UnpairedCmp->getOperand(0), X)); + UnpairedCmp = nullptr; + } else { + UnpairedCmp = Op->getDefiningRecipe(); + } + } + + if (UnpairedCmp) + NewOps.push_back(UnpairedCmp->getVPSingleValue()); + + if (NewOps.size() < Def->getNumOperands()) { + VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps); + return Def->replaceAllUsesWith(NewAnyOf); + } + } + // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y // This is useful for fmax/fmin without fast-math flags, where we need to // check if any operand is NaN. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll index f3d649b899686..6902dd990509e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll @@ -59,11 +59,8 @@ define float @fmaxnum(ptr %src, i64 %n) { ; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8 -; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]] -; CHECK-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP3]] -; CHECK-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP18]], [[TMP15]] +; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll index 1cc4c152649b4..193424d3eb70a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll @@ -59,11 +59,8 @@ define float @fminnum(ptr %src, i64 %n) { ; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8 -; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]] -; CHECK-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP3]] -; CHECK-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP15]], [[TMP18]] +; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]] diff --git a/llvm/test/Transforms/LoopVectorize/fcmp-uno-fold-interleave.ll b/llvm/test/Transforms/LoopVectorize/fcmp-uno-fold-interleave.ll index 7b8e1997a5011..22226a711bcf0 100644 --- a/llvm/test/Transforms/LoopVectorize/fcmp-uno-fold-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/fcmp-uno-fold-interleave.ll @@ -28,14 +28,11 @@ define float @fmaxnum(ptr %src, i64 %n) { ; IC3-NEXT: [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD3]]) ; IC3-NEXT: [[TMP5]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI2]], <4 x float> [[WIDE_LOAD4]]) ; IC3-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 12 -; IC3-NEXT: [[TMP6:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] -; IC3-NEXT: [[TMP7:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD3]] ; IC3-NEXT: [[TMP8:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD4]], [[WIDE_LOAD4]] -; IC3-NEXT: [[TMP9:%.*]] = freeze <4 x i1> [[TMP6]] +; IC3-NEXT: [[TMP7:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD3]] ; IC3-NEXT: [[TMP10:%.*]] = freeze <4 x i1> [[TMP7]] -; IC3-NEXT: [[TMP11:%.*]] = or <4 x i1> [[TMP9]], [[TMP10]] ; IC3-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP8]] -; IC3-NEXT: [[TMP13:%.*]] = or <4 x i1> [[TMP11]], [[TMP12]] +; IC3-NEXT: [[TMP13:%.*]] = or <4 x i1> [[TMP10]], [[TMP12]] ; IC3-NEXT: [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]]) ; IC3-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IC3-NEXT: [[TMP16:%.*]] = or i1 [[TMP14]], [[TMP15]] @@ -86,17 +83,11 @@ define float @fmaxnum(ptr %src, i64 %n) { ; IC4-NEXT: [[TMP6]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI2]], <4 x float> [[WIDE_LOAD5]]) ; IC4-NEXT: [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI3]], <4 x float> [[WIDE_LOAD6]]) ; IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; IC4-NEXT: [[TMP8:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] -; IC4-NEXT: [[TMP9:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD4]], [[WIDE_LOAD4]] -; IC4-NEXT: [[TMP24:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD5]] -; IC4-NEXT: [[TMP25:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD6]] -; IC4-NEXT: [[TMP10:%.*]] = freeze <4 x i1> [[TMP8]] -; IC4-NEXT: [[TMP11:%.*]] = freeze <4 x i1> [[TMP9]] -; IC4-NEXT: [[TMP12:%.*]] = or <4 x i1> [[TMP10]], [[TMP11]] +; IC4-NEXT: [[TMP24:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]] +; IC4-NEXT: [[TMP25:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD6]] ; IC4-NEXT: [[TMP26:%.*]] = freeze <4 x i1> [[TMP24]] -; IC4-NEXT: [[TMP27:%.*]] = or <4 x i1> [[TMP12]], [[TMP26]] ; IC4-NEXT: [[TMP28:%.*]] = freeze <4 x i1> [[TMP25]] -; IC4-NEXT: [[TMP29:%.*]] = or <4 x i1> [[TMP27]], [[TMP28]] +; IC4-NEXT: [[TMP29:%.*]] = or <4 x i1> [[TMP26]], [[TMP28]] ; IC4-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP29]]) ; IC4-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; IC4-NEXT: [[TMP15:%.*]] = or i1 [[TMP13]], [[TMP14]] @@ -153,18 +144,12 @@ define float @fmaxnum(ptr %src, i64 %n) { ; IC5-NEXT: [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI3]], <4 x float> [[WIDE_LOAD7]]) ; IC5-NEXT: [[TMP9]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI4]], <4 x float> [[WIDE_LOAD8]]) ; IC5-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 20 -; IC5-NEXT: [[TMP10:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] -; IC5-NEXT: [[TMP11:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD5]] -; IC5-NEXT: [[TMP12:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD6]] -; IC5-NEXT: [[TMP13:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD7]] ; IC5-NEXT: [[TMP14:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD8]], [[WIDE_LOAD8]] -; IC5-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP10]] -; IC5-NEXT: [[TMP16:%.*]] = freeze <4 x i1> [[TMP11]] -; IC5-NEXT: [[TMP17:%.*]] = or <4 x i1> [[TMP15]], [[TMP16]] +; IC5-NEXT: [[TMP12:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD5]] +; IC5-NEXT: [[TMP13:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD7]] ; IC5-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP12]] -; IC5-NEXT: [[TMP19:%.*]] = or <4 x i1> [[TMP17]], [[TMP18]] ; IC5-NEXT: [[TMP20:%.*]] = freeze <4 x i1> [[TMP13]] -; IC5-NEXT: [[TMP21:%.*]] = or <4 x i1> [[TMP19]], [[TMP20]] +; IC5-NEXT: [[TMP21:%.*]] = or <4 x i1> [[TMP18]], [[TMP20]] ; IC5-NEXT: [[TMP22:%.*]] = freeze <4 x i1> [[TMP14]] ; IC5-NEXT: [[TMP23:%.*]] = or <4 x i1> [[TMP21]], [[TMP22]] ; IC5-NEXT: [[TMP24:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP23]]) diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll index ca6e5bc2d0dcb..0745f286b2608 100644 --- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll @@ -59,11 +59,8 @@ define float @fmaxnum(ptr %src, i64 %n) { ; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8 -; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]] -; CHECK-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP3]] -; CHECK-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP15]], [[TMP18]] +; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]