Skip to content

Commit 21378fb

Browse files
authored
[VPlan] Merge fcmp uno feeding AnyOf. (llvm#166823)
Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... -> any-of (fcmp uno %A, %B), ... This pattern is generated to check if any vector lane is NaN, and combining multiple compares is beneficial on architectures that have dedicated instructions. Alive2 Proof: https://alive2.llvm.org/ce/z/vA_aoM Combine suggested as part of llvm#161735 PR: llvm#166823
1 parent 7485f34 commit 21378fb

File tree

7 files changed

+48
-39
lines changed

7 files changed

+48
-39
lines changed

llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,10 @@ m_BranchOnCount(const Op0_t &Op0, const Op1_t &Op1) {
420420
return m_VPInstruction<VPInstruction::BranchOnCount>(Op0, Op1);
421421
}
422422

423+
inline VPInstruction_match<VPInstruction::AnyOf> m_AnyOf() {
424+
return m_VPInstruction<VPInstruction::AnyOf>();
425+
}
426+
423427
template <typename Op0_t>
424428
inline VPInstruction_match<VPInstruction::AnyOf, Op0_t>
425429
m_AnyOf(const Op0_t &Op0) {

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -554,7 +554,6 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
554554
case Instruction::ExtractValue:
555555
case Instruction::Freeze:
556556
case Instruction::Load:
557-
case VPInstruction::AnyOf:
558557
case VPInstruction::BranchOnCond:
559558
case VPInstruction::Broadcast:
560559
case VPInstruction::BuildStructVector:
@@ -594,6 +593,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
594593
case Instruction::GetElementPtr:
595594
case Instruction::PHI:
596595
case Instruction::Switch:
596+
case VPInstruction::AnyOf:
597597
case VPInstruction::SLPLoad:
598598
case VPInstruction::SLPStore:
599599
// Cannot determine the number of operands from the opcode.

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1241,6 +1241,35 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
12411241
}
12421242
}
12431243

1244+
// Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1245+
// any-of (fcmp uno %A, %B), ...
1246+
if (match(Def, m_AnyOf())) {
1247+
SmallVector<VPValue *, 4> NewOps;
1248+
VPRecipeBase *UnpairedCmp = nullptr;
1249+
for (VPValue *Op : Def->operands()) {
1250+
VPValue *X;
1251+
if (Op->getNumUsers() > 1 ||
1252+
!match(Op, m_SpecificCmp(CmpInst::FCMP_UNO, m_VPValue(X),
1253+
m_Deferred(X)))) {
1254+
NewOps.push_back(Op);
1255+
} else if (!UnpairedCmp) {
1256+
UnpairedCmp = Op->getDefiningRecipe();
1257+
} else {
1258+
NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1259+
UnpairedCmp->getOperand(0), X));
1260+
UnpairedCmp = nullptr;
1261+
}
1262+
}
1263+
1264+
if (UnpairedCmp)
1265+
NewOps.push_back(UnpairedCmp->getVPSingleValue());
1266+
1267+
if (NewOps.size() < Def->getNumOperands()) {
1268+
VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1269+
return Def->replaceAllUsesWith(NewAnyOf);
1270+
}
1271+
}
1272+
12441273
// Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
12451274
// This is useful for fmax/fmin without fast-math flags, where we need to
12461275
// check if any operand is NaN.

llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,8 @@ define float @fmaxnum(ptr %src, i64 %n) {
5959
; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
6060
; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
6161
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
62-
; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
63-
; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
64-
; CHECK-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP3]]
65-
; CHECK-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP4]]
66-
; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP18]], [[TMP15]]
62+
; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
63+
; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
6764
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
6865
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
6966
; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]

llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,8 @@ define float @fminnum(ptr %src, i64 %n) {
5959
; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
6060
; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
6161
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
62-
; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
63-
; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
64-
; CHECK-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP3]]
65-
; CHECK-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP4]]
66-
; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP15]], [[TMP18]]
62+
; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
63+
; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
6764
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
6865
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
6966
; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]

llvm/test/Transforms/LoopVectorize/fcmp-uno-fold-interleave.ll

Lines changed: 8 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,11 @@ define float @fmaxnum(ptr %src, i64 %n) {
2828
; IC3-NEXT: [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD3]])
2929
; IC3-NEXT: [[TMP5]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI2]], <4 x float> [[WIDE_LOAD4]])
3030
; IC3-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 12
31-
; IC3-NEXT: [[TMP6:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
32-
; IC3-NEXT: [[TMP7:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD3]]
3331
; IC3-NEXT: [[TMP8:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD4]], [[WIDE_LOAD4]]
34-
; IC3-NEXT: [[TMP9:%.*]] = freeze <4 x i1> [[TMP6]]
32+
; IC3-NEXT: [[TMP7:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD3]]
3533
; IC3-NEXT: [[TMP10:%.*]] = freeze <4 x i1> [[TMP7]]
36-
; IC3-NEXT: [[TMP11:%.*]] = or <4 x i1> [[TMP9]], [[TMP10]]
3734
; IC3-NEXT: [[TMP12:%.*]] = freeze <4 x i1> [[TMP8]]
38-
; IC3-NEXT: [[TMP13:%.*]] = or <4 x i1> [[TMP11]], [[TMP12]]
35+
; IC3-NEXT: [[TMP13:%.*]] = or <4 x i1> [[TMP10]], [[TMP12]]
3936
; IC3-NEXT: [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
4037
; IC3-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
4138
; IC3-NEXT: [[TMP16:%.*]] = or i1 [[TMP14]], [[TMP15]]
@@ -86,17 +83,11 @@ define float @fmaxnum(ptr %src, i64 %n) {
8683
; IC4-NEXT: [[TMP6]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI2]], <4 x float> [[WIDE_LOAD5]])
8784
; IC4-NEXT: [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI3]], <4 x float> [[WIDE_LOAD6]])
8885
; IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
89-
; IC4-NEXT: [[TMP8:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
90-
; IC4-NEXT: [[TMP9:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD4]], [[WIDE_LOAD4]]
91-
; IC4-NEXT: [[TMP24:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD5]]
92-
; IC4-NEXT: [[TMP25:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD6]]
93-
; IC4-NEXT: [[TMP10:%.*]] = freeze <4 x i1> [[TMP8]]
94-
; IC4-NEXT: [[TMP11:%.*]] = freeze <4 x i1> [[TMP9]]
95-
; IC4-NEXT: [[TMP12:%.*]] = or <4 x i1> [[TMP10]], [[TMP11]]
86+
; IC4-NEXT: [[TMP24:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
87+
; IC4-NEXT: [[TMP25:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD6]]
9688
; IC4-NEXT: [[TMP26:%.*]] = freeze <4 x i1> [[TMP24]]
97-
; IC4-NEXT: [[TMP27:%.*]] = or <4 x i1> [[TMP12]], [[TMP26]]
9889
; IC4-NEXT: [[TMP28:%.*]] = freeze <4 x i1> [[TMP25]]
99-
; IC4-NEXT: [[TMP29:%.*]] = or <4 x i1> [[TMP27]], [[TMP28]]
90+
; IC4-NEXT: [[TMP29:%.*]] = or <4 x i1> [[TMP26]], [[TMP28]]
10091
; IC4-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP29]])
10192
; IC4-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
10293
; IC4-NEXT: [[TMP15:%.*]] = or i1 [[TMP13]], [[TMP14]]
@@ -153,18 +144,12 @@ define float @fmaxnum(ptr %src, i64 %n) {
153144
; IC5-NEXT: [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI3]], <4 x float> [[WIDE_LOAD7]])
154145
; IC5-NEXT: [[TMP9]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI4]], <4 x float> [[WIDE_LOAD8]])
155146
; IC5-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 20
156-
; IC5-NEXT: [[TMP10:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
157-
; IC5-NEXT: [[TMP11:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD5]]
158-
; IC5-NEXT: [[TMP12:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD6]]
159-
; IC5-NEXT: [[TMP13:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD7]]
160147
; IC5-NEXT: [[TMP14:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD8]], [[WIDE_LOAD8]]
161-
; IC5-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP10]]
162-
; IC5-NEXT: [[TMP16:%.*]] = freeze <4 x i1> [[TMP11]]
163-
; IC5-NEXT: [[TMP17:%.*]] = or <4 x i1> [[TMP15]], [[TMP16]]
148+
; IC5-NEXT: [[TMP12:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD5]]
149+
; IC5-NEXT: [[TMP13:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD7]]
164150
; IC5-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP12]]
165-
; IC5-NEXT: [[TMP19:%.*]] = or <4 x i1> [[TMP17]], [[TMP18]]
166151
; IC5-NEXT: [[TMP20:%.*]] = freeze <4 x i1> [[TMP13]]
167-
; IC5-NEXT: [[TMP21:%.*]] = or <4 x i1> [[TMP19]], [[TMP20]]
152+
; IC5-NEXT: [[TMP21:%.*]] = or <4 x i1> [[TMP18]], [[TMP20]]
168153
; IC5-NEXT: [[TMP22:%.*]] = freeze <4 x i1> [[TMP14]]
169154
; IC5-NEXT: [[TMP23:%.*]] = or <4 x i1> [[TMP21]], [[TMP22]]
170155
; IC5-NEXT: [[TMP24:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP23]])

llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,8 @@ define float @fmaxnum(ptr %src, i64 %n) {
5959
; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
6060
; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
6161
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
62-
; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
63-
; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
64-
; CHECK-NEXT: [[TMP15:%.*]] = freeze <4 x i1> [[TMP3]]
65-
; CHECK-NEXT: [[TMP18:%.*]] = freeze <4 x i1> [[TMP4]]
66-
; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP15]], [[TMP18]]
62+
; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
63+
; CHECK-NEXT: [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
6764
; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
6865
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
6966
; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]

0 commit comments

Comments
 (0)