Skip to content

Commit 27c65d7

Browse files
fhahngit-crd
authored andcommitted
[VPlan] Merge fcmp uno feeding Or. (llvm#167251)
Fold or (fcmp uno %A, %A), (fcmp uno %B, %B), ... -> or (fcmp uno %A, %B), ... This pattern is generated to check if any vector lane is NaN, and combining multiple compares is beneficial on architectures that have dedicated instructions. Alive2 Proof: https://alive2.llvm.org/ce/z/vA_aoM Combine suggested as part of llvm#161735 PR: llvm#167251
1 parent e843d07 commit 27c65d7

File tree

4 files changed

+21
-20
lines changed

4 files changed

+21
-20
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1220,6 +1220,17 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
12201220
}
12211221
}
12221222

1223+
// Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1224+
// This is useful for fmax/fmin without fast-math flags, where we need to
1225+
// check if any operand is NaN.
1226+
if (match(Def, m_BinaryOr(m_SpecificCmp(CmpInst::FCMP_UNO, m_VPValue(X),
1227+
m_Deferred(X)),
1228+
m_SpecificCmp(CmpInst::FCMP_UNO, m_VPValue(Y),
1229+
m_Deferred(Y))))) {
1230+
VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1231+
return Def->replaceAllUsesWith(NewCmp);
1232+
}
1233+
12231234
// Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
12241235
if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
12251236
match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&

llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -142,14 +142,10 @@ define float @test_fmax_and_fmin(ptr %src.0, ptr %src.1, i64 %n) {
142142
; CHECK-NEXT: [[TMP6]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD5]])
143143
; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD6]])
144144
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
145-
; CHECK-NEXT: [[TMP8:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD5]]
146-
; CHECK-NEXT: [[TMP9:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD6]]
147-
; CHECK-NEXT: [[TMP14:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
148-
; CHECK-NEXT: [[TMP15:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD4]], [[WIDE_LOAD4]]
149-
; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i1> [[TMP8]], [[TMP14]]
150-
; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i1> [[TMP9]], [[TMP15]]
151-
; CHECK-NEXT: [[TMP16:%.*]] = freeze <4 x i1> [[TMP12]]
152-
; CHECK-NEXT: [[TMP17:%.*]] = freeze <4 x i1> [[TMP13]]
145+
; CHECK-NEXT: [[TMP8:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD]]
146+
; CHECK-NEXT: [[TMP9:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD4]]
147+
; CHECK-NEXT: [[TMP16:%.*]] = freeze <4 x i1> [[TMP8]]
148+
; CHECK-NEXT: [[TMP17:%.*]] = freeze <4 x i1> [[TMP9]]
153149
; CHECK-NEXT: [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP17]]
154150
; CHECK-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP18]])
155151
; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]

llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -141,14 +141,10 @@ define float @test_fmax_and_fmin(ptr %src.0, ptr %src.1, i64 %n) {
141141
; CHECK-NEXT: [[TMP6]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD5]])
142142
; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD6]])
143143
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
144-
; CHECK-NEXT: [[TMP8:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD5]]
145-
; CHECK-NEXT: [[TMP9:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD6]]
146-
; CHECK-NEXT: [[TMP14:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
147-
; CHECK-NEXT: [[TMP15:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD4]], [[WIDE_LOAD4]]
148-
; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i1> [[TMP8]], [[TMP14]]
149-
; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i1> [[TMP9]], [[TMP15]]
150-
; CHECK-NEXT: [[TMP16:%.*]] = freeze <4 x i1> [[TMP12]]
151-
; CHECK-NEXT: [[TMP17:%.*]] = freeze <4 x i1> [[TMP13]]
144+
; CHECK-NEXT: [[TMP8:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD]]
145+
; CHECK-NEXT: [[TMP9:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD4]]
146+
; CHECK-NEXT: [[TMP16:%.*]] = freeze <4 x i1> [[TMP8]]
147+
; CHECK-NEXT: [[TMP17:%.*]] = freeze <4 x i1> [[TMP9]]
152148
; CHECK-NEXT: [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP17]]
153149
; CHECK-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP18]])
154150
; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]

llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -697,10 +697,8 @@ define float @test_fmax_and_fmax(ptr %src.0, ptr %src.1, i64 %n) {
697697
; CHECK-NEXT: [[TMP2]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD]])
698698
; CHECK-NEXT: [[TMP3]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD2]])
699699
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
700-
; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
701-
; CHECK-NEXT: [[TMP7:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
702-
; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i1> [[TMP4]], [[TMP7]]
703-
; CHECK-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP6]]
700+
; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD]]
701+
; CHECK-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP4]]
704702
; CHECK-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]])
705703
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
706704
; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP9]], [[TMP11]]

0 commit comments

Comments
 (0)