Skip to content

Commit c97c686

Browse files
authored
[VPlan] Allow folding not (cmp eq) -> icmp ne with other select users (#154497)
Currently we only allow folding not (cmp eq) -> icmp ne if the not is the only user of the compare. However a common scenario is that some select might also use the compare. We can still fold the not if we also swizzle the arms of the selects. This helps avoid regressions in #150368
1 parent e6d095e commit c97c686

10 files changed

+171
-185
lines changed

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1106,13 +1106,29 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
11061106
return Def->replaceAllUsesWith(A);
11071107

11081108
// Try to fold Not into compares by adjusting the predicate in-place.
1109-
if (isa<VPWidenRecipe>(A) && A->getNumUsers() == 1) {
1110-
auto *WideCmp = cast<VPWidenRecipe>(A);
1111-
if (WideCmp->getOpcode() == Instruction::ICmp ||
1112-
WideCmp->getOpcode() == Instruction::FCmp) {
1109+
if (auto *WideCmp = dyn_cast<VPWidenRecipe>(A)) {
1110+
if ((WideCmp->getOpcode() == Instruction::ICmp ||
1111+
WideCmp->getOpcode() == Instruction::FCmp) &&
1112+
all_of(WideCmp->users(), [&WideCmp](VPUser *U) {
1113+
return match(U, m_CombineOr(m_Not(m_Specific(WideCmp)),
1114+
m_Select(m_Specific(WideCmp),
1115+
m_VPValue(), m_VPValue())));
1116+
})) {
11131117
WideCmp->setPredicate(
11141118
CmpInst::getInversePredicate(WideCmp->getPredicate()));
1115-
Def->replaceAllUsesWith(WideCmp);
1119+
for (VPUser *U : to_vector(WideCmp->users())) {
1120+
auto *R = cast<VPSingleDefRecipe>(U);
1121+
if (match(R, m_Select(m_Specific(WideCmp), m_VPValue(X),
1122+
m_VPValue(Y)))) {
1123+
// select (cmp pred), x, y -> select (cmp inv_pred), y, x
1124+
R->setOperand(1, Y);
1125+
R->setOperand(2, X);
1126+
} else {
1127+
// not (cmp pred) -> cmp inv_pred
1128+
assert(match(R, m_Not(m_Specific(WideCmp))) && "Unexpected user");
1129+
R->replaceAllUsesWith(WideCmp);
1130+
}
1131+
}
11161132
// If WideCmp doesn't have a debug location, use the one from the
11171133
// negation, to preserve the location.
11181134
if (!WideCmp->getDebugLoc() && R.getDebugLoc())

llvm/test/Transforms/LoopVectorize/X86/drop-poison-generating-flags.ll

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ define void @drop_scalar_nuw_nsw(ptr noalias nocapture readonly %input, ptr %out
2929
; CHECK: [[VECTOR_BODY]]:
3030
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr float, ptr [[INPUT]], i64 -1
3131
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0]], i32 4, <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0:![0-9]+]]
32-
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> zeroinitializer, <4 x float> [[WIDE_MASKED_LOAD]]
32+
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
3333
; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[OUTPUT]], align 4
3434
; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]]
3535
; CHECK: [[MIDDLE_BLOCK]]:
@@ -71,7 +71,7 @@ define void @drop_scalar_gep_nusw(ptr noalias nocapture readonly %input, ptr %ou
7171
; CHECK: [[VECTOR_BODY]]:
7272
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr float, ptr [[INPUT]], i64 -1
7373
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0]], i32 4, <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
74-
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> zeroinitializer, <4 x float> [[WIDE_MASKED_LOAD]]
74+
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
7575
; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[OUTPUT]], align 4
7676
; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]]
7777
; CHECK: [[MIDDLE_BLOCK]]:
@@ -113,7 +113,7 @@ define void @drop_scalar_gep_nuw(ptr noalias nocapture readonly %input, ptr %out
113113
; CHECK: [[VECTOR_BODY]]:
114114
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr float, ptr [[INPUT]], i64 -1
115115
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0]], i32 4, <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
116-
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> zeroinitializer, <4 x float> [[WIDE_MASKED_LOAD]]
116+
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
117117
; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[OUTPUT]], align 4
118118
; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]]
119119
; CHECK: [[MIDDLE_BLOCK]]:
@@ -156,7 +156,7 @@ define void @drop_nonpred_scalar_nuw_nsw(ptr noalias nocapture readonly %input,
156156
; CHECK: [[VECTOR_BODY]]:
157157
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr float, ptr [[INPUT]], i64 -1
158158
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP0]], i32 4, <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
159-
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> zeroinitializer, <4 x float> [[WIDE_MASKED_LOAD]]
159+
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
160160
; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[OUTPUT]], align 4
161161
; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]]
162162
; CHECK: [[MIDDLE_BLOCK]]:
@@ -198,7 +198,7 @@ define void @preserve_vector_nuw_nsw(ptr noalias nocapture readonly %input, ptr
198198
; CHECK: [[VECTOR_BODY]]:
199199
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, ptr [[INPUT]], <4 x i64> <i64 -2, i64 0, i64 2, i64 4>
200200
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP0]], i32 4, <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
201-
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> zeroinitializer, <4 x float> [[WIDE_MASKED_GATHER]]
201+
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> [[WIDE_MASKED_GATHER]], <4 x float> zeroinitializer
202202
; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[OUTPUT]], align 4
203203
; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]]
204204
; CHECK: [[MIDDLE_BLOCK]]:
@@ -243,7 +243,7 @@ define void @drop_vector_nuw_nsw(ptr noalias nocapture readonly %input, ptr %out
243243
; CHECK-NEXT: store <4 x ptr> [[TMP3]], ptr [[PTRS]], align 8
244244
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP3]], i32 0
245245
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP6]], i32 4, <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
246-
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> zeroinitializer, <4 x float> [[WIDE_MASKED_LOAD]]
246+
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
247247
; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[OUTPUT]], align 4
248248
; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]]
249249
; CHECK: [[MIDDLE_BLOCK]]:
@@ -302,7 +302,7 @@ define void @drop_nonvector_nuw_nsw_avx1(ptr noalias nocapture readonly %input,
302302
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x ptr> [[TMP16]], ptr [[TMP13]], i32 3
303303
; CHECK-NEXT: store <4 x ptr> [[TMP17]], ptr [[TMP5]], align 8
304304
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[TMP10]], i32 4, <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> poison), !invariant.load [[META0]]
305-
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> zeroinitializer, <4 x float> [[WIDE_MASKED_LOAD]]
305+
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
306306
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[OUTPUT]], i64 0
307307
; CHECK-NEXT: store <4 x float> [[PREDPHI]], ptr [[TMP21]], align 4
308308
; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]]
@@ -619,8 +619,7 @@ define void @pr70590_recipe_without_underlying_instr(i64 %n, ptr noalias %dst) {
619619
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
620620
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
621621
; CHECK: [[VECTOR_BODY]]:
622-
; CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i64> <i64 0, i64 1, i64 2, i64 3>, [[BROADCAST_SPLAT]]
623-
; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
622+
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i64> <i64 0, i64 1, i64 2, i64 3>, [[BROADCAST_SPLAT]]
624623
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
625624
; CHECK-NEXT: br i1 [[TMP2]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
626625
; CHECK: [[PRED_LOAD_IF]]:
@@ -660,8 +659,8 @@ define void @pr70590_recipe_without_underlying_instr(i64 %n, ptr noalias %dst) {
660659
; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x i8> [[TMP22]], i8 [[TMP27]], i32 3
661660
; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE6]]
662661
; CHECK: [[PRED_LOAD_CONTINUE6]]:
663-
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = phi <4 x i8> [ [[TMP22]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP28]], %[[PRED_LOAD_IF5]] ]
664-
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i8> zeroinitializer, <4 x i8> [[WIDE_LOAD]]
662+
; CHECK-NEXT: [[TMP30:%.*]] = phi <4 x i8> [ [[TMP22]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP28]], %[[PRED_LOAD_IF5]] ]
663+
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i8> [[TMP30]], <4 x i8> zeroinitializer
665664
; CHECK-NEXT: store <4 x i8> [[PREDPHI]], ptr [[DST]], align 4
666665
; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]]
667666
; CHECK: [[MIDDLE_BLOCK]]:
@@ -706,8 +705,7 @@ define void @recipe_without_underlying_instr_lanes_used(i64 %n, ptr noalias %dst
706705
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
707706
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
708707
; CHECK: [[VECTOR_BODY]]:
709-
; CHECK-NEXT: [[TMP0:%.*]] = icmp eq <4 x i64> <i64 0, i64 1, i64 2, i64 3>, [[BROADCAST_SPLAT]]
710-
; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
708+
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i64> <i64 0, i64 1, i64 2, i64 3>, [[BROADCAST_SPLAT]]
711709
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
712710
; CHECK-NEXT: br i1 [[TMP2]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
713711
; CHECK: [[PRED_LOAD_IF]]:
@@ -747,9 +745,9 @@ define void @recipe_without_underlying_instr_lanes_used(i64 %n, ptr noalias %dst
747745
; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x i8> [[TMP22]], i8 [[TMP27]], i32 3
748746
; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE6]]
749747
; CHECK: [[PRED_LOAD_CONTINUE6]]:
750-
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = phi <4 x i8> [ [[TMP22]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP28]], %[[PRED_LOAD_IF5]] ]
751-
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i8> zeroinitializer, <4 x i8> [[WIDE_LOAD]]
752-
; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP0]], <4 x i64> zeroinitializer, <4 x i64> poison
748+
; CHECK-NEXT: [[TMP26:%.*]] = phi <4 x i8> [ [[TMP22]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP28]], %[[PRED_LOAD_IF5]] ]
749+
; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i8> [[TMP26]], <4 x i8> zeroinitializer
750+
; CHECK-NEXT: [[PREDPHI7:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> poison, <4 x i64> zeroinitializer
753751
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[PREDPHI7]], i32 3
754752
; CHECK-NEXT: store i64 [[TMP12]], ptr [[AUX]], align 8
755753
; CHECK-NEXT: store <4 x i8> [[PREDPHI]], ptr [[DST]], align 4

llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,7 @@ define ptr @test(ptr noalias %src, ptr noalias %dst) {
1717
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP1]]
1818
; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP6]], i32 0
1919
; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x ptr> [[TMP16]], ptr [[TMP2]], i32 1
20-
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i64> [[VEC_IND]], zeroinitializer
21-
; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true)
20+
; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <2 x i64> [[VEC_IND]], zeroinitializer
2221
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
2322
; CHECK-NEXT: br i1 [[TMP5]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
2423
; CHECK: [[PRED_LOAD_IF]]:
@@ -34,8 +33,8 @@ define ptr @test(ptr noalias %src, ptr noalias %dst) {
3433
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP11]], i32 1
3534
; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE2]]
3635
; CHECK: [[PRED_LOAD_CONTINUE2]]:
37-
; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i32> [ [[TMP9]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], %[[PRED_LOAD_IF1]] ]
38-
; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x i32> zeroinitializer, <2 x i32> [[TMP13]]
36+
; CHECK-NEXT: [[TMP15:%.*]] = phi <2 x i32> [ [[TMP9]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], %[[PRED_LOAD_IF1]] ]
37+
; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> [[TMP15]], <2 x i32> zeroinitializer
3938
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP0]]
4039
; CHECK-NEXT: store <2 x i32> [[PREDPHI]], ptr [[TMP14]], align 4
4140
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2

0 commit comments

Comments
 (0)