-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[InstCombine] Lower flag check pattern to use a bitmask-shift #169557
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
The following C-code:
bool or_icmp(int type) {
return type == 0 || type == 6 || type == 15;
}
Currently lowers to:
define i1 @or_icmp(i32 signext %type) {
entry:
%cmp = icmp eq i32 %type, 6
%cmp1 = icmp eq i32 %type, 0
%or.cond = or i1 %cmp, %cmp1
%cmp3 = icmp eq i32 %type, 15
%or.cond1 = or i1 %cmp3, %or.cond
ret i1 %or.cond1
}
But more optimally lowers to:
define i1 @or_icmp(i32 signext %type) {
entry:
%srl = lshr i32 32833, %type
%srl.1 = trunc i32 %srl to i1
%cmp = icmp ult i32 %type, 64
%and = and i1 %srl.1, %cmp
ret i1 %and
}
|
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-risc-v Author: Ryan Buchner (bababuck) ChangesFile Edit Options Buffers Tools Help Currently lowers to: But more optimally lowers to: Full diff: https://github.com/llvm/llvm-project/pull/169557.diff 2 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 3b250d7d9ad1f..50076dbb4555e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16736,6 +16736,52 @@ static SDValue combineOrAndToBitfieldInsert(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(RISCVISD::QC_INSB, DL, MVT::i32, Ops);
}
+// or (icmp eq x, imm0), (icmp eq x, imm1) -> czero.eqz (sltui x, 64), (bext x,
+// 1 << imm0 | 1 << imm1) If [imm0, imm1] < 64
+static SDValue combineOrOfImmCmpToBitExtract(SDNode *N, SelectionDAG &DAG,
+ const RISCVSubtarget &Subtarget) {
+ using namespace SDPatternMatch;
+
+ auto CollectSetEqImmTree = [](auto &&Self, SmallVector<APInt, 4> &FlagVals,
+ SDNode *N, SDValue &X) -> bool {
+ APInt Imm;
+ if (X ? sd_match(N, m_OneUse(m_SetCC(m_Specific(X), m_ConstInt(Imm),
+ m_SpecificCondCode(ISD::SETEQ))))
+ : sd_match(N, m_OneUse(m_SetCC(m_Value(X), m_ConstInt(Imm),
+ m_SpecificCondCode(ISD::SETEQ))))) {
+ FlagVals.push_back(Imm);
+ return true;
+ }
+ SDValue LHS, RHS;
+ if (sd_match(N, m_OneUse(m_Or(m_Value(LHS), m_Value(RHS))))) {
+ return Self(Self, FlagVals, LHS.getNode(), X) &&
+ Self(Self, FlagVals, RHS.getNode(), X);
+ }
+ return false;
+ };
+
+ SmallVector<APInt, 4> FlagVals;
+ SDValue X;
+ if (!CollectSetEqImmTree(CollectSetEqImmTree, FlagVals, N, X))
+ return SDValue();
+
+ unsigned XLen = Subtarget.getXLen();
+ uint64_t BitMask = 0;
+ for (auto &Imm : FlagVals) {
+ if (Imm.uge(XLen))
+ return SDValue();
+ BitMask |= ((uint64_t)1 << Imm.getZExtValue());
+ }
+
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ SDValue BitExtract =
+ DAG.getNode(ISD::SRL, DL, VT, DAG.getConstant(BitMask, DL, VT), X);
+ SDValue Lt64Check =
+ DAG.getSetCC(DL, VT, X, DAG.getConstant(XLen, DL, VT), ISD::SETULT);
+ return DAG.getNode(ISD::AND, DL, VT, Lt64Check, BitExtract);
+}
+
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
const RISCVSubtarget &Subtarget) {
SelectionDAG &DAG = DCI.DAG;
@@ -16748,6 +16794,9 @@ static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return V;
if (SDValue V = combineBinOpOfExtractToReduceTree(N, DAG, Subtarget))
return V;
+ if (DCI.isAfterLegalizeDAG())
+ if (SDValue V = combineOrOfImmCmpToBitExtract(N, DAG, Subtarget))
+ return V;
if (DCI.isAfterLegalizeDAG())
if (SDValue V = combineDeMorganOfBoolean(N, DAG))
diff --git a/llvm/test/CodeGen/RISCV/flag_check.ll b/llvm/test/CodeGen/RISCV/flag_check.ll
new file mode 100644
index 0000000000000..86049bf53379c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/flag_check.ll
@@ -0,0 +1,241 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+
+define i1 @or_icmp_2(i32 signext %type) {
+; RV32-LABEL: or_icmp_2:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: li a1, 65
+; RV32-NEXT: srl a1, a1, a0
+; RV32-NEXT: sltiu a0, a0, 32
+; RV32-NEXT: and a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: or_icmp_2:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: li a1, 65
+; RV64-NEXT: srl a1, a1, a0
+; RV64-NEXT: sltiu a0, a0, 64
+; RV64-NEXT: and a0, a0, a1
+; RV64-NEXT: ret
+entry:
+ %cmp = icmp eq i32 %type, 6
+ %cmp1 = icmp eq i32 %type, 0
+ %or.cond = or i1 %cmp, %cmp1
+ ret i1 %or.cond
+}
+
+define i1 @or_icmp_3(i32 signext %type) {
+; RV32-LABEL: or_icmp_3:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: lui a1, 8
+; RV32-NEXT: addi a1, a1, 65
+; RV32-NEXT: srl a1, a1, a0
+; RV32-NEXT: sltiu a0, a0, 32
+; RV32-NEXT: and a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: or_icmp_3:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: lui a1, 8
+; RV64-NEXT: addi a1, a1, 65
+; RV64-NEXT: srl a1, a1, a0
+; RV64-NEXT: sltiu a0, a0, 64
+; RV64-NEXT: and a0, a0, a1
+; RV64-NEXT: ret
+entry:
+ %cmp = icmp eq i32 %type, 6
+ %cmp1 = icmp eq i32 %type, 0
+ %or.cond = or i1 %cmp, %cmp1
+ %cmp3 = icmp eq i32 %type, 15
+ %or.cond1 = or i1 %cmp3, %or.cond
+ ret i1 %or.cond1
+}
+
+define i1 @or_icmp_4_tree(i32 signext %type) {
+; RV32-LABEL: or_icmp_4_tree:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: lui a1, 1032
+; RV32-NEXT: addi a1, a1, 65
+; RV32-NEXT: srl a1, a1, a0
+; RV32-NEXT: sltiu a0, a0, 32
+; RV32-NEXT: and a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: or_icmp_4_tree:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: lui a1, 1032
+; RV64-NEXT: addi a1, a1, 65
+; RV64-NEXT: srl a1, a1, a0
+; RV64-NEXT: sltiu a0, a0, 64
+; RV64-NEXT: and a0, a0, a1
+; RV64-NEXT: ret
+entry:
+ %cmp = icmp eq i32 %type, 6
+ %cmp1 = icmp eq i32 %type, 0
+ %or.cond = or i1 %cmp, %cmp1
+ %cmp2 = icmp eq i32 %type, 15
+ %cmp3 = icmp eq i32 %type, 22
+ %or.cond1 = or i1 %cmp2, %cmp3
+ %or.cond2 = or i1 %or.cond1, %or.cond
+ ret i1 %or.cond2
+}
+
+define i1 @or_icmp_7(i32 signext %type) {
+; RV32-LABEL: or_icmp_7:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: lui a1, 589860
+; RV32-NEXT: addi a1, a1, 73
+; RV32-NEXT: srl a1, a1, a0
+; RV32-NEXT: sltiu a0, a0, 32
+; RV32-NEXT: and a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: or_icmp_7:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: lui a1, 147465
+; RV64-NEXT: slli a1, a1, 2
+; RV64-NEXT: addi a1, a1, 73
+; RV64-NEXT: srl a1, a1, a0
+; RV64-NEXT: sltiu a0, a0, 64
+; RV64-NEXT: and a0, a0, a1
+; RV64-NEXT: ret
+entry:
+ %cmp = icmp eq i32 %type, 6
+ %cmp1 = icmp eq i32 %type, 0
+ %or.cond = or i1 %cmp, %cmp1
+ %cmp2 = icmp eq i32 %type, 17
+ %or.cond1 = or i1 %cmp2, %or.cond
+ %cmp3 = icmp eq i32 %type, 3
+ %or.cond2 = or i1 %cmp3, %or.cond1
+ %cmp4 = icmp eq i32 %type, 31
+ %or.cond3 = or i1 %cmp4, %or.cond2
+ %cmp5 = icmp eq i32 %type, 14
+ %or.cond4 = or i1 %cmp5, %or.cond3
+ %cmp6 = icmp eq i32 %type, 28
+ %or.cond5 = or i1 %cmp6, %or.cond4
+ ret i1 %or.cond5
+}
+
+define i1 @or_icmp_gte_64(i32 signext %type) {
+; CHECK-LABEL: or_icmp_gte_64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addi a1, a0, -6
+; CHECK-NEXT: addi a0, a0, -64
+; CHECK-NEXT: seqz a1, a1
+; CHECK-NEXT: seqz a0, a0
+; CHECK-NEXT: or a0, a1, a0
+; CHECK-NEXT: ret
+entry:
+ %cmp = icmp eq i32 %type, 6
+ %cmp1 = icmp eq i32 %type, 64
+ %or.cond = or i1 %cmp, %cmp1
+ ret i1 %or.cond
+}
+
+define i1 @or_icmp_multiple_uses(i32 signext %type) {
+; CHECK-LABEL: or_icmp_multiple_uses:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addi a1, a0, -6
+; CHECK-NEXT: seqz a1, a1
+; CHECK-NEXT: seqz a0, a0
+; CHECK-NEXT: or a0, a1, a0
+; CHECK-NEXT: xor a0, a1, a0
+; CHECK-NEXT: ret
+entry:
+ %cmp = icmp eq i32 %type, 6
+ %cmp1 = icmp eq i32 %type, 0
+ %or.cond = or i1 %cmp, %cmp1
+ %or.cond1 = xor i1 %cmp, %or.cond
+ ret i1 %or.cond1
+}
+
+
+define i1 @or_icmp_not_eq(i32 signext %type) {
+; CHECK-LABEL: or_icmp_not_eq:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addi a1, a0, -6
+; CHECK-NEXT: seqz a1, a1
+; CHECK-NEXT: snez a0, a0
+; CHECK-NEXT: or a0, a1, a0
+; CHECK-NEXT: ret
+entry:
+ %cmp = icmp eq i32 %type, 6
+ %cmp1 = icmp ugt i32 %type, 0
+ %or.cond = or i1 %cmp, %cmp1
+ ret i1 %or.cond
+}
+
+define i1 @or_icmp_xlen(i32 signext %type) {
+; RV32-LABEL: or_icmp_xlen:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: addi a1, a0, -6
+; RV32-NEXT: addi a0, a0, -32
+; RV32-NEXT: seqz a1, a1
+; RV32-NEXT: seqz a0, a0
+; RV32-NEXT: or a0, a1, a0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: or_icmp_xlen:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: li a1, 1
+; RV64-NEXT: slli a1, a1, 32
+; RV64-NEXT: addi a1, a1, 64
+; RV64-NEXT: srl a1, a1, a0
+; RV64-NEXT: sltiu a0, a0, 64
+; RV64-NEXT: and a0, a0, a1
+; RV64-NEXT: ret
+entry:
+ %cmp = icmp eq i32 %type, 6
+ %cmp1 = icmp eq i32 %type, 32
+ %or.cond = or i1 %cmp, %cmp1
+ ret i1 %or.cond
+}
+
+define i1 @or_icmp_i64(i64 signext %type) {
+; RV32-LABEL: or_icmp_i64:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: xori a2, a0, 6
+; RV32-NEXT: or a3, a0, a1
+; RV32-NEXT: xori a0, a0, 15
+; RV32-NEXT: or a2, a2, a1
+; RV32-NEXT: seqz a3, a3
+; RV32-NEXT: or a0, a0, a1
+; RV32-NEXT: seqz a1, a2
+; RV32-NEXT: or a1, a1, a3
+; RV32-NEXT: seqz a0, a0
+; RV32-NEXT: or a0, a0, a1
+; RV32-NEXT: ret
+;
+; RV64-LABEL: or_icmp_i64:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: lui a1, 8
+; RV64-NEXT: addi a1, a1, 65
+; RV64-NEXT: srl a1, a1, a0
+; RV64-NEXT: sltiu a0, a0, 64
+; RV64-NEXT: and a0, a0, a1
+; RV64-NEXT: ret
+entry:
+ %cmp = icmp eq i64 %type, 6
+ %cmp1 = icmp eq i64 %type, 0
+ %or.cond = or i1 %cmp, %cmp1
+ %cmp3 = icmp eq i64 %type, 15
+ %or.cond1 = or i1 %cmp3, %or.cond
+ ret i1 %or.cond1
+}
+
+define i1 @or_icmp_specific(i32 signext %type, i32 signext %type1) {
+; CHECK-LABEL: or_icmp_specific:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addi a0, a0, -6
+; CHECK-NEXT: addi a1, a1, -32
+; CHECK-NEXT: seqz a0, a0
+; CHECK-NEXT: seqz a1, a1
+; CHECK-NEXT: or a0, a0, a1
+; CHECK-NEXT: ret
+entry:
+ %cmp = icmp eq i32 %type, 6
+ %cmp1 = icmp eq i32 %type1, 32
+ %or.cond = or i1 %cmp, %cmp1
+ ret i1 %or.cond
+}
|
Will move to InstCombine
The following C-code:
bool or_icmp(int type) {
return type == 0 || type == 6 || type == 15;
}
Currently lowers to:
define i1 @or_icmp(i32 signext %type) {
entry:
%cmp = icmp eq i32 %type, 6
%cmp1 = icmp eq i32 %type, 0
%or.cond = or i1 %cmp, %cmp1
%cmp3 = icmp eq i32 %type, 15
%or.cond1 = or i1 %cmp3, %or.cond
ret i1 %or.cond1
}
But more optimally lowers to:
define i1 @or_icmp(i32 signext %type) {
entry:
%srl = lshr i32 32833, %type
%srl.1 = trunc i32 %srl to i1
%cmp = icmp ult i32 %type, 64
%and = select i1 %cmd, i1 %srl.1, i1 false
ret i1 %and
}
The following C-code:
bool or_icmp(int type) {
return type == 0 || type == 6 || type == 15;
}
Currently lowers to:
define i1 @or_icmp(i32 signext %type) {
entry:
%cmp = icmp eq i32 %type, 6
%cmp1 = icmp eq i32 %type, 0
%or.cond = or i1 %cmp, %cmp1
%cmp3 = icmp eq i32 %type, 15
%or.cond1 = or i1 %cmp3, %or.cond
ret i1 %or.cond1
}
But more optimally lowers to:
define i1 @or_icmp(i32 signext %type) {
entry:
%srl = lshr i32 32833, %type
%srl.1 = trunc i32 %srl to i1
%cmp = icmp ult i32 %type, 64
%and = select i1 %cmd, i1 %srl.1, i1 false
ret i1 %and
}
|
✅ With the latest revision this PR passed the undef deprecator. |
| if (!validImm(LHSAP) || !validImm(RHSAP)) | ||
| return nullptr; | ||
| LHSAP = LHSAP.zextOrTrunc(XLen); | ||
| RHSAP = RHSAP.zextOrTrunc(XLen); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know if this is the best way to wrangle various APInt's that may have differing BitWidth's- effectively I ensure they can be represented within XLen and then force them all to have a BitWidth or XLen.
| return Imm && (*Imm < XLen); | ||
| }; | ||
|
|
||
| // Match (or (icmp eq X, Imm0), (icmp eq X, Imm1)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The transform with just two options is about a net neutral (at least going to the RISCV backend). However, the benefit comes once we can expand to combining 3 or more comparisons. From a code standpoint, the implementation seemed cleaner to operate on a single or at a time though.
|
Does this transform already exist somewhere in LLVM to some degree? Just checking in godbolt Clang trunk already seems to do a very similar transform on the C snippet in the PR description: https://compiler-explorer.com/z/qTbzc9e33 |
See SimplifyCFGOpt::simplifyBranchOnICmpChain and simplifySwitchLookup |
File Edit Options Buffers Tools Help
The following C-code:
Currently lowers to:
But more optimally lowers to:
Alive Proofs for the test cases:
or_icmp_2: https://alive2.llvm.org/ce/z/9W4PZJ
or_icmp_3: https://alive2.llvm.org/ce/z/zrVHSh
or_icmp_7: https://alive2.llvm.org/ce/z/yBoMer
or_icmp_i64: https://alive2.llvm.org/ce/z/Jas5gE
or_icmp_expand: https://alive2.llvm.org/ce/z/waU32g
or_icmp_expand_trunc_type_shr: https://alive2.llvm.org/ce/z/Catc9V
or_icmp_expand_zext_cmp: https://alive2.llvm.org/ce/z/a7-d3B
or_icmp_i128: https://alive2.llvm.org/ce/z/ng6WJR
or_imcp_expand_128: https://alive2.llvm.org/ce/z/vy39kW