Skip to content

Commit 4655edd

Browse files
TerryGuolialan
authored andcommitted
[EraVM] Optimize overflow intrinsic which has a branching use
For every branch that utilizes the overflow i1 output of uaddo, usubo, umulo, the FBB is moved next to the branch in layout and the TBB is moved out of the way to the cold section. Thus the code generator can use `jump.lt` to jump to TBB and reach FBB without a jump. We expect that overflow is a less probable scenario for a typical usage. To implement the described logic, we lower `BRCOND` node to a conditional jump and glue it to the flag setting add, sub or mul. Co-authored-by: Alan Li <[email protected]> PR: #581, Issue: #491.
1 parent 153efb8 commit 4655edd

File tree

6 files changed

+226
-103
lines changed

6 files changed

+226
-103
lines changed

llvm/lib/Target/EraVM/EraVMISD.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ HANDLE_NODETYPE(DELEGATECALL_LONG)
2222
HANDLE_NODETYPE(MIMICCALL)
2323
HANDLE_NODETYPE(INVOKE)
2424
HANDLE_NODETYPE(BR_CC)
25+
HANDLE_NODETYPE(BRCOND)
2526
HANDLE_NODETYPE(SELECT_CC)
2627
HANDLE_NODETYPE(CMP)
2728
HANDLE_NODETYPE(THROW)

llvm/lib/Target/EraVM/EraVMISelLowering.cpp

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,6 @@ EraVMTargetLowering::EraVMTargetLowering(const TargetMachine &TM,
114114
setOperationAction(
115115
{
116116
ISD::BRIND,
117-
ISD::BRCOND,
118117
ISD::VASTART,
119118
ISD::VAARG,
120119
ISD::VAEND,
@@ -150,7 +149,7 @@ EraVMTargetLowering::EraVMTargetLowering(const TargetMachine &TM,
150149

151150
setOperationAction({ISD::INTRINSIC_VOID, ISD::INTRINSIC_WO_CHAIN,
152151
ISD::INTRINSIC_W_CHAIN, ISD::STACKSAVE, ISD::STACKRESTORE,
153-
ISD::TRAP, ISD::BR_JT},
152+
ISD::TRAP, ISD::BR_JT, ISD::BRCOND},
154153
MVT::Other, Custom);
155154

156155
for (MVT VT : {MVT::i1, MVT::i8, MVT::i16, MVT::i32, MVT::i64, MVT::i128}) {
@@ -683,6 +682,7 @@ SDValue EraVMTargetLowering::LowerOperation(SDValue Op,
683682
case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
684683
case ISD::BR_CC: return LowerBR_CC(Op, DAG);
685684
case ISD::BR_JT: return LowerBR_JT(Op, DAG);
685+
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
686686
case ISD::SELECT: return LowerSELECT(Op, DAG);
687687
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
688688
case ISD::SRA: return LowerSRA(Op, DAG);
@@ -1105,6 +1105,51 @@ SDValue EraVMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
11051105
return DAG.getNode(EraVMISD::SELECT_CC, DL, VTs, Ops);
11061106
}
11071107

1108+
/// Lower U{add|sub|mul}.with.overflow feeding into a branch
1109+
/// into {ADD|SUB|MUL}! and jump.of or jump.lt.
1110+
SDValue EraVMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1111+
SDValue Chain = Op.getOperand(0);
1112+
SDValue Cond = Op.getOperand(1);
1113+
SDValue Dest = Op.getOperand(2);
1114+
SDLoc DL(Op);
1115+
SDValue MatchedUArithO = matchingOverflowArithmeticOperation(Cond);
1116+
if (!MatchedUArithO)
1117+
return SDValue();
1118+
1119+
// Create a new node to set flag for BRCOND node and GLUE them together.
1120+
// Here we will keep the original node by not calling
1121+
// DAG.ReplaceAllUsesOfValueWith. This is to support below case
1122+
//
1123+
// result, of = uaddo a, b
1124+
// another_flag = cmp x, y
1125+
// c = another_flag ? result : d
1126+
// brcond of, L1, L2
1127+
// ...
1128+
// ... some inst using c
1129+
//
1130+
// In the end we will have the original node for generating result and the new
1131+
// one for setting flag. If the result of the original node isn't used, then
1132+
// it will be optimized out.
1133+
auto OPC = MatchedUArithO.getOpcode();
1134+
auto LoweredOPC = OpcodeMap.at(OPC);
1135+
SDValue FoldedArith = DAG.getNode(
1136+
LoweredOPC, DL, {MVT::i256, MVT::Other, MVT::Glue},
1137+
{Chain, MatchedUArithO.getOperand(0), MatchedUArithO.getOperand(1)});
1138+
1139+
// For overflow caused by USUBO, it is safe to use COND_LT and COND_GE as its
1140+
// reversal code, but for UADDO and UMULO, we have to distinguish them with
1141+
// COND_OF because COND_GE can't be used as reversal code.
1142+
// Using COND_LT for USUBO case will allow other passes to do optimization via
1143+
// inversing the conditional code. Using COND_OF for UADDO and UMULO cases
1144+
// will prevent other passes from attempting to inverse conditional code, thus
1145+
// ensure the correctness.
1146+
auto CCVal = (OPC == ISD::USUBO) ? EraVMCC::COND_LT : EraVMCC::COND_OF;
1147+
auto OFCC = DAG.getConstant(CCVal, DL, MVT::i256);
1148+
return DAG.getNode(EraVMISD::BRCOND, DL, Op.getValueType(),
1149+
FoldedArith.getValue(1), Dest, OFCC,
1150+
FoldedArith.getValue(2));
1151+
}
1152+
11081153
SDValue EraVMTargetLowering::LowerSELECT_CC(SDValue Op,
11091154
SelectionDAG &DAG) const {
11101155
SDValue LHS = Op.getOperand(0);

llvm/lib/Target/EraVM/EraVMISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ class EraVMTargetLowering : public TargetLowering {
5656
SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
5757
SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
5858
SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
59+
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
5960
SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
6061
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
6162
SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;

llvm/lib/Target/EraVM/EraVMInstrInfo.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ def SDT_EraVMAddToSP : SDTypeProfile<0, 1, [SDTCisVT<0, i256>]>;
2828
def SDT_EraVMGetSP : SDTypeProfile<1, 0, []>;
2929
def SDT_EraVMBrCC : SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>,
3030
SDTCisVT<1, i256>]>;
31+
def SDT_EraVMBrCOND : SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>,
32+
SDTCisVT<1, i256>]>;
3133
def SDT_EraVMSelectCC : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
3234
SDTCisSameAs<1, 2>,
3335
SDTCisVT<3, i256>]>;
@@ -62,6 +64,8 @@ def EraVMmimiccall: SDNode<"EraVMISD::MIMICCALL", SDT_EraVMFarCall,
6264
[SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>;
6365
def EraVMbrcc : SDNode<"EraVMISD::BR_CC", SDT_EraVMBrCC,
6466
[SDNPHasChain, SDNPInGlue]>;
67+
def EraVMbrcond : SDNode<"EraVMISD::BRCOND", SDT_EraVMBrCOND,
68+
[SDNPHasChain, SDNPInGlue]>;
6569
def EraVMselectcc: SDNode<"EraVMISD::SELECT_CC", SDT_EraVMSelectCC,
6670
[SDNPInGlue]>;
6771
def EraVMcmp : SDNode<"EraVMISD::CMP", SDT_EraVMCmp, [SDNPOutGlue]>;
@@ -1220,6 +1224,9 @@ def : Pat<(EraVMTrap), (PANIC 0)>;
12201224
def : Pat<(EraVMselectcc GRPTR:$rs0, GRPTR:$rs1, imm:$cc),
12211225
(FATPTR_SELrrr GRPTR:$rs0, GRPTR:$rs1, imm:$cc)>;
12221226

1227+
// For lowering BRCOND
1228+
def : Pat<(EraVMbrcond bb:$unwind, imm:$cc), (JCl bb:$unwind, imm:$cc)>;
1229+
12231230
// ============================ Overflow ADDs =====================================
12241231
// to register
12251232
def : Pat<(EraVMAdd_v GR256:$rs0, GR256:$rs1),

llvm/lib/Target/EraVM/EraVMPostCodegenPrepare.cpp

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
//===----------------------------------------------------------------------===//
1919

2020
#include "EraVM.h"
21+
#include "llvm/ADT/STLExtras.h"
2122
#include "llvm/IR/IRBuilder.h"
2223
#include "llvm/IR/InstrTypes.h"
2324

@@ -36,6 +37,21 @@ struct EraVMPostCodegenPrepare : public FunctionPass {
3637
}
3738
bool runOnFunction(Function &F) override;
3839

40+
// This function is an optimization for overflow arithmetic intrinsics.
41+
// For every branch that utilizes the overflow i1 output of the
42+
// intrinsic, it does two things:
43+
// 1. make sure FBB is adjacent to the branch in layout. This is
44+
// required to make sure ISEL not going to flip the branch
45+
// condition by adding XOR to the result. ISEL does this to create a
46+
// fallthrough optimization opportunity for MachineBlockPlacement pass.
47+
// In ISEL we have a specific pattern to match so that we can custom lower
48+
// the overflow handling, so we do not want ISEL do extra work for us.
49+
// 2. move TBB out of the way to cold section. This is needed to achieve
50+
// good code sequence for non-overflow handling.
51+
// This is done by giving minimal probability to TBB so that
52+
// MachineBlockPlacement pass will rearrange it to cold section.
53+
bool rearrangeOverflowHandlingBranches(Function &F);
54+
3955
StringRef getPassName() const override {
4056
return ERAVM_POST_CODEGEN_PREPARE_NAME;
4157
}
@@ -120,6 +136,96 @@ bool EraVMPostCodegenPrepare::runOnFunction(Function &F) {
120136
if (auto *Cmp = dyn_cast<ICmpInst>(&I))
121137
Changed |= optimizeICmp(*Cmp);
122138

139+
Changed |= rearrangeOverflowHandlingBranches(F);
140+
return Changed;
141+
}
142+
143+
static bool isUnsignedArithmeticOverflowInstruction(Instruction &I) {
144+
auto *Call = dyn_cast<CallInst>(&I);
145+
if (!Call)
146+
return false;
147+
Intrinsic::ID IntID = Call->getIntrinsicID();
148+
if (IntID != Intrinsic::uadd_with_overflow &&
149+
IntID != Intrinsic::usub_with_overflow &&
150+
IntID != Intrinsic::umul_with_overflow) {
151+
return false;
152+
}
153+
return true;
154+
}
155+
156+
bool EraVMPostCodegenPrepare::rearrangeOverflowHandlingBranches(Function &F) {
157+
bool Changed = false;
158+
// iterate over all basic blocks:
159+
auto BBI = F.begin();
160+
auto BBE = F.end();
161+
while (BBI != BBE) {
162+
auto *BB = &*BBI;
163+
BBI = std::next(BBI);
164+
for (auto &I : *BB) {
165+
if (!isUnsignedArithmeticOverflowInstruction(I))
166+
continue;
167+
168+
// now we've found an overflow handling intrinsic
169+
// get the overflow branching block:
170+
171+
// we are going to match structure like this:
172+
// %5 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %4, i32 1)
173+
// %7 = extractvalue { i32, i1 } %5, 1
174+
// br i1 %7, label %8, label %10
175+
176+
auto *Call = dyn_cast<CallInst>(&I);
177+
for (User *U : Call->users()) {
178+
179+
// check extractvalue: there must be at least one use which is
180+
// extractvalue and index 1
181+
auto *ExtractValue = dyn_cast<ExtractValueInst>(U);
182+
if (!ExtractValue ||
183+
(!ExtractValue->hasIndices() || ExtractValue->getIndices()[0] != 1))
184+
continue;
185+
186+
// check that the extracted value is used by a conditional branch in the
187+
// same basicblock:
188+
auto it = llvm::find_if(ExtractValue->users(), [&](User *U) {
189+
auto *IteratingBranch = dyn_cast<BranchInst>(U);
190+
if (IteratingBranch && IteratingBranch->getParent() == BB &&
191+
IteratingBranch->isConditional())
192+
return true;
193+
return false;
194+
});
195+
if (it == ExtractValue->user_end())
196+
continue;
197+
198+
// we have found a use which is conditional branch that uses the
199+
// result of extractvalue.
200+
auto *Branch = cast<BranchInst>(*it);
201+
BasicBlock *TBB = Branch->getSuccessor(0);
202+
BasicBlock *FBB = Branch->getSuccessor(1);
203+
204+
// now we've found the conversion candidate, and its branching TBB and
205+
// FBB.
206+
// We now will ensure that FBB is next to current BB in layout. This
207+
// will create an opportunity for MachineBlockPlacement to fall through
208+
// to FBB, and is necessary for the desired code sequence.
209+
FBB->moveAfter(BB);
210+
211+
// also make TBB a very low weight branch, so it can be moved to
212+
// cold section.
213+
LLVMContext &Ctx = TBB->getContext();
214+
llvm::MDString *mdName = llvm::MDString::get(Ctx, "branch_weights");
215+
// set first successor's branching weight to minimal and second
216+
// successor's branching weight to maximal
217+
llvm::MDTuple *ColdWeights = llvm::MDTuple::get(
218+
Ctx, {mdName,
219+
llvm::ConstantAsMetadata::get(
220+
llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1)),
221+
llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
222+
llvm::Type::getInt32Ty(Ctx), UINT32_MAX))});
223+
Branch->setMetadata(llvm::LLVMContext::MD_prof, ColdWeights);
224+
225+
Changed = true;
226+
}
227+
}
228+
}
123229
return Changed;
124230
}
125231

0 commit comments

Comments
 (0)