[EraVM] Optimize overflow intrinsic which has a branching use

TerryGuo · lialan · akiramenai · commit 4655edd745ab · 2024-06-14T16:00:23.000+02:00
For every branch that utilizes the overflow i1 output of uaddo, usubo, umulo, the FBB is moved next to the branch in layout and the TBB is moved out of the way to the cold section. Thus the code generator can use `jump.lt` to jump to TBB and reach FBB without a jump. We expect that overflow is a less probable scenario for a typical usage. To implement the described logic, we lower `BRCOND` node to a conditional jump and glue it to the flag setting add, sub or mul. Co-authored-by: Alan Li <alan.li@me.com> PR: #581, Issue: #491.
diff --git a/llvm/lib/Target/EraVM/EraVMISD.def b/llvm/lib/Target/EraVM/EraVMISD.def
@@ -22,6 +22,7 @@ HANDLE_NODETYPE(DELEGATECALL_LONG)
 HANDLE_NODETYPE(MIMICCALL)
 HANDLE_NODETYPE(INVOKE)
 HANDLE_NODETYPE(BR_CC)
+HANDLE_NODETYPE(BRCOND)
 HANDLE_NODETYPE(SELECT_CC)
 HANDLE_NODETYPE(CMP)
 HANDLE_NODETYPE(THROW)
diff --git a/llvm/lib/Target/EraVM/EraVMISelLowering.cpp b/llvm/lib/Target/EraVM/EraVMISelLowering.cpp
@@ -114,7 +114,6 @@ EraVMTargetLowering::EraVMTargetLowering(const TargetMachine &TM,
   setOperationAction(
       {
           ISD::BRIND,
-          ISD::BRCOND,
           ISD::VASTART,
           ISD::VAARG,
           ISD::VAEND,
@@ -150,7 +149,7 @@ EraVMTargetLowering::EraVMTargetLowering(const TargetMachine &TM,
 
   setOperationAction({ISD::INTRINSIC_VOID, ISD::INTRINSIC_WO_CHAIN,
                       ISD::INTRINSIC_W_CHAIN, ISD::STACKSAVE, ISD::STACKRESTORE,
-                      ISD::TRAP, ISD::BR_JT},
+                      ISD::TRAP, ISD::BR_JT, ISD::BRCOND},
                      MVT::Other, Custom);
 
   for (MVT VT : {MVT::i1, MVT::i8, MVT::i16, MVT::i32, MVT::i64, MVT::i128}) {
@@ -683,6 +682,7 @@ SDValue EraVMTargetLowering::LowerOperation(SDValue Op,
   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
   case ISD::BR_CC:              return LowerBR_CC(Op, DAG);
   case ISD::BR_JT:              return LowerBR_JT(Op, DAG);
+  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
   case ISD::SELECT:             return LowerSELECT(Op, DAG);
   case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
   case ISD::SRA:                return LowerSRA(Op, DAG);
@@ -1105,6 +1105,51 @@ SDValue EraVMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(EraVMISD::SELECT_CC, DL, VTs, Ops);
 }
 
+/// Lower U{add|sub|mul}.with.overflow feeding into a branch
+/// into {ADD|SUB|MUL}! and jump.of or jump.lt.
+SDValue EraVMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Cond = Op.getOperand(1);
+  SDValue Dest = Op.getOperand(2);
+  SDLoc DL(Op);
+  SDValue MatchedUArithO = matchingOverflowArithmeticOperation(Cond);
+  if (!MatchedUArithO)
+    return SDValue();
+
+  // Create a new node to set flag for BRCOND node and GLUE them together.
+  // Here we will keep the original node by not calling
+  // DAG.ReplaceAllUsesOfValueWith. This is to support below case
+  //
+  // result, of = uaddo a, b
+  // another_flag = cmp x, y
+  // c = another_flag ? result : d
+  // brcond of, L1, L2
+  // ...
+  // ... some inst using c
+  //
+  // In the end we will have the original node for generating result and the new
+  // one for setting flag. If the result of the original node isn't used, then
+  // it will be optimized out.
+  auto OPC = MatchedUArithO.getOpcode();
+  auto LoweredOPC = OpcodeMap.at(OPC);
+  SDValue FoldedArith = DAG.getNode(
+      LoweredOPC, DL, {MVT::i256, MVT::Other, MVT::Glue},
+      {Chain, MatchedUArithO.getOperand(0), MatchedUArithO.getOperand(1)});
+
+  // For overflow caused by USUBO, it is safe to use COND_LT and COND_GE as its
+  // reversal code, but for UADDO and UMULO, we have to distinguish them with
+  // COND_OF because COND_GE can't be used as reversal code.
+  // Using COND_LT for USUBO case will allow other passes to do optimization via
+  // inversing the conditional code. Using COND_OF for UADDO and UMULO cases
+  // will prevent other passes from attempting to inverse conditional code, thus
+  // ensure the correctness.
+  auto CCVal = (OPC == ISD::USUBO) ? EraVMCC::COND_LT : EraVMCC::COND_OF;
+  auto OFCC = DAG.getConstant(CCVal, DL, MVT::i256);
+  return DAG.getNode(EraVMISD::BRCOND, DL, Op.getValueType(),
+                     FoldedArith.getValue(1), Dest, OFCC,
+                     FoldedArith.getValue(2));
+}
+
 SDValue EraVMTargetLowering::LowerSELECT_CC(SDValue Op,
                                             SelectionDAG &DAG) const {
   SDValue LHS = Op.getOperand(0);
diff --git a/llvm/lib/Target/EraVM/EraVMISelLowering.h b/llvm/lib/Target/EraVM/EraVMISelLowering.h
@@ -56,6 +56,7 @@ class EraVMTargetLowering : public TargetLowering {
   SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/EraVM/EraVMInstrInfo.td b/llvm/lib/Target/EraVM/EraVMInstrInfo.td
@@ -28,6 +28,8 @@ def SDT_EraVMAddToSP      : SDTypeProfile<0,  1, [SDTCisVT<0, i256>]>;
 def SDT_EraVMGetSP        : SDTypeProfile<1,  0, []>;
 def SDT_EraVMBrCC         : SDTypeProfile<0,  2, [SDTCisVT<0, OtherVT>,
                                                    SDTCisVT<1, i256>]>;
+def SDT_EraVMBrCOND       : SDTypeProfile<0,  2, [SDTCisVT<0, OtherVT>,
+                                                   SDTCisVT<1, i256>]>;
 def SDT_EraVMSelectCC     : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
                                                   SDTCisSameAs<1, 2>,
                                                   SDTCisVT<3, i256>]>;
@@ -62,6 +64,8 @@ def EraVMmimiccall: SDNode<"EraVMISD::MIMICCALL", SDT_EraVMFarCall,
                     [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>;
 def EraVMbrcc   : SDNode<"EraVMISD::BR_CC", SDT_EraVMBrCC,
                     [SDNPHasChain, SDNPInGlue]>;
+def EraVMbrcond : SDNode<"EraVMISD::BRCOND", SDT_EraVMBrCOND,
+                    [SDNPHasChain, SDNPInGlue]>;
 def EraVMselectcc: SDNode<"EraVMISD::SELECT_CC", SDT_EraVMSelectCC,
                            [SDNPInGlue]>;
 def EraVMcmp    : SDNode<"EraVMISD::CMP", SDT_EraVMCmp, [SDNPOutGlue]>;
@@ -1220,6 +1224,9 @@ def : Pat<(EraVMTrap), (PANIC 0)>;
 def : Pat<(EraVMselectcc GRPTR:$rs0, GRPTR:$rs1, imm:$cc),
           (FATPTR_SELrrr GRPTR:$rs0, GRPTR:$rs1, imm:$cc)>;
 
+// For lowering BRCOND
+def : Pat<(EraVMbrcond bb:$unwind, imm:$cc), (JCl bb:$unwind, imm:$cc)>;
+
 // ============================ Overflow ADDs =====================================
 // to register
 def : Pat<(EraVMAdd_v GR256:$rs0, GR256:$rs1),
diff --git a/llvm/lib/Target/EraVM/EraVMPostCodegenPrepare.cpp b/llvm/lib/Target/EraVM/EraVMPostCodegenPrepare.cpp
@@ -18,6 +18,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "EraVM.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 
@@ -36,6 +37,21 @@ struct EraVMPostCodegenPrepare : public FunctionPass {
   }
   bool runOnFunction(Function &F) override;
 
+  // This function is an optimization for overflow arithmetic intrinsics.
+  // For every branch that utilizes the overflow i1 output of the
+  // intrinsic, it does two things:
+  // 1. make sure FBB is adjacent to the branch in layout. This is
+  //    required to make sure ISEL not going to flip the branch
+  //    condition by adding XOR to the result. ISEL does this to create a
+  //    fallthrough optimization opportunity for MachineBlockPlacement pass.
+  //    In ISEL we have a specific pattern to match so that we can custom lower
+  //    the overflow handling, so we do not want ISEL do extra work for us.
+  // 2. move TBB out of the way to cold section. This is needed to achieve
+  //    good code sequence for non-overflow handling.
+  //    This is done by giving minimal probability to TBB so that
+  //    MachineBlockPlacement pass will rearrange it to cold section.
+  bool rearrangeOverflowHandlingBranches(Function &F);
+
   StringRef getPassName() const override {
     return ERAVM_POST_CODEGEN_PREPARE_NAME;
   }
@@ -120,6 +136,96 @@ bool EraVMPostCodegenPrepare::runOnFunction(Function &F) {
       if (auto *Cmp = dyn_cast<ICmpInst>(&I))
         Changed |= optimizeICmp(*Cmp);
 
+  Changed |= rearrangeOverflowHandlingBranches(F);
+  return Changed;
+}
+
+static bool isUnsignedArithmeticOverflowInstruction(Instruction &I) {
+  auto *Call = dyn_cast<CallInst>(&I);
+  if (!Call)
+    return false;
+  Intrinsic::ID IntID = Call->getIntrinsicID();
+  if (IntID != Intrinsic::uadd_with_overflow &&
+      IntID != Intrinsic::usub_with_overflow &&
+      IntID != Intrinsic::umul_with_overflow) {
+    return false;
+  }
+  return true;
+}
+
+bool EraVMPostCodegenPrepare::rearrangeOverflowHandlingBranches(Function &F) {
+  bool Changed = false;
+  // iterate over all basic blocks:
+  auto BBI = F.begin();
+  auto BBE = F.end();
+  while (BBI != BBE) {
+    auto *BB = &*BBI;
+    BBI = std::next(BBI);
+    for (auto &I : *BB) {
+      if (!isUnsignedArithmeticOverflowInstruction(I))
+        continue;
+
+      // now we've found an overflow handling intrinsic
+      // get the overflow branching block:
+
+      // we are going to match structure like this:
+      // %5 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %4, i32 1)
+      // %7 = extractvalue { i32, i1 } %5, 1
+      // br i1 %7, label %8, label %10
+
+      auto *Call = dyn_cast<CallInst>(&I);
+      for (User *U : Call->users()) {
+
+        // check extractvalue: there must be at least one use which is
+        // extractvalue and index 1
+        auto *ExtractValue = dyn_cast<ExtractValueInst>(U);
+        if (!ExtractValue ||
+            (!ExtractValue->hasIndices() || ExtractValue->getIndices()[0] != 1))
+          continue;
+
+        // check that the extracted value is used by a conditional branch in the
+        // same basicblock:
+        auto it = llvm::find_if(ExtractValue->users(), [&](User *U) {
+          auto *IteratingBranch = dyn_cast<BranchInst>(U);
+          if (IteratingBranch && IteratingBranch->getParent() == BB &&
+              IteratingBranch->isConditional())
+            return true;
+          return false;
+        });
+        if (it == ExtractValue->user_end())
+          continue;
+
+        // we have found a use which is conditional branch that uses the
+        // result of extractvalue.
+        auto *Branch = cast<BranchInst>(*it);
+        BasicBlock *TBB = Branch->getSuccessor(0);
+        BasicBlock *FBB = Branch->getSuccessor(1);
+
+        // now we've found the conversion candidate, and its branching TBB and
+        // FBB.
+        // We now will ensure that FBB is next to current BB in layout. This
+        // will create an opportunity for MachineBlockPlacement to fall through
+        // to FBB, and is necessary for the desired code sequence.
+        FBB->moveAfter(BB);
+
+        // also make TBB a very low weight branch, so it can be moved to
+        // cold section.
+        LLVMContext &Ctx = TBB->getContext();
+        llvm::MDString *mdName = llvm::MDString::get(Ctx, "branch_weights");
+        // set first successor's branching weight to minimal and second
+        // successor's branching weight to maximal
+        llvm::MDTuple *ColdWeights = llvm::MDTuple::get(
+            Ctx, {mdName,
+                  llvm::ConstantAsMetadata::get(
+                      llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1)),
+                  llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+                      llvm::Type::getInt32Ty(Ctx), UINT32_MAX))});
+        Branch->setMetadata(llvm::LLVMContext::MD_prof, ColdWeights);
+
+        Changed = true;
+      }
+    }
+  }
   return Changed;
 }
 
diff --git a/llvm/test/CodeGen/EraVM/overflow-branch.ll b/llvm/test/CodeGen/EraVM/overflow-branch.ll